Page MenuHomeFreeBSD

No OneTemporary

This file is larger than 256 KB, so syntax highlighting was skipped.
Index: stable/11/cddl/contrib/opensolaris/cmd/zdb/zdb.c
===================================================================
--- stable/11/cddl/contrib/opensolaris/cmd/zdb/zdb.c (revision 332524)
+++ stable/11/cddl/contrib/opensolaris/cmd/zdb/zdb.c (revision 332525)
@@ -1,4186 +1,4756 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2017 Nexenta Systems, Inc.
*/
#include <stdio.h>
#include <unistd.h>
#include <stdio_ext.h>
#include <stdlib.h>
#include <ctype.h>
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/spa_impl.h>
#include <sys/dmu.h>
#include <sys/zap.h>
#include <sys/fs/zfs.h>
#include <sys/zfs_znode.h>
#include <sys/zfs_sa.h>
#include <sys/sa.h>
#include <sys/sa_impl.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
#include <sys/metaslab_impl.h>
#include <sys/dmu_objset.h>
#include <sys/dsl_dir.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_pool.h>
#include <sys/dbuf.h>
#include <sys/zil.h>
#include <sys/zil_impl.h>
#include <sys/stat.h>
#include <sys/resource.h>
#include <sys/dmu_traverse.h>
#include <sys/zio_checksum.h>
#include <sys/zio_compress.h>
#include <sys/zfs_fuid.h>
#include <sys/arc.h>
#include <sys/ddt.h>
#include <sys/zfeature.h>
#include <sys/abd.h>
#include <sys/blkptr.h>
#include <zfs_comutil.h>
#include <libcmdutils.h>
#undef verify
#include <libzfs.h>
#include "zdb.h"
#define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \
zio_compress_table[(idx)].ci_name : "UNKNOWN")
#define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \
zio_checksum_table[(idx)].ci_name : "UNKNOWN")
#define ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ? \
dmu_ot[(idx)].ot_name : DMU_OT_IS_VALID(idx) ? \
dmu_ot_byteswap[DMU_OT_BYTESWAP(idx)].ob_name : "UNKNOWN")
#define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \
- (((idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA) ? \
- DMU_OT_ZAP_OTHER : DMU_OT_NUMTYPES))
+ (idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA ? \
+ DMU_OT_ZAP_OTHER : \
+ (idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \
+ DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES)
#ifndef lint
extern int reference_tracking_enable;
extern boolean_t zfs_recover;
extern uint64_t zfs_arc_max, zfs_arc_meta_limit;
extern int zfs_vdev_async_read_max_active;
#else
int reference_tracking_enable;
boolean_t zfs_recover;
uint64_t zfs_arc_max, zfs_arc_meta_limit;
int zfs_vdev_async_read_max_active;
#endif
static const char cmdname[] = "zdb";
uint8_t dump_opt[256];
typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
static uint64_t *zopt_object = NULL;
static unsigned zopt_objects = 0;
static libzfs_handle_t *g_zfs;
static uint64_t max_inflight = 1000;
static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *);
/*
* These libumem hooks provide a reasonable set of defaults for the allocator's
* debugging facilities.
*/
const char *
_umem_debug_init()
{
return ("default,verbose"); /* $UMEM_DEBUG setting */
}
const char *
_umem_logging_init(void)
{
return ("fail,contents"); /* $UMEM_LOGGING setting */
}
static void
usage(void)
{
(void) fprintf(stderr,
"Usage:\t%s [-AbcdDFGhiLMPsvX] [-e [-V] [-p <path> ...]] "
"[-I <inflight I/Os>]\n"
"\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
"\t\t[<poolname> [<object> ...]]\n"
"\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>] <dataset> "
"[<object> ...]\n"
"\t%s -C [-A] [-U <cache>]\n"
"\t%s -l [-Aqu] <device>\n"
"\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] "
"[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n"
"\t%s -O <dataset> <path>\n"
"\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n"
"\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n"
"\t%s -E [-A] word0:word1:...:word15\n"
"\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] "
"<poolname>\n\n",
cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname,
cmdname, cmdname);
(void) fprintf(stderr, " Dataset name must include at least one "
"separator character '/' or '@'\n");
(void) fprintf(stderr, " If dataset name is specified, only that "
"dataset is dumped\n");
(void) fprintf(stderr, " If object numbers are specified, only "
"those objects are dumped\n\n");
(void) fprintf(stderr, " Options to control amount of output:\n");
(void) fprintf(stderr, " -b block statistics\n");
(void) fprintf(stderr, " -c checksum all metadata (twice for "
"all data) blocks\n");
(void) fprintf(stderr, " -C config (or cachefile if alone)\n");
(void) fprintf(stderr, " -d dataset(s)\n");
(void) fprintf(stderr, " -D dedup statistics\n");
(void) fprintf(stderr, " -E decode and display block from an "
"embedded block pointer\n");
(void) fprintf(stderr, " -h pool history\n");
(void) fprintf(stderr, " -i intent logs\n");
(void) fprintf(stderr, " -l read label contents\n");
(void) fprintf(stderr, " -L disable leak tracking (do not "
"load spacemaps)\n");
(void) fprintf(stderr, " -m metaslabs\n");
(void) fprintf(stderr, " -M metaslab groups\n");
(void) fprintf(stderr, " -O perform object lookups by path\n");
(void) fprintf(stderr, " -R read and display block from a "
"device\n");
(void) fprintf(stderr, " -s report stats on zdb's I/O\n");
(void) fprintf(stderr, " -S simulate dedup to measure effect\n");
(void) fprintf(stderr, " -v verbose (applies to all "
"others)\n\n");
(void) fprintf(stderr, " Below options are intended for use "
"with other options:\n");
(void) fprintf(stderr, " -A ignore assertions (-A), enable "
"panic recovery (-AA) or both (-AAA)\n");
(void) fprintf(stderr, " -e pool is exported/destroyed/"
"has altroot/not in a cachefile\n");
(void) fprintf(stderr, " -F attempt automatic rewind within "
"safe range of transaction groups\n");
(void) fprintf(stderr, " -G dump zfs_dbgmsg buffer before "
"exiting\n");
(void) fprintf(stderr, " -I <number of inflight I/Os> -- "
"specify the maximum number of "
"checksumming I/Os [default is 200]\n");
(void) fprintf(stderr, " -o <variable>=<value> set global "
"variable to an unsigned 32-bit integer value\n");
(void) fprintf(stderr, " -p <path> -- use one or more with "
"-e to specify path to vdev dir\n");
(void) fprintf(stderr, " -P print numbers in parseable form\n");
(void) fprintf(stderr, " -q don't print label contents\n");
(void) fprintf(stderr, " -t <txg> -- highest txg to use when "
"searching for uberblocks\n");
(void) fprintf(stderr, " -u uberblock\n");
(void) fprintf(stderr, " -U <cachefile_path> -- use alternate "
"cachefile\n");
(void) fprintf(stderr, " -V do verbatim import\n");
(void) fprintf(stderr, " -x <dumpdir> -- "
"dump all read blocks into specified directory\n");
(void) fprintf(stderr, " -X attempt extreme rewind (does not "
"work with dataset)\n\n");
(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
"to make only that option verbose\n");
(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
exit(1);
}
static void
dump_debug_buffer()
{
if (dump_opt['G']) {
(void) printf("\n");
zfs_dbgmsg_print("zdb");
}
}
/*
* Called for usage errors that are discovered after a call to spa_open(),
* dmu_bonus_hold(), or pool_match(). abort() is called for other errors.
*/
static void
fatal(const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
(void) fprintf(stderr, "%s: ", cmdname);
(void) vfprintf(stderr, fmt, ap);
va_end(ap);
(void) fprintf(stderr, "\n");
dump_debug_buffer();
exit(1);
}
/* ARGSUSED */
static void
dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
{
nvlist_t *nv;
size_t nvsize = *(uint64_t *)data;
char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));
VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
umem_free(packed, nvsize);
dump_nvlist(nv, 8);
nvlist_free(nv);
}
/* ARGSUSED */
static void
dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size)
{
spa_history_phys_t *shp = data;
if (shp == NULL)
return;
(void) printf("\t\tpool_create_len = %llu\n",
(u_longlong_t)shp->sh_pool_create_len);
(void) printf("\t\tphys_max_off = %llu\n",
(u_longlong_t)shp->sh_phys_max_off);
(void) printf("\t\tbof = %llu\n",
(u_longlong_t)shp->sh_bof);
(void) printf("\t\teof = %llu\n",
(u_longlong_t)shp->sh_eof);
(void) printf("\t\trecords_lost = %llu\n",
(u_longlong_t)shp->sh_records_lost);
}
static void
zdb_nicenum(uint64_t num, char *buf, size_t buflen)
{
if (dump_opt['P'])
(void) snprintf(buf, buflen, "%llu", (longlong_t)num);
else
nicenum(num, buf, sizeof (buf));
}
static const char histo_stars[] = "****************************************";
static const uint64_t histo_width = sizeof (histo_stars) - 1;
static void
dump_histogram(const uint64_t *histo, int size, int offset)
{
int i;
int minidx = size - 1;
int maxidx = 0;
uint64_t max = 0;
for (i = 0; i < size; i++) {
if (histo[i] > max)
max = histo[i];
if (histo[i] > 0 && i > maxidx)
maxidx = i;
if (histo[i] > 0 && i < minidx)
minidx = i;
}
if (max < histo_width)
max = histo_width;
for (i = minidx; i <= maxidx; i++) {
(void) printf("\t\t\t%3u: %6llu %s\n",
i + offset, (u_longlong_t)histo[i],
&histo_stars[(max - histo[i]) * histo_width / max]);
}
}
static void
dump_zap_stats(objset_t *os, uint64_t object)
{
int error;
zap_stats_t zs;
error = zap_get_stats(os, object, &zs);
if (error)
return;
if (zs.zs_ptrtbl_len == 0) {
ASSERT(zs.zs_num_blocks == 1);
(void) printf("\tmicrozap: %llu bytes, %llu entries\n",
(u_longlong_t)zs.zs_blocksize,
(u_longlong_t)zs.zs_num_entries);
return;
}
(void) printf("\tFat ZAP stats:\n");
(void) printf("\t\tPointer table:\n");
(void) printf("\t\t\t%llu elements\n",
(u_longlong_t)zs.zs_ptrtbl_len);
(void) printf("\t\t\tzt_blk: %llu\n",
(u_longlong_t)zs.zs_ptrtbl_zt_blk);
(void) printf("\t\t\tzt_numblks: %llu\n",
(u_longlong_t)zs.zs_ptrtbl_zt_numblks);
(void) printf("\t\t\tzt_shift: %llu\n",
(u_longlong_t)zs.zs_ptrtbl_zt_shift);
(void) printf("\t\t\tzt_blks_copied: %llu\n",
(u_longlong_t)zs.zs_ptrtbl_blks_copied);
(void) printf("\t\t\tzt_nextblk: %llu\n",
(u_longlong_t)zs.zs_ptrtbl_nextblk);
(void) printf("\t\tZAP entries: %llu\n",
(u_longlong_t)zs.zs_num_entries);
(void) printf("\t\tLeaf blocks: %llu\n",
(u_longlong_t)zs.zs_num_leafs);
(void) printf("\t\tTotal blocks: %llu\n",
(u_longlong_t)zs.zs_num_blocks);
(void) printf("\t\tzap_block_type: 0x%llx\n",
(u_longlong_t)zs.zs_block_type);
(void) printf("\t\tzap_magic: 0x%llx\n",
(u_longlong_t)zs.zs_magic);
(void) printf("\t\tzap_salt: 0x%llx\n",
(u_longlong_t)zs.zs_salt);
(void) printf("\t\tLeafs with 2^n pointers:\n");
dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);
(void) printf("\t\tBlocks with n*5 entries:\n");
dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);
(void) printf("\t\tBlocks n/10 full:\n");
dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);
(void) printf("\t\tEntries with n chunks:\n");
dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);
(void) printf("\t\tBuckets with n entries:\n");
dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);
}
/*ARGSUSED*/
static void
dump_none(objset_t *os, uint64_t object, void *data, size_t size)
{
}
/*ARGSUSED*/
static void
dump_unknown(objset_t *os, uint64_t object, void *data, size_t size)
{
(void) printf("\tUNKNOWN OBJECT TYPE\n");
}
/*ARGSUSED*/
static void
dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
{
}
/*ARGSUSED*/
static void
dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)
{
}
/*ARGSUSED*/
static void
dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
{
zap_cursor_t zc;
zap_attribute_t attr;
void *prop;
unsigned i;
dump_zap_stats(os, object);
(void) printf("\n");
for (zap_cursor_init(&zc, os, object);
zap_cursor_retrieve(&zc, &attr) == 0;
zap_cursor_advance(&zc)) {
(void) printf("\t\t%s = ", attr.za_name);
if (attr.za_num_integers == 0) {
(void) printf("\n");
continue;
}
prop = umem_zalloc(attr.za_num_integers *
attr.za_integer_length, UMEM_NOFAIL);
(void) zap_lookup(os, object, attr.za_name,
attr.za_integer_length, attr.za_num_integers, prop);
if (attr.za_integer_length == 1) {
(void) printf("%s", (char *)prop);
} else {
for (i = 0; i < attr.za_num_integers; i++) {
switch (attr.za_integer_length) {
case 2:
(void) printf("%u ",
((uint16_t *)prop)[i]);
break;
case 4:
(void) printf("%u ",
((uint32_t *)prop)[i]);
break;
case 8:
(void) printf("%lld ",
(u_longlong_t)((int64_t *)prop)[i]);
break;
}
}
}
(void) printf("\n");
umem_free(prop, attr.za_num_integers * attr.za_integer_length);
}
zap_cursor_fini(&zc);
}
static void
dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)
{
bpobj_phys_t *bpop = data;
char bytes[32], comp[32], uncomp[32];
/* make sure the output won't get truncated */
CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
if (bpop == NULL)
return;
zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes));
zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp));
zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp));
(void) printf("\t\tnum_blkptrs = %llu\n",
(u_longlong_t)bpop->bpo_num_blkptrs);
(void) printf("\t\tbytes = %s\n", bytes);
if (size >= BPOBJ_SIZE_V1) {
(void) printf("\t\tcomp = %s\n", comp);
(void) printf("\t\tuncomp = %s\n", uncomp);
}
if (size >= sizeof (*bpop)) {
(void) printf("\t\tsubobjs = %llu\n",
(u_longlong_t)bpop->bpo_subobjs);
(void) printf("\t\tnum_subobjs = %llu\n",
(u_longlong_t)bpop->bpo_num_subobjs);
}
if (dump_opt['d'] < 5)
return;
for (uint64_t i = 0; i < bpop->bpo_num_blkptrs; i++) {
char blkbuf[BP_SPRINTF_LEN];
blkptr_t bp;
int err = dmu_read(os, object,
i * sizeof (bp), sizeof (bp), &bp, 0);
if (err != 0) {
(void) printf("got error %u from dmu_read\n", err);
break;
}
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp);
(void) printf("\t%s\n", blkbuf);
}
}
/* ARGSUSED */
static void
dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size)
{
dmu_object_info_t doi;
VERIFY0(dmu_object_info(os, object, &doi));
uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP);
int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0);
if (err != 0) {
(void) printf("got error %u from dmu_read\n", err);
kmem_free(subobjs, doi.doi_max_offset);
return;
}
int64_t last_nonzero = -1;
for (uint64_t i = 0; i < doi.doi_max_offset / 8; i++) {
if (subobjs[i] != 0)
last_nonzero = i;
}
for (int64_t i = 0; i <= last_nonzero; i++) {
(void) printf("\t%llu\n", (longlong_t)subobjs[i]);
}
kmem_free(subobjs, doi.doi_max_offset);
}
/*ARGSUSED*/
static void
dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
{
dump_zap_stats(os, object);
/* contents are printed elsewhere, properly decoded */
}
/*ARGSUSED*/
static void
dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size)
{
zap_cursor_t zc;
zap_attribute_t attr;
dump_zap_stats(os, object);
(void) printf("\n");
for (zap_cursor_init(&zc, os, object);
zap_cursor_retrieve(&zc, &attr) == 0;
zap_cursor_advance(&zc)) {
(void) printf("\t\t%s = ", attr.za_name);
if (attr.za_num_integers == 0) {
(void) printf("\n");
continue;
}
(void) printf(" %llx : [%d:%d:%d]\n",
(u_longlong_t)attr.za_first_integer,
(int)ATTR_LENGTH(attr.za_first_integer),
(int)ATTR_BSWAP(attr.za_first_integer),
(int)ATTR_NUM(attr.za_first_integer));
}
zap_cursor_fini(&zc);
}
/*ARGSUSED*/
static void
dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size)
{
zap_cursor_t zc;
zap_attribute_t attr;
uint16_t *layout_attrs;
unsigned i;
dump_zap_stats(os, object);
(void) printf("\n");
for (zap_cursor_init(&zc, os, object);
zap_cursor_retrieve(&zc, &attr) == 0;
zap_cursor_advance(&zc)) {
(void) printf("\t\t%s = [", attr.za_name);
if (attr.za_num_integers == 0) {
(void) printf("\n");
continue;
}
VERIFY(attr.za_integer_length == 2);
layout_attrs = umem_zalloc(attr.za_num_integers *
attr.za_integer_length, UMEM_NOFAIL);
VERIFY(zap_lookup(os, object, attr.za_name,
attr.za_integer_length,
attr.za_num_integers, layout_attrs) == 0);
for (i = 0; i != attr.za_num_integers; i++)
(void) printf(" %d ", (int)layout_attrs[i]);
(void) printf("]\n");
umem_free(layout_attrs,
attr.za_num_integers * attr.za_integer_length);
}
zap_cursor_fini(&zc);
}
/*ARGSUSED*/
static void
dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
{
zap_cursor_t zc;
zap_attribute_t attr;
const char *typenames[] = {
/* 0 */ "not specified",
/* 1 */ "FIFO",
/* 2 */ "Character Device",
/* 3 */ "3 (invalid)",
/* 4 */ "Directory",
/* 5 */ "5 (invalid)",
/* 6 */ "Block Device",
/* 7 */ "7 (invalid)",
/* 8 */ "Regular File",
/* 9 */ "9 (invalid)",
/* 10 */ "Symbolic Link",
/* 11 */ "11 (invalid)",
/* 12 */ "Socket",
/* 13 */ "Door",
/* 14 */ "Event Port",
/* 15 */ "15 (invalid)",
};
dump_zap_stats(os, object);
(void) printf("\n");
for (zap_cursor_init(&zc, os, object);
zap_cursor_retrieve(&zc, &attr) == 0;
zap_cursor_advance(&zc)) {
(void) printf("\t\t%s = %lld (type: %s)\n",
attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer),
typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]);
}
zap_cursor_fini(&zc);
}
static int
get_dtl_refcount(vdev_t *vd)
{
int refcount = 0;
if (vd->vdev_ops->vdev_op_leaf) {
space_map_t *sm = vd->vdev_dtl_sm;
if (sm != NULL &&
sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
return (1);
return (0);
}
for (unsigned c = 0; c < vd->vdev_children; c++)
refcount += get_dtl_refcount(vd->vdev_child[c]);
return (refcount);
}
static int
get_metaslab_refcount(vdev_t *vd)
{
int refcount = 0;
- if (vd->vdev_top == vd && !vd->vdev_removing) {
- for (unsigned m = 0; m < vd->vdev_ms_count; m++) {
+ if (vd->vdev_top == vd) {
+ for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
space_map_t *sm = vd->vdev_ms[m]->ms_sm;
if (sm != NULL &&
sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
refcount++;
}
}
for (unsigned c = 0; c < vd->vdev_children; c++)
refcount += get_metaslab_refcount(vd->vdev_child[c]);
return (refcount);
}
static int
+get_obsolete_refcount(vdev_t *vd)
+{
+ int refcount = 0;
+
+ uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd);
+ if (vd->vdev_top == vd && obsolete_sm_obj != 0) {
+ dmu_object_info_t doi;
+ VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset,
+ obsolete_sm_obj, &doi));
+ if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
+ refcount++;
+ }
+ } else {
+ ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
+ ASSERT3U(obsolete_sm_obj, ==, 0);
+ }
+ for (unsigned c = 0; c < vd->vdev_children; c++) {
+ refcount += get_obsolete_refcount(vd->vdev_child[c]);
+ }
+
+ return (refcount);
+}
+
+static int
+get_prev_obsolete_spacemap_refcount(spa_t *spa)
+{
+ uint64_t prev_obj =
+ spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object;
+ if (prev_obj != 0) {
+ dmu_object_info_t doi;
+ VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi));
+ if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
+ return (1);
+ }
+ }
+ return (0);
+}
+
+static int
verify_spacemap_refcounts(spa_t *spa)
{
uint64_t expected_refcount = 0;
uint64_t actual_refcount;
(void) feature_get_refcount(spa,
&spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM],
&expected_refcount);
actual_refcount = get_dtl_refcount(spa->spa_root_vdev);
actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
+ actual_refcount += get_obsolete_refcount(spa->spa_root_vdev);
+ actual_refcount += get_prev_obsolete_spacemap_refcount(spa);
if (expected_refcount != actual_refcount) {
(void) printf("space map refcount mismatch: expected %lld != "
"actual %lld\n",
(longlong_t)expected_refcount,
(longlong_t)actual_refcount);
return (2);
}
return (0);
}
static void
dump_spacemap(objset_t *os, space_map_t *sm)
{
uint64_t alloc, offset, entry;
- const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
- "INVALID", "INVALID", "INVALID", "INVALID" };
+ char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
+ "INVALID", "INVALID", "INVALID", "INVALID" };
if (sm == NULL)
return;
+ (void) printf("space map object %llu:\n",
+ (longlong_t)sm->sm_phys->smp_object);
+ (void) printf(" smp_objsize = 0x%llx\n",
+ (longlong_t)sm->sm_phys->smp_objsize);
+ (void) printf(" smp_alloc = 0x%llx\n",
+ (longlong_t)sm->sm_phys->smp_alloc);
+
/*
* Print out the freelist entries in both encoded and decoded form.
*/
alloc = 0;
for (offset = 0; offset < space_map_length(sm);
offset += sizeof (entry)) {
uint8_t mapshift = sm->sm_shift;
VERIFY0(dmu_read(os, space_map_object(sm), offset,
sizeof (entry), &entry, DMU_READ_PREFETCH));
if (SM_DEBUG_DECODE(entry)) {
(void) printf("\t [%6llu] %s: txg %llu, pass %llu\n",
(u_longlong_t)(offset / sizeof (entry)),
ddata[SM_DEBUG_ACTION_DECODE(entry)],
(u_longlong_t)SM_DEBUG_TXG_DECODE(entry),
(u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(entry));
} else {
(void) printf("\t [%6llu] %c range:"
" %010llx-%010llx size: %06llx\n",
(u_longlong_t)(offset / sizeof (entry)),
SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F',
(u_longlong_t)((SM_OFFSET_DECODE(entry) <<
mapshift) + sm->sm_start),
(u_longlong_t)((SM_OFFSET_DECODE(entry) <<
mapshift) + sm->sm_start +
(SM_RUN_DECODE(entry) << mapshift)),
(u_longlong_t)(SM_RUN_DECODE(entry) << mapshift));
if (SM_TYPE_DECODE(entry) == SM_ALLOC)
alloc += SM_RUN_DECODE(entry) << mapshift;
else
alloc -= SM_RUN_DECODE(entry) << mapshift;
}
}
if (alloc != space_map_allocated(sm)) {
(void) printf("space_map_object alloc (%llu) INCONSISTENT "
"with space map summary (%llu)\n",
(u_longlong_t)space_map_allocated(sm), (u_longlong_t)alloc);
}
}
static void
dump_metaslab_stats(metaslab_t *msp)
{
char maxbuf[32];
range_tree_t *rt = msp->ms_tree;
avl_tree_t *t = &msp->ms_size_tree;
int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
/* max sure nicenum has enough space */
CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ);
zdb_nicenum(metaslab_block_maxsize(msp), maxbuf, sizeof (maxbuf));
(void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n",
"segments", avl_numnodes(t), "maxsize", maxbuf,
"freepct", free_pct);
(void) printf("\tIn-memory histogram:\n");
dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
}
static void
dump_metaslab(metaslab_t *msp)
{
vdev_t *vd = msp->ms_group->mg_vd;
spa_t *spa = vd->vdev_spa;
space_map_t *sm = msp->ms_sm;
char freebuf[32];
zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf,
sizeof (freebuf));
(void) printf(
"\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n",
(u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
(u_longlong_t)space_map_object(sm), freebuf);
if (dump_opt['m'] > 2 && !dump_opt['L']) {
mutex_enter(&msp->ms_lock);
metaslab_load_wait(msp);
if (!msp->ms_loaded) {
VERIFY0(metaslab_load(msp));
range_tree_stat_verify(msp->ms_tree);
}
dump_metaslab_stats(msp);
metaslab_unload(msp);
mutex_exit(&msp->ms_lock);
}
if (dump_opt['m'] > 1 && sm != NULL &&
spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
/*
* The space map histogram represents free space in chunks
* of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
*/
(void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
(u_longlong_t)msp->ms_fragmentation);
dump_histogram(sm->sm_phys->smp_histogram,
SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
}
if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
- mutex_enter(&msp->ms_lock);
dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
- mutex_exit(&msp->ms_lock);
}
}
static void
print_vdev_metaslab_header(vdev_t *vd)
{
(void) printf("\tvdev %10llu\n\t%-10s%5llu %-19s %-15s %-10s\n",
(u_longlong_t)vd->vdev_id,
"metaslabs", (u_longlong_t)vd->vdev_ms_count,
"offset", "spacemap", "free");
(void) printf("\t%15s %19s %15s %10s\n",
"---------------", "-------------------",
"---------------", "-------------");
}
static void
dump_metaslab_groups(spa_t *spa)
{
vdev_t *rvd = spa->spa_root_vdev;
metaslab_class_t *mc = spa_normal_class(spa);
uint64_t fragmentation;
metaslab_class_histogram_verify(mc);
for (unsigned c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = tvd->vdev_mg;
if (mg->mg_class != mc)
continue;
metaslab_group_histogram_verify(mg);
mg->mg_fragmentation = metaslab_group_fragmentation(mg);
(void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
"fragmentation",
(u_longlong_t)tvd->vdev_id,
(u_longlong_t)tvd->vdev_ms_count);
if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
(void) printf("%3s\n", "-");
} else {
(void) printf("%3llu%%\n",
(u_longlong_t)mg->mg_fragmentation);
}
dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
}
(void) printf("\tpool %s\tfragmentation", spa_name(spa));
fragmentation = metaslab_class_fragmentation(mc);
if (fragmentation == ZFS_FRAG_INVALID)
(void) printf("\t%3s\n", "-");
else
(void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
}
static void
+print_vdev_indirect(vdev_t *vd)
+{
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ vdev_indirect_births_t *vib = vd->vdev_indirect_births;
+
+ if (vim == NULL) {
+ ASSERT3P(vib, ==, NULL);
+ return;
+ }
+
+ ASSERT3U(vdev_indirect_mapping_object(vim), ==,
+ vic->vic_mapping_object);
+ ASSERT3U(vdev_indirect_births_object(vib), ==,
+ vic->vic_births_object);
+
+ (void) printf("indirect births obj %llu:\n",
+ (longlong_t)vic->vic_births_object);
+ (void) printf(" vib_count = %llu\n",
+ (longlong_t)vdev_indirect_births_count(vib));
+ for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) {
+ vdev_indirect_birth_entry_phys_t *cur_vibe =
+ &vib->vib_entries[i];
+ (void) printf("\toffset %llx -> txg %llu\n",
+ (longlong_t)cur_vibe->vibe_offset,
+ (longlong_t)cur_vibe->vibe_phys_birth_txg);
+ }
+ (void) printf("\n");
+
+ (void) printf("indirect mapping obj %llu:\n",
+ (longlong_t)vic->vic_mapping_object);
+ (void) printf(" vim_max_offset = 0x%llx\n",
+ (longlong_t)vdev_indirect_mapping_max_offset(vim));
+ (void) printf(" vim_bytes_mapped = 0x%llx\n",
+ (longlong_t)vdev_indirect_mapping_bytes_mapped(vim));
+ (void) printf(" vim_count = %llu\n",
+ (longlong_t)vdev_indirect_mapping_num_entries(vim));
+
+ if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3)
+ return;
+
+ uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim);
+
+ for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
+ vdev_indirect_mapping_entry_phys_t *vimep =
+ &vim->vim_entries[i];
+ (void) printf("\t<%llx:%llx:%llx> -> "
+ "<%llx:%llx:%llx> (%x obsolete)\n",
+ (longlong_t)vd->vdev_id,
+ (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
+ (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
+ (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst),
+ (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst),
+ (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
+ counts[i]);
+ }
+ (void) printf("\n");
+
+ uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd);
+ if (obsolete_sm_object != 0) {
+ objset_t *mos = vd->vdev_spa->spa_meta_objset;
+ (void) printf("obsolete space map object %llu:\n",
+ (u_longlong_t)obsolete_sm_object);
+ ASSERT(vd->vdev_obsolete_sm != NULL);
+ ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==,
+ obsolete_sm_object);
+ dump_spacemap(mos, vd->vdev_obsolete_sm);
+ (void) printf("\n");
+ }
+}
+
+static void
dump_metaslabs(spa_t *spa)
{
vdev_t *vd, *rvd = spa->spa_root_vdev;
uint64_t m, c = 0, children = rvd->vdev_children;
(void) printf("\nMetaslabs:\n");
if (!dump_opt['d'] && zopt_objects > 0) {
c = zopt_object[0];
if (c >= children)
(void) fatal("bad vdev id: %llu", (u_longlong_t)c);
if (zopt_objects > 1) {
vd = rvd->vdev_child[c];
print_vdev_metaslab_header(vd);
for (m = 1; m < zopt_objects; m++) {
if (zopt_object[m] < vd->vdev_ms_count)
dump_metaslab(
vd->vdev_ms[zopt_object[m]]);
else
(void) fprintf(stderr, "bad metaslab "
"number %llu\n",
(u_longlong_t)zopt_object[m]);
}
(void) printf("\n");
return;
}
children = c + 1;
}
for (; c < children; c++) {
vd = rvd->vdev_child[c];
print_vdev_metaslab_header(vd);
+ print_vdev_indirect(vd);
+
for (m = 0; m < vd->vdev_ms_count; m++)
dump_metaslab(vd->vdev_ms[m]);
(void) printf("\n");
}
}
static void
dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
{
const ddt_phys_t *ddp = dde->dde_phys;
const ddt_key_t *ddk = &dde->dde_key;
const char *types[4] = { "ditto", "single", "double", "triple" };
char blkbuf[BP_SPRINTF_LEN];
blkptr_t blk;
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
if (ddp->ddp_phys_birth == 0)
continue;
ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
(void) printf("index %llx refcnt %llu %s %s\n",
(u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
types[p], blkbuf);
}
}
static void
dump_dedup_ratio(const ddt_stat_t *dds)
{
double rL, rP, rD, D, dedup, compress, copies;
if (dds->dds_blocks == 0)
return;
rL = (double)dds->dds_ref_lsize;
rP = (double)dds->dds_ref_psize;
rD = (double)dds->dds_ref_dsize;
D = (double)dds->dds_dsize;
dedup = rD / D;
compress = rL / rP;
copies = rD / rP;
(void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
"dedup * compress / copies = %.2f\n\n",
dedup, compress, copies, dedup * compress / copies);
}
static void
dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
{
char name[DDT_NAMELEN];
ddt_entry_t dde;
uint64_t walk = 0;
dmu_object_info_t doi;
uint64_t count, dspace, mspace;
int error;
error = ddt_object_info(ddt, type, class, &doi);
if (error == ENOENT)
return;
ASSERT(error == 0);
error = ddt_object_count(ddt, type, class, &count);
ASSERT(error == 0);
if (count == 0)
return;
dspace = doi.doi_physical_blocks_512 << 9;
mspace = doi.doi_fill_count * doi.doi_data_block_size;
ddt_object_name(ddt, type, class, name);
(void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
name,
(u_longlong_t)count,
(u_longlong_t)(dspace / count),
(u_longlong_t)(mspace / count));
if (dump_opt['D'] < 3)
return;
zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
if (dump_opt['D'] < 4)
return;
if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
return;
(void) printf("%s contents:\n\n", name);
while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
dump_dde(ddt, &dde, walk);
ASSERT(error == ENOENT);
(void) printf("\n");
}
static void
dump_all_ddts(spa_t *spa)
{
ddt_histogram_t ddh_total;
ddt_stat_t dds_total;
bzero(&ddh_total, sizeof (ddh_total));
bzero(&dds_total, sizeof (dds_total));
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];
for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
for (enum ddt_class class = 0; class < DDT_CLASSES;
class++) {
dump_ddt(ddt, type, class);
}
}
}
ddt_get_dedup_stats(spa, &dds_total);
if (dds_total.dds_blocks == 0) {
(void) printf("All DDTs are empty\n");
return;
}
(void) printf("\n");
if (dump_opt['D'] > 1) {
(void) printf("DDT histogram (aggregated over all DDTs):\n");
ddt_get_dedup_histogram(spa, &ddh_total);
zpool_dump_ddt(&dds_total, &ddh_total);
}
dump_dedup_ratio(&dds_total);
}
static void
dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
{
char *prefix = arg;
(void) printf("%s [%llu,%llu) length %llu\n",
prefix,
(u_longlong_t)start,
(u_longlong_t)(start + size),
(u_longlong_t)(size));
}
static void
dump_dtl(vdev_t *vd, int indent)
{
spa_t *spa = vd->vdev_spa;
boolean_t required;
const char *name[DTL_TYPES] = { "missing", "partial", "scrub",
"outage" };
char prefix[256];
spa_vdev_state_enter(spa, SCL_NONE);
required = vdev_dtl_required(vd);
(void) spa_vdev_state_exit(spa, NULL, 0);
if (indent == 0)
(void) printf("\nDirty time logs:\n\n");
(void) printf("\t%*s%s [%s]\n", indent, "",
vd->vdev_path ? vd->vdev_path :
vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
required ? "DTL-required" : "DTL-expendable");
for (int t = 0; t < DTL_TYPES; t++) {
range_tree_t *rt = vd->vdev_dtl[t];
if (range_tree_space(rt) == 0)
continue;
(void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
indent + 2, "", name[t]);
- mutex_enter(rt->rt_lock);
range_tree_walk(rt, dump_dtl_seg, prefix);
- mutex_exit(rt->rt_lock);
if (dump_opt['d'] > 5 && vd->vdev_children == 0)
dump_spacemap(spa->spa_meta_objset, vd->vdev_dtl_sm);
}
for (unsigned c = 0; c < vd->vdev_children; c++)
dump_dtl(vd->vdev_child[c], indent + 4);
}
/* from spa_history.c: spa_history_create_obj() */
#define HIS_BUF_LEN_DEF (128 << 10)
#define HIS_BUF_LEN_MAX (1 << 30)
static void
dump_history(spa_t *spa)
{
nvlist_t **events = NULL;
char *buf = NULL;
uint64_t bufsize = HIS_BUF_LEN_DEF;
uint64_t resid, len, off = 0;
uint_t num = 0;
int error;
time_t tsec;
struct tm t;
char tbuf[30];
char internalstr[MAXPATHLEN];
if ((buf = malloc(bufsize)) == NULL)
(void) fprintf(stderr, "Unable to read history: "
"out of memory\n");
do {
len = bufsize;
if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
(void) fprintf(stderr, "Unable to read history: "
"error %d\n", error);
return;
}
if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
break;
off -= resid;
/*
* If the history block is too big, double the buffer
* size and try again.
*/
if (resid == len) {
free(buf);
buf = NULL;
bufsize <<= 1;
if ((bufsize >= HIS_BUF_LEN_MAX) ||
((buf = malloc(bufsize)) == NULL)) {
(void) fprintf(stderr, "Unable to read history: "
"out of memory\n");
return;
}
}
} while (len != 0);
free(buf);
(void) printf("\nHistory:\n");
for (unsigned i = 0; i < num; i++) {
uint64_t time, txg, ievent;
char *cmd, *intstr;
boolean_t printed = B_FALSE;
if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME,
&time) != 0)
goto next;
if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD,
&cmd) != 0) {
if (nvlist_lookup_uint64(events[i],
ZPOOL_HIST_INT_EVENT, &ievent) != 0)
goto next;
verify(nvlist_lookup_uint64(events[i],
ZPOOL_HIST_TXG, &txg) == 0);
verify(nvlist_lookup_string(events[i],
ZPOOL_HIST_INT_STR, &intstr) == 0);
if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS)
goto next;
(void) snprintf(internalstr,
sizeof (internalstr),
"[internal %s txg:%ju] %s",
zfs_history_event_names[ievent], (uintmax_t)txg,
intstr);
cmd = internalstr;
}
tsec = time;
(void) localtime_r(&tsec, &t);
(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
(void) printf("%s %s\n", tbuf, cmd);
printed = B_TRUE;
next:
if (dump_opt['h'] > 1) {
if (!printed)
(void) printf("unrecognized record:\n");
dump_nvlist(events[i], 2);
}
}
}
/*ARGSUSED*/
static void
dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
{
}
static uint64_t
blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp,
const zbookmark_phys_t *zb)
{
if (dnp == NULL) {
ASSERT(zb->zb_level < 0);
if (zb->zb_object == 0)
return (zb->zb_blkid);
return (zb->zb_blkid * BP_GET_LSIZE(bp));
}
ASSERT(zb->zb_level >= 0);
return ((zb->zb_blkid <<
(zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
}
static void
snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp)
{
const dva_t *dva = bp->blk_dva;
int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
if (dump_opt['b'] >= 6) {
snprintf_blkptr(blkbuf, buflen, bp);
return;
}
if (BP_IS_EMBEDDED(bp)) {
(void) sprintf(blkbuf,
"EMBEDDED et=%u %llxL/%llxP B=%llu",
(int)BPE_GET_ETYPE(bp),
(u_longlong_t)BPE_GET_LSIZE(bp),
(u_longlong_t)BPE_GET_PSIZE(bp),
(u_longlong_t)bp->blk_birth);
return;
}
blkbuf[0] = '\0';
for (int i = 0; i < ndvas; i++)
(void) snprintf(blkbuf + strlen(blkbuf),
buflen - strlen(blkbuf), "%llu:%llx:%llx ",
(u_longlong_t)DVA_GET_VDEV(&dva[i]),
(u_longlong_t)DVA_GET_OFFSET(&dva[i]),
(u_longlong_t)DVA_GET_ASIZE(&dva[i]));
if (BP_IS_HOLE(bp)) {
(void) snprintf(blkbuf + strlen(blkbuf),
buflen - strlen(blkbuf),
"%llxL B=%llu",
(u_longlong_t)BP_GET_LSIZE(bp),
(u_longlong_t)bp->blk_birth);
} else {
(void) snprintf(blkbuf + strlen(blkbuf),
buflen - strlen(blkbuf),
"%llxL/%llxP F=%llu B=%llu/%llu",
(u_longlong_t)BP_GET_LSIZE(bp),
(u_longlong_t)BP_GET_PSIZE(bp),
(u_longlong_t)BP_GET_FILL(bp),
(u_longlong_t)bp->blk_birth,
(u_longlong_t)BP_PHYSICAL_BIRTH(bp));
}
}
static void
print_indirect(blkptr_t *bp, const zbookmark_phys_t *zb,
const dnode_phys_t *dnp)
{
char blkbuf[BP_SPRINTF_LEN];
int l;
if (!BP_IS_EMBEDDED(bp)) {
ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
}
(void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
ASSERT(zb->zb_level >= 0);
for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
if (l == zb->zb_level) {
(void) printf("L%llx", (u_longlong_t)zb->zb_level);
} else {
(void) printf(" ");
}
}
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
(void) printf("%s\n", blkbuf);
}
static int
visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
blkptr_t *bp, const zbookmark_phys_t *zb)
{
int err = 0;
if (bp->blk_birth == 0)
return (0);
print_indirect(bp, zb, dnp);
if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
arc_flags_t flags = ARC_FLAG_WAIT;
int i;
blkptr_t *cbp;
int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
arc_buf_t *buf;
uint64_t fill = 0;
err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err)
return (err);
ASSERT(buf->b_data);
/* recursively visit blocks below this */
cbp = buf->b_data;
for (i = 0; i < epb; i++, cbp++) {
zbookmark_phys_t czb;
SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
zb->zb_level - 1,
zb->zb_blkid * epb + i);
err = visit_indirect(spa, dnp, cbp, &czb);
if (err)
break;
fill += BP_GET_FILL(cbp);
}
if (!err)
ASSERT3U(fill, ==, BP_GET_FILL(bp));
arc_buf_destroy(buf, &buf);
}
return (err);
}
/*ARGSUSED*/
static void
dump_indirect(dnode_t *dn)
{
dnode_phys_t *dnp = dn->dn_phys;
int j;
zbookmark_phys_t czb;
(void) printf("Indirect blocks:\n");
SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),
dn->dn_object, dnp->dn_nlevels - 1, 0);
for (j = 0; j < dnp->dn_nblkptr; j++) {
czb.zb_blkid = j;
(void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,
&dnp->dn_blkptr[j], &czb);
}
(void) printf("\n");
}
/*ARGSUSED*/
static void
dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
{
dsl_dir_phys_t *dd = data;
time_t crtime;
char nice[32];
/* make sure nicenum has enough space */
CTASSERT(sizeof (nice) >= NN_NUMBUF_SZ);
if (dd == NULL)
return;
ASSERT3U(size, >=, sizeof (dsl_dir_phys_t));
crtime = dd->dd_creation_time;
(void) printf("\t\tcreation_time = %s", ctime(&crtime));
(void) printf("\t\thead_dataset_obj = %llu\n",
(u_longlong_t)dd->dd_head_dataset_obj);
(void) printf("\t\tparent_dir_obj = %llu\n",
(u_longlong_t)dd->dd_parent_obj);
(void) printf("\t\torigin_obj = %llu\n",
(u_longlong_t)dd->dd_origin_obj);
(void) printf("\t\tchild_dir_zapobj = %llu\n",
(u_longlong_t)dd->dd_child_dir_zapobj);
zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice));
(void) printf("\t\tused_bytes = %s\n", nice);
zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice));
(void) printf("\t\tcompressed_bytes = %s\n", nice);
zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice));
(void) printf("\t\tuncompressed_bytes = %s\n", nice);
zdb_nicenum(dd->dd_quota, nice, sizeof (nice));
(void) printf("\t\tquota = %s\n", nice);
zdb_nicenum(dd->dd_reserved, nice, sizeof (nice));
(void) printf("\t\treserved = %s\n", nice);
(void) printf("\t\tprops_zapobj = %llu\n",
(u_longlong_t)dd->dd_props_zapobj);
(void) printf("\t\tdeleg_zapobj = %llu\n",
(u_longlong_t)dd->dd_deleg_zapobj);
(void) printf("\t\tflags = %llx\n",
(u_longlong_t)dd->dd_flags);
#define DO(which) \
zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \
sizeof (nice)); \
(void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
DO(HEAD);
DO(SNAP);
DO(CHILD);
DO(CHILD_RSRV);
DO(REFRSRV);
#undef DO
}
/*ARGSUSED*/
static void
dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
{
dsl_dataset_phys_t *ds = data;
time_t crtime;
char used[32], compressed[32], uncompressed[32], unique[32];
char blkbuf[BP_SPRINTF_LEN];
/* make sure nicenum has enough space */
CTASSERT(sizeof (used) >= NN_NUMBUF_SZ);
CTASSERT(sizeof (compressed) >= NN_NUMBUF_SZ);
CTASSERT(sizeof (uncompressed) >= NN_NUMBUF_SZ);
CTASSERT(sizeof (unique) >= NN_NUMBUF_SZ);
if (ds == NULL)
return;
ASSERT(size == sizeof (*ds));
crtime = ds->ds_creation_time;
zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used));
zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed));
zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed,
sizeof (uncompressed));
zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique));
snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp);
(void) printf("\t\tdir_obj = %llu\n",
(u_longlong_t)ds->ds_dir_obj);
(void) printf("\t\tprev_snap_obj = %llu\n",
(u_longlong_t)ds->ds_prev_snap_obj);
(void) printf("\t\tprev_snap_txg = %llu\n",
(u_longlong_t)ds->ds_prev_snap_txg);
(void) printf("\t\tnext_snap_obj = %llu\n",
(u_longlong_t)ds->ds_next_snap_obj);
(void) printf("\t\tsnapnames_zapobj = %llu\n",
(u_longlong_t)ds->ds_snapnames_zapobj);
(void) printf("\t\tnum_children = %llu\n",
(u_longlong_t)ds->ds_num_children);
(void) printf("\t\tuserrefs_obj = %llu\n",
(u_longlong_t)ds->ds_userrefs_obj);
(void) printf("\t\tcreation_time = %s", ctime(&crtime));
(void) printf("\t\tcreation_txg = %llu\n",
(u_longlong_t)ds->ds_creation_txg);
(void) printf("\t\tdeadlist_obj = %llu\n",
(u_longlong_t)ds->ds_deadlist_obj);
(void) printf("\t\tused_bytes = %s\n", used);
(void) printf("\t\tcompressed_bytes = %s\n", compressed);
(void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
(void) printf("\t\tunique = %s\n", unique);
(void) printf("\t\tfsid_guid = %llu\n",
(u_longlong_t)ds->ds_fsid_guid);
(void) printf("\t\tguid = %llu\n",
(u_longlong_t)ds->ds_guid);
(void) printf("\t\tflags = %llx\n",
(u_longlong_t)ds->ds_flags);
(void) printf("\t\tnext_clones_obj = %llu\n",
(u_longlong_t)ds->ds_next_clones_obj);
(void) printf("\t\tprops_obj = %llu\n",
(u_longlong_t)ds->ds_props_obj);
(void) printf("\t\tbp = %s\n", blkbuf);
}
/* ARGSUSED */
static int
dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
{
char blkbuf[BP_SPRINTF_LEN];
if (bp->blk_birth != 0) {
snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
(void) printf("\t%s\n", blkbuf);
}
return (0);
}
static void
dump_bptree(objset_t *os, uint64_t obj, const char *name)
{
char bytes[32];
bptree_phys_t *bt;
dmu_buf_t *db;
/* make sure nicenum has enough space */
CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
if (dump_opt['d'] < 3)
return;
VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
bt = db->db_data;
zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes));
(void) printf("\n %s: %llu datasets, %s\n",
name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes);
dmu_buf_rele(db, FTAG);
if (dump_opt['d'] < 5)
return;
(void) printf("\n");
(void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL);
}
/* ARGSUSED */
static int
dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
{
char blkbuf[BP_SPRINTF_LEN];
ASSERT(bp->blk_birth != 0);
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
(void) printf("\t%s\n", blkbuf);
return (0);
}
static void
dump_full_bpobj(bpobj_t *bpo, const char *name, int indent)
{
char bytes[32];
char comp[32];
char uncomp[32];
/* make sure nicenum has enough space */
CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
if (dump_opt['d'] < 3)
return;
zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes));
if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp));
zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp));
(void) printf(" %*s: object %llu, %llu local blkptrs, "
"%llu subobjs in object %llu, %s (%s/%s comp)\n",
indent * 8, name,
(u_longlong_t)bpo->bpo_object,
(u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
(u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
(u_longlong_t)bpo->bpo_phys->bpo_subobjs,
bytes, comp, uncomp);
for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
uint64_t subobj;
bpobj_t subbpo;
int error;
VERIFY0(dmu_read(bpo->bpo_os,
bpo->bpo_phys->bpo_subobjs,
i * sizeof (subobj), sizeof (subobj), &subobj, 0));
error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
if (error != 0) {
(void) printf("ERROR %u while trying to open "
"subobj id %llu\n",
error, (u_longlong_t)subobj);
continue;
}
dump_full_bpobj(&subbpo, "subobj", indent + 1);
bpobj_close(&subbpo);
}
} else {
(void) printf(" %*s: object %llu, %llu blkptrs, %s\n",
indent * 8, name,
(u_longlong_t)bpo->bpo_object,
(u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
bytes);
}
if (dump_opt['d'] < 5)
return;
if (indent == 0) {
(void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
(void) printf("\n");
}
}
static void
dump_deadlist(dsl_deadlist_t *dl)
{
dsl_deadlist_entry_t *dle;
uint64_t unused;
char bytes[32];
char comp[32];
char uncomp[32];
/* make sure nicenum has enough space */
CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
if (dump_opt['d'] < 3)
return;
if (dl->dl_oldfmt) {
dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
return;
}
zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes));
zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp));
zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp));
(void) printf("\n Deadlist: %s (%s/%s comp)\n",
bytes, comp, uncomp);
if (dump_opt['d'] < 4)
return;
(void) printf("\n");
/* force the tree to be loaded */
dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused);
for (dle = avl_first(&dl->dl_tree); dle;
dle = AVL_NEXT(&dl->dl_tree, dle)) {
if (dump_opt['d'] >= 5) {
char buf[128];
(void) snprintf(buf, sizeof (buf),
"mintxg %llu -> obj %llu",
(longlong_t)dle->dle_mintxg,
(longlong_t)dle->dle_bpobj.bpo_object);
dump_full_bpobj(&dle->dle_bpobj, buf, 0);
} else {
(void) printf("mintxg %llu -> obj %llu\n",
(longlong_t)dle->dle_mintxg,
(longlong_t)dle->dle_bpobj.bpo_object);
}
}
}
static avl_tree_t idx_tree;
static avl_tree_t domain_tree;
static boolean_t fuid_table_loaded;
static objset_t *sa_os = NULL;
static sa_attr_type_t *sa_attr_table = NULL;
static int
open_objset(const char *path, dmu_objset_type_t type, void *tag, objset_t **osp)
{
int err;
uint64_t sa_attrs = 0;
uint64_t version = 0;
VERIFY3P(sa_os, ==, NULL);
err = dmu_objset_own(path, type, B_TRUE, tag, osp);
if (err != 0) {
(void) fprintf(stderr, "failed to own dataset '%s': %s\n", path,
strerror(err));
return (err);
}
if (dmu_objset_type(*osp) == DMU_OST_ZFS) {
(void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR,
8, 1, &version);
if (version >= ZPL_VERSION_SA) {
(void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
8, 1, &sa_attrs);
}
err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END,
&sa_attr_table);
if (err != 0) {
(void) fprintf(stderr, "sa_setup failed: %s\n",
strerror(err));
dmu_objset_disown(*osp, tag);
*osp = NULL;
}
}
sa_os = *osp;
return (0);
}
static void
close_objset(objset_t *os, void *tag)
{
VERIFY3P(os, ==, sa_os);
if (os->os_sa != NULL)
sa_tear_down(os);
dmu_objset_disown(os, tag);
sa_attr_table = NULL;
sa_os = NULL;
}
static void
fuid_table_destroy()
{
if (fuid_table_loaded) {
zfs_fuid_table_destroy(&idx_tree, &domain_tree);
fuid_table_loaded = B_FALSE;
}
}
/*
* print uid or gid information.
* For normal POSIX id just the id is printed in decimal format.
* For CIFS files with FUID the fuid is printed in hex followed by
* the domain-rid string.
*/
static void
print_idstr(uint64_t id, const char *id_type)
{
if (FUID_INDEX(id)) {
char *domain;
domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id));
(void) printf("\t%s %llx [%s-%d]\n", id_type,
(u_longlong_t)id, domain, (int)FUID_RID(id));
} else {
(void) printf("\t%s %llu\n", id_type, (u_longlong_t)id);
}
}
static void
dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
{
uint32_t uid_idx, gid_idx;
uid_idx = FUID_INDEX(uid);
gid_idx = FUID_INDEX(gid);
/* Load domain table, if not already loaded */
if (!fuid_table_loaded && (uid_idx || gid_idx)) {
uint64_t fuid_obj;
/* first find the fuid object. It lives in the master node */
VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,
8, 1, &fuid_obj) == 0);
zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);
(void) zfs_fuid_table_load(os, fuid_obj,
&idx_tree, &domain_tree);
fuid_table_loaded = B_TRUE;
}
print_idstr(uid, "uid");
print_idstr(gid, "gid");
}
/*ARGSUSED*/
static void
dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
{
char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */
sa_handle_t *hdl;
uint64_t xattr, rdev, gen;
uint64_t uid, gid, mode, fsize, parent, links;
uint64_t pflags;
uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
time_t z_crtime, z_atime, z_mtime, z_ctime;
sa_bulk_attr_t bulk[12];
int idx = 0;
int error;
VERIFY3P(os, ==, sa_os);
if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
(void) printf("Failed to get handle for SA znode\n");
return;
}
SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
&links, 8);
SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
&mode, 8);
SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
NULL, &parent, 8);
SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
&fsize, 8);
SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
acctm, 16);
SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
modtm, 16);
SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
crtm, 16);
SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
chgtm, 16);
SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,
&pflags, 8);
if (sa_bulk_lookup(hdl, bulk, idx)) {
(void) sa_handle_destroy(hdl);
return;
}
z_crtime = (time_t)crtm[0];
z_atime = (time_t)acctm[0];
z_mtime = (time_t)modtm[0];
z_ctime = (time_t)chgtm[0];
if (dump_opt['d'] > 4) {
error = zfs_obj_to_path(os, object, path, sizeof (path));
if (error != 0) {
(void) snprintf(path, sizeof (path),
"\?\?\?<object#%llu>", (u_longlong_t)object);
}
(void) printf("\tpath %s\n", path);
}
dump_uidgid(os, uid, gid);
(void) printf("\tatime %s", ctime(&z_atime));
(void) printf("\tmtime %s", ctime(&z_mtime));
(void) printf("\tctime %s", ctime(&z_ctime));
(void) printf("\tcrtime %s", ctime(&z_crtime));
(void) printf("\tgen %llu\n", (u_longlong_t)gen);
(void) printf("\tmode %llo\n", (u_longlong_t)mode);
(void) printf("\tsize %llu\n", (u_longlong_t)fsize);
(void) printf("\tparent %llu\n", (u_longlong_t)parent);
(void) printf("\tlinks %llu\n", (u_longlong_t)links);
(void) printf("\tpflags %llx\n", (u_longlong_t)pflags);
if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
sizeof (uint64_t)) == 0)
(void) printf("\txattr %llu\n", (u_longlong_t)xattr);
if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
sizeof (uint64_t)) == 0)
(void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev);
sa_handle_destroy(hdl);
}
/*ARGSUSED*/
static void
dump_acl(objset_t *os, uint64_t object, void *data, size_t size)
{
}
/*ARGSUSED*/
static void
dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
{
}
static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
dump_none, /* unallocated */
dump_zap, /* object directory */
dump_uint64, /* object array */
dump_none, /* packed nvlist */
dump_packed_nvlist, /* packed nvlist size */
dump_none, /* bpobj */
dump_bpobj, /* bpobj header */
dump_none, /* SPA space map header */
dump_none, /* SPA space map */
dump_none, /* ZIL intent log */
dump_dnode, /* DMU dnode */
dump_dmu_objset, /* DMU objset */
dump_dsl_dir, /* DSL directory */
dump_zap, /* DSL directory child map */
dump_zap, /* DSL dataset snap map */
dump_zap, /* DSL props */
dump_dsl_dataset, /* DSL dataset */
dump_znode, /* ZFS znode */
dump_acl, /* ZFS V0 ACL */
dump_uint8, /* ZFS plain file */
dump_zpldir, /* ZFS directory */
dump_zap, /* ZFS master node */
dump_zap, /* ZFS delete queue */
dump_uint8, /* zvol object */
dump_zap, /* zvol prop */
dump_uint8, /* other uint8[] */
dump_uint64, /* other uint64[] */
dump_zap, /* other ZAP */
dump_zap, /* persistent error log */
dump_uint8, /* SPA history */
dump_history_offsets, /* SPA history offsets */
dump_zap, /* Pool properties */
dump_zap, /* DSL permissions */
dump_acl, /* ZFS ACL */
dump_uint8, /* ZFS SYSACL */
dump_none, /* FUID nvlist */
dump_packed_nvlist, /* FUID nvlist size */
dump_zap, /* DSL dataset next clones */
dump_zap, /* DSL scrub queue */
dump_zap, /* ZFS user/group used */
dump_zap, /* ZFS user/group quota */
dump_zap, /* snapshot refcount tags */
dump_ddt_zap, /* DDT ZAP object */
dump_zap, /* DDT statistics */
dump_znode, /* SA object */
dump_zap, /* SA Master Node */
dump_sa_attrs, /* SA attribute registration */
dump_sa_layouts, /* SA attribute layouts */
dump_zap, /* DSL scrub translations */
dump_none, /* fake dedup BP */
dump_zap, /* deadlist */
dump_none, /* deadlist hdr */
dump_zap, /* dsl clones */
dump_bpobj_subobjs, /* bpobj subobjs */
dump_unknown, /* Unknown type, must be last */
};
static void
dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
{
dmu_buf_t *db = NULL;
dmu_object_info_t doi;
dnode_t *dn;
void *bonus = NULL;
size_t bsize = 0;
char iblk[32], dblk[32], lsize[32], asize[32], fill[32];
char bonus_size[32];
char aux[50];
int error;
/* make sure nicenum has enough space */
CTASSERT(sizeof (iblk) >= NN_NUMBUF_SZ);
CTASSERT(sizeof (dblk) >= NN_NUMBUF_SZ);
CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ);
CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ);
CTASSERT(sizeof (bonus_size) >= NN_NUMBUF_SZ);
if (*print_header) {
(void) printf("\n%10s %3s %5s %5s %5s %5s %6s %s\n",
"Object", "lvl", "iblk", "dblk", "dsize", "lsize",
"%full", "type");
*print_header = 0;
}
if (object == 0) {
dn = DMU_META_DNODE(os);
} else {
error = dmu_bonus_hold(os, object, FTAG, &db);
if (error)
fatal("dmu_bonus_hold(%llu) failed, errno %u",
object, error);
bonus = db->db_data;
bsize = db->db_size;
dn = DB_DNODE((dmu_buf_impl_t *)db);
}
dmu_object_info_from_dnode(dn, &doi);
zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk));
zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk));
zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize));
zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize));
zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size));
(void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count *
doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) /
doi.doi_max_offset);
aux[0] = '\0';
if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) {
(void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)",
ZDB_CHECKSUM_NAME(doi.doi_checksum));
}
if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) {
(void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)",
ZDB_COMPRESS_NAME(doi.doi_compress));
}
(void) printf("%10lld %3u %5s %5s %5s %5s %6s %s%s\n",
(u_longlong_t)object, doi.doi_indirection, iblk, dblk,
asize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux);
if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
(void) printf("%10s %3s %5s %5s %5s %5s %6s %s\n",
"", "", "", "", "", bonus_size, "bonus",
ZDB_OT_NAME(doi.doi_bonus_type));
}
if (verbosity >= 4) {
(void) printf("\tdnode flags: %s%s%s\n",
(dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
"USED_BYTES " : "",
(dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
"USERUSED_ACCOUNTED " : "",
(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
"SPILL_BLKPTR" : "");
(void) printf("\tdnode maxblkid: %llu\n",
(longlong_t)dn->dn_phys->dn_maxblkid);
object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, object,
bonus, bsize);
object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, NULL, 0);
*print_header = 1;
}
if (verbosity >= 5)
dump_indirect(dn);
if (verbosity >= 5) {
/*
* Report the list of segments that comprise the object.
*/
uint64_t start = 0;
uint64_t end;
uint64_t blkfill = 1;
int minlvl = 1;
if (dn->dn_type == DMU_OT_DNODE) {
minlvl = 0;
blkfill = DNODES_PER_BLOCK;
}
for (;;) {
char segsize[32];
/* make sure nicenum has enough space */
CTASSERT(sizeof (segsize) >= NN_NUMBUF_SZ);
error = dnode_next_offset(dn,
0, &start, minlvl, blkfill, 0);
if (error)
break;
end = start;
error = dnode_next_offset(dn,
DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
zdb_nicenum(end - start, segsize, sizeof (segsize));
(void) printf("\t\tsegment [%016llx, %016llx)"
" size %5s\n", (u_longlong_t)start,
(u_longlong_t)end, segsize);
if (error)
break;
start = end;
}
}
if (db != NULL)
dmu_buf_rele(db, FTAG);
}
static const char *objset_types[DMU_OST_NUMTYPES] = {
"NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
static void
dump_dir(objset_t *os)
{
dmu_objset_stats_t dds;
uint64_t object, object_count;
uint64_t refdbytes, usedobjs, scratch;
char numbuf[32];
char blkbuf[BP_SPRINTF_LEN + 20];
char osname[ZFS_MAX_DATASET_NAME_LEN];
const char *type = "UNKNOWN";
int verbosity = dump_opt['d'];
int print_header = 1;
unsigned i;
int error;
/* make sure nicenum has enough space */
CTASSERT(sizeof (numbuf) >= NN_NUMBUF_SZ);
dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
dmu_objset_fast_stat(os, &dds);
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
if (dds.dds_type < DMU_OST_NUMTYPES)
type = objset_types[dds.dds_type];
if (dds.dds_type == DMU_OST_META) {
dds.dds_creation_txg = TXG_INITIAL;
usedobjs = BP_GET_FILL(os->os_rootbp);
refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->
dd_used_bytes;
} else {
dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
}
ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
zdb_nicenum(refdbytes, numbuf, sizeof (numbuf));
if (verbosity >= 4) {
(void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp ");
(void) snprintf_blkptr(blkbuf + strlen(blkbuf),
sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp);
} else {
blkbuf[0] = '\0';
}
dmu_objset_name(os, osname);
(void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "
"%s, %llu objects%s\n",
osname, type, (u_longlong_t)dmu_objset_id(os),
(u_longlong_t)dds.dds_creation_txg,
numbuf, (u_longlong_t)usedobjs, blkbuf);
if (zopt_objects != 0) {
for (i = 0; i < zopt_objects; i++)
dump_object(os, zopt_object[i], verbosity,
&print_header);
(void) printf("\n");
return;
}
if (dump_opt['i'] != 0 || verbosity >= 2)
dump_intent_log(dmu_objset_zil(os));
- if (dmu_objset_ds(os) != NULL)
- dump_deadlist(&dmu_objset_ds(os)->ds_deadlist);
+ if (dmu_objset_ds(os) != NULL) {
+ dsl_dataset_t *ds = dmu_objset_ds(os);
+ dump_deadlist(&ds->ds_deadlist);
+ if (dsl_dataset_remap_deadlist_exists(ds)) {
+ (void) printf("ds_remap_deadlist:\n");
+ dump_deadlist(&ds->ds_remap_deadlist);
+ }
+ }
+
if (verbosity < 2)
return;
if (BP_IS_HOLE(os->os_rootbp))
return;
dump_object(os, 0, verbosity, &print_header);
object_count = 0;
if (DMU_USERUSED_DNODE(os) != NULL &&
DMU_USERUSED_DNODE(os)->dn_type != 0) {
dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header);
dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header);
}
object = 0;
while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
dump_object(os, object, verbosity, &print_header);
object_count++;
}
ASSERT3U(object_count, ==, usedobjs);
(void) printf("\n");
if (error != ESRCH) {
(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
abort();
}
}
static void
dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
{
time_t timestamp = ub->ub_timestamp;
(void) printf("%s", header ? header : "");
(void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
(void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
(void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
(void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
(void) printf("\ttimestamp = %llu UTC = %s",
(u_longlong_t)ub->ub_timestamp, asctime(localtime(&timestamp)));
if (dump_opt['u'] >= 3) {
char blkbuf[BP_SPRINTF_LEN];
snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
(void) printf("\trootbp = %s\n", blkbuf);
}
(void) printf("%s", footer ? footer : "");
}
static void
dump_config(spa_t *spa)
{
dmu_buf_t *db;
size_t nvsize = 0;
int error = 0;
error = dmu_bonus_hold(spa->spa_meta_objset,
spa->spa_config_object, FTAG, &db);
if (error == 0) {
nvsize = *(uint64_t *)db->db_data;
dmu_buf_rele(db, FTAG);
(void) printf("\nMOS Configuration:\n");
dump_packed_nvlist(spa->spa_meta_objset,
spa->spa_config_object, (void *)&nvsize, 1);
} else {
(void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",
(u_longlong_t)spa->spa_config_object, error);
}
}
static void
dump_cachefile(const char *cachefile)
{
int fd;
struct stat64 statbuf;
char *buf;
nvlist_t *config;
if ((fd = open64(cachefile, O_RDONLY)) < 0) {
(void) fprintf(stderr, "cannot open '%s': %s\n", cachefile,
strerror(errno));
exit(1);
}
if (fstat64(fd, &statbuf) != 0) {
(void) fprintf(stderr, "failed to stat '%s': %s\n", cachefile,
strerror(errno));
exit(1);
}
if ((buf = malloc(statbuf.st_size)) == NULL) {
(void) fprintf(stderr, "failed to allocate %llu bytes\n",
(u_longlong_t)statbuf.st_size);
exit(1);
}
if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
(void) fprintf(stderr, "failed to read %llu bytes\n",
(u_longlong_t)statbuf.st_size);
exit(1);
}
(void) close(fd);
if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) {
(void) fprintf(stderr, "failed to unpack nvlist\n");
exit(1);
}
free(buf);
dump_nvlist(config, 0);
nvlist_free(config);
}
#define ZDB_MAX_UB_HEADER_SIZE 32
static void
dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift)
{
vdev_t vd;
vdev_t *vdp = &vd;
char header[ZDB_MAX_UB_HEADER_SIZE];
vd.vdev_ashift = ashift;
vdp->vdev_top = vdp;
for (int i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) {
uint64_t uoff = VDEV_UBERBLOCK_OFFSET(vdp, i);
uberblock_t *ub = (void *)((char *)lbl + uoff);
if (uberblock_verify(ub))
continue;
(void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
"Uberblock[%d]\n", i);
dump_uberblock(ub, header, "");
}
}
static char curpath[PATH_MAX];
/*
* Iterate through the path components, recursively passing
* current one's obj and remaining path until we find the obj
* for the last one.
*/
static int
dump_path_impl(objset_t *os, uint64_t obj, char *name)
{
int err;
int header = 1;
uint64_t child_obj;
char *s;
dmu_buf_t *db;
dmu_object_info_t doi;
if ((s = strchr(name, '/')) != NULL)
*s = '\0';
err = zap_lookup(os, obj, name, 8, 1, &child_obj);
(void) strlcat(curpath, name, sizeof (curpath));
if (err != 0) {
(void) fprintf(stderr, "failed to lookup %s: %s\n",
curpath, strerror(err));
return (err);
}
child_obj = ZFS_DIRENT_OBJ(child_obj);
err = sa_buf_hold(os, child_obj, FTAG, &db);
if (err != 0) {
(void) fprintf(stderr,
"failed to get SA dbuf for obj %llu: %s\n",
(u_longlong_t)child_obj, strerror(err));
return (EINVAL);
}
dmu_object_info_from_db(db, &doi);
sa_buf_rele(db, FTAG);
if (doi.doi_bonus_type != DMU_OT_SA &&
doi.doi_bonus_type != DMU_OT_ZNODE) {
(void) fprintf(stderr, "invalid bonus type %d for obj %llu\n",
doi.doi_bonus_type, (u_longlong_t)child_obj);
return (EINVAL);
}
if (dump_opt['v'] > 6) {
(void) printf("obj=%llu %s type=%d bonustype=%d\n",
(u_longlong_t)child_obj, curpath, doi.doi_type,
doi.doi_bonus_type);
}
(void) strlcat(curpath, "/", sizeof (curpath));
switch (doi.doi_type) {
case DMU_OT_DIRECTORY_CONTENTS:
if (s != NULL && *(s + 1) != '\0')
return (dump_path_impl(os, child_obj, s + 1));
/*FALLTHROUGH*/
case DMU_OT_PLAIN_FILE_CONTENTS:
dump_object(os, child_obj, dump_opt['v'], &header);
return (0);
default:
(void) fprintf(stderr, "object %llu has non-file/directory "
"type %d\n", (u_longlong_t)obj, doi.doi_type);
break;
}
return (EINVAL);
}
/*
* Dump the blocks for the object specified by path inside the dataset.
*/
static int
dump_path(char *ds, char *path)
{
int err;
objset_t *os;
uint64_t root_obj;
err = open_objset(ds, DMU_OST_ZFS, FTAG, &os);
if (err != 0)
return (err);
err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj);
if (err != 0) {
(void) fprintf(stderr, "can't lookup root znode: %s\n",
strerror(err));
dmu_objset_disown(os, FTAG);
return (EINVAL);
}
(void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds);
err = dump_path_impl(os, root_obj, path);
close_objset(os, FTAG);
return (err);
}
static int
dump_label(const char *dev)
{
int fd;
vdev_label_t label;
char path[MAXPATHLEN];
char *buf = label.vl_vdev_phys.vp_nvlist;
size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist);
struct stat64 statbuf;
uint64_t psize, ashift;
boolean_t label_found = B_FALSE;
(void) strlcpy(path, dev, sizeof (path));
if (dev[0] == '/') {
if (strncmp(dev, ZFS_DISK_ROOTD,
strlen(ZFS_DISK_ROOTD)) == 0) {
(void) snprintf(path, sizeof (path), "%s%s",
ZFS_RDISK_ROOTD, dev + strlen(ZFS_DISK_ROOTD));
}
} else if (stat64(path, &statbuf) != 0) {
char *s;
(void) snprintf(path, sizeof (path), "%s%s", ZFS_RDISK_ROOTD,
dev);
if (((s = strrchr(dev, 's')) == NULL &&
(s = strchr(dev, 'p')) == NULL) ||
!isdigit(*(s + 1)))
(void) strlcat(path, "s0", sizeof (path));
}
if ((fd = open64(path, O_RDONLY)) < 0) {
(void) fprintf(stderr, "cannot open '%s': %s\n", path,
strerror(errno));
exit(1);
}
if (fstat64(fd, &statbuf) != 0) {
(void) fprintf(stderr, "failed to stat '%s': %s\n", path,
strerror(errno));
(void) close(fd);
exit(1);
}
if (S_ISBLK(statbuf.st_mode)) {
(void) fprintf(stderr,
"cannot use '%s': character device required\n", path);
(void) close(fd);
exit(1);
}
psize = statbuf.st_size;
psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
for (int l = 0; l < VDEV_LABELS; l++) {
nvlist_t *config = NULL;
if (!dump_opt['q']) {
(void) printf("------------------------------------\n");
(void) printf("LABEL %d\n", l);
(void) printf("------------------------------------\n");
}
if (pread64(fd, &label, sizeof (label),
vdev_label_offset(psize, l, 0)) != sizeof (label)) {
if (!dump_opt['q'])
(void) printf("failed to read label %d\n", l);
continue;
}
if (nvlist_unpack(buf, buflen, &config, 0) != 0) {
if (!dump_opt['q'])
(void) printf("failed to unpack label %d\n", l);
ashift = SPA_MINBLOCKSHIFT;
} else {
nvlist_t *vdev_tree = NULL;
if (!dump_opt['q'])
dump_nvlist(config, 4);
if ((nvlist_lookup_nvlist(config,
ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) ||
(nvlist_lookup_uint64(vdev_tree,
ZPOOL_CONFIG_ASHIFT, &ashift) != 0))
ashift = SPA_MINBLOCKSHIFT;
nvlist_free(config);
label_found = B_TRUE;
}
if (dump_opt['u'])
dump_label_uberblocks(&label, ashift);
}
(void) close(fd);
return (label_found ? 0 : 2);
}
static uint64_t dataset_feature_count[SPA_FEATURES];
+static uint64_t remap_deadlist_count = 0;
/*ARGSUSED*/
static int
dump_one_dir(const char *dsname, void *arg)
{
int error;
objset_t *os;
error = open_objset(dsname, DMU_OST_ANY, FTAG, &os);
if (error != 0)
return (0);
for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
if (!dmu_objset_ds(os)->ds_feature_inuse[f])
continue;
ASSERT(spa_feature_table[f].fi_flags &
ZFEATURE_FLAG_PER_DATASET);
dataset_feature_count[f]++;
}
+ if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) {
+ remap_deadlist_count++;
+ }
+
dump_dir(os);
close_objset(os, FTAG);
fuid_table_destroy();
return (0);
}
/*
* Block statistics.
*/
#define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
typedef struct zdb_blkstats {
uint64_t zb_asize;
uint64_t zb_lsize;
uint64_t zb_psize;
uint64_t zb_count;
uint64_t zb_gangs;
uint64_t zb_ditto_samevdev;
uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
} zdb_blkstats_t;
/*
* Extended object types to report deferred frees and dedup auto-ditto blocks.
*/
#define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0)
#define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1)
#define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2)
#define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3)
static const char *zdb_ot_extname[] = {
"deferred free",
"dedup ditto",
"other",
"Total",
};
#define ZB_TOTAL DN_MAX_LEVELS
typedef struct zdb_cb {
zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
+ uint64_t zcb_removing_size;
uint64_t zcb_dedup_asize;
uint64_t zcb_dedup_blocks;
uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
[BPE_PAYLOAD_SIZE];
uint64_t zcb_start;
hrtime_t zcb_lastprint;
uint64_t zcb_totalasize;
uint64_t zcb_errors[256];
int zcb_readfails;
int zcb_haderrors;
spa_t *zcb_spa;
+ uint32_t **zcb_vd_obsolete_counts;
} zdb_cb_t;
static void
zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
dmu_object_type_t type)
{
uint64_t refcnt = 0;
ASSERT(type < ZDB_OT_TOTAL);
if (zilog && zil_bp_tree_add(zilog, bp) != 0)
return;
for (int i = 0; i < 4; i++) {
int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
int t = (i & 1) ? type : ZDB_OT_TOTAL;
int equal;
zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
zb->zb_asize += BP_GET_ASIZE(bp);
zb->zb_lsize += BP_GET_LSIZE(bp);
zb->zb_psize += BP_GET_PSIZE(bp);
zb->zb_count++;
/*
* The histogram is only big enough to record blocks up to
* SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
* "other", bucket.
*/
unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
zb->zb_psize_histogram[idx]++;
zb->zb_gangs += BP_COUNT_GANG(bp);
switch (BP_GET_NDVAS(bp)) {
case 2:
if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
DVA_GET_VDEV(&bp->blk_dva[1]))
zb->zb_ditto_samevdev++;
break;
case 3:
equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
DVA_GET_VDEV(&bp->blk_dva[1])) +
(DVA_GET_VDEV(&bp->blk_dva[0]) ==
DVA_GET_VDEV(&bp->blk_dva[2])) +
(DVA_GET_VDEV(&bp->blk_dva[1]) ==
DVA_GET_VDEV(&bp->blk_dva[2]));
if (equal != 0)
zb->zb_ditto_samevdev++;
break;
}
}
if (BP_IS_EMBEDDED(bp)) {
zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
[BPE_GET_PSIZE(bp)]++;
return;
}
if (dump_opt['L'])
return;
if (BP_GET_DEDUP(bp)) {
ddt_t *ddt;
ddt_entry_t *dde;
ddt = ddt_select(zcb->zcb_spa, bp);
ddt_enter(ddt);
dde = ddt_lookup(ddt, bp, B_FALSE);
if (dde == NULL) {
refcnt = 0;
} else {
ddt_phys_t *ddp = ddt_phys_select(dde, bp);
ddt_phys_decref(ddp);
refcnt = ddp->ddp_refcnt;
if (ddt_phys_total_refcnt(dde) == 0)
ddt_remove(ddt, dde);
}
ddt_exit(ddt);
}
VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
refcnt ? 0 : spa_first_txg(zcb->zcb_spa),
bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
}
/* ARGSUSED */
static void
zdb_blkptr_done(zio_t *zio)
{
spa_t *spa = zio->io_spa;
blkptr_t *bp = zio->io_bp;
int ioerr = zio->io_error;
zdb_cb_t *zcb = zio->io_private;
zbookmark_phys_t *zb = &zio->io_bookmark;
abd_free(zio->io_abd);
mutex_enter(&spa->spa_scrub_lock);
spa->spa_scrub_inflight--;
cv_broadcast(&spa->spa_scrub_io_cv);
if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
char blkbuf[BP_SPRINTF_LEN];
zcb->zcb_haderrors = 1;
zcb->zcb_errors[ioerr]++;
if (dump_opt['b'] >= 2)
snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
else
blkbuf[0] = '\0';
(void) printf("zdb_blkptr_cb: "
"Got error %d reading "
"<%llu, %llu, %lld, %llx> %s -- skipping\n",
ioerr,
(u_longlong_t)zb->zb_objset,
(u_longlong_t)zb->zb_object,
(u_longlong_t)zb->zb_level,
(u_longlong_t)zb->zb_blkid,
blkbuf);
}
mutex_exit(&spa->spa_scrub_lock);
}
/* ARGSUSED */
static int
zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
zdb_cb_t *zcb = arg;
dmu_object_type_t type;
boolean_t is_metadata;
if (bp == NULL)
return (0);
if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
char blkbuf[BP_SPRINTF_LEN];
snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
(void) printf("objset %llu object %llu "
"level %lld offset 0x%llx %s\n",
(u_longlong_t)zb->zb_objset,
(u_longlong_t)zb->zb_object,
(longlong_t)zb->zb_level,
(u_longlong_t)blkid2offset(dnp, bp, zb),
blkbuf);
}
if (BP_IS_HOLE(bp))
return (0);
type = BP_GET_TYPE(bp);
zdb_count_block(zcb, zilog, bp,
(type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type);
is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
if (!BP_IS_EMBEDDED(bp) &&
(dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
size_t size = BP_GET_PSIZE(bp);
abd_t *abd = abd_alloc(size, B_FALSE);
int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
/* If it's an intent log block, failure is expected. */
if (zb->zb_level == ZB_ZIL_LEVEL)
flags |= ZIO_FLAG_SPECULATIVE;
mutex_enter(&spa->spa_scrub_lock);
while (spa->spa_scrub_inflight > max_inflight)
cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
spa->spa_scrub_inflight++;
mutex_exit(&spa->spa_scrub_lock);
zio_nowait(zio_read(NULL, spa, bp, abd, size,
zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
}
zcb->zcb_readfails = 0;
/* only call gethrtime() every 100 blocks */
static int iters;
if (++iters > 100)
iters = 0;
else
return (0);
if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {
uint64_t now = gethrtime();
char buf[10];
uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
int kb_per_sec =
1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000));
int sec_remaining =
(zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec;
/* make sure nicenum has enough space */
CTASSERT(sizeof (buf) >= NN_NUMBUF_SZ);
zfs_nicenum(bytes, buf, sizeof (buf));
(void) fprintf(stderr,
"\r%5s completed (%4dMB/s) "
"estimated time remaining: %uhr %02umin %02usec ",
buf, kb_per_sec / 1024,
sec_remaining / 60 / 60,
sec_remaining / 60 % 60,
sec_remaining % 60);
zcb->zcb_lastprint = now;
}
return (0);
}
static void
zdb_leak(void *arg, uint64_t start, uint64_t size)
{
vdev_t *vd = arg;
(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
(u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
}
static metaslab_ops_t zdb_metaslab_ops = {
NULL /* alloc */
};
static void
zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
{
ddt_bookmark_t ddb;
ddt_entry_t dde;
int error;
bzero(&ddb, sizeof (ddb));
while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
blkptr_t blk;
ddt_phys_t *ddp = dde.dde_phys;
if (ddb.ddb_class == DDT_CLASS_UNIQUE)
return;
ASSERT(ddt_phys_total_refcnt(&dde) > 1);
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
if (ddp->ddp_phys_birth == 0)
continue;
ddt_bp_create(ddb.ddb_checksum,
&dde.dde_key, ddp, &blk);
if (p == DDT_PHYS_DITTO) {
zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
} else {
zcb->zcb_dedup_asize +=
BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
zcb->zcb_dedup_blocks++;
}
}
if (!dump_opt['L']) {
ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
ddt_enter(ddt);
VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
ddt_exit(ddt);
}
}
ASSERT(error == ENOENT);
}
+/* ARGSUSED */
static void
+claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
+ uint64_t size, void *arg)
+{
+ /*
+ * This callback was called through a remap from
+ * a device being removed. Therefore, the vdev that
+ * this callback is applied to is a concrete
+ * vdev.
+ */
+ ASSERT(vdev_is_concrete(vd));
+
+ VERIFY0(metaslab_claim_impl(vd, offset, size,
+ spa_first_txg(vd->vdev_spa)));
+}
+
+static void
+claim_segment_cb(void *arg, uint64_t offset, uint64_t size)
+{
+ vdev_t *vd = arg;
+
+ vdev_indirect_ops.vdev_op_remap(vd, offset, size,
+ claim_segment_impl_cb, NULL);
+}
+
+/*
+ * After accounting for all allocated blocks that are directly referenced,
+ * we might have missed a reference to a block from a partially complete
+ * (and thus unused) indirect mapping object. We perform a secondary pass
+ * through the metaslabs we have already mapped and claim the destination
+ * blocks.
+ */
+static void
+zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
+{
+ if (spa->spa_vdev_removal == NULL)
+ return;
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+ vdev_t *vd = svr->svr_vdev;
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+
+ for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
+ metaslab_t *msp = vd->vdev_ms[msi];
+
+ if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim))
+ break;
+
+ ASSERT0(range_tree_space(svr->svr_allocd_segs));
+
+ if (msp->ms_sm != NULL) {
+ VERIFY0(space_map_load(msp->ms_sm,
+ svr->svr_allocd_segs, SM_ALLOC));
+
+ /*
+ * Clear everything past what has been synced,
+ * because we have not allocated mappings for it yet.
+ */
+ range_tree_clear(svr->svr_allocd_segs,
+ vdev_indirect_mapping_max_offset(vim),
+ msp->ms_sm->sm_start + msp->ms_sm->sm_size -
+ vdev_indirect_mapping_max_offset(vim));
+ }
+
+ zcb->zcb_removing_size +=
+ range_tree_space(svr->svr_allocd_segs);
+ range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd);
+ }
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+}
+
+/*
+ * vm_idxp is an in-out parameter which (for indirect vdevs) is the
+ * index in vim_entries that has the first entry in this metaslab. On
+ * return, it will be set to the first entry after this metaslab.
+ */
+static void
+zdb_leak_init_ms(metaslab_t *msp, uint64_t *vim_idxp)
+{
+ metaslab_group_t *mg = msp->ms_group;
+ vdev_t *vd = mg->mg_vd;
+ vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+
+ mutex_enter(&msp->ms_lock);
+ metaslab_unload(msp);
+
+ /*
+ * We don't want to spend the CPU manipulating the size-ordered
+ * tree, so clear the range_tree ops.
+ */
+ msp->ms_tree->rt_ops = NULL;
+
+ (void) fprintf(stderr,
+ "\rloading vdev %llu of %llu, metaslab %llu of %llu ...",
+ (longlong_t)vd->vdev_id,
+ (longlong_t)rvd->vdev_children,
+ (longlong_t)msp->ms_id,
+ (longlong_t)vd->vdev_ms_count);
+
+ /*
+ * For leak detection, we overload the metaslab ms_tree to
+ * contain allocated segments instead of free segments. As a
+ * result, we can't use the normal metaslab_load/unload
+ * interfaces.
+ */
+ if (vd->vdev_ops == &vdev_indirect_ops) {
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);
+ (*vim_idxp)++) {
+ vdev_indirect_mapping_entry_phys_t *vimep =
+ &vim->vim_entries[*vim_idxp];
+ uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
+ uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);
+ ASSERT3U(ent_offset, >=, msp->ms_start);
+ if (ent_offset >= msp->ms_start + msp->ms_size)
+ break;
+
+ /*
+ * Mappings do not cross metaslab boundaries,
+ * because we create them by walking the metaslabs.
+ */
+ ASSERT3U(ent_offset + ent_len, <=,
+ msp->ms_start + msp->ms_size);
+ range_tree_add(msp->ms_tree, ent_offset, ent_len);
+ }
+ } else if (msp->ms_sm != NULL) {
+ VERIFY0(space_map_load(msp->ms_sm, msp->ms_tree, SM_ALLOC));
+ }
+
+ if (!msp->ms_loaded) {
+ msp->ms_loaded = B_TRUE;
+ }
+ mutex_exit(&msp->ms_lock);
+}
+
+/* ARGSUSED */
+static int
+increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ zdb_cb_t *zcb = arg;
+ spa_t *spa = zcb->zcb_spa;
+ vdev_t *vd;
+ const dva_t *dva = &bp->blk_dva[0];
+
+ ASSERT(!dump_opt['L']);
+ ASSERT3U(BP_GET_NDVAS(bp), ==, 1);
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva));
+ ASSERT3P(vd, !=, NULL);
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
+ ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL);
+
+ vdev_indirect_mapping_increment_obsolete_count(
+ vd->vdev_indirect_mapping,
+ DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva),
+ zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
+
+ return (0);
+}
+
+static uint32_t *
+zdb_load_obsolete_counts(vdev_t *vd)
+{
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ spa_t *spa = vd->vdev_spa;
+ spa_condensing_indirect_phys_t *scip =
+ &spa->spa_condensing_indirect_phys;
+ uint32_t *counts;
+
+ EQUIV(vdev_obsolete_sm_object(vd) != 0, vd->vdev_obsolete_sm != NULL);
+ counts = vdev_indirect_mapping_load_obsolete_counts(vim);
+ if (vd->vdev_obsolete_sm != NULL) {
+ vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
+ vd->vdev_obsolete_sm);
+ }
+ if (scip->scip_vdev == vd->vdev_id &&
+ scip->scip_prev_obsolete_sm_object != 0) {
+ space_map_t *prev_obsolete_sm = NULL;
+ VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
+ scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
+ space_map_update(prev_obsolete_sm);
+ vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
+ prev_obsolete_sm);
+ space_map_close(prev_obsolete_sm);
+ }
+ return (counts);
+}
+
+static void
zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
{
zcb->zcb_spa = spa;
if (!dump_opt['L']) {
+ dsl_pool_t *dp = spa->spa_dsl_pool;
vdev_t *rvd = spa->spa_root_vdev;
/*
* We are going to be changing the meaning of the metaslab's
* ms_tree. Ensure that the allocator doesn't try to
* use the tree.
*/
spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
+ zcb->zcb_vd_obsolete_counts =
+ umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
+ UMEM_NOFAIL);
+
+
for (uint64_t c = 0; c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
- metaslab_group_t *mg = vd->vdev_mg;
- for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
- metaslab_t *msp = vd->vdev_ms[m];
- ASSERT3P(msp->ms_group, ==, mg);
- mutex_enter(&msp->ms_lock);
- metaslab_unload(msp);
+ uint64_t vim_idx = 0;
+ ASSERT3U(c, ==, vd->vdev_id);
+
+ /*
+ * Note: we don't check for mapping leaks on
+ * removing vdevs because their ms_tree's are
+ * used to look for leaks in allocated space.
+ */
+ if (vd->vdev_ops == &vdev_indirect_ops) {
+ zcb->zcb_vd_obsolete_counts[c] =
+ zdb_load_obsolete_counts(vd);
+
/*
- * For leak detection, we overload the metaslab
- * ms_tree to contain allocated segments
- * instead of free segments. As a result,
- * we can't use the normal metaslab_load/unload
- * interfaces.
+ * Normally, indirect vdevs don't have any
+ * metaslabs. We want to set them up for
+ * zio_claim().
*/
- if (msp->ms_sm != NULL) {
- (void) fprintf(stderr,
- "\rloading space map for "
- "vdev %llu of %llu, "
- "metaslab %llu of %llu ...",
- (longlong_t)c,
- (longlong_t)rvd->vdev_children,
- (longlong_t)m,
- (longlong_t)vd->vdev_ms_count);
+ VERIFY0(vdev_metaslab_init(vd, 0));
+ }
- /*
- * We don't want to spend the CPU
- * manipulating the size-ordered
- * tree, so clear the range_tree
- * ops.
- */
- msp->ms_tree->rt_ops = NULL;
- VERIFY0(space_map_load(msp->ms_sm,
- msp->ms_tree, SM_ALLOC));
-
- if (!msp->ms_loaded) {
- msp->ms_loaded = B_TRUE;
- }
- }
- mutex_exit(&msp->ms_lock);
+ for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+ zdb_leak_init_ms(vd->vdev_ms[m], &vim_idx);
}
+ if (vd->vdev_ops == &vdev_indirect_ops) {
+ ASSERT3U(vim_idx, ==,
+ vdev_indirect_mapping_num_entries(
+ vd->vdev_indirect_mapping));
+ }
}
(void) fprintf(stderr, "\n");
+
+ if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
+ ASSERT(spa_feature_is_enabled(spa,
+ SPA_FEATURE_DEVICE_REMOVAL));
+ (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
+ increment_indirect_mapping_cb, zcb, NULL);
+ }
}
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
zdb_ddt_leak_init(spa, zcb);
spa_config_exit(spa, SCL_CONFIG, FTAG);
}
-static void
-zdb_leak_fini(spa_t *spa)
+static boolean_t
+zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
{
+ boolean_t leaks = B_FALSE;
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ uint64_t total_leaked = 0;
+
+ ASSERT(vim != NULL);
+
+ for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
+ vdev_indirect_mapping_entry_phys_t *vimep =
+ &vim->vim_entries[i];
+ uint64_t obsolete_bytes = 0;
+ uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
+ metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+ /*
+ * This is not very efficient but it's easy to
+ * verify correctness.
+ */
+ for (uint64_t inner_offset = 0;
+ inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst);
+ inner_offset += 1 << vd->vdev_ashift) {
+ if (range_tree_contains(msp->ms_tree,
+ offset + inner_offset, 1 << vd->vdev_ashift)) {
+ obsolete_bytes += 1 << vd->vdev_ashift;
+ }
+ }
+
+ int64_t bytes_leaked = obsolete_bytes -
+ zcb->zcb_vd_obsolete_counts[vd->vdev_id][i];
+ ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=,
+ zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]);
+ if (bytes_leaked != 0 &&
+ (vdev_obsolete_counts_are_precise(vd) ||
+ dump_opt['d'] >= 5)) {
+ (void) printf("obsolete indirect mapping count "
+ "mismatch on %llu:%llx:%llx : %llx bytes leaked\n",
+ (u_longlong_t)vd->vdev_id,
+ (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
+ (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
+ (u_longlong_t)bytes_leaked);
+ }
+ total_leaked += ABS(bytes_leaked);
+ }
+
+ if (!vdev_obsolete_counts_are_precise(vd) && total_leaked > 0) {
+ int pct_leaked = total_leaked * 100 /
+ vdev_indirect_mapping_bytes_mapped(vim);
+ (void) printf("cannot verify obsolete indirect mapping "
+ "counts of vdev %llu because precise feature was not "
+ "enabled when it was removed: %d%% (%llx bytes) of mapping"
+ "unreferenced\n",
+ (u_longlong_t)vd->vdev_id, pct_leaked,
+ (u_longlong_t)total_leaked);
+ } else if (total_leaked > 0) {
+ (void) printf("obsolete indirect mapping count mismatch "
+ "for vdev %llu -- %llx total bytes mismatched\n",
+ (u_longlong_t)vd->vdev_id,
+ (u_longlong_t)total_leaked);
+ leaks |= B_TRUE;
+ }
+
+ vdev_indirect_mapping_free_obsolete_counts(vim,
+ zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
+ zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL;
+
+ return (leaks);
+}
+
+static boolean_t
+zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
+{
+ boolean_t leaks = B_FALSE;
if (!dump_opt['L']) {
vdev_t *rvd = spa->spa_root_vdev;
for (unsigned c = 0; c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
metaslab_group_t *mg = vd->vdev_mg;
- for (unsigned m = 0; m < vd->vdev_ms_count; m++) {
+
+ if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
+ leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
+ }
+
+ for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
ASSERT3P(mg, ==, msp->ms_group);
- mutex_enter(&msp->ms_lock);
/*
* The ms_tree has been overloaded to
* contain allocated segments. Now that we
* finished traversing all blocks, any
* block that remains in the ms_tree
* represents an allocated block that we
* did not claim during the traversal.
* Claimed blocks would have been removed
- * from the ms_tree.
+ * from the ms_tree. For indirect vdevs,
+ * space remaining in the tree represents
+ * parts of the mapping that are not
+ * referenced, which is not a bug.
*/
- range_tree_vacate(msp->ms_tree, zdb_leak, vd);
+ if (vd->vdev_ops == &vdev_indirect_ops) {
+ range_tree_vacate(msp->ms_tree,
+ NULL, NULL);
+ } else {
+ range_tree_vacate(msp->ms_tree,
+ zdb_leak, vd);
+ }
if (msp->ms_loaded) {
msp->ms_loaded = B_FALSE;
}
-
- mutex_exit(&msp->ms_lock);
}
}
+
+ umem_free(zcb->zcb_vd_obsolete_counts,
+ rvd->vdev_children * sizeof (uint32_t *));
+ zcb->zcb_vd_obsolete_counts = NULL;
}
+ return (leaks);
}
/* ARGSUSED */
static int
count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
{
zdb_cb_t *zcb = arg;
if (dump_opt['b'] >= 5) {
char blkbuf[BP_SPRINTF_LEN];
snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
(void) printf("[%s] %s\n",
"deferred free", blkbuf);
}
zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
return (0);
}
static int
dump_block_stats(spa_t *spa)
{
zdb_cb_t zcb;
zdb_blkstats_t *zb, *tzb;
uint64_t norm_alloc, norm_space, total_alloc, total_found;
int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD;
boolean_t leaks = B_FALSE;
bzero(&zcb, sizeof (zcb));
(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
(dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
(dump_opt['c'] == 1) ? "metadata " : "",
dump_opt['c'] ? "checksums " : "",
(dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
!dump_opt['L'] ? "nothing leaked " : "");
/*
* Load all space maps as SM_ALLOC maps, then traverse the pool
* claiming each block we discover. If the pool is perfectly
* consistent, the space maps will be empty when we're done.
* Anything left over is a leak; any block we can't claim (because
* it's not part of any space map) is a double allocation,
* reference to a freed block, or an unclaimed log block.
*/
zdb_leak_init(spa, &zcb);
/*
* If there's a deferred-free bplist, process that first.
*/
(void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
count_block_cb, &zcb, NULL);
+
if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
(void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
count_block_cb, &zcb, NULL);
}
+
+ zdb_claim_removing(spa, &zcb);
+
if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
&zcb, NULL));
}
if (dump_opt['c'] > 1)
flags |= TRAVERSE_PREFETCH_DATA;
zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
zcb.zcb_start = zcb.zcb_lastprint = gethrtime();
zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
/*
* If we've traversed the data blocks then we need to wait for those
* I/Os to complete. We leverage "The Godfather" zio to wait on
* all async I/Os to complete.
*/
if (dump_opt['c']) {
for (int i = 0; i < max_ncpus; i++) {
(void) zio_wait(spa->spa_async_zio_root[i]);
spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
ZIO_FLAG_GODFATHER);
}
}
if (zcb.zcb_haderrors) {
(void) printf("\nError counts:\n\n");
(void) printf("\t%5s %s\n", "errno", "count");
for (int e = 0; e < 256; e++) {
if (zcb.zcb_errors[e] != 0) {
(void) printf("\t%5d %llu\n",
e, (u_longlong_t)zcb.zcb_errors[e]);
}
}
}
/*
* Report any leaked segments.
*/
- zdb_leak_fini(spa);
+ leaks |= zdb_leak_fini(spa, &zcb);
tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
norm_space = metaslab_class_get_space(spa_normal_class(spa));
total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa));
- total_found = tzb->zb_asize - zcb.zcb_dedup_asize;
+ total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
+ zcb.zcb_removing_size;
if (total_found == total_alloc) {
if (!dump_opt['L'])
(void) printf("\n\tNo leaks (block sum matches space"
" maps exactly)\n");
} else {
(void) printf("block traversal size %llu != alloc %llu "
"(%s %lld)\n",
(u_longlong_t)total_found,
(u_longlong_t)total_alloc,
(dump_opt['L']) ? "unreachable" : "leaked",
(longlong_t)(total_alloc - total_found));
leaks = B_TRUE;
}
if (tzb->zb_count == 0)
return (2);
(void) printf("\n");
(void) printf("\tbp count: %10llu\n",
(u_longlong_t)tzb->zb_count);
(void) printf("\tganged count: %10llu\n",
(longlong_t)tzb->zb_gangs);
(void) printf("\tbp logical: %10llu avg: %6llu\n",
(u_longlong_t)tzb->zb_lsize,
(u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
(void) printf("\tbp physical: %10llu avg:"
" %6llu compression: %6.2f\n",
(u_longlong_t)tzb->zb_psize,
(u_longlong_t)(tzb->zb_psize / tzb->zb_count),
(double)tzb->zb_lsize / tzb->zb_psize);
(void) printf("\tbp allocated: %10llu avg:"
" %6llu compression: %6.2f\n",
(u_longlong_t)tzb->zb_asize,
(u_longlong_t)(tzb->zb_asize / tzb->zb_count),
(double)tzb->zb_lsize / tzb->zb_asize);
(void) printf("\tbp deduped: %10llu ref>1:"
" %6llu deduplication: %6.2f\n",
(u_longlong_t)zcb.zcb_dedup_asize,
(u_longlong_t)zcb.zcb_dedup_blocks,
(double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0);
(void) printf("\tSPA allocated: %10llu used: %5.2f%%\n",
(u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
for (bp_embedded_type_t i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
if (zcb.zcb_embedded_blocks[i] == 0)
continue;
(void) printf("\n");
(void) printf("\tadditional, non-pointer bps of type %u: "
"%10llu\n",
i, (u_longlong_t)zcb.zcb_embedded_blocks[i]);
if (dump_opt['b'] >= 3) {
(void) printf("\t number of (compressed) bytes: "
"number of bps\n");
dump_histogram(zcb.zcb_embedded_histogram[i],
sizeof (zcb.zcb_embedded_histogram[i]) /
sizeof (zcb.zcb_embedded_histogram[i][0]), 0);
}
}
if (tzb->zb_ditto_samevdev != 0) {
(void) printf("\tDittoed blocks on same vdev: %llu\n",
(longlong_t)tzb->zb_ditto_samevdev);
}
+ for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) {
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[v];
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+
+ if (vim == NULL) {
+ continue;
+ }
+
+ char mem[32];
+ zdb_nicenum(vdev_indirect_mapping_num_entries(vim),
+ mem, vdev_indirect_mapping_size(vim));
+
+ (void) printf("\tindirect vdev id %llu has %llu segments "
+ "(%s in memory)\n",
+ (longlong_t)vd->vdev_id,
+ (longlong_t)vdev_indirect_mapping_num_entries(vim), mem);
+ }
+
if (dump_opt['b'] >= 2) {
int l, t, level;
(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
"\t avg\t comp\t%%Total\tType\n");
for (t = 0; t <= ZDB_OT_TOTAL; t++) {
char csize[32], lsize[32], psize[32], asize[32];
char avg[32], gang[32];
const char *typename;
/* make sure nicenum has enough space */
CTASSERT(sizeof (csize) >= NN_NUMBUF_SZ);
CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ);
CTASSERT(sizeof (psize) >= NN_NUMBUF_SZ);
CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ);
CTASSERT(sizeof (avg) >= NN_NUMBUF_SZ);
CTASSERT(sizeof (gang) >= NN_NUMBUF_SZ);
if (t < DMU_OT_NUMTYPES)
typename = dmu_ot[t].ot_name;
else
typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) {
(void) printf("%6s\t%5s\t%5s\t%5s"
"\t%5s\t%5s\t%6s\t%s\n",
"-",
"-",
"-",
"-",
"-",
"-",
"-",
typename);
continue;
}
for (l = ZB_TOTAL - 1; l >= -1; l--) {
level = (l == -1 ? ZB_TOTAL : l);
zb = &zcb.zcb_type[level][t];
if (zb->zb_asize == 0)
continue;
if (dump_opt['b'] < 3 && level != ZB_TOTAL)
continue;
if (level == 0 && zb->zb_asize ==
zcb.zcb_type[ZB_TOTAL][t].zb_asize)
continue;
zdb_nicenum(zb->zb_count, csize,
sizeof (csize));
zdb_nicenum(zb->zb_lsize, lsize,
sizeof (lsize));
zdb_nicenum(zb->zb_psize, psize,
sizeof (psize));
zdb_nicenum(zb->zb_asize, asize,
sizeof (asize));
zdb_nicenum(zb->zb_asize / zb->zb_count, avg,
sizeof (avg));
zdb_nicenum(zb->zb_gangs, gang, sizeof (gang));
(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
"\t%5.2f\t%6.2f\t",
csize, lsize, psize, asize, avg,
(double)zb->zb_lsize / zb->zb_psize,
100.0 * zb->zb_asize / tzb->zb_asize);
if (level == ZB_TOTAL)
(void) printf("%s\n", typename);
else
(void) printf(" L%d %s\n",
level, typename);
if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {
(void) printf("\t number of ganged "
"blocks: %s\n", gang);
}
if (dump_opt['b'] >= 4) {
(void) printf("psize "
"(in 512-byte sectors): "
"number of blocks\n");
dump_histogram(zb->zb_psize_histogram,
PSIZE_HISTO_SIZE, 0);
}
}
}
}
(void) printf("\n");
if (leaks)
return (2);
if (zcb.zcb_haderrors)
return (3);
return (0);
}
typedef struct zdb_ddt_entry {
ddt_key_t zdde_key;
uint64_t zdde_ref_blocks;
uint64_t zdde_ref_lsize;
uint64_t zdde_ref_psize;
uint64_t zdde_ref_dsize;
avl_node_t zdde_node;
} zdb_ddt_entry_t;
/* ARGSUSED */
static int
zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
avl_tree_t *t = arg;
avl_index_t where;
zdb_ddt_entry_t *zdde, zdde_search;
if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
return (0);
if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
(void) printf("traversing objset %llu, %llu objects, "
"%lu blocks so far\n",
(u_longlong_t)zb->zb_objset,
(u_longlong_t)BP_GET_FILL(bp),
avl_numnodes(t));
}
if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
return (0);
ddt_key_fill(&zdde_search.zdde_key, bp);
zdde = avl_find(t, &zdde_search, &where);
if (zdde == NULL) {
zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
zdde->zdde_key = zdde_search.zdde_key;
avl_insert(t, zdde, where);
}
zdde->zdde_ref_blocks += 1;
zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
return (0);
}
static void
dump_simulated_ddt(spa_t *spa)
{
avl_tree_t t;
void *cookie = NULL;
zdb_ddt_entry_t *zdde;
ddt_histogram_t ddh_total;
ddt_stat_t dds_total;
bzero(&ddh_total, sizeof (ddh_total));
bzero(&dds_total, sizeof (dds_total));
avl_create(&t, ddt_entry_compare,
sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
(void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
zdb_ddt_add_cb, &t);
spa_config_exit(spa, SCL_CONFIG, FTAG);
while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
ddt_stat_t dds;
uint64_t refcnt = zdde->zdde_ref_blocks;
ASSERT(refcnt != 0);
dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
dds.dds_psize = zdde->zdde_ref_psize / refcnt;
dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
dds.dds_ref_blocks = zdde->zdde_ref_blocks;
dds.dds_ref_lsize = zdde->zdde_ref_lsize;
dds.dds_ref_psize = zdde->zdde_ref_psize;
dds.dds_ref_dsize = zdde->zdde_ref_dsize;
ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
&dds, 0);
umem_free(zdde, sizeof (*zdde));
}
avl_destroy(&t);
ddt_histogram_stat(&dds_total, &ddh_total);
(void) printf("Simulated DDT histogram:\n");
zpool_dump_ddt(&dds_total, &ddh_total);
dump_dedup_ratio(&dds_total);
}
+static int
+verify_device_removal_feature_counts(spa_t *spa)
+{
+ uint64_t dr_feature_refcount = 0;
+ uint64_t oc_feature_refcount = 0;
+ uint64_t indirect_vdev_count = 0;
+ uint64_t precise_vdev_count = 0;
+ uint64_t obsolete_counts_object_count = 0;
+ uint64_t obsolete_sm_count = 0;
+ uint64_t obsolete_counts_count = 0;
+ uint64_t scip_count = 0;
+ uint64_t obsolete_bpobj_count = 0;
+ int ret = 0;
+
+ spa_condensing_indirect_phys_t *scip =
+ &spa->spa_condensing_indirect_phys;
+ if (scip->scip_next_mapping_object != 0) {
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev];
+ ASSERT(scip->scip_prev_obsolete_sm_object != 0);
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+
+ (void) printf("Condensing indirect vdev %llu: new mapping "
+ "object %llu, prev obsolete sm %llu\n",
+ (u_longlong_t)scip->scip_vdev,
+ (u_longlong_t)scip->scip_next_mapping_object,
+ (u_longlong_t)scip->scip_prev_obsolete_sm_object);
+ if (scip->scip_prev_obsolete_sm_object != 0) {
+ space_map_t *prev_obsolete_sm = NULL;
+ VERIFY0(space_map_open(&prev_obsolete_sm,
+ spa->spa_meta_objset,
+ scip->scip_prev_obsolete_sm_object,
+ 0, vd->vdev_asize, 0));
+ space_map_update(prev_obsolete_sm);
+ dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
+ (void) printf("\n");
+ space_map_close(prev_obsolete_sm);
+ }
+
+ scip_count += 2;
+ }
+
+ for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+
+ if (vic->vic_mapping_object != 0) {
+ ASSERT(vd->vdev_ops == &vdev_indirect_ops ||
+ vd->vdev_removing);
+ indirect_vdev_count++;
+
+ if (vd->vdev_indirect_mapping->vim_havecounts) {
+ obsolete_counts_count++;
+ }
+ }
+ if (vdev_obsolete_counts_are_precise(vd)) {
+ ASSERT(vic->vic_mapping_object != 0);
+ precise_vdev_count++;
+ }
+ if (vdev_obsolete_sm_object(vd) != 0) {
+ ASSERT(vic->vic_mapping_object != 0);
+ obsolete_sm_count++;
+ }
+ }
+
+ (void) feature_get_refcount(spa,
+ &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL],
+ &dr_feature_refcount);
+ (void) feature_get_refcount(spa,
+ &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS],
+ &oc_feature_refcount);
+
+ if (dr_feature_refcount != indirect_vdev_count) {
+ ret = 1;
+ (void) printf("Number of indirect vdevs (%llu) " \
+ "does not match feature count (%llu)\n",
+ (u_longlong_t)indirect_vdev_count,
+ (u_longlong_t)dr_feature_refcount);
+ } else {
+ (void) printf("Verified device_removal feature refcount " \
+ "of %llu is correct\n",
+ (u_longlong_t)dr_feature_refcount);
+ }
+
+ if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_OBSOLETE_BPOBJ) == 0) {
+ obsolete_bpobj_count++;
+ }
+
+
+ obsolete_counts_object_count = precise_vdev_count;
+ obsolete_counts_object_count += obsolete_sm_count;
+ obsolete_counts_object_count += obsolete_counts_count;
+ obsolete_counts_object_count += scip_count;
+ obsolete_counts_object_count += obsolete_bpobj_count;
+ obsolete_counts_object_count += remap_deadlist_count;
+
+ if (oc_feature_refcount != obsolete_counts_object_count) {
+ ret = 1;
+ (void) printf("Number of obsolete counts objects (%llu) " \
+ "does not match feature count (%llu)\n",
+ (u_longlong_t)obsolete_counts_object_count,
+ (u_longlong_t)oc_feature_refcount);
+ (void) printf("pv:%llu os:%llu oc:%llu sc:%llu "
+ "ob:%llu rd:%llu\n",
+ (u_longlong_t)precise_vdev_count,
+ (u_longlong_t)obsolete_sm_count,
+ (u_longlong_t)obsolete_counts_count,
+ (u_longlong_t)scip_count,
+ (u_longlong_t)obsolete_bpobj_count,
+ (u_longlong_t)remap_deadlist_count);
+ } else {
+ (void) printf("Verified indirect_refcount feature refcount " \
+ "of %llu is correct\n",
+ (u_longlong_t)oc_feature_refcount);
+ }
+ return (ret);
+}
+
static void
dump_zpool(spa_t *spa)
{
dsl_pool_t *dp = spa_get_dsl(spa);
int rc = 0;
if (dump_opt['S']) {
dump_simulated_ddt(spa);
return;
}
if (!dump_opt['e'] && dump_opt['C'] > 1) {
(void) printf("\nCached configuration:\n");
dump_nvlist(spa->spa_config, 8);
}
if (dump_opt['C'])
dump_config(spa);
if (dump_opt['u'])
dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");
if (dump_opt['D'])
dump_all_ddts(spa);
if (dump_opt['d'] > 2 || dump_opt['m'])
dump_metaslabs(spa);
if (dump_opt['M'])
dump_metaslab_groups(spa);
if (dump_opt['d'] || dump_opt['i']) {
dump_dir(dp->dp_meta_objset);
if (dump_opt['d'] >= 3) {
+ dsl_pool_t *dp = spa->spa_dsl_pool;
dump_full_bpobj(&spa->spa_deferred_bpobj,
"Deferred frees", 0);
if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
- dump_full_bpobj(
- &spa->spa_dsl_pool->dp_free_bpobj,
+ dump_full_bpobj(&dp->dp_free_bpobj,
"Pool snapshot frees", 0);
}
+ if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
+ ASSERT(spa_feature_is_enabled(spa,
+ SPA_FEATURE_DEVICE_REMOVAL));
+ dump_full_bpobj(&dp->dp_obsolete_bpobj,
+ "Pool obsolete blocks", 0);
+ }
if (spa_feature_is_active(spa,
SPA_FEATURE_ASYNC_DESTROY)) {
dump_bptree(spa->spa_meta_objset,
- spa->spa_dsl_pool->dp_bptree_obj,
+ dp->dp_bptree_obj,
"Pool dataset frees");
}
dump_dtl(spa->spa_root_vdev, 0);
}
(void) dmu_objset_find(spa_name(spa), dump_one_dir,
NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
uint64_t refcount;
if (!(spa_feature_table[f].fi_flags &
ZFEATURE_FLAG_PER_DATASET)) {
ASSERT0(dataset_feature_count[f]);
continue;
}
(void) feature_get_refcount(spa,
&spa_feature_table[f], &refcount);
if (dataset_feature_count[f] != refcount) {
(void) printf("%s feature refcount mismatch: "
"%lld datasets != %lld refcount\n",
spa_feature_table[f].fi_uname,
(longlong_t)dataset_feature_count[f],
(longlong_t)refcount);
rc = 2;
} else {
(void) printf("Verified %s feature refcount "
"of %llu is correct\n",
spa_feature_table[f].fi_uname,
(longlong_t)refcount);
}
}
+
+ if (rc == 0) {
+ rc = verify_device_removal_feature_counts(spa);
+ }
}
if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
rc = dump_block_stats(spa);
if (rc == 0)
rc = verify_spacemap_refcounts(spa);
if (dump_opt['s'])
show_pool_stats(spa);
if (dump_opt['h'])
dump_history(spa);
if (rc != 0) {
dump_debug_buffer();
exit(rc);
}
}
#define ZDB_FLAG_CHECKSUM 0x0001
#define ZDB_FLAG_DECOMPRESS 0x0002
#define ZDB_FLAG_BSWAP 0x0004
#define ZDB_FLAG_GBH 0x0008
#define ZDB_FLAG_INDIRECT 0x0010
#define ZDB_FLAG_PHYS 0x0020
#define ZDB_FLAG_RAW 0x0040
#define ZDB_FLAG_PRINT_BLKPTR 0x0080
static int flagbits[256];
static void
zdb_print_blkptr(blkptr_t *bp, int flags)
{
char blkbuf[BP_SPRINTF_LEN];
if (flags & ZDB_FLAG_BSWAP)
byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
(void) printf("%s\n", blkbuf);
}
static void
zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
{
int i;
for (i = 0; i < nbps; i++)
zdb_print_blkptr(&bp[i], flags);
}
static void
zdb_dump_gbh(void *buf, int flags)
{
zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
}
static void
zdb_dump_block_raw(void *buf, uint64_t size, int flags)
{
if (flags & ZDB_FLAG_BSWAP)
byteswap_uint64_array(buf, size);
(void) write(1, buf, size);
}
static void
zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
{
uint64_t *d = (uint64_t *)buf;
unsigned nwords = size / sizeof (uint64_t);
int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
unsigned i, j;
const char *hdr;
char *c;
if (do_bswap)
hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8";
else
hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f";
(void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr);
for (i = 0; i < nwords; i += 2) {
(void) printf("%06llx: %016llx %016llx ",
(u_longlong_t)(i * sizeof (uint64_t)),
(u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
(u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));
c = (char *)&d[i];
for (j = 0; j < 2 * sizeof (uint64_t); j++)
(void) printf("%c", isprint(c[j]) ? c[j] : '.');
(void) printf("\n");
}
}
/*
* There are two acceptable formats:
* leaf_name - For example: c1t0d0 or /tmp/ztest.0a
* child[.child]* - For example: 0.1.1
*
* The second form can be used to specify arbitrary vdevs anywhere
* in the heirarchy. For example, in a pool with a mirror of
* RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
*/
static vdev_t *
zdb_vdev_lookup(vdev_t *vdev, const char *path)
{
char *s, *p, *q;
unsigned i;
if (vdev == NULL)
return (NULL);
/* First, assume the x.x.x.x format */
i = strtoul(path, &s, 10);
if (s == path || (s && *s != '.' && *s != '\0'))
goto name;
if (i >= vdev->vdev_children)
return (NULL);
vdev = vdev->vdev_child[i];
if (*s == '\0')
return (vdev);
return (zdb_vdev_lookup(vdev, s+1));
name:
for (i = 0; i < vdev->vdev_children; i++) {
vdev_t *vc = vdev->vdev_child[i];
if (vc->vdev_path == NULL) {
vc = zdb_vdev_lookup(vc, path);
if (vc == NULL)
continue;
else
return (vc);
}
p = strrchr(vc->vdev_path, '/');
p = p ? p + 1 : vc->vdev_path;
q = &vc->vdev_path[strlen(vc->vdev_path) - 2];
if (strcmp(vc->vdev_path, path) == 0)
return (vc);
if (strcmp(p, path) == 0)
return (vc);
if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
return (vc);
}
return (NULL);
}
/* ARGSUSED */
static int
random_get_pseudo_bytes_cb(void *buf, size_t len, void *unused)
{
return (random_get_pseudo_bytes(buf, len));
}
/*
* Read a block from a pool and print it out. The syntax of the
* block descriptor is:
*
* pool:vdev_specifier:offset:size[:flags]
*
* pool - The name of the pool you wish to read from
* vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
* offset - offset, in hex, in bytes
* size - Amount of data to read, in hex, in bytes
* flags - A string of characters specifying options
* b: Decode a blkptr at given offset within block
* *c: Calculate and display checksums
* d: Decompress data before dumping
* e: Byteswap data before dumping
* g: Display data as a gang block header
* i: Display as an indirect block
* p: Do I/O to physical offset
* r: Dump raw data to stdout
*
* * = not yet implemented
*/
static void
zdb_read_block(char *thing, spa_t *spa)
{
blkptr_t blk, *bp = &blk;
dva_t *dva = bp->blk_dva;
int flags = 0;
uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
zio_t *zio;
vdev_t *vd;
abd_t *pabd;
void *lbuf, *buf;
const char *s, *vdev;
char *p, *dup, *flagstr;
int i, error;
dup = strdup(thing);
s = strtok(dup, ":");
vdev = s ? s : "";
s = strtok(NULL, ":");
offset = strtoull(s ? s : "", NULL, 16);
s = strtok(NULL, ":");
size = strtoull(s ? s : "", NULL, 16);
s = strtok(NULL, ":");
if (s)
flagstr = strdup(s);
else
flagstr = strdup("");
s = NULL;
if (size == 0)
s = "size must not be zero";
if (!IS_P2ALIGNED(size, DEV_BSIZE))
s = "size must be a multiple of sector size";
if (!IS_P2ALIGNED(offset, DEV_BSIZE))
s = "offset must be a multiple of sector size";
if (s) {
(void) printf("Invalid block specifier: %s - %s\n", thing, s);
free(flagstr);
free(dup);
return;
}
for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) {
for (i = 0; flagstr[i]; i++) {
int bit = flagbits[(uchar_t)flagstr[i]];
if (bit == 0) {
(void) printf("***Invalid flag: %c\n",
flagstr[i]);
continue;
}
flags |= bit;
/* If it's not something with an argument, keep going */
if ((bit & (ZDB_FLAG_CHECKSUM |
ZDB_FLAG_PRINT_BLKPTR)) == 0)
continue;
p = &flagstr[i + 1];
if (bit == ZDB_FLAG_PRINT_BLKPTR)
blkptr_offset = strtoull(p, &p, 16);
if (*p != ':' && *p != '\0') {
(void) printf("***Invalid flag arg: '%s'\n", s);
free(flagstr);
free(dup);
return;
}
i += p - &flagstr[i + 1]; /* skip over the number */
}
}
free(flagstr);
vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
if (vd == NULL) {
(void) printf("***Invalid vdev: %s\n", vdev);
free(dup);
return;
} else {
if (vd->vdev_path)
(void) fprintf(stderr, "Found vdev: %s\n",
vd->vdev_path);
else
(void) fprintf(stderr, "Found vdev type: %s\n",
vd->vdev_ops->vdev_op_type);
}
psize = size;
lsize = size;
pabd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_FALSE);
lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
BP_ZERO(bp);
DVA_SET_VDEV(&dva[0], vd->vdev_id);
DVA_SET_OFFSET(&dva[0], offset);
DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
BP_SET_LSIZE(bp, lsize);
BP_SET_PSIZE(bp, psize);
BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
BP_SET_TYPE(bp, DMU_OT_NONE);
BP_SET_LEVEL(bp, 0);
BP_SET_DEDUP(bp, 0);
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
zio = zio_root(spa, NULL, NULL, 0);
if (vd == vd->vdev_top) {
/*
* Treat this as a normal block read.
*/
zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
} else {
/*
* Treat this as a vdev child I/O.
*/
zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
- ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL));
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL,
+ NULL, NULL));
}
error = zio_wait(zio);
spa_config_exit(spa, SCL_STATE, FTAG);
if (error) {
(void) printf("Read of %s failed, error: %d\n", thing, error);
goto out;
}
if (flags & ZDB_FLAG_DECOMPRESS) {
/*
* We don't know how the data was compressed, so just try
* every decompress function at every inflated blocksize.
*/
enum zio_compress c;
void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
abd_copy_to_buf(pbuf2, pabd, psize);
VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize,
random_get_pseudo_bytes_cb, NULL));
VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
SPA_MAXBLOCKSIZE - psize));
for (lsize = SPA_MAXBLOCKSIZE; lsize > psize;
lsize -= SPA_MINBLOCKSIZE) {
for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) {
if (zio_decompress_data(c, pabd,
lbuf, psize, lsize) == 0 &&
zio_decompress_data_buf(c, pbuf2,
lbuf2, psize, lsize) == 0 &&
bcmp(lbuf, lbuf2, lsize) == 0)
break;
}
if (c != ZIO_COMPRESS_FUNCTIONS)
break;
lsize -= SPA_MINBLOCKSIZE;
}
umem_free(pbuf2, SPA_MAXBLOCKSIZE);
umem_free(lbuf2, SPA_MAXBLOCKSIZE);
if (lsize <= psize) {
(void) printf("Decompress of %s failed\n", thing);
goto out;
}
buf = lbuf;
size = lsize;
} else {
buf = abd_to_buf(pabd);
size = psize;
}
if (flags & ZDB_FLAG_PRINT_BLKPTR)
zdb_print_blkptr((blkptr_t *)(void *)
((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
else if (flags & ZDB_FLAG_RAW)
zdb_dump_block_raw(buf, size, flags);
else if (flags & ZDB_FLAG_INDIRECT)
zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t),
flags);
else if (flags & ZDB_FLAG_GBH)
zdb_dump_gbh(buf, flags);
else
zdb_dump_block(thing, buf, size, flags);
out:
abd_free(pabd);
umem_free(lbuf, SPA_MAXBLOCKSIZE);
free(dup);
}
static void
zdb_embedded_block(char *thing)
{
blkptr_t bp;
unsigned long long *words = (void *)&bp;
char *buf;
int err;
bzero(&bp, sizeof (bp));
err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:"
"%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx",
words + 0, words + 1, words + 2, words + 3,
words + 4, words + 5, words + 6, words + 7,
words + 8, words + 9, words + 10, words + 11,
words + 12, words + 13, words + 14, words + 15);
if (err != 16) {
(void) printf("invalid input format\n");
exit(1);
}
ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE);
buf = malloc(SPA_MAXBLOCKSIZE);
if (buf == NULL) {
(void) fprintf(stderr, "%s: failed to allocate %llu bytes\n",
__func__, SPA_MAXBLOCKSIZE);
exit(1);
}
err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp));
if (err != 0) {
(void) printf("decode failed: %u\n", err);
free(buf);
exit(1);
}
zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0);
free(buf);
}
static boolean_t
pool_match(nvlist_t *cfg, char *tgt)
{
uint64_t v, guid = strtoull(tgt, NULL, 0);
char *s;
if (guid != 0) {
if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0)
return (v == guid);
} else {
if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0)
return (strcmp(s, tgt) == 0);
}
return (B_FALSE);
}
static char *
find_zpool(char **target, nvlist_t **configp, int dirc, char **dirv)
{
nvlist_t *pools;
nvlist_t *match = NULL;
char *name = NULL;
char *sepp = NULL;
char sep = '\0';
int count = 0;
importargs_t args;
bzero(&args, sizeof (args));
args.paths = dirc;
args.path = dirv;
args.can_be_active = B_TRUE;
if ((sepp = strpbrk(*target, "/@")) != NULL) {
sep = *sepp;
*sepp = '\0';
}
pools = zpool_search_import(g_zfs, &args);
if (pools != NULL) {
nvpair_t *elem = NULL;
while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
verify(nvpair_value_nvlist(elem, configp) == 0);
if (pool_match(*configp, *target)) {
count++;
if (match != NULL) {
/* print previously found config */
if (name != NULL) {
(void) printf("%s\n", name);
dump_nvlist(match, 8);
name = NULL;
}
(void) printf("%s\n",
nvpair_name(elem));
dump_nvlist(*configp, 8);
} else {
match = *configp;
name = nvpair_name(elem);
}
}
}
}
if (count > 1)
(void) fatal("\tMatched %d pools - use pool GUID "
"instead of pool name or \n"
"\tpool name part of a dataset name to select pool", count);
if (sepp)
*sepp = sep;
/*
* If pool GUID was specified for pool id, replace it with pool name
*/
if (name && (strstr(*target, name) != *target)) {
int sz = 1 + strlen(name) + ((sepp) ? strlen(sepp) : 0);
*target = umem_alloc(sz, UMEM_NOFAIL);
(void) snprintf(*target, sz, "%s%s", name, sepp ? sepp : "");
}
*configp = name ? match : NULL;
return (name);
}
int
main(int argc, char **argv)
{
int c;
struct rlimit rl = { 1024, 1024 };
spa_t *spa = NULL;
objset_t *os = NULL;
int dump_all = 1;
int verbose = 0;
int error = 0;
char **searchdirs = NULL;
int nsearch = 0;
char *target;
nvlist_t *policy = NULL;
uint64_t max_txg = UINT64_MAX;
int flags = ZFS_IMPORT_MISSING_LOG;
int rewind = ZPOOL_NEVER_REWIND;
char *spa_config_path_env;
boolean_t target_is_spa = B_TRUE;
(void) setrlimit(RLIMIT_NOFILE, &rl);
(void) enable_extended_FILE_stdio(-1, -1);
dprintf_setup(&argc, argv);
/*
* If there is an environment variable SPA_CONFIG_PATH it overrides
* default spa_config_path setting. If -U flag is specified it will
* override this environment variable settings once again.
*/
spa_config_path_env = getenv("SPA_CONFIG_PATH");
if (spa_config_path_env != NULL)
spa_config_path = spa_config_path_env;
while ((c = getopt(argc, argv,
"AbcCdDeEFGhiI:lLmMo:Op:PqRsSt:uU:vVx:X")) != -1) {
switch (c) {
case 'b':
case 'c':
case 'C':
case 'd':
case 'D':
case 'E':
case 'G':
case 'h':
case 'i':
case 'l':
case 'm':
case 'M':
case 'O':
case 'R':
case 's':
case 'S':
case 'u':
dump_opt[c]++;
dump_all = 0;
break;
case 'A':
case 'e':
case 'F':
case 'L':
case 'P':
case 'q':
case 'X':
dump_opt[c]++;
break;
/* NB: Sort single match options below. */
case 'I':
max_inflight = strtoull(optarg, NULL, 0);
if (max_inflight == 0) {
(void) fprintf(stderr, "maximum number "
"of inflight I/Os must be greater "
"than 0\n");
usage();
}
break;
case 'o':
error = set_global_var(optarg);
if (error != 0)
usage();
break;
case 'p':
if (searchdirs == NULL) {
searchdirs = umem_alloc(sizeof (char *),
UMEM_NOFAIL);
} else {
char **tmp = umem_alloc((nsearch + 1) *
sizeof (char *), UMEM_NOFAIL);
bcopy(searchdirs, tmp, nsearch *
sizeof (char *));
umem_free(searchdirs,
nsearch * sizeof (char *));
searchdirs = tmp;
}
searchdirs[nsearch++] = optarg;
break;
case 't':
max_txg = strtoull(optarg, NULL, 0);
if (max_txg < TXG_INITIAL) {
(void) fprintf(stderr, "incorrect txg "
"specified: %s\n", optarg);
usage();
}
break;
case 'U':
spa_config_path = optarg;
if (spa_config_path[0] != '/') {
(void) fprintf(stderr,
"cachefile must be an absolute path "
"(i.e. start with a slash)\n");
usage();
}
break;
case 'v':
verbose++;
break;
case 'V':
flags = ZFS_IMPORT_VERBATIM;
break;
case 'x':
vn_dumpdir = optarg;
break;
default:
usage();
break;
}
}
if (!dump_opt['e'] && searchdirs != NULL) {
(void) fprintf(stderr, "-p option requires use of -e\n");
usage();
}
/*
* ZDB does not typically re-read blocks; therefore limit the ARC
* to 256 MB, which can be used entirely for metadata.
*/
zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024;
/*
* "zdb -c" uses checksum-verifying scrub i/os which are async reads.
* "zdb -b" uses traversal prefetch which uses async reads.
* For good performance, let several of them be active at once.
*/
zfs_vdev_async_read_max_active = 10;
/*
* Disable reference tracking for better performance.
*/
reference_tracking_enable = B_FALSE;
kernel_init(FREAD);
g_zfs = libzfs_init();
if (g_zfs == NULL)
fatal("Fail to initialize zfs");
if (dump_all)
verbose = MAX(verbose, 1);
for (c = 0; c < 256; c++) {
if (dump_all && strchr("AeEFlLOPRSX", c) == NULL)
dump_opt[c] = 1;
if (dump_opt[c])
dump_opt[c] += verbose;
}
aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2);
zfs_recover = (dump_opt['A'] > 1);
argc -= optind;
argv += optind;
if (argc < 2 && dump_opt['R'])
usage();
if (dump_opt['E']) {
if (argc != 1)
usage();
zdb_embedded_block(argv[0]);
return (0);
}
if (argc < 1) {
if (!dump_opt['e'] && dump_opt['C']) {
dump_cachefile(spa_config_path);
return (0);
}
usage();
}
if (dump_opt['l'])
return (dump_label(argv[0]));
if (dump_opt['O']) {
if (argc != 2)
usage();
dump_opt['v'] = verbose + 3;
return (dump_path(argv[0], argv[1]));
}
if (dump_opt['X'] || dump_opt['F'])
rewind = ZPOOL_DO_REWIND |
(dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 ||
nvlist_add_uint64(policy, ZPOOL_REWIND_REQUEST_TXG, max_txg) != 0 ||
nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind) != 0)
fatal("internal error: %s", strerror(ENOMEM));
error = 0;
target = argv[0];
if (dump_opt['e']) {
nvlist_t *cfg = NULL;
char *name = find_zpool(&target, &cfg, nsearch, searchdirs);
error = ENOENT;
if (name) {
if (dump_opt['C'] > 1) {
(void) printf("\nConfiguration for import:\n");
dump_nvlist(cfg, 8);
}
if (nvlist_add_nvlist(cfg,
ZPOOL_REWIND_POLICY, policy) != 0) {
fatal("can't open '%s': %s",
target, strerror(ENOMEM));
}
error = spa_import(name, cfg, NULL, flags);
}
}
if (strpbrk(target, "/@") != NULL) {
size_t targetlen;
target_is_spa = B_FALSE;
/*
* Remove any trailing slash. Later code would get confused
* by it, but we want to allow it so that "pool/" can
* indicate that we want to dump the topmost filesystem,
* rather than the whole pool.
*/
targetlen = strlen(target);
if (targetlen != 0 && target[targetlen - 1] == '/')
target[targetlen - 1] = '\0';
}
if (error == 0) {
if (target_is_spa || dump_opt['R']) {
error = spa_open_rewind(target, &spa, FTAG, policy,
NULL);
if (error) {
/*
* If we're missing the log device then
* try opening the pool after clearing the
* log state.
*/
mutex_enter(&spa_namespace_lock);
if ((spa = spa_lookup(target)) != NULL &&
spa->spa_log_state == SPA_LOG_MISSING) {
spa->spa_log_state = SPA_LOG_CLEAR;
error = 0;
}
mutex_exit(&spa_namespace_lock);
if (!error) {
error = spa_open_rewind(target, &spa,
FTAG, policy, NULL);
}
}
} else {
error = open_objset(target, DMU_OST_ANY, FTAG, &os);
}
}
nvlist_free(policy);
if (error)
fatal("can't open '%s': %s", target, strerror(error));
argv++;
argc--;
if (!dump_opt['R']) {
if (argc > 0) {
zopt_objects = argc;
zopt_object = calloc(zopt_objects, sizeof (uint64_t));
for (unsigned i = 0; i < zopt_objects; i++) {
errno = 0;
zopt_object[i] = strtoull(argv[i], NULL, 0);
if (zopt_object[i] == 0 && errno != 0)
fatal("bad number %s: %s",
argv[i], strerror(errno));
}
}
if (os != NULL) {
dump_dir(os);
} else if (zopt_objects > 0 && !dump_opt['m']) {
dump_dir(spa->spa_meta_objset);
} else {
dump_zpool(spa);
}
} else {
flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
flagbits['c'] = ZDB_FLAG_CHECKSUM;
flagbits['d'] = ZDB_FLAG_DECOMPRESS;
flagbits['e'] = ZDB_FLAG_BSWAP;
flagbits['g'] = ZDB_FLAG_GBH;
flagbits['i'] = ZDB_FLAG_INDIRECT;
flagbits['p'] = ZDB_FLAG_PHYS;
flagbits['r'] = ZDB_FLAG_RAW;
for (int i = 0; i < argc; i++)
zdb_read_block(argv[i], spa);
}
if (os != NULL)
close_objset(os, FTAG);
else
spa_close(spa, FTAG);
fuid_table_destroy();
dump_debug_buffer();
libzfs_fini(g_zfs);
kernel_fini();
return (0);
}
Index: stable/11/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c
===================================================================
--- stable/11/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c (revision 332524)
+++ stable/11/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c (revision 332525)
@@ -1,7434 +1,7457 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
* Copyright 2012 Milan Jurik. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
* Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
* Copyright 2016 Nexenta Systems, Inc.
*/
#include <assert.h>
#include <ctype.h>
#include <errno.h>
#include <getopt.h>
#include <libgen.h>
#include <libintl.h>
#include <libuutil.h>
#include <libnvpair.h>
#include <locale.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <unistd.h>
#include <fcntl.h>
#include <zone.h>
#include <grp.h>
#include <pwd.h>
#include <signal.h>
#include <sys/debug.h>
#include <sys/list.h>
#include <sys/mntent.h>
#include <sys/mnttab.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/fs/zfs.h>
#include <sys/types.h>
#include <time.h>
#include <err.h>
#include <jail.h>
#include <libzfs.h>
#include <libzfs_core.h>
#include <zfs_prop.h>
#include <zfs_deleg.h>
#include <libuutil.h>
#ifdef illumos
#include <aclutils.h>
#include <directory.h>
#include <idmap.h>
#endif
#include "zfs_iter.h"
#include "zfs_util.h"
#include "zfs_comutil.h"
libzfs_handle_t *g_zfs;
static FILE *mnttab_file;
static char history_str[HIS_MAX_RECORD_LEN];
static boolean_t log_history = B_TRUE;
static int zfs_do_clone(int argc, char **argv);
static int zfs_do_create(int argc, char **argv);
static int zfs_do_destroy(int argc, char **argv);
static int zfs_do_get(int argc, char **argv);
static int zfs_do_inherit(int argc, char **argv);
static int zfs_do_list(int argc, char **argv);
static int zfs_do_mount(int argc, char **argv);
static int zfs_do_rename(int argc, char **argv);
static int zfs_do_rollback(int argc, char **argv);
static int zfs_do_set(int argc, char **argv);
static int zfs_do_upgrade(int argc, char **argv);
static int zfs_do_snapshot(int argc, char **argv);
static int zfs_do_unmount(int argc, char **argv);
static int zfs_do_share(int argc, char **argv);
static int zfs_do_unshare(int argc, char **argv);
static int zfs_do_send(int argc, char **argv);
static int zfs_do_receive(int argc, char **argv);
static int zfs_do_promote(int argc, char **argv);
static int zfs_do_userspace(int argc, char **argv);
static int zfs_do_allow(int argc, char **argv);
static int zfs_do_unallow(int argc, char **argv);
static int zfs_do_hold(int argc, char **argv);
static int zfs_do_holds(int argc, char **argv);
static int zfs_do_release(int argc, char **argv);
static int zfs_do_diff(int argc, char **argv);
static int zfs_do_jail(int argc, char **argv);
static int zfs_do_unjail(int argc, char **argv);
static int zfs_do_bookmark(int argc, char **argv);
+static int zfs_do_remap(int argc, char **argv);
static int zfs_do_channel_program(int argc, char **argv);
/*
* Enable a reasonable set of defaults for libumem debugging on DEBUG builds.
*/
#ifdef DEBUG
const char *
_umem_debug_init(void)
{
return ("default,verbose"); /* $UMEM_DEBUG setting */
}
const char *
_umem_logging_init(void)
{
return ("fail,contents"); /* $UMEM_LOGGING setting */
}
#endif
typedef enum {
HELP_CLONE,
HELP_CREATE,
HELP_DESTROY,
HELP_GET,
HELP_INHERIT,
HELP_UPGRADE,
HELP_JAIL,
HELP_UNJAIL,
HELP_LIST,
HELP_MOUNT,
HELP_PROMOTE,
HELP_RECEIVE,
HELP_RENAME,
HELP_ROLLBACK,
HELP_SEND,
HELP_SET,
HELP_SHARE,
HELP_SNAPSHOT,
HELP_UNMOUNT,
HELP_UNSHARE,
HELP_ALLOW,
HELP_UNALLOW,
HELP_USERSPACE,
HELP_GROUPSPACE,
HELP_HOLD,
HELP_HOLDS,
HELP_RELEASE,
HELP_DIFF,
+ HELP_REMAP,
HELP_BOOKMARK,
HELP_CHANNEL_PROGRAM,
} zfs_help_t;
typedef struct zfs_command {
const char *name;
int (*func)(int argc, char **argv);
zfs_help_t usage;
} zfs_command_t;
/*
* Master command table. Each ZFS command has a name, associated function, and
* usage message. The usage messages need to be internationalized, so we have
* to have a function to return the usage message based on a command index.
*
* These commands are organized according to how they are displayed in the usage
* message. An empty command (one with a NULL name) indicates an empty line in
* the generic usage message.
*/
static zfs_command_t command_table[] = {
{ "create", zfs_do_create, HELP_CREATE },
{ "destroy", zfs_do_destroy, HELP_DESTROY },
{ NULL },
{ "snapshot", zfs_do_snapshot, HELP_SNAPSHOT },
{ "rollback", zfs_do_rollback, HELP_ROLLBACK },
{ "clone", zfs_do_clone, HELP_CLONE },
{ "promote", zfs_do_promote, HELP_PROMOTE },
{ "rename", zfs_do_rename, HELP_RENAME },
{ "bookmark", zfs_do_bookmark, HELP_BOOKMARK },
{ "program", zfs_do_channel_program, HELP_CHANNEL_PROGRAM },
{ NULL },
{ "list", zfs_do_list, HELP_LIST },
{ NULL },
{ "set", zfs_do_set, HELP_SET },
{ "get", zfs_do_get, HELP_GET },
{ "inherit", zfs_do_inherit, HELP_INHERIT },
{ "upgrade", zfs_do_upgrade, HELP_UPGRADE },
{ "userspace", zfs_do_userspace, HELP_USERSPACE },
{ "groupspace", zfs_do_userspace, HELP_GROUPSPACE },
{ NULL },
{ "mount", zfs_do_mount, HELP_MOUNT },
{ "unmount", zfs_do_unmount, HELP_UNMOUNT },
{ "share", zfs_do_share, HELP_SHARE },
{ "unshare", zfs_do_unshare, HELP_UNSHARE },
{ NULL },
{ "send", zfs_do_send, HELP_SEND },
{ "receive", zfs_do_receive, HELP_RECEIVE },
{ NULL },
{ "allow", zfs_do_allow, HELP_ALLOW },
{ NULL },
{ "unallow", zfs_do_unallow, HELP_UNALLOW },
{ NULL },
{ "hold", zfs_do_hold, HELP_HOLD },
{ "holds", zfs_do_holds, HELP_HOLDS },
{ "release", zfs_do_release, HELP_RELEASE },
{ "diff", zfs_do_diff, HELP_DIFF },
{ NULL },
{ "jail", zfs_do_jail, HELP_JAIL },
{ "unjail", zfs_do_unjail, HELP_UNJAIL },
+ { "remap", zfs_do_remap, HELP_REMAP },
};
#define NCOMMAND (sizeof (command_table) / sizeof (command_table[0]))
zfs_command_t *current_command;
static const char *
get_usage(zfs_help_t idx)
{
switch (idx) {
case HELP_CLONE:
return (gettext("\tclone [-p] [-o property=value] ... "
"<snapshot> <filesystem|volume>\n"));
case HELP_CREATE:
return (gettext("\tcreate [-pu] [-o property=value] ... "
"<filesystem>\n"
"\tcreate [-ps] [-b blocksize] [-o property=value] ... "
"-V <size> <volume>\n"));
case HELP_DESTROY:
return (gettext("\tdestroy [-fnpRrv] <filesystem|volume>\n"
"\tdestroy [-dnpRrv] "
"<filesystem|volume>@<snap>[%<snap>][,...]\n"
"\tdestroy <filesystem|volume>#<bookmark>\n"));
case HELP_GET:
return (gettext("\tget [-rHp] [-d max] "
"[-o \"all\" | field[,...]]\n"
"\t [-t type[,...]] [-s source[,...]]\n"
"\t <\"all\" | property[,...]> "
"[filesystem|volume|snapshot|bookmark] ...\n"));
case HELP_INHERIT:
return (gettext("\tinherit [-rS] <property> "
"<filesystem|volume|snapshot> ...\n"));
case HELP_UPGRADE:
return (gettext("\tupgrade [-v]\n"
"\tupgrade [-r] [-V version] <-a | filesystem ...>\n"));
case HELP_JAIL:
return (gettext("\tjail <jailid|jailname> <filesystem>\n"));
case HELP_UNJAIL:
return (gettext("\tunjail <jailid|jailname> <filesystem>\n"));
case HELP_LIST:
return (gettext("\tlist [-Hp] [-r|-d max] [-o property[,...]] "
"[-s property]...\n\t [-S property]... [-t type[,...]] "
"[filesystem|volume|snapshot] ...\n"));
case HELP_MOUNT:
return (gettext("\tmount\n"
"\tmount [-vO] [-o opts] <-a | filesystem>\n"));
case HELP_PROMOTE:
return (gettext("\tpromote <clone-filesystem>\n"));
case HELP_RECEIVE:
return (gettext("\treceive|recv [-vnsFu] <filesystem|volume|"
"snapshot>\n"
"\treceive|recv [-vnsFu] [-o origin=<snapshot>] [-d | -e] "
"<filesystem>\n"
"\treceive|recv -A <filesystem|volume>\n"));
case HELP_RENAME:
return (gettext("\trename [-f] <filesystem|volume|snapshot> "
"<filesystem|volume|snapshot>\n"
"\trename [-f] -p <filesystem|volume> <filesystem|volume>\n"
"\trename -r <snapshot> <snapshot>\n"
"\trename -u [-p] <filesystem> <filesystem>"));
case HELP_ROLLBACK:
return (gettext("\trollback [-rRf] <snapshot>\n"));
case HELP_SEND:
return (gettext("\tsend [-DnPpRvLec] [-[iI] snapshot] "
"<snapshot>\n"
"\tsend [-Le] [-i snapshot|bookmark] "
"<filesystem|volume|snapshot>\n"
"\tsend [-nvPe] -t <receive_resume_token>\n"));
case HELP_SET:
return (gettext("\tset <property=value> ... "
"<filesystem|volume|snapshot> ...\n"));
case HELP_SHARE:
return (gettext("\tshare <-a | filesystem>\n"));
case HELP_SNAPSHOT:
return (gettext("\tsnapshot|snap [-r] [-o property=value] ... "
"<filesystem|volume>@<snap> ...\n"));
case HELP_UNMOUNT:
return (gettext("\tunmount|umount [-f] "
"<-a | filesystem|mountpoint>\n"));
case HELP_UNSHARE:
return (gettext("\tunshare "
"<-a | filesystem|mountpoint>\n"));
case HELP_ALLOW:
return (gettext("\tallow <filesystem|volume>\n"
"\tallow [-ldug] "
"<\"everyone\"|user|group>[,...] <perm|@setname>[,...]\n"
"\t <filesystem|volume>\n"
"\tallow [-ld] -e <perm|@setname>[,...] "
"<filesystem|volume>\n"
"\tallow -c <perm|@setname>[,...] <filesystem|volume>\n"
"\tallow -s @setname <perm|@setname>[,...] "
"<filesystem|volume>\n"));
case HELP_UNALLOW:
return (gettext("\tunallow [-rldug] "
"<\"everyone\"|user|group>[,...]\n"
"\t [<perm|@setname>[,...]] <filesystem|volume>\n"
"\tunallow [-rld] -e [<perm|@setname>[,...]] "
"<filesystem|volume>\n"
"\tunallow [-r] -c [<perm|@setname>[,...]] "
"<filesystem|volume>\n"
"\tunallow [-r] -s @setname [<perm|@setname>[,...]] "
"<filesystem|volume>\n"));
case HELP_USERSPACE:
return (gettext("\tuserspace [-Hinp] [-o field[,...]] "
"[-s field] ...\n"
"\t [-S field] ... [-t type[,...]] "
"<filesystem|snapshot>\n"));
case HELP_GROUPSPACE:
return (gettext("\tgroupspace [-Hinp] [-o field[,...]] "
"[-s field] ...\n"
"\t [-S field] ... [-t type[,...]] "
"<filesystem|snapshot>\n"));
case HELP_HOLD:
return (gettext("\thold [-r] <tag> <snapshot> ...\n"));
case HELP_HOLDS:
return (gettext("\tholds [-Hp] [-r|-d depth] "
"<filesystem|volume|snapshot> ...\n"));
case HELP_RELEASE:
return (gettext("\trelease [-r] <tag> <snapshot> ...\n"));
case HELP_DIFF:
return (gettext("\tdiff [-FHt] <snapshot> "
"[snapshot|filesystem]\n"));
+ case HELP_REMAP:
+ return (gettext("\tremap <filesystem | volume>\n"));
case HELP_BOOKMARK:
return (gettext("\tbookmark <snapshot> <bookmark>\n"));
case HELP_CHANNEL_PROGRAM:
return (gettext("\tprogram [-n] [-t <instruction limit>] "
"[-m <memory limit (b)>] <pool> <program file> "
"[lua args...]\n"));
}
abort();
/* NOTREACHED */
}
void
nomem(void)
{
(void) fprintf(stderr, gettext("internal error: out of memory\n"));
exit(1);
}
/*
* Utility function to guarantee malloc() success.
*/
void *
safe_malloc(size_t size)
{
void *data;
if ((data = calloc(1, size)) == NULL)
nomem();
return (data);
}
void *
safe_realloc(void *data, size_t size)
{
void *newp;
if ((newp = realloc(data, size)) == NULL) {
free(data);
nomem();
}
return (newp);
}
static char *
safe_strdup(char *str)
{
char *dupstr = strdup(str);
if (dupstr == NULL)
nomem();
return (dupstr);
}
/*
* Callback routine that will print out information for each of
* the properties.
*/
static int
usage_prop_cb(int prop, void *cb)
{
FILE *fp = cb;
(void) fprintf(fp, "\t%-15s ", zfs_prop_to_name(prop));
if (zfs_prop_readonly(prop))
(void) fprintf(fp, " NO ");
else
(void) fprintf(fp, "YES ");
if (zfs_prop_inheritable(prop))
(void) fprintf(fp, " YES ");
else
(void) fprintf(fp, " NO ");
if (zfs_prop_values(prop) == NULL)
(void) fprintf(fp, "-\n");
else
(void) fprintf(fp, "%s\n", zfs_prop_values(prop));
return (ZPROP_CONT);
}
/*
* Display usage message. If we're inside a command, display only the usage for
* that command. Otherwise, iterate over the entire command table and display
* a complete usage message.
*/
static void
usage(boolean_t requested)
{
int i;
boolean_t show_properties = B_FALSE;
FILE *fp = requested ? stdout : stderr;
if (current_command == NULL) {
(void) fprintf(fp, gettext("usage: zfs command args ...\n"));
(void) fprintf(fp,
gettext("where 'command' is one of the following:\n\n"));
for (i = 0; i < NCOMMAND; i++) {
if (command_table[i].name == NULL)
(void) fprintf(fp, "\n");
else
(void) fprintf(fp, "%s",
get_usage(command_table[i].usage));
}
(void) fprintf(fp, gettext("\nEach dataset is of the form: "
"pool/[dataset/]*dataset[@name]\n"));
} else {
(void) fprintf(fp, gettext("usage:\n"));
(void) fprintf(fp, "%s", get_usage(current_command->usage));
}
if (current_command != NULL &&
(strcmp(current_command->name, "set") == 0 ||
strcmp(current_command->name, "get") == 0 ||
strcmp(current_command->name, "inherit") == 0 ||
strcmp(current_command->name, "list") == 0))
show_properties = B_TRUE;
if (show_properties) {
(void) fprintf(fp,
gettext("\nThe following properties are supported:\n"));
(void) fprintf(fp, "\n\t%-14s %s %s %s\n\n",
"PROPERTY", "EDIT", "INHERIT", "VALUES");
/* Iterate over all properties */
(void) zprop_iter(usage_prop_cb, fp, B_FALSE, B_TRUE,
ZFS_TYPE_DATASET);
(void) fprintf(fp, "\t%-15s ", "userused@...");
(void) fprintf(fp, " NO NO <size>\n");
(void) fprintf(fp, "\t%-15s ", "groupused@...");
(void) fprintf(fp, " NO NO <size>\n");
(void) fprintf(fp, "\t%-15s ", "userquota@...");
(void) fprintf(fp, "YES NO <size> | none\n");
(void) fprintf(fp, "\t%-15s ", "groupquota@...");
(void) fprintf(fp, "YES NO <size> | none\n");
(void) fprintf(fp, "\t%-15s ", "written@<snap>");
(void) fprintf(fp, " NO NO <size>\n");
(void) fprintf(fp, gettext("\nSizes are specified in bytes "
"with standard units such as K, M, G, etc.\n"));
(void) fprintf(fp, gettext("\nUser-defined properties can "
"be specified by using a name containing a colon (:).\n"));
(void) fprintf(fp, gettext("\nThe {user|group}{used|quota}@ "
"properties must be appended with\n"
"a user or group specifier of one of these forms:\n"
" POSIX name (eg: \"matt\")\n"
" POSIX id (eg: \"126829\")\n"
" SMB name@domain (eg: \"matt@sun\")\n"
" SMB SID (eg: \"S-1-234-567-89\")\n"));
} else {
(void) fprintf(fp,
gettext("\nFor the property list, run: %s\n"),
"zfs set|get");
(void) fprintf(fp,
gettext("\nFor the delegated permission list, run: %s\n"),
"zfs allow|unallow");
}
/*
* See comments at end of main().
*/
if (getenv("ZFS_ABORT") != NULL) {
(void) printf("dumping core by request\n");
abort();
}
exit(requested ? 0 : 2);
}
/*
* Take a property=value argument string and add it to the given nvlist.
* Modifies the argument inplace.
*/
static int
parseprop(nvlist_t *props, char *propname)
{
char *propval, *strval;
if ((propval = strchr(propname, '=')) == NULL) {
(void) fprintf(stderr, gettext("missing "
"'=' for property=value argument\n"));
return (-1);
}
*propval = '\0';
propval++;
if (nvlist_lookup_string(props, propname, &strval) == 0) {
(void) fprintf(stderr, gettext("property '%s' "
"specified multiple times\n"), propname);
return (-1);
}
if (nvlist_add_string(props, propname, propval) != 0)
nomem();
return (0);
}
static int
parse_depth(char *opt, int *flags)
{
char *tmp;
int depth;
depth = (int)strtol(opt, &tmp, 0);
if (*tmp) {
(void) fprintf(stderr,
gettext("%s is not an integer\n"), opt);
usage(B_FALSE);
}
if (depth < 0) {
(void) fprintf(stderr,
gettext("Depth can not be negative.\n"));
usage(B_FALSE);
}
*flags |= (ZFS_ITER_DEPTH_LIMIT|ZFS_ITER_RECURSE);
return (depth);
}
#define PROGRESS_DELAY 2 /* seconds */
static char *pt_reverse = "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
static time_t pt_begin;
static char *pt_header = NULL;
static boolean_t pt_shown;
static void
start_progress_timer(void)
{
pt_begin = time(NULL) + PROGRESS_DELAY;
pt_shown = B_FALSE;
}
static void
set_progress_header(char *header)
{
assert(pt_header == NULL);
pt_header = safe_strdup(header);
if (pt_shown) {
(void) printf("%s: ", header);
(void) fflush(stdout);
}
}
static void
update_progress(char *update)
{
if (!pt_shown && time(NULL) > pt_begin) {
int len = strlen(update);
(void) printf("%s: %s%*.*s", pt_header, update, len, len,
pt_reverse);
(void) fflush(stdout);
pt_shown = B_TRUE;
} else if (pt_shown) {
int len = strlen(update);
(void) printf("%s%*.*s", update, len, len, pt_reverse);
(void) fflush(stdout);
}
}
static void
finish_progress(char *done)
{
if (pt_shown) {
(void) printf("%s\n", done);
(void) fflush(stdout);
}
free(pt_header);
pt_header = NULL;
}
/*
* Check if the dataset is mountable and should be automatically mounted.
*/
static boolean_t
should_auto_mount(zfs_handle_t *zhp)
{
if (!zfs_prop_valid_for_type(ZFS_PROP_CANMOUNT, zfs_get_type(zhp)))
return (B_FALSE);
return (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON);
}
/*
* zfs clone [-p] [-o prop=value] ... <snap> <fs | vol>
*
* Given an existing dataset, create a writable copy whose initial contents
* are the same as the source. The newly created dataset maintains a
* dependency on the original; the original cannot be destroyed so long as
* the clone exists.
*
* The '-p' flag creates all the non-existing ancestors of the target first.
*/
static int
zfs_do_clone(int argc, char **argv)
{
zfs_handle_t *zhp = NULL;
boolean_t parents = B_FALSE;
nvlist_t *props;
int ret = 0;
int c;
if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
nomem();
/* check options */
while ((c = getopt(argc, argv, "o:p")) != -1) {
switch (c) {
case 'o':
if (parseprop(props, optarg) != 0)
return (1);
break;
case 'p':
parents = B_TRUE;
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
goto usage;
}
}
argc -= optind;
argv += optind;
/* check number of arguments */
if (argc < 1) {
(void) fprintf(stderr, gettext("missing source dataset "
"argument\n"));
goto usage;
}
if (argc < 2) {
(void) fprintf(stderr, gettext("missing target dataset "
"argument\n"));
goto usage;
}
if (argc > 2) {
(void) fprintf(stderr, gettext("too many arguments\n"));
goto usage;
}
/* open the source dataset */
if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL)
return (1);
if (parents && zfs_name_valid(argv[1], ZFS_TYPE_FILESYSTEM |
ZFS_TYPE_VOLUME)) {
/*
* Now create the ancestors of the target dataset. If the
* target already exists and '-p' option was used we should not
* complain.
*/
if (zfs_dataset_exists(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM |
ZFS_TYPE_VOLUME))
return (0);
if (zfs_create_ancestors(g_zfs, argv[1]) != 0)
return (1);
}
/* pass to libzfs */
ret = zfs_clone(zhp, argv[1], props);
/* create the mountpoint if necessary */
if (ret == 0) {
zfs_handle_t *clone;
clone = zfs_open(g_zfs, argv[1], ZFS_TYPE_DATASET);
if (clone != NULL) {
/*
* If the user doesn't want the dataset
* automatically mounted, then skip the mount/share
* step.
*/
if (should_auto_mount(clone)) {
if ((ret = zfs_mount(clone, NULL, 0)) != 0) {
(void) fprintf(stderr, gettext("clone "
"successfully created, "
"but not mounted\n"));
} else if ((ret = zfs_share(clone)) != 0) {
(void) fprintf(stderr, gettext("clone "
"successfully created, "
"but not shared\n"));
}
}
zfs_close(clone);
}
}
zfs_close(zhp);
nvlist_free(props);
return (!!ret);
usage:
if (zhp)
zfs_close(zhp);
nvlist_free(props);
usage(B_FALSE);
return (-1);
}
/*
* zfs create [-pu] [-o prop=value] ... fs
* zfs create [-ps] [-b blocksize] [-o prop=value] ... -V vol size
*
* Create a new dataset. This command can be used to create filesystems
* and volumes. Snapshot creation is handled by 'zfs snapshot'.
* For volumes, the user must specify a size to be used.
*
* The '-s' flag applies only to volumes, and indicates that we should not try
* to set the reservation for this volume. By default we set a reservation
* equal to the size for any volume. For pools with SPA_VERSION >=
* SPA_VERSION_REFRESERVATION, we set a refreservation instead.
*
* The '-p' flag creates all the non-existing ancestors of the target first.
*
* The '-u' flag prevents mounting of newly created file system.
*/
static int
zfs_do_create(int argc, char **argv)
{
zfs_type_t type = ZFS_TYPE_FILESYSTEM;
zfs_handle_t *zhp = NULL;
uint64_t volsize = 0;
int c;
boolean_t noreserve = B_FALSE;
boolean_t bflag = B_FALSE;
boolean_t parents = B_FALSE;
boolean_t nomount = B_FALSE;
int ret = 1;
nvlist_t *props;
uint64_t intval;
if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
nomem();
/* check options */
while ((c = getopt(argc, argv, ":V:b:so:pu")) != -1) {
switch (c) {
case 'V':
type = ZFS_TYPE_VOLUME;
if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) {
(void) fprintf(stderr, gettext("bad volume "
"size '%s': %s\n"), optarg,
libzfs_error_description(g_zfs));
goto error;
}
if (nvlist_add_uint64(props,
zfs_prop_to_name(ZFS_PROP_VOLSIZE), intval) != 0)
nomem();
volsize = intval;
break;
case 'p':
parents = B_TRUE;
break;
case 'b':
bflag = B_TRUE;
if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) {
(void) fprintf(stderr, gettext("bad volume "
"block size '%s': %s\n"), optarg,
libzfs_error_description(g_zfs));
goto error;
}
if (nvlist_add_uint64(props,
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
intval) != 0)
nomem();
break;
case 'o':
if (parseprop(props, optarg) != 0)
goto error;
break;
case 's':
noreserve = B_TRUE;
break;
case 'u':
nomount = B_TRUE;
break;
case ':':
(void) fprintf(stderr, gettext("missing size "
"argument\n"));
goto badusage;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
goto badusage;
}
}
if ((bflag || noreserve) && type != ZFS_TYPE_VOLUME) {
(void) fprintf(stderr, gettext("'-s' and '-b' can only be "
"used when creating a volume\n"));
goto badusage;
}
if (nomount && type != ZFS_TYPE_FILESYSTEM) {
(void) fprintf(stderr, gettext("'-u' can only be "
"used when creating a file system\n"));
goto badusage;
}
argc -= optind;
argv += optind;
/* check number of arguments */
if (argc == 0) {
(void) fprintf(stderr, gettext("missing %s argument\n"),
zfs_type_to_name(type));
goto badusage;
}
if (argc > 1) {
(void) fprintf(stderr, gettext("too many arguments\n"));
goto badusage;
}
if (type == ZFS_TYPE_VOLUME && !noreserve) {
zpool_handle_t *zpool_handle;
nvlist_t *real_props = NULL;
uint64_t spa_version;
char *p;
zfs_prop_t resv_prop;
char *strval;
char msg[1024];
if ((p = strchr(argv[0], '/')) != NULL)
*p = '\0';
zpool_handle = zpool_open(g_zfs, argv[0]);
if (p != NULL)
*p = '/';
if (zpool_handle == NULL)
goto error;
spa_version = zpool_get_prop_int(zpool_handle,
ZPOOL_PROP_VERSION, NULL);
if (spa_version >= SPA_VERSION_REFRESERVATION)
resv_prop = ZFS_PROP_REFRESERVATION;
else
resv_prop = ZFS_PROP_RESERVATION;
(void) snprintf(msg, sizeof (msg),
gettext("cannot create '%s'"), argv[0]);
if (props && (real_props = zfs_valid_proplist(g_zfs, type,
props, 0, NULL, zpool_handle, msg)) == NULL) {
zpool_close(zpool_handle);
goto error;
}
zpool_close(zpool_handle);
volsize = zvol_volsize_to_reservation(volsize, real_props);
nvlist_free(real_props);
if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop),
&strval) != 0) {
if (nvlist_add_uint64(props,
zfs_prop_to_name(resv_prop), volsize) != 0) {
nvlist_free(props);
nomem();
}
}
}
if (parents && zfs_name_valid(argv[0], type)) {
/*
* Now create the ancestors of target dataset. If the target
* already exists and '-p' option was used we should not
* complain.
*/
if (zfs_dataset_exists(g_zfs, argv[0], type)) {
ret = 0;
goto error;
}
if (zfs_create_ancestors(g_zfs, argv[0]) != 0)
goto error;
}
/* pass to libzfs */
if (zfs_create(g_zfs, argv[0], type, props) != 0)
goto error;
if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL)
goto error;
ret = 0;
/*
* Mount and/or share the new filesystem as appropriate. We provide a
* verbose error message to let the user know that their filesystem was
* in fact created, even if we failed to mount or share it.
* If the user doesn't want the dataset automatically mounted,
* then skip the mount/share step altogether.
*/
if (!nomount && should_auto_mount(zhp)) {
if (zfs_mount(zhp, NULL, 0) != 0) {
(void) fprintf(stderr, gettext("filesystem "
"successfully created, but not mounted\n"));
ret = 1;
} else if (zfs_share(zhp) != 0) {
(void) fprintf(stderr, gettext("filesystem "
"successfully created, but not shared\n"));
ret = 1;
}
}
error:
if (zhp)
zfs_close(zhp);
nvlist_free(props);
return (ret);
badusage:
nvlist_free(props);
usage(B_FALSE);
return (2);
}
/*
* zfs destroy [-rRf] <fs, vol>
* zfs destroy [-rRd] <snap>
*
* -r Recursively destroy all children
* -R Recursively destroy all dependents, including clones
* -f Force unmounting of any dependents
* -d If we can't destroy now, mark for deferred destruction
*
* Destroys the given dataset. By default, it will unmount any filesystems,
* and refuse to destroy a dataset that has any dependents. A dependent can
* either be a child, or a clone of a child.
*/
typedef struct destroy_cbdata {
boolean_t cb_first;
boolean_t cb_force;
boolean_t cb_recurse;
boolean_t cb_error;
boolean_t cb_doclones;
zfs_handle_t *cb_target;
boolean_t cb_defer_destroy;
boolean_t cb_verbose;
boolean_t cb_parsable;
boolean_t cb_dryrun;
nvlist_t *cb_nvl;
nvlist_t *cb_batchedsnaps;
/* first snap in contiguous run */
char *cb_firstsnap;
/* previous snap in contiguous run */
char *cb_prevsnap;
int64_t cb_snapused;
char *cb_snapspec;
char *cb_bookmark;
} destroy_cbdata_t;
/*
* Check for any dependents based on the '-r' or '-R' flags.
*/
static int
destroy_check_dependent(zfs_handle_t *zhp, void *data)
{
destroy_cbdata_t *cbp = data;
const char *tname = zfs_get_name(cbp->cb_target);
const char *name = zfs_get_name(zhp);
if (strncmp(tname, name, strlen(tname)) == 0 &&
(name[strlen(tname)] == '/' || name[strlen(tname)] == '@')) {
/*
* This is a direct descendant, not a clone somewhere else in
* the hierarchy.
*/
if (cbp->cb_recurse)
goto out;
if (cbp->cb_first) {
(void) fprintf(stderr, gettext("cannot destroy '%s': "
"%s has children\n"),
zfs_get_name(cbp->cb_target),
zfs_type_to_name(zfs_get_type(cbp->cb_target)));
(void) fprintf(stderr, gettext("use '-r' to destroy "
"the following datasets:\n"));
cbp->cb_first = B_FALSE;
cbp->cb_error = B_TRUE;
}
(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
} else {
/*
* This is a clone. We only want to report this if the '-r'
* wasn't specified, or the target is a snapshot.
*/
if (!cbp->cb_recurse &&
zfs_get_type(cbp->cb_target) != ZFS_TYPE_SNAPSHOT)
goto out;
if (cbp->cb_first) {
(void) fprintf(stderr, gettext("cannot destroy '%s': "
"%s has dependent clones\n"),
zfs_get_name(cbp->cb_target),
zfs_type_to_name(zfs_get_type(cbp->cb_target)));
(void) fprintf(stderr, gettext("use '-R' to destroy "
"the following datasets:\n"));
cbp->cb_first = B_FALSE;
cbp->cb_error = B_TRUE;
cbp->cb_dryrun = B_TRUE;
}
(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
}
out:
zfs_close(zhp);
return (0);
}
static int
destroy_callback(zfs_handle_t *zhp, void *data)
{
destroy_cbdata_t *cb = data;
const char *name = zfs_get_name(zhp);
if (cb->cb_verbose) {
if (cb->cb_parsable) {
(void) printf("destroy\t%s\n", name);
} else if (cb->cb_dryrun) {
(void) printf(gettext("would destroy %s\n"),
name);
} else {
(void) printf(gettext("will destroy %s\n"),
name);
}
}
/*
* Ignore pools (which we've already flagged as an error before getting
* here).
*/
if (strchr(zfs_get_name(zhp), '/') == NULL &&
zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
zfs_close(zhp);
return (0);
}
if (cb->cb_dryrun) {
zfs_close(zhp);
return (0);
}
/*
* We batch up all contiguous snapshots (even of different
* filesystems) and destroy them with one ioctl. We can't
* simply do all snap deletions and then all fs deletions,
* because we must delete a clone before its origin.
*/
if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) {
fnvlist_add_boolean(cb->cb_batchedsnaps, name);
} else {
int error = zfs_destroy_snaps_nvl(g_zfs,
cb->cb_batchedsnaps, B_FALSE);
fnvlist_free(cb->cb_batchedsnaps);
cb->cb_batchedsnaps = fnvlist_alloc();
if (error != 0 ||
zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 ||
zfs_destroy(zhp, cb->cb_defer_destroy) != 0) {
zfs_close(zhp);
return (-1);
}
}
zfs_close(zhp);
return (0);
}
static int
destroy_print_cb(zfs_handle_t *zhp, void *arg)
{
destroy_cbdata_t *cb = arg;
const char *name = zfs_get_name(zhp);
int err = 0;
if (nvlist_exists(cb->cb_nvl, name)) {
if (cb->cb_firstsnap == NULL)
cb->cb_firstsnap = strdup(name);
if (cb->cb_prevsnap != NULL)
free(cb->cb_prevsnap);
/* this snap continues the current range */
cb->cb_prevsnap = strdup(name);
if (cb->cb_firstsnap == NULL || cb->cb_prevsnap == NULL)
nomem();
if (cb->cb_verbose) {
if (cb->cb_parsable) {
(void) printf("destroy\t%s\n", name);
} else if (cb->cb_dryrun) {
(void) printf(gettext("would destroy %s\n"),
name);
} else {
(void) printf(gettext("will destroy %s\n"),
name);
}
}
} else if (cb->cb_firstsnap != NULL) {
/* end of this range */
uint64_t used = 0;
err = lzc_snaprange_space(cb->cb_firstsnap,
cb->cb_prevsnap, &used);
cb->cb_snapused += used;
free(cb->cb_firstsnap);
cb->cb_firstsnap = NULL;
free(cb->cb_prevsnap);
cb->cb_prevsnap = NULL;
}
zfs_close(zhp);
return (err);
}
static int
destroy_print_snapshots(zfs_handle_t *fs_zhp, destroy_cbdata_t *cb)
{
int err = 0;
assert(cb->cb_firstsnap == NULL);
assert(cb->cb_prevsnap == NULL);
err = zfs_iter_snapshots_sorted(fs_zhp, destroy_print_cb, cb);
if (cb->cb_firstsnap != NULL) {
uint64_t used = 0;
if (err == 0) {
err = lzc_snaprange_space(cb->cb_firstsnap,
cb->cb_prevsnap, &used);
}
cb->cb_snapused += used;
free(cb->cb_firstsnap);
cb->cb_firstsnap = NULL;
free(cb->cb_prevsnap);
cb->cb_prevsnap = NULL;
}
return (err);
}
static int
snapshot_to_nvl_cb(zfs_handle_t *zhp, void *arg)
{
destroy_cbdata_t *cb = arg;
int err = 0;
/* Check for clones. */
if (!cb->cb_doclones && !cb->cb_defer_destroy) {
cb->cb_target = zhp;
cb->cb_first = B_TRUE;
err = zfs_iter_dependents(zhp, B_TRUE,
destroy_check_dependent, cb);
}
if (err == 0) {
if (nvlist_add_boolean(cb->cb_nvl, zfs_get_name(zhp)))
nomem();
}
zfs_close(zhp);
return (err);
}
static int
gather_snapshots(zfs_handle_t *zhp, void *arg)
{
destroy_cbdata_t *cb = arg;
int err = 0;
err = zfs_iter_snapspec(zhp, cb->cb_snapspec, snapshot_to_nvl_cb, cb);
if (err == ENOENT)
err = 0;
if (err != 0)
goto out;
if (cb->cb_verbose) {
err = destroy_print_snapshots(zhp, cb);
if (err != 0)
goto out;
}
if (cb->cb_recurse)
err = zfs_iter_filesystems(zhp, gather_snapshots, cb);
out:
zfs_close(zhp);
return (err);
}
static int
destroy_clones(destroy_cbdata_t *cb)
{
nvpair_t *pair;
for (pair = nvlist_next_nvpair(cb->cb_nvl, NULL);
pair != NULL;
pair = nvlist_next_nvpair(cb->cb_nvl, pair)) {
zfs_handle_t *zhp = zfs_open(g_zfs, nvpair_name(pair),
ZFS_TYPE_SNAPSHOT);
if (zhp != NULL) {
boolean_t defer = cb->cb_defer_destroy;
int err = 0;
/*
* We can't defer destroy non-snapshots, so set it to
* false while destroying the clones.
*/
cb->cb_defer_destroy = B_FALSE;
err = zfs_iter_dependents(zhp, B_FALSE,
destroy_callback, cb);
cb->cb_defer_destroy = defer;
zfs_close(zhp);
if (err != 0)
return (err);
}
}
return (0);
}
static int
zfs_do_destroy(int argc, char **argv)
{
destroy_cbdata_t cb = { 0 };
int rv = 0;
int err = 0;
int c;
zfs_handle_t *zhp = NULL;
char *at, *pound;
zfs_type_t type = ZFS_TYPE_DATASET;
/* check options */
while ((c = getopt(argc, argv, "vpndfrR")) != -1) {
switch (c) {
case 'v':
cb.cb_verbose = B_TRUE;
break;
case 'p':
cb.cb_verbose = B_TRUE;
cb.cb_parsable = B_TRUE;
break;
case 'n':
cb.cb_dryrun = B_TRUE;
break;
case 'd':
cb.cb_defer_destroy = B_TRUE;
type = ZFS_TYPE_SNAPSHOT;
break;
case 'f':
cb.cb_force = B_TRUE;
break;
case 'r':
cb.cb_recurse = B_TRUE;
break;
case 'R':
cb.cb_recurse = B_TRUE;
cb.cb_doclones = B_TRUE;
break;
case '?':
default:
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
/* check number of arguments */
if (argc == 0) {
(void) fprintf(stderr, gettext("missing dataset argument\n"));
usage(B_FALSE);
}
if (argc > 1) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}
at = strchr(argv[0], '@');
pound = strchr(argv[0], '#');
if (at != NULL) {
/* Build the list of snaps to destroy in cb_nvl. */
cb.cb_nvl = fnvlist_alloc();
*at = '\0';
zhp = zfs_open(g_zfs, argv[0],
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
if (zhp == NULL)
return (1);
cb.cb_snapspec = at + 1;
if (gather_snapshots(zfs_handle_dup(zhp), &cb) != 0 ||
cb.cb_error) {
rv = 1;
goto out;
}
if (nvlist_empty(cb.cb_nvl)) {
(void) fprintf(stderr, gettext("could not find any "
"snapshots to destroy; check snapshot names.\n"));
rv = 1;
goto out;
}
if (cb.cb_verbose) {
char buf[16];
zfs_nicenum(cb.cb_snapused, buf, sizeof (buf));
if (cb.cb_parsable) {
(void) printf("reclaim\t%llu\n",
cb.cb_snapused);
} else if (cb.cb_dryrun) {
(void) printf(gettext("would reclaim %s\n"),
buf);
} else {
(void) printf(gettext("will reclaim %s\n"),
buf);
}
}
if (!cb.cb_dryrun) {
if (cb.cb_doclones) {
cb.cb_batchedsnaps = fnvlist_alloc();
err = destroy_clones(&cb);
if (err == 0) {
err = zfs_destroy_snaps_nvl(g_zfs,
cb.cb_batchedsnaps, B_FALSE);
}
if (err != 0) {
rv = 1;
goto out;
}
}
if (err == 0) {
err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_nvl,
cb.cb_defer_destroy);
}
}
if (err != 0)
rv = 1;
} else if (pound != NULL) {
int err;
nvlist_t *nvl;
if (cb.cb_dryrun) {
(void) fprintf(stderr,
"dryrun is not supported with bookmark\n");
return (-1);
}
if (cb.cb_defer_destroy) {
(void) fprintf(stderr,
"defer destroy is not supported with bookmark\n");
return (-1);
}
if (cb.cb_recurse) {
(void) fprintf(stderr,
"recursive is not supported with bookmark\n");
return (-1);
}
if (!zfs_bookmark_exists(argv[0])) {
(void) fprintf(stderr, gettext("bookmark '%s' "
"does not exist.\n"), argv[0]);
return (1);
}
nvl = fnvlist_alloc();
fnvlist_add_boolean(nvl, argv[0]);
err = lzc_destroy_bookmarks(nvl, NULL);
if (err != 0) {
(void) zfs_standard_error(g_zfs, err,
"cannot destroy bookmark");
}
nvlist_free(cb.cb_nvl);
return (err);
} else {
/* Open the given dataset */
if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL)
return (1);
cb.cb_target = zhp;
/*
* Perform an explicit check for pools before going any further.
*/
if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL &&
zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
(void) fprintf(stderr, gettext("cannot destroy '%s': "
"operation does not apply to pools\n"),
zfs_get_name(zhp));
(void) fprintf(stderr, gettext("use 'zfs destroy -r "
"%s' to destroy all datasets in the pool\n"),
zfs_get_name(zhp));
(void) fprintf(stderr, gettext("use 'zpool destroy %s' "
"to destroy the pool itself\n"), zfs_get_name(zhp));
rv = 1;
goto out;
}
/*
* Check for any dependents and/or clones.
*/
cb.cb_first = B_TRUE;
if (!cb.cb_doclones &&
zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent,
&cb) != 0) {
rv = 1;
goto out;
}
if (cb.cb_error) {
rv = 1;
goto out;
}
cb.cb_batchedsnaps = fnvlist_alloc();
if (zfs_iter_dependents(zhp, B_FALSE, destroy_callback,
&cb) != 0) {
rv = 1;
goto out;
}
/*
* Do the real thing. The callback will close the
* handle regardless of whether it succeeds or not.
*/
err = destroy_callback(zhp, &cb);
zhp = NULL;
if (err == 0) {
err = zfs_destroy_snaps_nvl(g_zfs,
cb.cb_batchedsnaps, cb.cb_defer_destroy);
}
if (err != 0)
rv = 1;
}
out:
fnvlist_free(cb.cb_batchedsnaps);
fnvlist_free(cb.cb_nvl);
if (zhp != NULL)
zfs_close(zhp);
return (rv);
}
static boolean_t
is_recvd_column(zprop_get_cbdata_t *cbp)
{
int i;
zfs_get_column_t col;
for (i = 0; i < ZFS_GET_NCOLS &&
(col = cbp->cb_columns[i]) != GET_COL_NONE; i++)
if (col == GET_COL_RECVD)
return (B_TRUE);
return (B_FALSE);
}
/*
* zfs get [-rHp] [-o all | field[,field]...] [-s source[,source]...]
* < all | property[,property]... > < fs | snap | vol > ...
*
* -r recurse over any child datasets
* -H scripted mode. Headers are stripped, and fields are separated
* by tabs instead of spaces.
* -o Set of fields to display. One of "name,property,value,
* received,source". Default is "name,property,value,source".
* "all" is an alias for all five.
* -s Set of sources to allow. One of
* "local,default,inherited,received,temporary,none". Default is
* all six.
* -p Display values in parsable (literal) format.
*
* Prints properties for the given datasets. The user can control which
* columns to display as well as which property types to allow.
*/
/*
* Invoked to display the properties for a single dataset.
*/
static int
get_callback(zfs_handle_t *zhp, void *data)
{
char buf[ZFS_MAXPROPLEN];
char rbuf[ZFS_MAXPROPLEN];
zprop_source_t sourcetype;
char source[ZFS_MAX_DATASET_NAME_LEN];
zprop_get_cbdata_t *cbp = data;
nvlist_t *user_props = zfs_get_user_props(zhp);
zprop_list_t *pl = cbp->cb_proplist;
nvlist_t *propval;
char *strval;
char *sourceval;
boolean_t received = is_recvd_column(cbp);
for (; pl != NULL; pl = pl->pl_next) {
char *recvdval = NULL;
/*
* Skip the special fake placeholder. This will also skip over
* the name property when 'all' is specified.
*/
if (pl->pl_prop == ZFS_PROP_NAME &&
pl == cbp->cb_proplist)
continue;
if (pl->pl_prop != ZPROP_INVAL) {
if (zfs_prop_get(zhp, pl->pl_prop, buf,
sizeof (buf), &sourcetype, source,
sizeof (source),
cbp->cb_literal) != 0) {
if (pl->pl_all)
continue;
if (!zfs_prop_valid_for_type(pl->pl_prop,
ZFS_TYPE_DATASET)) {
(void) fprintf(stderr,
gettext("No such property '%s'\n"),
zfs_prop_to_name(pl->pl_prop));
continue;
}
sourcetype = ZPROP_SRC_NONE;
(void) strlcpy(buf, "-", sizeof (buf));
}
if (received && (zfs_prop_get_recvd(zhp,
zfs_prop_to_name(pl->pl_prop), rbuf, sizeof (rbuf),
cbp->cb_literal) == 0))
recvdval = rbuf;
zprop_print_one_property(zfs_get_name(zhp), cbp,
zfs_prop_to_name(pl->pl_prop),
buf, sourcetype, source, recvdval);
} else if (zfs_prop_userquota(pl->pl_user_prop)) {
sourcetype = ZPROP_SRC_LOCAL;
if (zfs_prop_get_userquota(zhp, pl->pl_user_prop,
buf, sizeof (buf), cbp->cb_literal) != 0) {
sourcetype = ZPROP_SRC_NONE;
(void) strlcpy(buf, "-", sizeof (buf));
}
zprop_print_one_property(zfs_get_name(zhp), cbp,
pl->pl_user_prop, buf, sourcetype, source, NULL);
} else if (zfs_prop_written(pl->pl_user_prop)) {
sourcetype = ZPROP_SRC_LOCAL;
if (zfs_prop_get_written(zhp, pl->pl_user_prop,
buf, sizeof (buf), cbp->cb_literal) != 0) {
sourcetype = ZPROP_SRC_NONE;
(void) strlcpy(buf, "-", sizeof (buf));
}
zprop_print_one_property(zfs_get_name(zhp), cbp,
pl->pl_user_prop, buf, sourcetype, source, NULL);
} else {
if (nvlist_lookup_nvlist(user_props,
pl->pl_user_prop, &propval) != 0) {
if (pl->pl_all)
continue;
sourcetype = ZPROP_SRC_NONE;
strval = "-";
} else {
verify(nvlist_lookup_string(propval,
ZPROP_VALUE, &strval) == 0);
verify(nvlist_lookup_string(propval,
ZPROP_SOURCE, &sourceval) == 0);
if (strcmp(sourceval,
zfs_get_name(zhp)) == 0) {
sourcetype = ZPROP_SRC_LOCAL;
} else if (strcmp(sourceval,
ZPROP_SOURCE_VAL_RECVD) == 0) {
sourcetype = ZPROP_SRC_RECEIVED;
} else {
sourcetype = ZPROP_SRC_INHERITED;
(void) strlcpy(source,
sourceval, sizeof (source));
}
}
if (received && (zfs_prop_get_recvd(zhp,
pl->pl_user_prop, rbuf, sizeof (rbuf),
cbp->cb_literal) == 0))
recvdval = rbuf;
zprop_print_one_property(zfs_get_name(zhp), cbp,
pl->pl_user_prop, strval, sourcetype,
source, recvdval);
}
}
return (0);
}
static int
zfs_do_get(int argc, char **argv)
{
zprop_get_cbdata_t cb = { 0 };
int i, c, flags = ZFS_ITER_ARGS_CAN_BE_PATHS;
int types = ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK;
char *value, *fields;
int ret = 0;
int limit = 0;
zprop_list_t fake_name = { 0 };
/*
* Set up default columns and sources.
*/
cb.cb_sources = ZPROP_SRC_ALL;
cb.cb_columns[0] = GET_COL_NAME;
cb.cb_columns[1] = GET_COL_PROPERTY;
cb.cb_columns[2] = GET_COL_VALUE;
cb.cb_columns[3] = GET_COL_SOURCE;
cb.cb_type = ZFS_TYPE_DATASET;
/* check options */
while ((c = getopt(argc, argv, ":d:o:s:rt:Hp")) != -1) {
switch (c) {
case 'p':
cb.cb_literal = B_TRUE;
break;
case 'd':
limit = parse_depth(optarg, &flags);
break;
case 'r':
flags |= ZFS_ITER_RECURSE;
break;
case 'H':
cb.cb_scripted = B_TRUE;
break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
"'%c' option\n"), optopt);
usage(B_FALSE);
break;
case 'o':
/*
* Process the set of columns to display. We zero out
* the structure to give us a blank slate.
*/
bzero(&cb.cb_columns, sizeof (cb.cb_columns));
i = 0;
while (*optarg != '\0') {
static char *col_subopts[] =
{ "name", "property", "value", "received",
"source", "all", NULL };
if (i == ZFS_GET_NCOLS) {
(void) fprintf(stderr, gettext("too "
"many fields given to -o "
"option\n"));
usage(B_FALSE);
}
switch (getsubopt(&optarg, col_subopts,
&value)) {
case 0:
cb.cb_columns[i++] = GET_COL_NAME;
break;
case 1:
cb.cb_columns[i++] = GET_COL_PROPERTY;
break;
case 2:
cb.cb_columns[i++] = GET_COL_VALUE;
break;
case 3:
cb.cb_columns[i++] = GET_COL_RECVD;
flags |= ZFS_ITER_RECVD_PROPS;
break;
case 4:
cb.cb_columns[i++] = GET_COL_SOURCE;
break;
case 5:
if (i > 0) {
(void) fprintf(stderr,
gettext("\"all\" conflicts "
"with specific fields "
"given to -o option\n"));
usage(B_FALSE);
}
cb.cb_columns[0] = GET_COL_NAME;
cb.cb_columns[1] = GET_COL_PROPERTY;
cb.cb_columns[2] = GET_COL_VALUE;
cb.cb_columns[3] = GET_COL_RECVD;
cb.cb_columns[4] = GET_COL_SOURCE;
flags |= ZFS_ITER_RECVD_PROPS;
i = ZFS_GET_NCOLS;
break;
default:
(void) fprintf(stderr,
gettext("invalid column name "
"'%s'\n"), suboptarg);
usage(B_FALSE);
}
}
break;
case 's':
cb.cb_sources = 0;
while (*optarg != '\0') {
static char *source_subopts[] = {
"local", "default", "inherited",
"received", "temporary", "none",
NULL };
switch (getsubopt(&optarg, source_subopts,
&value)) {
case 0:
cb.cb_sources |= ZPROP_SRC_LOCAL;
break;
case 1:
cb.cb_sources |= ZPROP_SRC_DEFAULT;
break;
case 2:
cb.cb_sources |= ZPROP_SRC_INHERITED;
break;
case 3:
cb.cb_sources |= ZPROP_SRC_RECEIVED;
break;
case 4:
cb.cb_sources |= ZPROP_SRC_TEMPORARY;
break;
case 5:
cb.cb_sources |= ZPROP_SRC_NONE;
break;
default:
(void) fprintf(stderr,
gettext("invalid source "
"'%s'\n"), suboptarg);
usage(B_FALSE);
}
}
break;
case 't':
types = 0;
flags &= ~ZFS_ITER_PROP_LISTSNAPS;
while (*optarg != '\0') {
static char *type_subopts[] = { "filesystem",
"volume", "snapshot", "bookmark",
"all", NULL };
switch (getsubopt(&optarg, type_subopts,
&value)) {
case 0:
types |= ZFS_TYPE_FILESYSTEM;
break;
case 1:
types |= ZFS_TYPE_VOLUME;
break;
case 2:
types |= ZFS_TYPE_SNAPSHOT;
break;
case 3:
types |= ZFS_TYPE_BOOKMARK;
break;
case 4:
types = ZFS_TYPE_DATASET |
ZFS_TYPE_BOOKMARK;
break;
default:
(void) fprintf(stderr,
gettext("invalid type '%s'\n"),
suboptarg);
usage(B_FALSE);
}
}
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
if (argc < 1) {
(void) fprintf(stderr, gettext("missing property "
"argument\n"));
usage(B_FALSE);
}
fields = argv[0];
if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET)
!= 0)
usage(B_FALSE);
argc--;
argv++;
/*
* As part of zfs_expand_proplist(), we keep track of the maximum column
* width for each property. For the 'NAME' (and 'SOURCE') columns, we
* need to know the maximum name length. However, the user likely did
* not specify 'name' as one of the properties to fetch, so we need to
* make sure we always include at least this property for
* print_get_headers() to work properly.
*/
if (cb.cb_proplist != NULL) {
fake_name.pl_prop = ZFS_PROP_NAME;
fake_name.pl_width = strlen(gettext("NAME"));
fake_name.pl_next = cb.cb_proplist;
cb.cb_proplist = &fake_name;
}
cb.cb_first = B_TRUE;
/* run for each object */
ret = zfs_for_each(argc, argv, flags, types, NULL,
&cb.cb_proplist, limit, get_callback, &cb);
if (cb.cb_proplist == &fake_name)
zprop_free_list(fake_name.pl_next);
else
zprop_free_list(cb.cb_proplist);
return (ret);
}
/*
* inherit [-rS] <property> <fs|vol> ...
*
* -r Recurse over all children
* -S Revert to received value, if any
*
* For each dataset specified on the command line, inherit the given property
* from its parent. Inheriting a property at the pool level will cause it to
* use the default value. The '-r' flag will recurse over all children, and is
* useful for setting a property on a hierarchy-wide basis, regardless of any
* local modifications for each dataset.
*/
typedef struct inherit_cbdata {
const char *cb_propname;
boolean_t cb_received;
} inherit_cbdata_t;
static int
inherit_recurse_cb(zfs_handle_t *zhp, void *data)
{
inherit_cbdata_t *cb = data;
zfs_prop_t prop = zfs_name_to_prop(cb->cb_propname);
/*
* If we're doing it recursively, then ignore properties that
* are not valid for this type of dataset.
*/
if (prop != ZPROP_INVAL &&
!zfs_prop_valid_for_type(prop, zfs_get_type(zhp)))
return (0);
return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0);
}
static int
inherit_cb(zfs_handle_t *zhp, void *data)
{
inherit_cbdata_t *cb = data;
return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0);
}
static int
zfs_do_inherit(int argc, char **argv)
{
int c;
zfs_prop_t prop;
inherit_cbdata_t cb = { 0 };
char *propname;
int ret = 0;
int flags = 0;
boolean_t received = B_FALSE;
/* check options */
while ((c = getopt(argc, argv, "rS")) != -1) {
switch (c) {
case 'r':
flags |= ZFS_ITER_RECURSE;
break;
case 'S':
received = B_TRUE;
break;
case '?':
default:
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
/* check number of arguments */
if (argc < 1) {
(void) fprintf(stderr, gettext("missing property argument\n"));
usage(B_FALSE);
}
if (argc < 2) {
(void) fprintf(stderr, gettext("missing dataset argument\n"));
usage(B_FALSE);
}
propname = argv[0];
argc--;
argv++;
if ((prop = zfs_name_to_prop(propname)) != ZPROP_INVAL) {
if (zfs_prop_readonly(prop)) {
(void) fprintf(stderr, gettext(
"%s property is read-only\n"),
propname);
return (1);
}
if (!zfs_prop_inheritable(prop) && !received) {
(void) fprintf(stderr, gettext("'%s' property cannot "
"be inherited\n"), propname);
if (prop == ZFS_PROP_QUOTA ||
prop == ZFS_PROP_RESERVATION ||
prop == ZFS_PROP_REFQUOTA ||
prop == ZFS_PROP_REFRESERVATION) {
(void) fprintf(stderr, gettext("use 'zfs set "
"%s=none' to clear\n"), propname);
(void) fprintf(stderr, gettext("use 'zfs "
"inherit -S %s' to revert to received "
"value\n"), propname);
}
return (1);
}
if (received && (prop == ZFS_PROP_VOLSIZE ||
prop == ZFS_PROP_VERSION)) {
(void) fprintf(stderr, gettext("'%s' property cannot "
"be reverted to a received value\n"), propname);
return (1);
}
} else if (!zfs_prop_user(propname)) {
(void) fprintf(stderr, gettext("invalid property '%s'\n"),
propname);
usage(B_FALSE);
}
cb.cb_propname = propname;
cb.cb_received = received;
if (flags & ZFS_ITER_RECURSE) {
ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET,
NULL, NULL, 0, inherit_recurse_cb, &cb);
} else {
ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET,
NULL, NULL, 0, inherit_cb, &cb);
}
return (ret);
}
typedef struct upgrade_cbdata {
uint64_t cb_numupgraded;
uint64_t cb_numsamegraded;
uint64_t cb_numfailed;
uint64_t cb_version;
boolean_t cb_newer;
boolean_t cb_foundone;
char cb_lastfs[ZFS_MAX_DATASET_NAME_LEN];
} upgrade_cbdata_t;
static int
same_pool(zfs_handle_t *zhp, const char *name)
{
int len1 = strcspn(name, "/@");
const char *zhname = zfs_get_name(zhp);
int len2 = strcspn(zhname, "/@");
if (len1 != len2)
return (B_FALSE);
return (strncmp(name, zhname, len1) == 0);
}
static int
upgrade_list_callback(zfs_handle_t *zhp, void *data)
{
upgrade_cbdata_t *cb = data;
int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
/* list if it's old/new */
if ((!cb->cb_newer && version < ZPL_VERSION) ||
(cb->cb_newer && version > ZPL_VERSION)) {
char *str;
if (cb->cb_newer) {
str = gettext("The following filesystems are "
"formatted using a newer software version and\n"
"cannot be accessed on the current system.\n\n");
} else {
str = gettext("The following filesystems are "
"out of date, and can be upgraded. After being\n"
"upgraded, these filesystems (and any 'zfs send' "
"streams generated from\n"
"subsequent snapshots) will no longer be "
"accessible by older software versions.\n\n");
}
if (!cb->cb_foundone) {
(void) puts(str);
(void) printf(gettext("VER FILESYSTEM\n"));
(void) printf(gettext("--- ------------\n"));
cb->cb_foundone = B_TRUE;
}
(void) printf("%2u %s\n", version, zfs_get_name(zhp));
}
return (0);
}
static int
upgrade_set_callback(zfs_handle_t *zhp, void *data)
{
upgrade_cbdata_t *cb = data;
int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
int needed_spa_version;
int spa_version;
if (zfs_spa_version(zhp, &spa_version) < 0)
return (-1);
needed_spa_version = zfs_spa_version_map(cb->cb_version);
if (needed_spa_version < 0)
return (-1);
if (spa_version < needed_spa_version) {
/* can't upgrade */
(void) printf(gettext("%s: can not be "
"upgraded; the pool version needs to first "
"be upgraded\nto version %d\n\n"),
zfs_get_name(zhp), needed_spa_version);
cb->cb_numfailed++;
return (0);
}
/* upgrade */
if (version < cb->cb_version) {
char verstr[16];
(void) snprintf(verstr, sizeof (verstr),
"%llu", cb->cb_version);
if (cb->cb_lastfs[0] && !same_pool(zhp, cb->cb_lastfs)) {
/*
* If they did "zfs upgrade -a", then we could
* be doing ioctls to different pools. We need
* to log this history once to each pool, and bypass
* the normal history logging that happens in main().
*/
(void) zpool_log_history(g_zfs, history_str);
log_history = B_FALSE;
}
if (zfs_prop_set(zhp, "version", verstr) == 0)
cb->cb_numupgraded++;
else
cb->cb_numfailed++;
(void) strcpy(cb->cb_lastfs, zfs_get_name(zhp));
} else if (version > cb->cb_version) {
/* can't downgrade */
(void) printf(gettext("%s: can not be downgraded; "
"it is already at version %u\n"),
zfs_get_name(zhp), version);
cb->cb_numfailed++;
} else {
cb->cb_numsamegraded++;
}
return (0);
}
/*
* zfs upgrade
* zfs upgrade -v
* zfs upgrade [-r] [-V <version>] <-a | filesystem>
*/
static int
zfs_do_upgrade(int argc, char **argv)
{
boolean_t all = B_FALSE;
boolean_t showversions = B_FALSE;
int ret = 0;
upgrade_cbdata_t cb = { 0 };
int c;
int flags = ZFS_ITER_ARGS_CAN_BE_PATHS;
/* check options */
while ((c = getopt(argc, argv, "rvV:a")) != -1) {
switch (c) {
case 'r':
flags |= ZFS_ITER_RECURSE;
break;
case 'v':
showversions = B_TRUE;
break;
case 'V':
if (zfs_prop_string_to_index(ZFS_PROP_VERSION,
optarg, &cb.cb_version) != 0) {
(void) fprintf(stderr,
gettext("invalid version %s\n"), optarg);
usage(B_FALSE);
}
break;
case 'a':
all = B_TRUE;
break;
case '?':
default:
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
if ((!all && !argc) && ((flags & ZFS_ITER_RECURSE) | cb.cb_version))
usage(B_FALSE);
if (showversions && (flags & ZFS_ITER_RECURSE || all ||
cb.cb_version || argc))
usage(B_FALSE);
if ((all || argc) && (showversions))
usage(B_FALSE);
if (all && argc)
usage(B_FALSE);
if (showversions) {
/* Show info on available versions. */
(void) printf(gettext("The following filesystem versions are "
"supported:\n\n"));
(void) printf(gettext("VER DESCRIPTION\n"));
(void) printf("--- -----------------------------------------"
"---------------\n");
(void) printf(gettext(" 1 Initial ZFS filesystem version\n"));
(void) printf(gettext(" 2 Enhanced directory entries\n"));
(void) printf(gettext(" 3 Case insensitive and filesystem "
"user identifier (FUID)\n"));
(void) printf(gettext(" 4 userquota, groupquota "
"properties\n"));
(void) printf(gettext(" 5 System attributes\n"));
(void) printf(gettext("\nFor more information on a particular "
"version, including supported releases,\n"));
(void) printf("see the ZFS Administration Guide.\n\n");
ret = 0;
} else if (argc || all) {
/* Upgrade filesystems */
if (cb.cb_version == 0)
cb.cb_version = ZPL_VERSION;
ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM,
NULL, NULL, 0, upgrade_set_callback, &cb);
(void) printf(gettext("%llu filesystems upgraded\n"),
cb.cb_numupgraded);
if (cb.cb_numsamegraded) {
(void) printf(gettext("%llu filesystems already at "
"this version\n"),
cb.cb_numsamegraded);
}
if (cb.cb_numfailed != 0)
ret = 1;
} else {
/* List old-version filesytems */
boolean_t found;
(void) printf(gettext("This system is currently running "
"ZFS filesystem version %llu.\n\n"), ZPL_VERSION);
flags |= ZFS_ITER_RECURSE;
ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM,
NULL, NULL, 0, upgrade_list_callback, &cb);
found = cb.cb_foundone;
cb.cb_foundone = B_FALSE;
cb.cb_newer = B_TRUE;
ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM,
NULL, NULL, 0, upgrade_list_callback, &cb);
if (!cb.cb_foundone && !found) {
(void) printf(gettext("All filesystems are "
"formatted with the current version.\n"));
}
}
return (ret);
}
/*
* zfs userspace [-Hinp] [-o field[,...]] [-s field [-s field]...]
* [-S field [-S field]...] [-t type[,...]] filesystem | snapshot
* zfs groupspace [-Hinp] [-o field[,...]] [-s field [-s field]...]
* [-S field [-S field]...] [-t type[,...]] filesystem | snapshot
*
* -H Scripted mode; elide headers and separate columns by tabs.
* -i Translate SID to POSIX ID.
* -n Print numeric ID instead of user/group name.
* -o Control which fields to display.
* -p Use exact (parsable) numeric output.
* -s Specify sort columns, descending order.
* -S Specify sort columns, ascending order.
* -t Control which object types to display.
*
* Displays space consumed by, and quotas on, each user in the specified
* filesystem or snapshot.
*/
/* us_field_types, us_field_hdr and us_field_names should be kept in sync */
enum us_field_types {
USFIELD_TYPE,
USFIELD_NAME,
USFIELD_USED,
USFIELD_QUOTA
};
static char *us_field_hdr[] = { "TYPE", "NAME", "USED", "QUOTA" };
static char *us_field_names[] = { "type", "name", "used", "quota" };
#define USFIELD_LAST (sizeof (us_field_names) / sizeof (char *))
#define USTYPE_PSX_GRP (1 << 0)
#define USTYPE_PSX_USR (1 << 1)
#define USTYPE_SMB_GRP (1 << 2)
#define USTYPE_SMB_USR (1 << 3)
#define USTYPE_ALL \
(USTYPE_PSX_GRP | USTYPE_PSX_USR | USTYPE_SMB_GRP | USTYPE_SMB_USR)
static int us_type_bits[] = {
USTYPE_PSX_GRP,
USTYPE_PSX_USR,
USTYPE_SMB_GRP,
USTYPE_SMB_USR,
USTYPE_ALL
};
static char *us_type_names[] = { "posixgroup", "posixuser", "smbgroup",
"smbuser", "all" };
typedef struct us_node {
nvlist_t *usn_nvl;
uu_avl_node_t usn_avlnode;
uu_list_node_t usn_listnode;
} us_node_t;
typedef struct us_cbdata {
nvlist_t **cb_nvlp;
uu_avl_pool_t *cb_avl_pool;
uu_avl_t *cb_avl;
boolean_t cb_numname;
boolean_t cb_nicenum;
boolean_t cb_sid2posix;
zfs_userquota_prop_t cb_prop;
zfs_sort_column_t *cb_sortcol;
size_t cb_width[USFIELD_LAST];
} us_cbdata_t;
static boolean_t us_populated = B_FALSE;
typedef struct {
zfs_sort_column_t *si_sortcol;
boolean_t si_numname;
} us_sort_info_t;
static int
us_field_index(char *field)
{
int i;
for (i = 0; i < USFIELD_LAST; i++) {
if (strcmp(field, us_field_names[i]) == 0)
return (i);
}
return (-1);
}
static int
us_compare(const void *larg, const void *rarg, void *unused)
{
const us_node_t *l = larg;
const us_node_t *r = rarg;
us_sort_info_t *si = (us_sort_info_t *)unused;
zfs_sort_column_t *sortcol = si->si_sortcol;
boolean_t numname = si->si_numname;
nvlist_t *lnvl = l->usn_nvl;
nvlist_t *rnvl = r->usn_nvl;
int rc = 0;
boolean_t lvb, rvb;
for (; sortcol != NULL; sortcol = sortcol->sc_next) {
char *lvstr = "";
char *rvstr = "";
uint32_t lv32 = 0;
uint32_t rv32 = 0;
uint64_t lv64 = 0;
uint64_t rv64 = 0;
zfs_prop_t prop = sortcol->sc_prop;
const char *propname = NULL;
boolean_t reverse = sortcol->sc_reverse;
switch (prop) {
case ZFS_PROP_TYPE:
propname = "type";
(void) nvlist_lookup_uint32(lnvl, propname, &lv32);
(void) nvlist_lookup_uint32(rnvl, propname, &rv32);
if (rv32 != lv32)
rc = (rv32 < lv32) ? 1 : -1;
break;
case ZFS_PROP_NAME:
propname = "name";
if (numname) {
(void) nvlist_lookup_uint64(lnvl, propname,
&lv64);
(void) nvlist_lookup_uint64(rnvl, propname,
&rv64);
if (rv64 != lv64)
rc = (rv64 < lv64) ? 1 : -1;
} else {
(void) nvlist_lookup_string(lnvl, propname,
&lvstr);
(void) nvlist_lookup_string(rnvl, propname,
&rvstr);
rc = strcmp(lvstr, rvstr);
}
break;
case ZFS_PROP_USED:
case ZFS_PROP_QUOTA:
if (!us_populated)
break;
if (prop == ZFS_PROP_USED)
propname = "used";
else
propname = "quota";
(void) nvlist_lookup_uint64(lnvl, propname, &lv64);
(void) nvlist_lookup_uint64(rnvl, propname, &rv64);
if (rv64 != lv64)
rc = (rv64 < lv64) ? 1 : -1;
break;
default:
break;
}
if (rc != 0) {
if (rc < 0)
return (reverse ? 1 : -1);
else
return (reverse ? -1 : 1);
}
}
/*
* If entries still seem to be the same, check if they are of the same
* type (smbentity is added only if we are doing SID to POSIX ID
* translation where we can have duplicate type/name combinations).
*/
if (nvlist_lookup_boolean_value(lnvl, "smbentity", &lvb) == 0 &&
nvlist_lookup_boolean_value(rnvl, "smbentity", &rvb) == 0 &&
lvb != rvb)
return (lvb < rvb ? -1 : 1);
return (0);
}
static inline const char *
us_type2str(unsigned field_type)
{
switch (field_type) {
case USTYPE_PSX_USR:
return ("POSIX User");
case USTYPE_PSX_GRP:
return ("POSIX Group");
case USTYPE_SMB_USR:
return ("SMB User");
case USTYPE_SMB_GRP:
return ("SMB Group");
default:
return ("Undefined");
}
}
static int
userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space)
{
us_cbdata_t *cb = (us_cbdata_t *)arg;
zfs_userquota_prop_t prop = cb->cb_prop;
char *name = NULL;
char *propname;
char sizebuf[32];
us_node_t *node;
uu_avl_pool_t *avl_pool = cb->cb_avl_pool;
uu_avl_t *avl = cb->cb_avl;
uu_avl_index_t idx;
nvlist_t *props;
us_node_t *n;
zfs_sort_column_t *sortcol = cb->cb_sortcol;
unsigned type = 0;
const char *typestr;
size_t namelen;
size_t typelen;
size_t sizelen;
int typeidx, nameidx, sizeidx;
us_sort_info_t sortinfo = { sortcol, cb->cb_numname };
boolean_t smbentity = B_FALSE;
if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
nomem();
node = safe_malloc(sizeof (us_node_t));
uu_avl_node_init(node, &node->usn_avlnode, avl_pool);
node->usn_nvl = props;
if (domain != NULL && domain[0] != '\0') {
/* SMB */
char sid[MAXNAMELEN + 32];
uid_t id;
#ifdef illumos
int err;
int flag = IDMAP_REQ_FLG_USE_CACHE;
#endif
smbentity = B_TRUE;
(void) snprintf(sid, sizeof (sid), "%s-%u", domain, rid);
if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) {
type = USTYPE_SMB_GRP;
#ifdef illumos
err = sid_to_id(sid, B_FALSE, &id);
#endif
} else {
type = USTYPE_SMB_USR;
#ifdef illumos
err = sid_to_id(sid, B_TRUE, &id);
#endif
}
#ifdef illumos
if (err == 0) {
rid = id;
if (!cb->cb_sid2posix) {
if (type == USTYPE_SMB_USR) {
(void) idmap_getwinnamebyuid(rid, flag,
&name, NULL);
} else {
(void) idmap_getwinnamebygid(rid, flag,
&name, NULL);
}
if (name == NULL)
name = sid;
}
}
#endif
}
if (cb->cb_sid2posix || domain == NULL || domain[0] == '\0') {
/* POSIX or -i */
if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) {
type = USTYPE_PSX_GRP;
if (!cb->cb_numname) {
struct group *g;
if ((g = getgrgid(rid)) != NULL)
name = g->gr_name;
}
} else {
type = USTYPE_PSX_USR;
if (!cb->cb_numname) {
struct passwd *p;
if ((p = getpwuid(rid)) != NULL)
name = p->pw_name;
}
}
}
/*
* Make sure that the type/name combination is unique when doing
* SID to POSIX ID translation (hence changing the type from SMB to
* POSIX).
*/
if (cb->cb_sid2posix &&
nvlist_add_boolean_value(props, "smbentity", smbentity) != 0)
nomem();
/* Calculate/update width of TYPE field */
typestr = us_type2str(type);
typelen = strlen(gettext(typestr));
typeidx = us_field_index("type");
if (typelen > cb->cb_width[typeidx])
cb->cb_width[typeidx] = typelen;
if (nvlist_add_uint32(props, "type", type) != 0)
nomem();
/* Calculate/update width of NAME field */
if ((cb->cb_numname && cb->cb_sid2posix) || name == NULL) {
if (nvlist_add_uint64(props, "name", rid) != 0)
nomem();
namelen = snprintf(NULL, 0, "%u", rid);
} else {
if (nvlist_add_string(props, "name", name) != 0)
nomem();
namelen = strlen(name);
}
nameidx = us_field_index("name");
if (namelen > cb->cb_width[nameidx])
cb->cb_width[nameidx] = namelen;
/*
* Check if this type/name combination is in the list and update it;
* otherwise add new node to the list.
*/
if ((n = uu_avl_find(avl, node, &sortinfo, &idx)) == NULL) {
uu_avl_insert(avl, node, idx);
} else {
nvlist_free(props);
free(node);
node = n;
props = node->usn_nvl;
}
/* Calculate/update width of USED/QUOTA fields */
if (cb->cb_nicenum)
zfs_nicenum(space, sizebuf, sizeof (sizebuf));
else
(void) snprintf(sizebuf, sizeof (sizebuf), "%llu", space);
sizelen = strlen(sizebuf);
if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED) {
propname = "used";
if (!nvlist_exists(props, "quota"))
(void) nvlist_add_uint64(props, "quota", 0);
} else {
propname = "quota";
if (!nvlist_exists(props, "used"))
(void) nvlist_add_uint64(props, "used", 0);
}
sizeidx = us_field_index(propname);
if (sizelen > cb->cb_width[sizeidx])
cb->cb_width[sizeidx] = sizelen;
if (nvlist_add_uint64(props, propname, space) != 0)
nomem();
return (0);
}
static void
print_us_node(boolean_t scripted, boolean_t parsable, int *fields, int types,
size_t *width, us_node_t *node)
{
nvlist_t *nvl = node->usn_nvl;
char valstr[MAXNAMELEN];
boolean_t first = B_TRUE;
int cfield = 0;
int field;
uint32_t ustype;
/* Check type */
(void) nvlist_lookup_uint32(nvl, "type", &ustype);
if (!(ustype & types))
return;
while ((field = fields[cfield]) != USFIELD_LAST) {
nvpair_t *nvp = NULL;
data_type_t type;
uint32_t val32;
uint64_t val64;
char *strval = NULL;
while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
if (strcmp(nvpair_name(nvp),
us_field_names[field]) == 0)
break;
}
type = nvpair_type(nvp);
switch (type) {
case DATA_TYPE_UINT32:
(void) nvpair_value_uint32(nvp, &val32);
break;
case DATA_TYPE_UINT64:
(void) nvpair_value_uint64(nvp, &val64);
break;
case DATA_TYPE_STRING:
(void) nvpair_value_string(nvp, &strval);
break;
default:
(void) fprintf(stderr, "invalid data type\n");
}
switch (field) {
case USFIELD_TYPE:
strval = (char *)us_type2str(val32);
break;
case USFIELD_NAME:
if (type == DATA_TYPE_UINT64) {
(void) sprintf(valstr, "%llu", val64);
strval = valstr;
}
break;
case USFIELD_USED:
case USFIELD_QUOTA:
if (type == DATA_TYPE_UINT64) {
if (parsable) {
(void) sprintf(valstr, "%llu", val64);
} else {
zfs_nicenum(val64, valstr,
sizeof (valstr));
}
if (field == USFIELD_QUOTA &&
strcmp(valstr, "0") == 0)
strval = "none";
else
strval = valstr;
}
break;
}
if (!first) {
if (scripted)
(void) printf("\t");
else
(void) printf(" ");
}
if (scripted)
(void) printf("%s", strval);
else if (field == USFIELD_TYPE || field == USFIELD_NAME)
(void) printf("%-*s", width[field], strval);
else
(void) printf("%*s", width[field], strval);
first = B_FALSE;
cfield++;
}
(void) printf("\n");
}
static void
print_us(boolean_t scripted, boolean_t parsable, int *fields, int types,
size_t *width, boolean_t rmnode, uu_avl_t *avl)
{
us_node_t *node;
const char *col;
int cfield = 0;
int field;
if (!scripted) {
boolean_t first = B_TRUE;
while ((field = fields[cfield]) != USFIELD_LAST) {
col = gettext(us_field_hdr[field]);
if (field == USFIELD_TYPE || field == USFIELD_NAME) {
(void) printf(first ? "%-*s" : " %-*s",
width[field], col);
} else {
(void) printf(first ? "%*s" : " %*s",
width[field], col);
}
first = B_FALSE;
cfield++;
}
(void) printf("\n");
}
for (node = uu_avl_first(avl); node; node = uu_avl_next(avl, node)) {
print_us_node(scripted, parsable, fields, types, width, node);
if (rmnode)
nvlist_free(node->usn_nvl);
}
}
static int
zfs_do_userspace(int argc, char **argv)
{
zfs_handle_t *zhp;
zfs_userquota_prop_t p;
uu_avl_pool_t *avl_pool;
uu_avl_t *avl_tree;
uu_avl_walk_t *walk;
char *delim;
char deffields[] = "type,name,used,quota";
char *ofield = NULL;
char *tfield = NULL;
int cfield = 0;
int fields[256];
int i;
boolean_t scripted = B_FALSE;
boolean_t prtnum = B_FALSE;
boolean_t parsable = B_FALSE;
boolean_t sid2posix = B_FALSE;
int ret = 0;
int c;
zfs_sort_column_t *sortcol = NULL;
int types = USTYPE_PSX_USR | USTYPE_SMB_USR;
us_cbdata_t cb;
us_node_t *node;
us_node_t *rmnode;
uu_list_pool_t *listpool;
uu_list_t *list;
uu_avl_index_t idx = 0;
uu_list_index_t idx2 = 0;
if (argc < 2)
usage(B_FALSE);
if (strcmp(argv[0], "groupspace") == 0)
/* Toggle default group types */
types = USTYPE_PSX_GRP | USTYPE_SMB_GRP;
while ((c = getopt(argc, argv, "nHpo:s:S:t:i")) != -1) {
switch (c) {
case 'n':
prtnum = B_TRUE;
break;
case 'H':
scripted = B_TRUE;
break;
case 'p':
parsable = B_TRUE;
break;
case 'o':
ofield = optarg;
break;
case 's':
case 'S':
if (zfs_add_sort_column(&sortcol, optarg,
c == 's' ? B_FALSE : B_TRUE) != 0) {
(void) fprintf(stderr,
gettext("invalid field '%s'\n"), optarg);
usage(B_FALSE);
}
break;
case 't':
tfield = optarg;
break;
case 'i':
sid2posix = B_TRUE;
break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
"'%c' option\n"), optopt);
usage(B_FALSE);
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
if (argc < 1) {
(void) fprintf(stderr, gettext("missing dataset name\n"));
usage(B_FALSE);
}
if (argc > 1) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}
/* Use default output fields if not specified using -o */
if (ofield == NULL)
ofield = deffields;
do {
if ((delim = strchr(ofield, ',')) != NULL)
*delim = '\0';
if ((fields[cfield++] = us_field_index(ofield)) == -1) {
(void) fprintf(stderr, gettext("invalid type '%s' "
"for -o option\n"), ofield);
return (-1);
}
if (delim != NULL)
ofield = delim + 1;
} while (delim != NULL);
fields[cfield] = USFIELD_LAST;
/* Override output types (-t option) */
if (tfield != NULL) {
types = 0;
do {
boolean_t found = B_FALSE;
if ((delim = strchr(tfield, ',')) != NULL)
*delim = '\0';
for (i = 0; i < sizeof (us_type_bits) / sizeof (int);
i++) {
if (strcmp(tfield, us_type_names[i]) == 0) {
found = B_TRUE;
types |= us_type_bits[i];
break;
}
}
if (!found) {
(void) fprintf(stderr, gettext("invalid type "
"'%s' for -t option\n"), tfield);
return (-1);
}
if (delim != NULL)
tfield = delim + 1;
} while (delim != NULL);
}
if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL)
return (1);
if ((avl_pool = uu_avl_pool_create("us_avl_pool", sizeof (us_node_t),
offsetof(us_node_t, usn_avlnode), us_compare, UU_DEFAULT)) == NULL)
nomem();
if ((avl_tree = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL)
nomem();
/* Always add default sorting columns */
(void) zfs_add_sort_column(&sortcol, "type", B_FALSE);
(void) zfs_add_sort_column(&sortcol, "name", B_FALSE);
cb.cb_sortcol = sortcol;
cb.cb_numname = prtnum;
cb.cb_nicenum = !parsable;
cb.cb_avl_pool = avl_pool;
cb.cb_avl = avl_tree;
cb.cb_sid2posix = sid2posix;
for (i = 0; i < USFIELD_LAST; i++)
cb.cb_width[i] = strlen(gettext(us_field_hdr[i]));
for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) {
if (((p == ZFS_PROP_USERUSED || p == ZFS_PROP_USERQUOTA) &&
!(types & (USTYPE_PSX_USR | USTYPE_SMB_USR))) ||
((p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA) &&
!(types & (USTYPE_PSX_GRP | USTYPE_SMB_GRP))))
continue;
cb.cb_prop = p;
if ((ret = zfs_userspace(zhp, p, userspace_cb, &cb)) != 0)
return (ret);
}
/* Sort the list */
if ((node = uu_avl_first(avl_tree)) == NULL)
return (0);
us_populated = B_TRUE;
listpool = uu_list_pool_create("tmplist", sizeof (us_node_t),
offsetof(us_node_t, usn_listnode), NULL, UU_DEFAULT);
list = uu_list_create(listpool, NULL, UU_DEFAULT);
uu_list_node_init(node, &node->usn_listnode, listpool);
while (node != NULL) {
rmnode = node;
node = uu_avl_next(avl_tree, node);
uu_avl_remove(avl_tree, rmnode);
if (uu_list_find(list, rmnode, NULL, &idx2) == NULL)
uu_list_insert(list, rmnode, idx2);
}
for (node = uu_list_first(list); node != NULL;
node = uu_list_next(list, node)) {
us_sort_info_t sortinfo = { sortcol, cb.cb_numname };
if (uu_avl_find(avl_tree, node, &sortinfo, &idx) == NULL)
uu_avl_insert(avl_tree, node, idx);
}
uu_list_destroy(list);
uu_list_pool_destroy(listpool);
/* Print and free node nvlist memory */
print_us(scripted, parsable, fields, types, cb.cb_width, B_TRUE,
cb.cb_avl);
zfs_free_sort_columns(sortcol);
/* Clean up the AVL tree */
if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL)
nomem();
while ((node = uu_avl_walk_next(walk)) != NULL) {
uu_avl_remove(cb.cb_avl, node);
free(node);
}
uu_avl_walk_end(walk);
uu_avl_destroy(avl_tree);
uu_avl_pool_destroy(avl_pool);
return (ret);
}
/*
* list [-Hp][-r|-d max] [-o property[,...]] [-s property] ... [-S property] ...
* [-t type[,...]] [filesystem|volume|snapshot] ...
*
* -H Scripted mode; elide headers and separate columns by tabs.
* -p Display values in parsable (literal) format.
* -r Recurse over all children.
* -d Limit recursion by depth.
* -o Control which fields to display.
* -s Specify sort columns, descending order.
* -S Specify sort columns, ascending order.
* -t Control which object types to display.
*
* When given no arguments, list all filesystems in the system.
* Otherwise, list the specified datasets, optionally recursing down them if
* '-r' is specified.
*/
typedef struct list_cbdata {
boolean_t cb_first;
boolean_t cb_literal;
boolean_t cb_scripted;
zprop_list_t *cb_proplist;
} list_cbdata_t;
/*
* Given a list of columns to display, output appropriate headers for each one.
*/
static void
print_header(list_cbdata_t *cb)
{
zprop_list_t *pl = cb->cb_proplist;
char headerbuf[ZFS_MAXPROPLEN];
const char *header;
int i;
boolean_t first = B_TRUE;
boolean_t right_justify;
for (; pl != NULL; pl = pl->pl_next) {
if (!first) {
(void) printf(" ");
} else {
first = B_FALSE;
}
right_justify = B_FALSE;
if (pl->pl_prop != ZPROP_INVAL) {
header = zfs_prop_column_name(pl->pl_prop);
right_justify = zfs_prop_align_right(pl->pl_prop);
} else {
for (i = 0; pl->pl_user_prop[i] != '\0'; i++)
headerbuf[i] = toupper(pl->pl_user_prop[i]);
headerbuf[i] = '\0';
header = headerbuf;
}
if (pl->pl_next == NULL && !right_justify)
(void) printf("%s", header);
else if (right_justify)
(void) printf("%*s", pl->pl_width, header);
else
(void) printf("%-*s", pl->pl_width, header);
}
(void) printf("\n");
}
/*
* Given a dataset and a list of fields, print out all the properties according
* to the described layout.
*/
static void
print_dataset(zfs_handle_t *zhp, list_cbdata_t *cb)
{
zprop_list_t *pl = cb->cb_proplist;
boolean_t first = B_TRUE;
char property[ZFS_MAXPROPLEN];
nvlist_t *userprops = zfs_get_user_props(zhp);
nvlist_t *propval;
char *propstr;
boolean_t right_justify;
for (; pl != NULL; pl = pl->pl_next) {
if (!first) {
if (cb->cb_scripted)
(void) printf("\t");
else
(void) printf(" ");
} else {
first = B_FALSE;
}
if (pl->pl_prop == ZFS_PROP_NAME) {
(void) strlcpy(property, zfs_get_name(zhp),
sizeof (property));
propstr = property;
right_justify = zfs_prop_align_right(pl->pl_prop);
} else if (pl->pl_prop != ZPROP_INVAL) {
if (zfs_prop_get(zhp, pl->pl_prop, property,
sizeof (property), NULL, NULL, 0,
cb->cb_literal) != 0)
propstr = "-";
else
propstr = property;
right_justify = zfs_prop_align_right(pl->pl_prop);
} else if (zfs_prop_userquota(pl->pl_user_prop)) {
if (zfs_prop_get_userquota(zhp, pl->pl_user_prop,
property, sizeof (property), cb->cb_literal) != 0)
propstr = "-";
else
propstr = property;
right_justify = B_TRUE;
} else if (zfs_prop_written(pl->pl_user_prop)) {
if (zfs_prop_get_written(zhp, pl->pl_user_prop,
property, sizeof (property), cb->cb_literal) != 0)
propstr = "-";
else
propstr = property;
right_justify = B_TRUE;
} else {
if (nvlist_lookup_nvlist(userprops,
pl->pl_user_prop, &propval) != 0)
propstr = "-";
else
verify(nvlist_lookup_string(propval,
ZPROP_VALUE, &propstr) == 0);
right_justify = B_FALSE;
}
/*
* If this is being called in scripted mode, or if this is the
* last column and it is left-justified, don't include a width
* format specifier.
*/
if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify))
(void) printf("%s", propstr);
else if (right_justify)
(void) printf("%*s", pl->pl_width, propstr);
else
(void) printf("%-*s", pl->pl_width, propstr);
}
(void) printf("\n");
}
/*
* Generic callback function to list a dataset or snapshot.
*/
static int
list_callback(zfs_handle_t *zhp, void *data)
{
list_cbdata_t *cbp = data;
if (cbp->cb_first) {
if (!cbp->cb_scripted)
print_header(cbp);
cbp->cb_first = B_FALSE;
}
print_dataset(zhp, cbp);
return (0);
}
static int
zfs_do_list(int argc, char **argv)
{
int c;
static char default_fields[] =
"name,used,available,referenced,mountpoint";
int types = ZFS_TYPE_DATASET;
boolean_t types_specified = B_FALSE;
char *fields = NULL;
list_cbdata_t cb = { 0 };
char *value;
int limit = 0;
int ret = 0;
zfs_sort_column_t *sortcol = NULL;
int flags = ZFS_ITER_PROP_LISTSNAPS | ZFS_ITER_ARGS_CAN_BE_PATHS;
/* check options */
while ((c = getopt(argc, argv, "HS:d:o:prs:t:")) != -1) {
switch (c) {
case 'o':
fields = optarg;
break;
case 'p':
cb.cb_literal = B_TRUE;
flags |= ZFS_ITER_LITERAL_PROPS;
break;
case 'd':
limit = parse_depth(optarg, &flags);
break;
case 'r':
flags |= ZFS_ITER_RECURSE;
break;
case 'H':
cb.cb_scripted = B_TRUE;
break;
case 's':
if (zfs_add_sort_column(&sortcol, optarg,
B_FALSE) != 0) {
(void) fprintf(stderr,
gettext("invalid property '%s'\n"), optarg);
usage(B_FALSE);
}
break;
case 'S':
if (zfs_add_sort_column(&sortcol, optarg,
B_TRUE) != 0) {
(void) fprintf(stderr,
gettext("invalid property '%s'\n"), optarg);
usage(B_FALSE);
}
break;
case 't':
types = 0;
types_specified = B_TRUE;
flags &= ~ZFS_ITER_PROP_LISTSNAPS;
while (*optarg != '\0') {
static char *type_subopts[] = { "filesystem",
"volume", "snapshot", "snap", "bookmark",
"all", NULL };
switch (getsubopt(&optarg, type_subopts,
&value)) {
case 0:
types |= ZFS_TYPE_FILESYSTEM;
break;
case 1:
types |= ZFS_TYPE_VOLUME;
break;
case 2:
case 3:
types |= ZFS_TYPE_SNAPSHOT;
break;
case 4:
types |= ZFS_TYPE_BOOKMARK;
break;
case 5:
types = ZFS_TYPE_DATASET |
ZFS_TYPE_BOOKMARK;
break;
default:
(void) fprintf(stderr,
gettext("invalid type '%s'\n"),
suboptarg);
usage(B_FALSE);
}
}
break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
"'%c' option\n"), optopt);
usage(B_FALSE);
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
if (fields == NULL)
fields = default_fields;
/*
* If we are only going to list snapshot names and sort by name,
* then we can use faster version.
*/
if (strcmp(fields, "name") == 0 && zfs_sort_only_by_name(sortcol))
flags |= ZFS_ITER_SIMPLE;
/*
* If "-o space" and no types were specified, don't display snapshots.
*/
if (strcmp(fields, "space") == 0 && types_specified == B_FALSE)
types &= ~ZFS_TYPE_SNAPSHOT;
/*
* If the user specifies '-o all', the zprop_get_list() doesn't
* normally include the name of the dataset. For 'zfs list', we always
* want this property to be first.
*/
if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET)
!= 0)
usage(B_FALSE);
cb.cb_first = B_TRUE;
ret = zfs_for_each(argc, argv, flags, types, sortcol, &cb.cb_proplist,
limit, list_callback, &cb);
zprop_free_list(cb.cb_proplist);
zfs_free_sort_columns(sortcol);
if (ret == 0 && cb.cb_first && !cb.cb_scripted)
(void) printf(gettext("no datasets available\n"));
return (ret);
}
/*
* zfs rename [-f] <fs | snap | vol> <fs | snap | vol>
* zfs rename [-f] -p <fs | vol> <fs | vol>
* zfs rename -r <snap> <snap>
* zfs rename -u [-p] <fs> <fs>
*
* Renames the given dataset to another of the same type.
*
* The '-p' flag creates all the non-existing ancestors of the target first.
*/
/* ARGSUSED */
static int
zfs_do_rename(int argc, char **argv)
{
zfs_handle_t *zhp;
renameflags_t flags = { 0 };
int c;
int ret = 0;
int types;
boolean_t parents = B_FALSE;
char *snapshot = NULL;
/* check options */
while ((c = getopt(argc, argv, "fpru")) != -1) {
switch (c) {
case 'p':
parents = B_TRUE;
break;
case 'r':
flags.recurse = B_TRUE;
break;
case 'u':
flags.nounmount = B_TRUE;
break;
case 'f':
flags.forceunmount = B_TRUE;
break;
case '?':
default:
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
/* check number of arguments */
if (argc < 1) {
(void) fprintf(stderr, gettext("missing source dataset "
"argument\n"));
usage(B_FALSE);
}
if (argc < 2) {
(void) fprintf(stderr, gettext("missing target dataset "
"argument\n"));
usage(B_FALSE);
}
if (argc > 2) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}
if (flags.recurse && parents) {
(void) fprintf(stderr, gettext("-p and -r options are mutually "
"exclusive\n"));
usage(B_FALSE);
}
if (flags.recurse && strchr(argv[0], '@') == 0) {
(void) fprintf(stderr, gettext("source dataset for recursive "
"rename must be a snapshot\n"));
usage(B_FALSE);
}
if (flags.nounmount && parents) {
(void) fprintf(stderr, gettext("-u and -p options are mutually "
"exclusive\n"));
usage(B_FALSE);
}
if (flags.nounmount)
types = ZFS_TYPE_FILESYSTEM;
else if (parents)
types = ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME;
else
types = ZFS_TYPE_DATASET;
if (flags.recurse) {
/*
* When we do recursive rename we are fine when the given
* snapshot for the given dataset doesn't exist - it can
* still exists below.
*/
snapshot = strchr(argv[0], '@');
assert(snapshot != NULL);
*snapshot = '\0';
snapshot++;
}
if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL)
return (1);
/* If we were asked and the name looks good, try to create ancestors. */
if (parents && zfs_name_valid(argv[1], zfs_get_type(zhp)) &&
zfs_create_ancestors(g_zfs, argv[1]) != 0) {
zfs_close(zhp);
return (1);
}
ret = (zfs_rename(zhp, snapshot, argv[1], flags) != 0);
zfs_close(zhp);
return (ret);
}
/*
* zfs promote <fs>
*
* Promotes the given clone fs to be the parent
*/
/* ARGSUSED */
static int
zfs_do_promote(int argc, char **argv)
{
zfs_handle_t *zhp;
int ret = 0;
/* check options */
if (argc > 1 && argv[1][0] == '-') {
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
argv[1][1]);
usage(B_FALSE);
}
/* check number of arguments */
if (argc < 2) {
(void) fprintf(stderr, gettext("missing clone filesystem"
" argument\n"));
usage(B_FALSE);
}
if (argc > 2) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}
zhp = zfs_open(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
if (zhp == NULL)
return (1);
ret = (zfs_promote(zhp) != 0);
zfs_close(zhp);
return (ret);
}
/*
* zfs rollback [-rRf] <snapshot>
*
* -r Delete any intervening snapshots before doing rollback
* -R Delete any snapshots and their clones
* -f ignored for backwards compatability
*
* Given a filesystem, rollback to a specific snapshot, discarding any changes
* since then and making it the active dataset. If more recent snapshots exist,
* the command will complain unless the '-r' flag is given.
*/
typedef struct rollback_cbdata {
uint64_t cb_create;
boolean_t cb_first;
int cb_doclones;
char *cb_target;
int cb_error;
boolean_t cb_recurse;
} rollback_cbdata_t;
static int
rollback_check_dependent(zfs_handle_t *zhp, void *data)
{
rollback_cbdata_t *cbp = data;
if (cbp->cb_first && cbp->cb_recurse) {
(void) fprintf(stderr, gettext("cannot rollback to "
"'%s': clones of previous snapshots exist\n"),
cbp->cb_target);
(void) fprintf(stderr, gettext("use '-R' to "
"force deletion of the following clones and "
"dependents:\n"));
cbp->cb_first = 0;
cbp->cb_error = 1;
}
(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
zfs_close(zhp);
return (0);
}
/*
* Report any snapshots more recent than the one specified. Used when '-r' is
* not specified. We reuse this same callback for the snapshot dependents - if
* 'cb_dependent' is set, then this is a dependent and we should report it
* without checking the transaction group.
*/
static int
rollback_check(zfs_handle_t *zhp, void *data)
{
rollback_cbdata_t *cbp = data;
if (cbp->cb_doclones) {
zfs_close(zhp);
return (0);
}
if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) {
if (cbp->cb_first && !cbp->cb_recurse) {
(void) fprintf(stderr, gettext("cannot "
"rollback to '%s': more recent snapshots "
"or bookmarks exist\n"),
cbp->cb_target);
(void) fprintf(stderr, gettext("use '-r' to "
"force deletion of the following "
"snapshots and bookmarks:\n"));
cbp->cb_first = 0;
cbp->cb_error = 1;
}
if (cbp->cb_recurse) {
if (zfs_iter_dependents(zhp, B_TRUE,
rollback_check_dependent, cbp) != 0) {
zfs_close(zhp);
return (-1);
}
} else {
(void) fprintf(stderr, "%s\n",
zfs_get_name(zhp));
}
}
zfs_close(zhp);
return (0);
}
static int
zfs_do_rollback(int argc, char **argv)
{
int ret = 0;
int c;
boolean_t force = B_FALSE;
rollback_cbdata_t cb = { 0 };
zfs_handle_t *zhp, *snap;
char parentname[ZFS_MAX_DATASET_NAME_LEN];
char *delim;
/* check options */
while ((c = getopt(argc, argv, "rRf")) != -1) {
switch (c) {
case 'r':
cb.cb_recurse = 1;
break;
case 'R':
cb.cb_recurse = 1;
cb.cb_doclones = 1;
break;
case 'f':
force = B_TRUE;
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
/* check number of arguments */
if (argc < 1) {
(void) fprintf(stderr, gettext("missing dataset argument\n"));
usage(B_FALSE);
}
if (argc > 1) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}
/* open the snapshot */
if ((snap = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL)
return (1);
/* open the parent dataset */
(void) strlcpy(parentname, argv[0], sizeof (parentname));
verify((delim = strrchr(parentname, '@')) != NULL);
*delim = '\0';
if ((zhp = zfs_open(g_zfs, parentname, ZFS_TYPE_DATASET)) == NULL) {
zfs_close(snap);
return (1);
}
/*
* Check for more recent snapshots and/or clones based on the presence
* of '-r' and '-R'.
*/
cb.cb_target = argv[0];
cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG);
cb.cb_first = B_TRUE;
cb.cb_error = 0;
if ((ret = zfs_iter_snapshots(zhp, B_FALSE, rollback_check, &cb)) != 0)
goto out;
if ((ret = zfs_iter_bookmarks(zhp, rollback_check, &cb)) != 0)
goto out;
if ((ret = cb.cb_error) != 0)
goto out;
/*
* Rollback parent to the given snapshot.
*/
ret = zfs_rollback(zhp, snap, force);
out:
zfs_close(snap);
zfs_close(zhp);
if (ret == 0)
return (0);
else
return (1);
}
/*
* zfs set property=value ... { fs | snap | vol } ...
*
* Sets the given properties for all datasets specified on the command line.
*/
static int
set_callback(zfs_handle_t *zhp, void *data)
{
nvlist_t *props = data;
if (zfs_prop_set_list(zhp, props) != 0) {
switch (libzfs_errno(g_zfs)) {
case EZFS_MOUNTFAILED:
(void) fprintf(stderr, gettext("property may be set "
"but unable to remount filesystem\n"));
break;
case EZFS_SHARENFSFAILED:
(void) fprintf(stderr, gettext("property may be set "
"but unable to reshare filesystem\n"));
break;
}
return (1);
}
return (0);
}
static int
zfs_do_set(int argc, char **argv)
{
nvlist_t *props = NULL;
int ds_start = -1; /* argv idx of first dataset arg */
int ret = 0;
/* check for options */
if (argc > 1 && argv[1][0] == '-') {
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
argv[1][1]);
usage(B_FALSE);
}
/* check number of arguments */
if (argc < 2) {
(void) fprintf(stderr, gettext("missing arguments\n"));
usage(B_FALSE);
}
if (argc < 3) {
if (strchr(argv[1], '=') == NULL) {
(void) fprintf(stderr, gettext("missing property=value "
"argument(s)\n"));
} else {
(void) fprintf(stderr, gettext("missing dataset "
"name(s)\n"));
}
usage(B_FALSE);
}
/* validate argument order: prop=val args followed by dataset args */
for (int i = 1; i < argc; i++) {
if (strchr(argv[i], '=') != NULL) {
if (ds_start > 0) {
/* out-of-order prop=val argument */
(void) fprintf(stderr, gettext("invalid "
"argument order\n"), i);
usage(B_FALSE);
}
} else if (ds_start < 0) {
ds_start = i;
}
}
if (ds_start < 0) {
(void) fprintf(stderr, gettext("missing dataset name(s)\n"));
usage(B_FALSE);
}
/* Populate a list of property settings */
if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
nomem();
for (int i = 1; i < ds_start; i++) {
if ((ret = parseprop(props, argv[i])) != 0)
goto error;
}
ret = zfs_for_each(argc - ds_start, argv + ds_start, 0,
ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, props);
error:
nvlist_free(props);
return (ret);
}
typedef struct snap_cbdata {
nvlist_t *sd_nvl;
boolean_t sd_recursive;
const char *sd_snapname;
} snap_cbdata_t;
static int
zfs_snapshot_cb(zfs_handle_t *zhp, void *arg)
{
snap_cbdata_t *sd = arg;
char *name;
int rv = 0;
int error;
if (sd->sd_recursive &&
zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) != 0) {
zfs_close(zhp);
return (0);
}
error = asprintf(&name, "%s@%s", zfs_get_name(zhp), sd->sd_snapname);
if (error == -1)
nomem();
fnvlist_add_boolean(sd->sd_nvl, name);
free(name);
if (sd->sd_recursive)
rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd);
zfs_close(zhp);
return (rv);
}
/*
* zfs snapshot [-r] [-o prop=value] ... <fs@snap>
*
* Creates a snapshot with the given name. While functionally equivalent to
* 'zfs create', it is a separate command to differentiate intent.
*/
static int
zfs_do_snapshot(int argc, char **argv)
{
int ret = 0;
int c;
nvlist_t *props;
snap_cbdata_t sd = { 0 };
boolean_t multiple_snaps = B_FALSE;
if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
nomem();
if (nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) != 0)
nomem();
/* check options */
while ((c = getopt(argc, argv, "ro:")) != -1) {
switch (c) {
case 'o':
if (parseprop(props, optarg) != 0)
return (1);
break;
case 'r':
sd.sd_recursive = B_TRUE;
multiple_snaps = B_TRUE;
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
goto usage;
}
}
argc -= optind;
argv += optind;
/* check number of arguments */
if (argc < 1) {
(void) fprintf(stderr, gettext("missing snapshot argument\n"));
goto usage;
}
if (argc > 1)
multiple_snaps = B_TRUE;
for (; argc > 0; argc--, argv++) {
char *atp;
zfs_handle_t *zhp;
atp = strchr(argv[0], '@');
if (atp == NULL)
goto usage;
*atp = '\0';
sd.sd_snapname = atp + 1;
zhp = zfs_open(g_zfs, argv[0],
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
if (zhp == NULL)
goto usage;
if (zfs_snapshot_cb(zhp, &sd) != 0)
goto usage;
}
ret = zfs_snapshot_nvl(g_zfs, sd.sd_nvl, props);
nvlist_free(sd.sd_nvl);
nvlist_free(props);
if (ret != 0 && multiple_snaps)
(void) fprintf(stderr, gettext("no snapshots were created\n"));
return (ret != 0);
usage:
nvlist_free(sd.sd_nvl);
nvlist_free(props);
usage(B_FALSE);
return (-1);
}
/*
* Send a backup stream to stdout.
*/
static int
zfs_do_send(int argc, char **argv)
{
char *fromname = NULL;
char *toname = NULL;
char *resume_token = NULL;
char *cp;
zfs_handle_t *zhp;
sendflags_t flags = { 0 };
int c, err;
nvlist_t *dbgnv = NULL;
boolean_t extraverbose = B_FALSE;
struct option long_options[] = {
{"replicate", no_argument, NULL, 'R'},
{"props", no_argument, NULL, 'p'},
{"parsable", no_argument, NULL, 'P'},
{"dedup", no_argument, NULL, 'D'},
{"verbose", no_argument, NULL, 'v'},
{"dryrun", no_argument, NULL, 'n'},
{"large-block", no_argument, NULL, 'L'},
{"embed", no_argument, NULL, 'e'},
{"resume", required_argument, NULL, 't'},
{"compressed", no_argument, NULL, 'c'},
{0, 0, 0, 0}
};
/* check options */
while ((c = getopt_long(argc, argv, ":i:I:RbDpvnPLet:c", long_options,
NULL)) != -1) {
switch (c) {
case 'i':
if (fromname)
usage(B_FALSE);
fromname = optarg;
break;
case 'I':
if (fromname)
usage(B_FALSE);
fromname = optarg;
flags.doall = B_TRUE;
break;
case 'R':
flags.replicate = B_TRUE;
break;
case 'p':
flags.props = B_TRUE;
break;
case 'P':
flags.parsable = B_TRUE;
flags.verbose = B_TRUE;
break;
case 'v':
if (flags.verbose)
extraverbose = B_TRUE;
flags.verbose = B_TRUE;
flags.progress = B_TRUE;
break;
case 'D':
flags.dedup = B_TRUE;
break;
case 'n':
flags.dryrun = B_TRUE;
break;
case 'L':
flags.largeblock = B_TRUE;
break;
case 'e':
flags.embed_data = B_TRUE;
break;
case 't':
resume_token = optarg;
break;
case 'c':
flags.compress = B_TRUE;
break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
"'%c' option\n"), optopt);
usage(B_FALSE);
break;
case '?':
/*FALLTHROUGH*/
default:
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
if (resume_token != NULL) {
if (fromname != NULL || flags.replicate || flags.props ||
flags.dedup) {
(void) fprintf(stderr,
gettext("invalid flags combined with -t\n"));
usage(B_FALSE);
}
if (argc != 0) {
(void) fprintf(stderr, gettext("no additional "
"arguments are permitted with -t\n"));
usage(B_FALSE);
}
} else {
if (argc < 1) {
(void) fprintf(stderr,
gettext("missing snapshot argument\n"));
usage(B_FALSE);
}
if (argc > 1) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}
}
if (!flags.dryrun && isatty(STDOUT_FILENO)) {
(void) fprintf(stderr,
gettext("Error: Stream can not be written to a terminal.\n"
"You must redirect standard output.\n"));
return (1);
}
if (resume_token != NULL) {
return (zfs_send_resume(g_zfs, &flags, STDOUT_FILENO,
resume_token));
}
/*
* Special case sending a filesystem, or from a bookmark.
*/
if (strchr(argv[0], '@') == NULL ||
(fromname && strchr(fromname, '#') != NULL)) {
char frombuf[ZFS_MAX_DATASET_NAME_LEN];
enum lzc_send_flags lzc_flags = 0;
if (flags.replicate || flags.doall || flags.props ||
flags.dedup || flags.dryrun || flags.verbose ||
flags.progress) {
(void) fprintf(stderr,
gettext("Error: "
"Unsupported flag with filesystem or bookmark.\n"));
return (1);
}
zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET);
if (zhp == NULL)
return (1);
if (flags.largeblock)
lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK;
if (flags.embed_data)
lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
if (flags.compress)
lzc_flags |= LZC_SEND_FLAG_COMPRESS;
if (fromname != NULL &&
(fromname[0] == '#' || fromname[0] == '@')) {
/*
* Incremental source name begins with # or @.
* Default to same fs as target.
*/
(void) strncpy(frombuf, argv[0], sizeof (frombuf));
cp = strchr(frombuf, '@');
if (cp != NULL)
*cp = '\0';
(void) strlcat(frombuf, fromname, sizeof (frombuf));
fromname = frombuf;
}
err = zfs_send_one(zhp, fromname, STDOUT_FILENO, lzc_flags);
zfs_close(zhp);
return (err != 0);
}
cp = strchr(argv[0], '@');
*cp = '\0';
toname = cp + 1;
zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
if (zhp == NULL)
return (1);
/*
* If they specified the full path to the snapshot, chop off
* everything except the short name of the snapshot, but special
* case if they specify the origin.
*/
if (fromname && (cp = strchr(fromname, '@')) != NULL) {
char origin[ZFS_MAX_DATASET_NAME_LEN];
zprop_source_t src;
(void) zfs_prop_get(zhp, ZFS_PROP_ORIGIN,
origin, sizeof (origin), &src, NULL, 0, B_FALSE);
if (strcmp(origin, fromname) == 0) {
fromname = NULL;
flags.fromorigin = B_TRUE;
} else {
*cp = '\0';
if (cp != fromname && strcmp(argv[0], fromname)) {
(void) fprintf(stderr,
gettext("incremental source must be "
"in same filesystem\n"));
usage(B_FALSE);
}
fromname = cp + 1;
if (strchr(fromname, '@') || strchr(fromname, '/')) {
(void) fprintf(stderr,
gettext("invalid incremental source\n"));
usage(B_FALSE);
}
}
}
if (flags.replicate && fromname == NULL)
flags.doall = B_TRUE;
err = zfs_send(zhp, fromname, toname, &flags, STDOUT_FILENO, NULL, 0,
extraverbose ? &dbgnv : NULL);
if (extraverbose && dbgnv != NULL) {
/*
* dump_nvlist prints to stdout, but that's been
* redirected to a file. Make it print to stderr
* instead.
*/
(void) dup2(STDERR_FILENO, STDOUT_FILENO);
dump_nvlist(dbgnv, 0);
nvlist_free(dbgnv);
}
zfs_close(zhp);
return (err != 0);
}
/*
* Restore a backup stream from stdin.
*/
static int
zfs_do_receive(int argc, char **argv)
{
int c, err = 0;
recvflags_t flags = { 0 };
boolean_t abort_resumable = B_FALSE;
nvlist_t *props;
nvpair_t *nvp = NULL;
if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
nomem();
/* check options */
while ((c = getopt(argc, argv, ":o:denuvFsA")) != -1) {
switch (c) {
case 'o':
if (parseprop(props, optarg) != 0)
return (1);
break;
case 'd':
flags.isprefix = B_TRUE;
break;
case 'e':
flags.isprefix = B_TRUE;
flags.istail = B_TRUE;
break;
case 'n':
flags.dryrun = B_TRUE;
break;
case 'u':
flags.nomount = B_TRUE;
break;
case 'v':
flags.verbose = B_TRUE;
break;
case 's':
flags.resumable = B_TRUE;
break;
case 'F':
flags.force = B_TRUE;
break;
case 'A':
abort_resumable = B_TRUE;
break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
"'%c' option\n"), optopt);
usage(B_FALSE);
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
/* check number of arguments */
if (argc < 1) {
(void) fprintf(stderr, gettext("missing snapshot argument\n"));
usage(B_FALSE);
}
if (argc > 1) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}
while ((nvp = nvlist_next_nvpair(props, nvp))) {
if (strcmp(nvpair_name(nvp), "origin") != 0) {
(void) fprintf(stderr, gettext("invalid option"));
usage(B_FALSE);
}
}
if (abort_resumable) {
if (flags.isprefix || flags.istail || flags.dryrun ||
flags.resumable || flags.nomount) {
(void) fprintf(stderr, gettext("invalid option"));
usage(B_FALSE);
}
char namebuf[ZFS_MAX_DATASET_NAME_LEN];
(void) snprintf(namebuf, sizeof (namebuf),
"%s/%%recv", argv[0]);
if (zfs_dataset_exists(g_zfs, namebuf,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) {
zfs_handle_t *zhp = zfs_open(g_zfs,
namebuf, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
if (zhp == NULL)
return (1);
err = zfs_destroy(zhp, B_FALSE);
} else {
zfs_handle_t *zhp = zfs_open(g_zfs,
argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
if (zhp == NULL)
usage(B_FALSE);
if (!zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) ||
zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
NULL, 0, NULL, NULL, 0, B_TRUE) == -1) {
(void) fprintf(stderr,
gettext("'%s' does not have any "
"resumable receive state to abort\n"),
argv[0]);
return (1);
}
err = zfs_destroy(zhp, B_FALSE);
}
return (err != 0);
}
if (isatty(STDIN_FILENO)) {
(void) fprintf(stderr,
gettext("Error: Backup stream can not be read "
"from a terminal.\n"
"You must redirect standard input.\n"));
return (1);
}
err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL);
return (err != 0);
}
/*
* allow/unallow stuff
*/
/* copied from zfs/sys/dsl_deleg.h */
#define ZFS_DELEG_PERM_CREATE "create"
#define ZFS_DELEG_PERM_DESTROY "destroy"
#define ZFS_DELEG_PERM_SNAPSHOT "snapshot"
#define ZFS_DELEG_PERM_ROLLBACK "rollback"
#define ZFS_DELEG_PERM_CLONE "clone"
#define ZFS_DELEG_PERM_PROMOTE "promote"
#define ZFS_DELEG_PERM_RENAME "rename"
#define ZFS_DELEG_PERM_MOUNT "mount"
#define ZFS_DELEG_PERM_SHARE "share"
#define ZFS_DELEG_PERM_SEND "send"
#define ZFS_DELEG_PERM_RECEIVE "receive"
#define ZFS_DELEG_PERM_ALLOW "allow"
#define ZFS_DELEG_PERM_USERPROP "userprop"
#define ZFS_DELEG_PERM_VSCAN "vscan" /* ??? */
#define ZFS_DELEG_PERM_USERQUOTA "userquota"
#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota"
#define ZFS_DELEG_PERM_USERUSED "userused"
#define ZFS_DELEG_PERM_GROUPUSED "groupused"
#define ZFS_DELEG_PERM_HOLD "hold"
#define ZFS_DELEG_PERM_RELEASE "release"
#define ZFS_DELEG_PERM_DIFF "diff"
#define ZFS_DELEG_PERM_BOOKMARK "bookmark"
+#define ZFS_DELEG_PERM_REMAP "remap"
#define ZFS_NUM_DELEG_NOTES ZFS_DELEG_NOTE_NONE
static zfs_deleg_perm_tab_t zfs_deleg_perm_tbl[] = {
{ ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW },
{ ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE },
{ ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE },
{ ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY },
{ ZFS_DELEG_PERM_DIFF, ZFS_DELEG_NOTE_DIFF},
{ ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD },
{ ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT },
{ ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE },
{ ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE },
{ ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE },
{ ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME },
{ ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK },
{ ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND },
{ ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE },
{ ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT },
{ ZFS_DELEG_PERM_BOOKMARK, ZFS_DELEG_NOTE_BOOKMARK },
+ { ZFS_DELEG_PERM_REMAP, ZFS_DELEG_NOTE_REMAP },
{ ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA },
{ ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED },
{ ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP },
{ ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA },
{ ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED },
{ NULL, ZFS_DELEG_NOTE_NONE }
};
/* permission structure */
typedef struct deleg_perm {
zfs_deleg_who_type_t dp_who_type;
const char *dp_name;
boolean_t dp_local;
boolean_t dp_descend;
} deleg_perm_t;
/* */
typedef struct deleg_perm_node {
deleg_perm_t dpn_perm;
uu_avl_node_t dpn_avl_node;
} deleg_perm_node_t;
typedef struct fs_perm fs_perm_t;
/* permissions set */
typedef struct who_perm {
zfs_deleg_who_type_t who_type;
const char *who_name; /* id */
char who_ug_name[256]; /* user/group name */
fs_perm_t *who_fsperm; /* uplink */
uu_avl_t *who_deleg_perm_avl; /* permissions */
} who_perm_t;
/* */
typedef struct who_perm_node {
who_perm_t who_perm;
uu_avl_node_t who_avl_node;
} who_perm_node_t;
typedef struct fs_perm_set fs_perm_set_t;
/* fs permissions */
struct fs_perm {
const char *fsp_name;
uu_avl_t *fsp_sc_avl; /* sets,create */
uu_avl_t *fsp_uge_avl; /* user,group,everyone */
fs_perm_set_t *fsp_set; /* uplink */
};
/* */
typedef struct fs_perm_node {
fs_perm_t fspn_fsperm;
uu_avl_t *fspn_avl;
uu_list_node_t fspn_list_node;
} fs_perm_node_t;
/* top level structure */
struct fs_perm_set {
uu_list_pool_t *fsps_list_pool;
uu_list_t *fsps_list; /* list of fs_perms */
uu_avl_pool_t *fsps_named_set_avl_pool;
uu_avl_pool_t *fsps_who_perm_avl_pool;
uu_avl_pool_t *fsps_deleg_perm_avl_pool;
};
static inline const char *
deleg_perm_type(zfs_deleg_note_t note)
{
/* subcommands */
switch (note) {
/* SUBCOMMANDS */
/* OTHER */
case ZFS_DELEG_NOTE_GROUPQUOTA:
case ZFS_DELEG_NOTE_GROUPUSED:
case ZFS_DELEG_NOTE_USERPROP:
case ZFS_DELEG_NOTE_USERQUOTA:
case ZFS_DELEG_NOTE_USERUSED:
/* other */
return (gettext("other"));
default:
return (gettext("subcommand"));
}
}
static int
who_type2weight(zfs_deleg_who_type_t who_type)
{
int res;
switch (who_type) {
case ZFS_DELEG_NAMED_SET_SETS:
case ZFS_DELEG_NAMED_SET:
res = 0;
break;
case ZFS_DELEG_CREATE_SETS:
case ZFS_DELEG_CREATE:
res = 1;
break;
case ZFS_DELEG_USER_SETS:
case ZFS_DELEG_USER:
res = 2;
break;
case ZFS_DELEG_GROUP_SETS:
case ZFS_DELEG_GROUP:
res = 3;
break;
case ZFS_DELEG_EVERYONE_SETS:
case ZFS_DELEG_EVERYONE:
res = 4;
break;
default:
res = -1;
}
return (res);
}
/* ARGSUSED */
static int
who_perm_compare(const void *larg, const void *rarg, void *unused)
{
const who_perm_node_t *l = larg;
const who_perm_node_t *r = rarg;
zfs_deleg_who_type_t ltype = l->who_perm.who_type;
zfs_deleg_who_type_t rtype = r->who_perm.who_type;
int lweight = who_type2weight(ltype);
int rweight = who_type2weight(rtype);
int res = lweight - rweight;
if (res == 0)
res = strncmp(l->who_perm.who_name, r->who_perm.who_name,
ZFS_MAX_DELEG_NAME-1);
if (res == 0)
return (0);
if (res > 0)
return (1);
else
return (-1);
}
/* ARGSUSED */
static int
deleg_perm_compare(const void *larg, const void *rarg, void *unused)
{
const deleg_perm_node_t *l = larg;
const deleg_perm_node_t *r = rarg;
int res = strncmp(l->dpn_perm.dp_name, r->dpn_perm.dp_name,
ZFS_MAX_DELEG_NAME-1);
if (res == 0)
return (0);
if (res > 0)
return (1);
else
return (-1);
}
static inline void
fs_perm_set_init(fs_perm_set_t *fspset)
{
bzero(fspset, sizeof (fs_perm_set_t));
if ((fspset->fsps_list_pool = uu_list_pool_create("fsps_list_pool",
sizeof (fs_perm_node_t), offsetof(fs_perm_node_t, fspn_list_node),
NULL, UU_DEFAULT)) == NULL)
nomem();
if ((fspset->fsps_list = uu_list_create(fspset->fsps_list_pool, NULL,
UU_DEFAULT)) == NULL)
nomem();
if ((fspset->fsps_named_set_avl_pool = uu_avl_pool_create(
"named_set_avl_pool", sizeof (who_perm_node_t), offsetof(
who_perm_node_t, who_avl_node), who_perm_compare,
UU_DEFAULT)) == NULL)
nomem();
if ((fspset->fsps_who_perm_avl_pool = uu_avl_pool_create(
"who_perm_avl_pool", sizeof (who_perm_node_t), offsetof(
who_perm_node_t, who_avl_node), who_perm_compare,
UU_DEFAULT)) == NULL)
nomem();
if ((fspset->fsps_deleg_perm_avl_pool = uu_avl_pool_create(
"deleg_perm_avl_pool", sizeof (deleg_perm_node_t), offsetof(
deleg_perm_node_t, dpn_avl_node), deleg_perm_compare, UU_DEFAULT))
== NULL)
nomem();
}
static inline void fs_perm_fini(fs_perm_t *);
static inline void who_perm_fini(who_perm_t *);
static inline void
fs_perm_set_fini(fs_perm_set_t *fspset)
{
fs_perm_node_t *node = uu_list_first(fspset->fsps_list);
while (node != NULL) {
fs_perm_node_t *next_node =
uu_list_next(fspset->fsps_list, node);
fs_perm_t *fsperm = &node->fspn_fsperm;
fs_perm_fini(fsperm);
uu_list_remove(fspset->fsps_list, node);
free(node);
node = next_node;
}
uu_avl_pool_destroy(fspset->fsps_named_set_avl_pool);
uu_avl_pool_destroy(fspset->fsps_who_perm_avl_pool);
uu_avl_pool_destroy(fspset->fsps_deleg_perm_avl_pool);
}
static inline void
deleg_perm_init(deleg_perm_t *deleg_perm, zfs_deleg_who_type_t type,
const char *name)
{
deleg_perm->dp_who_type = type;
deleg_perm->dp_name = name;
}
static inline void
who_perm_init(who_perm_t *who_perm, fs_perm_t *fsperm,
zfs_deleg_who_type_t type, const char *name)
{
uu_avl_pool_t *pool;
pool = fsperm->fsp_set->fsps_deleg_perm_avl_pool;
bzero(who_perm, sizeof (who_perm_t));
if ((who_perm->who_deleg_perm_avl = uu_avl_create(pool, NULL,
UU_DEFAULT)) == NULL)
nomem();
who_perm->who_type = type;
who_perm->who_name = name;
who_perm->who_fsperm = fsperm;
}
static inline void
who_perm_fini(who_perm_t *who_perm)
{
deleg_perm_node_t *node = uu_avl_first(who_perm->who_deleg_perm_avl);
while (node != NULL) {
deleg_perm_node_t *next_node =
uu_avl_next(who_perm->who_deleg_perm_avl, node);
uu_avl_remove(who_perm->who_deleg_perm_avl, node);
free(node);
node = next_node;
}
uu_avl_destroy(who_perm->who_deleg_perm_avl);
}
static inline void
fs_perm_init(fs_perm_t *fsperm, fs_perm_set_t *fspset, const char *fsname)
{
uu_avl_pool_t *nset_pool = fspset->fsps_named_set_avl_pool;
uu_avl_pool_t *who_pool = fspset->fsps_who_perm_avl_pool;
bzero(fsperm, sizeof (fs_perm_t));
if ((fsperm->fsp_sc_avl = uu_avl_create(nset_pool, NULL, UU_DEFAULT))
== NULL)
nomem();
if ((fsperm->fsp_uge_avl = uu_avl_create(who_pool, NULL, UU_DEFAULT))
== NULL)
nomem();
fsperm->fsp_set = fspset;
fsperm->fsp_name = fsname;
}
static inline void
fs_perm_fini(fs_perm_t *fsperm)
{
who_perm_node_t *node = uu_avl_first(fsperm->fsp_sc_avl);
while (node != NULL) {
who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_sc_avl,
node);
who_perm_t *who_perm = &node->who_perm;
who_perm_fini(who_perm);
uu_avl_remove(fsperm->fsp_sc_avl, node);
free(node);
node = next_node;
}
node = uu_avl_first(fsperm->fsp_uge_avl);
while (node != NULL) {
who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_uge_avl,
node);
who_perm_t *who_perm = &node->who_perm;
who_perm_fini(who_perm);
uu_avl_remove(fsperm->fsp_uge_avl, node);
free(node);
node = next_node;
}
uu_avl_destroy(fsperm->fsp_sc_avl);
uu_avl_destroy(fsperm->fsp_uge_avl);
}
static void
set_deleg_perm_node(uu_avl_t *avl, deleg_perm_node_t *node,
zfs_deleg_who_type_t who_type, const char *name, char locality)
{
uu_avl_index_t idx = 0;
deleg_perm_node_t *found_node = NULL;
deleg_perm_t *deleg_perm = &node->dpn_perm;
deleg_perm_init(deleg_perm, who_type, name);
if ((found_node = uu_avl_find(avl, node, NULL, &idx))
== NULL)
uu_avl_insert(avl, node, idx);
else {
node = found_node;
deleg_perm = &node->dpn_perm;
}
switch (locality) {
case ZFS_DELEG_LOCAL:
deleg_perm->dp_local = B_TRUE;
break;
case ZFS_DELEG_DESCENDENT:
deleg_perm->dp_descend = B_TRUE;
break;
case ZFS_DELEG_NA:
break;
default:
assert(B_FALSE); /* invalid locality */
}
}
static inline int
parse_who_perm(who_perm_t *who_perm, nvlist_t *nvl, char locality)
{
nvpair_t *nvp = NULL;
fs_perm_set_t *fspset = who_perm->who_fsperm->fsp_set;
uu_avl_t *avl = who_perm->who_deleg_perm_avl;
zfs_deleg_who_type_t who_type = who_perm->who_type;
while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
const char *name = nvpair_name(nvp);
data_type_t type = nvpair_type(nvp);
uu_avl_pool_t *avl_pool = fspset->fsps_deleg_perm_avl_pool;
deleg_perm_node_t *node =
safe_malloc(sizeof (deleg_perm_node_t));
assert(type == DATA_TYPE_BOOLEAN);
uu_avl_node_init(node, &node->dpn_avl_node, avl_pool);
set_deleg_perm_node(avl, node, who_type, name, locality);
}
return (0);
}
static inline int
parse_fs_perm(fs_perm_t *fsperm, nvlist_t *nvl)
{
nvpair_t *nvp = NULL;
fs_perm_set_t *fspset = fsperm->fsp_set;
while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
nvlist_t *nvl2 = NULL;
const char *name = nvpair_name(nvp);
uu_avl_t *avl = NULL;
uu_avl_pool_t *avl_pool = NULL;
zfs_deleg_who_type_t perm_type = name[0];
char perm_locality = name[1];
const char *perm_name = name + 3;
boolean_t is_set = B_TRUE;
who_perm_t *who_perm = NULL;
assert('$' == name[2]);
if (nvpair_value_nvlist(nvp, &nvl2) != 0)
return (-1);
switch (perm_type) {
case ZFS_DELEG_CREATE:
case ZFS_DELEG_CREATE_SETS:
case ZFS_DELEG_NAMED_SET:
case ZFS_DELEG_NAMED_SET_SETS:
avl_pool = fspset->fsps_named_set_avl_pool;
avl = fsperm->fsp_sc_avl;
break;
case ZFS_DELEG_USER:
case ZFS_DELEG_USER_SETS:
case ZFS_DELEG_GROUP:
case ZFS_DELEG_GROUP_SETS:
case ZFS_DELEG_EVERYONE:
case ZFS_DELEG_EVERYONE_SETS:
avl_pool = fspset->fsps_who_perm_avl_pool;
avl = fsperm->fsp_uge_avl;
break;
default:
assert(!"unhandled zfs_deleg_who_type_t");
}
if (is_set) {
who_perm_node_t *found_node = NULL;
who_perm_node_t *node = safe_malloc(
sizeof (who_perm_node_t));
who_perm = &node->who_perm;
uu_avl_index_t idx = 0;
uu_avl_node_init(node, &node->who_avl_node, avl_pool);
who_perm_init(who_perm, fsperm, perm_type, perm_name);
if ((found_node = uu_avl_find(avl, node, NULL, &idx))
== NULL) {
if (avl == fsperm->fsp_uge_avl) {
uid_t rid = 0;
struct passwd *p = NULL;
struct group *g = NULL;
const char *nice_name = NULL;
switch (perm_type) {
case ZFS_DELEG_USER_SETS:
case ZFS_DELEG_USER:
rid = atoi(perm_name);
p = getpwuid(rid);
if (p)
nice_name = p->pw_name;
break;
case ZFS_DELEG_GROUP_SETS:
case ZFS_DELEG_GROUP:
rid = atoi(perm_name);
g = getgrgid(rid);
if (g)
nice_name = g->gr_name;
break;
default:
break;
}
if (nice_name != NULL)
(void) strlcpy(
node->who_perm.who_ug_name,
nice_name, 256);
}
uu_avl_insert(avl, node, idx);
} else {
node = found_node;
who_perm = &node->who_perm;
}
}
(void) parse_who_perm(who_perm, nvl2, perm_locality);
}
return (0);
}
static inline int
parse_fs_perm_set(fs_perm_set_t *fspset, nvlist_t *nvl)
{
nvpair_t *nvp = NULL;
uu_avl_index_t idx = 0;
while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
nvlist_t *nvl2 = NULL;
const char *fsname = nvpair_name(nvp);
data_type_t type = nvpair_type(nvp);
fs_perm_t *fsperm = NULL;
fs_perm_node_t *node = safe_malloc(sizeof (fs_perm_node_t));
if (node == NULL)
nomem();
fsperm = &node->fspn_fsperm;
assert(DATA_TYPE_NVLIST == type);
uu_list_node_init(node, &node->fspn_list_node,
fspset->fsps_list_pool);
idx = uu_list_numnodes(fspset->fsps_list);
fs_perm_init(fsperm, fspset, fsname);
if (nvpair_value_nvlist(nvp, &nvl2) != 0)
return (-1);
(void) parse_fs_perm(fsperm, nvl2);
uu_list_insert(fspset->fsps_list, node, idx);
}
return (0);
}
static inline const char *
deleg_perm_comment(zfs_deleg_note_t note)
{
const char *str = "";
/* subcommands */
switch (note) {
/* SUBCOMMANDS */
case ZFS_DELEG_NOTE_ALLOW:
str = gettext("Must also have the permission that is being"
"\n\t\t\t\tallowed");
break;
case ZFS_DELEG_NOTE_CLONE:
str = gettext("Must also have the 'create' ability and 'mount'"
"\n\t\t\t\tability in the origin file system");
break;
case ZFS_DELEG_NOTE_CREATE:
str = gettext("Must also have the 'mount' ability");
break;
case ZFS_DELEG_NOTE_DESTROY:
str = gettext("Must also have the 'mount' ability");
break;
case ZFS_DELEG_NOTE_DIFF:
str = gettext("Allows lookup of paths within a dataset;"
"\n\t\t\t\tgiven an object number. Ordinary users need this"
"\n\t\t\t\tin order to use zfs diff");
break;
case ZFS_DELEG_NOTE_HOLD:
str = gettext("Allows adding a user hold to a snapshot");
break;
case ZFS_DELEG_NOTE_MOUNT:
str = gettext("Allows mount/umount of ZFS datasets");
break;
case ZFS_DELEG_NOTE_PROMOTE:
str = gettext("Must also have the 'mount'\n\t\t\t\tand"
" 'promote' ability in the origin file system");
break;
case ZFS_DELEG_NOTE_RECEIVE:
str = gettext("Must also have the 'mount' and 'create'"
" ability");
break;
case ZFS_DELEG_NOTE_RELEASE:
str = gettext("Allows releasing a user hold which\n\t\t\t\t"
"might destroy the snapshot");
break;
case ZFS_DELEG_NOTE_RENAME:
str = gettext("Must also have the 'mount' and 'create'"
"\n\t\t\t\tability in the new parent");
break;
case ZFS_DELEG_NOTE_ROLLBACK:
str = gettext("");
break;
case ZFS_DELEG_NOTE_SEND:
str = gettext("");
break;
case ZFS_DELEG_NOTE_SHARE:
str = gettext("Allows sharing file systems over NFS or SMB"
"\n\t\t\t\tprotocols");
break;
case ZFS_DELEG_NOTE_SNAPSHOT:
str = gettext("");
break;
/*
* case ZFS_DELEG_NOTE_VSCAN:
* str = gettext("");
* break;
*/
/* OTHER */
case ZFS_DELEG_NOTE_GROUPQUOTA:
str = gettext("Allows accessing any groupquota@... property");
break;
case ZFS_DELEG_NOTE_GROUPUSED:
str = gettext("Allows reading any groupused@... property");
break;
case ZFS_DELEG_NOTE_USERPROP:
str = gettext("Allows changing any user property");
break;
case ZFS_DELEG_NOTE_USERQUOTA:
str = gettext("Allows accessing any userquota@... property");
break;
case ZFS_DELEG_NOTE_USERUSED:
str = gettext("Allows reading any userused@... property");
break;
/* other */
default:
str = "";
}
return (str);
}
struct allow_opts {
boolean_t local;
boolean_t descend;
boolean_t user;
boolean_t group;
boolean_t everyone;
boolean_t create;
boolean_t set;
boolean_t recursive; /* unallow only */
boolean_t prt_usage;
boolean_t prt_perms;
char *who;
char *perms;
const char *dataset;
};
static inline int
prop_cmp(const void *a, const void *b)
{
const char *str1 = *(const char **)a;
const char *str2 = *(const char **)b;
return (strcmp(str1, str2));
}
static void
allow_usage(boolean_t un, boolean_t requested, const char *msg)
{
const char *opt_desc[] = {
"-h", gettext("show this help message and exit"),
"-l", gettext("set permission locally"),
"-d", gettext("set permission for descents"),
"-u", gettext("set permission for user"),
"-g", gettext("set permission for group"),
"-e", gettext("set permission for everyone"),
"-c", gettext("set create time permission"),
"-s", gettext("define permission set"),
/* unallow only */
"-r", gettext("remove permissions recursively"),
};
size_t unallow_size = sizeof (opt_desc) / sizeof (char *);
size_t allow_size = unallow_size - 2;
const char *props[ZFS_NUM_PROPS];
int i;
size_t count = 0;
FILE *fp = requested ? stdout : stderr;
zprop_desc_t *pdtbl = zfs_prop_get_table();
const char *fmt = gettext("%-16s %-14s\t%s\n");
(void) fprintf(fp, gettext("Usage: %s\n"), get_usage(un ? HELP_UNALLOW :
HELP_ALLOW));
(void) fprintf(fp, gettext("Options:\n"));
for (i = 0; i < (un ? unallow_size : allow_size); i++) {
const char *opt = opt_desc[i++];
const char *optdsc = opt_desc[i];
(void) fprintf(fp, gettext(" %-10s %s\n"), opt, optdsc);
}
(void) fprintf(fp, gettext("\nThe following permissions are "
"supported:\n\n"));
(void) fprintf(fp, fmt, gettext("NAME"), gettext("TYPE"),
gettext("NOTES"));
for (i = 0; i < ZFS_NUM_DELEG_NOTES; i++) {
const char *perm_name = zfs_deleg_perm_tbl[i].z_perm;
zfs_deleg_note_t perm_note = zfs_deleg_perm_tbl[i].z_note;
const char *perm_type = deleg_perm_type(perm_note);
const char *perm_comment = deleg_perm_comment(perm_note);
(void) fprintf(fp, fmt, perm_name, perm_type, perm_comment);
}
for (i = 0; i < ZFS_NUM_PROPS; i++) {
zprop_desc_t *pd = &pdtbl[i];
if (pd->pd_visible != B_TRUE)
continue;
if (pd->pd_attr == PROP_READONLY)
continue;
props[count++] = pd->pd_name;
}
props[count] = NULL;
qsort(props, count, sizeof (char *), prop_cmp);
for (i = 0; i < count; i++)
(void) fprintf(fp, fmt, props[i], gettext("property"), "");
if (msg != NULL)
(void) fprintf(fp, gettext("\nzfs: error: %s"), msg);
exit(requested ? 0 : 2);
}
static inline const char *
munge_args(int argc, char **argv, boolean_t un, size_t expected_argc,
char **permsp)
{
if (un && argc == expected_argc - 1)
*permsp = NULL;
else if (argc == expected_argc)
*permsp = argv[argc - 2];
else
allow_usage(un, B_FALSE,
gettext("wrong number of parameters\n"));
return (argv[argc - 1]);
}
static void
parse_allow_args(int argc, char **argv, boolean_t un, struct allow_opts *opts)
{
int uge_sum = opts->user + opts->group + opts->everyone;
int csuge_sum = opts->create + opts->set + uge_sum;
int ldcsuge_sum = csuge_sum + opts->local + opts->descend;
int all_sum = un ? ldcsuge_sum + opts->recursive : ldcsuge_sum;
if (uge_sum > 1)
allow_usage(un, B_FALSE,
gettext("-u, -g, and -e are mutually exclusive\n"));
if (opts->prt_usage) {
if (argc == 0 && all_sum == 0)
allow_usage(un, B_TRUE, NULL);
else
usage(B_FALSE);
}
if (opts->set) {
if (csuge_sum > 1)
allow_usage(un, B_FALSE,
gettext("invalid options combined with -s\n"));
opts->dataset = munge_args(argc, argv, un, 3, &opts->perms);
if (argv[0][0] != '@')
allow_usage(un, B_FALSE,
gettext("invalid set name: missing '@' prefix\n"));
opts->who = argv[0];
} else if (opts->create) {
if (ldcsuge_sum > 1)
allow_usage(un, B_FALSE,
gettext("invalid options combined with -c\n"));
opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
} else if (opts->everyone) {
if (csuge_sum > 1)
allow_usage(un, B_FALSE,
gettext("invalid options combined with -e\n"));
opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
} else if (uge_sum == 0 && argc > 0 && strcmp(argv[0], "everyone")
== 0) {
opts->everyone = B_TRUE;
argc--;
argv++;
opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
} else if (argc == 1 && !un) {
opts->prt_perms = B_TRUE;
opts->dataset = argv[argc-1];
} else {
opts->dataset = munge_args(argc, argv, un, 3, &opts->perms);
opts->who = argv[0];
}
if (!opts->local && !opts->descend) {
opts->local = B_TRUE;
opts->descend = B_TRUE;
}
}
static void
store_allow_perm(zfs_deleg_who_type_t type, boolean_t local, boolean_t descend,
const char *who, char *perms, nvlist_t *top_nvl)
{
int i;
char ld[2] = { '\0', '\0' };
char who_buf[MAXNAMELEN + 32];
char base_type = '\0';
char set_type = '\0';
nvlist_t *base_nvl = NULL;
nvlist_t *set_nvl = NULL;
nvlist_t *nvl;
if (nvlist_alloc(&base_nvl, NV_UNIQUE_NAME, 0) != 0)
nomem();
if (nvlist_alloc(&set_nvl, NV_UNIQUE_NAME, 0) != 0)
nomem();
switch (type) {
case ZFS_DELEG_NAMED_SET_SETS:
case ZFS_DELEG_NAMED_SET:
set_type = ZFS_DELEG_NAMED_SET_SETS;
base_type = ZFS_DELEG_NAMED_SET;
ld[0] = ZFS_DELEG_NA;
break;
case ZFS_DELEG_CREATE_SETS:
case ZFS_DELEG_CREATE:
set_type = ZFS_DELEG_CREATE_SETS;
base_type = ZFS_DELEG_CREATE;
ld[0] = ZFS_DELEG_NA;
break;
case ZFS_DELEG_USER_SETS:
case ZFS_DELEG_USER:
set_type = ZFS_DELEG_USER_SETS;
base_type = ZFS_DELEG_USER;
if (local)
ld[0] = ZFS_DELEG_LOCAL;
if (descend)
ld[1] = ZFS_DELEG_DESCENDENT;
break;
case ZFS_DELEG_GROUP_SETS:
case ZFS_DELEG_GROUP:
set_type = ZFS_DELEG_GROUP_SETS;
base_type = ZFS_DELEG_GROUP;
if (local)
ld[0] = ZFS_DELEG_LOCAL;
if (descend)
ld[1] = ZFS_DELEG_DESCENDENT;
break;
case ZFS_DELEG_EVERYONE_SETS:
case ZFS_DELEG_EVERYONE:
set_type = ZFS_DELEG_EVERYONE_SETS;
base_type = ZFS_DELEG_EVERYONE;
if (local)
ld[0] = ZFS_DELEG_LOCAL;
if (descend)
ld[1] = ZFS_DELEG_DESCENDENT;
break;
default:
assert(set_type != '\0' && base_type != '\0');
}
if (perms != NULL) {
char *curr = perms;
char *end = curr + strlen(perms);
while (curr < end) {
char *delim = strchr(curr, ',');
if (delim == NULL)
delim = end;
else
*delim = '\0';
if (curr[0] == '@')
nvl = set_nvl;
else
nvl = base_nvl;
(void) nvlist_add_boolean(nvl, curr);
if (delim != end)
*delim = ',';
curr = delim + 1;
}
for (i = 0; i < 2; i++) {
char locality = ld[i];
if (locality == 0)
continue;
if (!nvlist_empty(base_nvl)) {
if (who != NULL)
(void) snprintf(who_buf,
sizeof (who_buf), "%c%c$%s",
base_type, locality, who);
else
(void) snprintf(who_buf,
sizeof (who_buf), "%c%c$",
base_type, locality);
(void) nvlist_add_nvlist(top_nvl, who_buf,
base_nvl);
}
if (!nvlist_empty(set_nvl)) {
if (who != NULL)
(void) snprintf(who_buf,
sizeof (who_buf), "%c%c$%s",
set_type, locality, who);
else
(void) snprintf(who_buf,
sizeof (who_buf), "%c%c$",
set_type, locality);
(void) nvlist_add_nvlist(top_nvl, who_buf,
set_nvl);
}
}
} else {
for (i = 0; i < 2; i++) {
char locality = ld[i];
if (locality == 0)
continue;
if (who != NULL)
(void) snprintf(who_buf, sizeof (who_buf),
"%c%c$%s", base_type, locality, who);
else
(void) snprintf(who_buf, sizeof (who_buf),
"%c%c$", base_type, locality);
(void) nvlist_add_boolean(top_nvl, who_buf);
if (who != NULL)
(void) snprintf(who_buf, sizeof (who_buf),
"%c%c$%s", set_type, locality, who);
else
(void) snprintf(who_buf, sizeof (who_buf),
"%c%c$", set_type, locality);
(void) nvlist_add_boolean(top_nvl, who_buf);
}
}
}
static int
construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp)
{
if (nvlist_alloc(nvlp, NV_UNIQUE_NAME, 0) != 0)
nomem();
if (opts->set) {
store_allow_perm(ZFS_DELEG_NAMED_SET, opts->local,
opts->descend, opts->who, opts->perms, *nvlp);
} else if (opts->create) {
store_allow_perm(ZFS_DELEG_CREATE, opts->local,
opts->descend, NULL, opts->perms, *nvlp);
} else if (opts->everyone) {
store_allow_perm(ZFS_DELEG_EVERYONE, opts->local,
opts->descend, NULL, opts->perms, *nvlp);
} else {
char *curr = opts->who;
char *end = curr + strlen(curr);
while (curr < end) {
const char *who;
zfs_deleg_who_type_t who_type = ZFS_DELEG_WHO_UNKNOWN;
char *endch;
char *delim = strchr(curr, ',');
char errbuf[256];
char id[64];
struct passwd *p = NULL;
struct group *g = NULL;
uid_t rid;
if (delim == NULL)
delim = end;
else
*delim = '\0';
rid = (uid_t)strtol(curr, &endch, 0);
if (opts->user) {
who_type = ZFS_DELEG_USER;
if (*endch != '\0')
p = getpwnam(curr);
else
p = getpwuid(rid);
if (p != NULL)
rid = p->pw_uid;
else {
(void) snprintf(errbuf, 256, gettext(
"invalid user %s"), curr);
allow_usage(un, B_TRUE, errbuf);
}
} else if (opts->group) {
who_type = ZFS_DELEG_GROUP;
if (*endch != '\0')
g = getgrnam(curr);
else
g = getgrgid(rid);
if (g != NULL)
rid = g->gr_gid;
else {
(void) snprintf(errbuf, 256, gettext(
"invalid group %s"), curr);
allow_usage(un, B_TRUE, errbuf);
}
} else {
if (*endch != '\0') {
p = getpwnam(curr);
} else {
p = getpwuid(rid);
}
if (p == NULL) {
if (*endch != '\0') {
g = getgrnam(curr);
} else {
g = getgrgid(rid);
}
}
if (p != NULL) {
who_type = ZFS_DELEG_USER;
rid = p->pw_uid;
} else if (g != NULL) {
who_type = ZFS_DELEG_GROUP;
rid = g->gr_gid;
} else {
(void) snprintf(errbuf, 256, gettext(
"invalid user/group %s"), curr);
allow_usage(un, B_TRUE, errbuf);
}
}
(void) sprintf(id, "%u", rid);
who = id;
store_allow_perm(who_type, opts->local,
opts->descend, who, opts->perms, *nvlp);
curr = delim + 1;
}
}
return (0);
}
static void
print_set_creat_perms(uu_avl_t *who_avl)
{
const char *sc_title[] = {
gettext("Permission sets:\n"),
gettext("Create time permissions:\n"),
NULL
};
const char **title_ptr = sc_title;
who_perm_node_t *who_node = NULL;
int prev_weight = -1;
for (who_node = uu_avl_first(who_avl); who_node != NULL;
who_node = uu_avl_next(who_avl, who_node)) {
uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl;
zfs_deleg_who_type_t who_type = who_node->who_perm.who_type;
const char *who_name = who_node->who_perm.who_name;
int weight = who_type2weight(who_type);
boolean_t first = B_TRUE;
deleg_perm_node_t *deleg_node;
if (prev_weight != weight) {
(void) printf(*title_ptr++);
prev_weight = weight;
}
if (who_name == NULL || strnlen(who_name, 1) == 0)
(void) printf("\t");
else
(void) printf("\t%s ", who_name);
for (deleg_node = uu_avl_first(avl); deleg_node != NULL;
deleg_node = uu_avl_next(avl, deleg_node)) {
if (first) {
(void) printf("%s",
deleg_node->dpn_perm.dp_name);
first = B_FALSE;
} else
(void) printf(",%s",
deleg_node->dpn_perm.dp_name);
}
(void) printf("\n");
}
}
static void
print_uge_deleg_perms(uu_avl_t *who_avl, boolean_t local, boolean_t descend,
const char *title)
{
who_perm_node_t *who_node = NULL;
boolean_t prt_title = B_TRUE;
uu_avl_walk_t *walk;
if ((walk = uu_avl_walk_start(who_avl, UU_WALK_ROBUST)) == NULL)
nomem();
while ((who_node = uu_avl_walk_next(walk)) != NULL) {
const char *who_name = who_node->who_perm.who_name;
const char *nice_who_name = who_node->who_perm.who_ug_name;
uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl;
zfs_deleg_who_type_t who_type = who_node->who_perm.who_type;
char delim = ' ';
deleg_perm_node_t *deleg_node;
boolean_t prt_who = B_TRUE;
for (deleg_node = uu_avl_first(avl);
deleg_node != NULL;
deleg_node = uu_avl_next(avl, deleg_node)) {
if (local != deleg_node->dpn_perm.dp_local ||
descend != deleg_node->dpn_perm.dp_descend)
continue;
if (prt_who) {
const char *who = NULL;
if (prt_title) {
prt_title = B_FALSE;
(void) printf(title);
}
switch (who_type) {
case ZFS_DELEG_USER_SETS:
case ZFS_DELEG_USER:
who = gettext("user");
if (nice_who_name)
who_name = nice_who_name;
break;
case ZFS_DELEG_GROUP_SETS:
case ZFS_DELEG_GROUP:
who = gettext("group");
if (nice_who_name)
who_name = nice_who_name;
break;
case ZFS_DELEG_EVERYONE_SETS:
case ZFS_DELEG_EVERYONE:
who = gettext("everyone");
who_name = NULL;
break;
default:
assert(who != NULL);
}
prt_who = B_FALSE;
if (who_name == NULL)
(void) printf("\t%s", who);
else
(void) printf("\t%s %s", who, who_name);
}
(void) printf("%c%s", delim,
deleg_node->dpn_perm.dp_name);
delim = ',';
}
if (!prt_who)
(void) printf("\n");
}
uu_avl_walk_end(walk);
}
static void
print_fs_perms(fs_perm_set_t *fspset)
{
fs_perm_node_t *node = NULL;
char buf[MAXNAMELEN + 32];
const char *dsname = buf;
for (node = uu_list_first(fspset->fsps_list); node != NULL;
node = uu_list_next(fspset->fsps_list, node)) {
uu_avl_t *sc_avl = node->fspn_fsperm.fsp_sc_avl;
uu_avl_t *uge_avl = node->fspn_fsperm.fsp_uge_avl;
int left = 0;
(void) snprintf(buf, sizeof (buf),
gettext("---- Permissions on %s "),
node->fspn_fsperm.fsp_name);
(void) printf(dsname);
left = 70 - strlen(buf);
while (left-- > 0)
(void) printf("-");
(void) printf("\n");
print_set_creat_perms(sc_avl);
print_uge_deleg_perms(uge_avl, B_TRUE, B_FALSE,
gettext("Local permissions:\n"));
print_uge_deleg_perms(uge_avl, B_FALSE, B_TRUE,
gettext("Descendent permissions:\n"));
print_uge_deleg_perms(uge_avl, B_TRUE, B_TRUE,
gettext("Local+Descendent permissions:\n"));
}
}
static fs_perm_set_t fs_perm_set = { NULL, NULL, NULL, NULL };
struct deleg_perms {
boolean_t un;
nvlist_t *nvl;
};
static int
set_deleg_perms(zfs_handle_t *zhp, void *data)
{
struct deleg_perms *perms = (struct deleg_perms *)data;
zfs_type_t zfs_type = zfs_get_type(zhp);
if (zfs_type != ZFS_TYPE_FILESYSTEM && zfs_type != ZFS_TYPE_VOLUME)
return (0);
return (zfs_set_fsacl(zhp, perms->un, perms->nvl));
}
static int
zfs_do_allow_unallow_impl(int argc, char **argv, boolean_t un)
{
zfs_handle_t *zhp;
nvlist_t *perm_nvl = NULL;
nvlist_t *update_perm_nvl = NULL;
int error = 1;
int c;
struct allow_opts opts = { 0 };
const char *optstr = un ? "ldugecsrh" : "ldugecsh";
/* check opts */
while ((c = getopt(argc, argv, optstr)) != -1) {
switch (c) {
case 'l':
opts.local = B_TRUE;
break;
case 'd':
opts.descend = B_TRUE;
break;
case 'u':
opts.user = B_TRUE;
break;
case 'g':
opts.group = B_TRUE;
break;
case 'e':
opts.everyone = B_TRUE;
break;
case 's':
opts.set = B_TRUE;
break;
case 'c':
opts.create = B_TRUE;
break;
case 'r':
opts.recursive = B_TRUE;
break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
"'%c' option\n"), optopt);
usage(B_FALSE);
break;
case 'h':
opts.prt_usage = B_TRUE;
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
/* check arguments */
parse_allow_args(argc, argv, un, &opts);
/* try to open the dataset */
if ((zhp = zfs_open(g_zfs, opts.dataset, ZFS_TYPE_FILESYSTEM |
ZFS_TYPE_VOLUME)) == NULL) {
(void) fprintf(stderr, "Failed to open dataset: %s\n",
opts.dataset);
return (-1);
}
if (zfs_get_fsacl(zhp, &perm_nvl) != 0)
goto cleanup2;
fs_perm_set_init(&fs_perm_set);
if (parse_fs_perm_set(&fs_perm_set, perm_nvl) != 0) {
(void) fprintf(stderr, "Failed to parse fsacl permissions\n");
goto cleanup1;
}
if (opts.prt_perms)
print_fs_perms(&fs_perm_set);
else {
(void) construct_fsacl_list(un, &opts, &update_perm_nvl);
if (zfs_set_fsacl(zhp, un, update_perm_nvl) != 0)
goto cleanup0;
if (un && opts.recursive) {
struct deleg_perms data = { un, update_perm_nvl };
if (zfs_iter_filesystems(zhp, set_deleg_perms,
&data) != 0)
goto cleanup0;
}
}
error = 0;
cleanup0:
nvlist_free(perm_nvl);
nvlist_free(update_perm_nvl);
cleanup1:
fs_perm_set_fini(&fs_perm_set);
cleanup2:
zfs_close(zhp);
return (error);
}
static int
zfs_do_allow(int argc, char **argv)
{
return (zfs_do_allow_unallow_impl(argc, argv, B_FALSE));
}
static int
zfs_do_unallow(int argc, char **argv)
{
return (zfs_do_allow_unallow_impl(argc, argv, B_TRUE));
}
static int
zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding)
{
int errors = 0;
int i;
const char *tag;
boolean_t recursive = B_FALSE;
const char *opts = holding ? "rt" : "r";
int c;
/* check options */
while ((c = getopt(argc, argv, opts)) != -1) {
switch (c) {
case 'r':
recursive = B_TRUE;
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
/* check number of arguments */
if (argc < 2)
usage(B_FALSE);
tag = argv[0];
--argc;
++argv;
if (holding && tag[0] == '.') {
/* tags starting with '.' are reserved for libzfs */
(void) fprintf(stderr, gettext("tag may not start with '.'\n"));
usage(B_FALSE);
}
for (i = 0; i < argc; ++i) {
zfs_handle_t *zhp;
char parent[ZFS_MAX_DATASET_NAME_LEN];
const char *delim;
char *path = argv[i];
delim = strchr(path, '@');
if (delim == NULL) {
(void) fprintf(stderr,
gettext("'%s' is not a snapshot\n"), path);
++errors;
continue;
}
(void) strncpy(parent, path, delim - path);
parent[delim - path] = '\0';
zhp = zfs_open(g_zfs, parent,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
if (zhp == NULL) {
++errors;
continue;
}
if (holding) {
if (zfs_hold(zhp, delim+1, tag, recursive, -1) != 0)
++errors;
} else {
if (zfs_release(zhp, delim+1, tag, recursive) != 0)
++errors;
}
zfs_close(zhp);
}
return (errors != 0);
}
/*
* zfs hold [-r] [-t] <tag> <snap> ...
*
* -r Recursively hold
*
* Apply a user-hold with the given tag to the list of snapshots.
*/
static int
zfs_do_hold(int argc, char **argv)
{
return (zfs_do_hold_rele_impl(argc, argv, B_TRUE));
}
/*
* zfs release [-r] <tag> <snap> ...
*
* -r Recursively release
*
* Release a user-hold with the given tag from the list of snapshots.
*/
static int
zfs_do_release(int argc, char **argv)
{
return (zfs_do_hold_rele_impl(argc, argv, B_FALSE));
}
typedef struct holds_cbdata {
boolean_t cb_recursive;
const char *cb_snapname;
nvlist_t **cb_nvlp;
size_t cb_max_namelen;
size_t cb_max_taglen;
} holds_cbdata_t;
#define STRFTIME_FMT_STR "%a %b %e %k:%M %Y"
#define DATETIME_BUF_LEN (32)
/*
*
*/
static void
print_holds(boolean_t scripted, boolean_t literal, size_t nwidth,
size_t tagwidth, nvlist_t *nvl)
{
int i;
nvpair_t *nvp = NULL;
char *hdr_cols[] = { "NAME", "TAG", "TIMESTAMP" };
const char *col;
if (!scripted) {
for (i = 0; i < 3; i++) {
col = gettext(hdr_cols[i]);
if (i < 2)
(void) printf("%-*s ", i ? tagwidth : nwidth,
col);
else
(void) printf("%s\n", col);
}
}
while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
char *zname = nvpair_name(nvp);
nvlist_t *nvl2;
nvpair_t *nvp2 = NULL;
(void) nvpair_value_nvlist(nvp, &nvl2);
while ((nvp2 = nvlist_next_nvpair(nvl2, nvp2)) != NULL) {
char tsbuf[DATETIME_BUF_LEN];
char *tagname = nvpair_name(nvp2);
uint64_t val = 0;
time_t time;
struct tm t;
(void) nvpair_value_uint64(nvp2, &val);
if (literal)
snprintf(tsbuf, DATETIME_BUF_LEN, "%llu", val);
else {
time = (time_t)val;
(void) localtime_r(&time, &t);
(void) strftime(tsbuf, DATETIME_BUF_LEN,
gettext(STRFTIME_FMT_STR), &t);
}
if (scripted) {
(void) printf("%s\t%s\t%s\n", zname,
tagname, tsbuf);
} else {
(void) printf("%-*s %-*s %s\n", nwidth,
zname, tagwidth, tagname, tsbuf);
}
}
}
}
/*
* Generic callback function to list a dataset or snapshot.
*/
static int
holds_callback(zfs_handle_t *zhp, void *data)
{
holds_cbdata_t *cbp = data;
nvlist_t *top_nvl = *cbp->cb_nvlp;
nvlist_t *nvl = NULL;
nvpair_t *nvp = NULL;
const char *zname = zfs_get_name(zhp);
size_t znamelen = strlen(zname);
if (cbp->cb_recursive && cbp->cb_snapname != NULL) {
const char *snapname;
char *delim = strchr(zname, '@');
if (delim == NULL)
return (0);
snapname = delim + 1;
if (strcmp(cbp->cb_snapname, snapname))
return (0);
}
if (zfs_get_holds(zhp, &nvl) != 0)
return (-1);
if (znamelen > cbp->cb_max_namelen)
cbp->cb_max_namelen = znamelen;
while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
const char *tag = nvpair_name(nvp);
size_t taglen = strlen(tag);
if (taglen > cbp->cb_max_taglen)
cbp->cb_max_taglen = taglen;
}
return (nvlist_add_nvlist(top_nvl, zname, nvl));
}
/*
* zfs holds [-Hp] [-r | -d max] <dataset|snap> ...
*
* -H Suppress header output
* -p Output literal values
* -r Recursively search for holds
* -d max Limit depth of recursive search
*/
static int
zfs_do_holds(int argc, char **argv)
{
int errors = 0;
int c;
int i;
boolean_t scripted = B_FALSE;
boolean_t literal = B_FALSE;
boolean_t recursive = B_FALSE;
const char *opts = "d:rHp";
nvlist_t *nvl;
int types = ZFS_TYPE_SNAPSHOT;
holds_cbdata_t cb = { 0 };
int limit = 0;
int ret = 0;
int flags = 0;
/* check options */
while ((c = getopt(argc, argv, opts)) != -1) {
switch (c) {
case 'd':
limit = parse_depth(optarg, &flags);
recursive = B_TRUE;
break;
case 'r':
recursive = B_TRUE;
break;
case 'H':
scripted = B_TRUE;
break;
case 'p':
literal = B_TRUE;
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
if (recursive) {
types |= ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME;
flags |= ZFS_ITER_RECURSE;
}
argc -= optind;
argv += optind;
/* check number of arguments */
if (argc < 1)
usage(B_FALSE);
if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
nomem();
for (i = 0; i < argc; ++i) {
char *snapshot = argv[i];
const char *delim;
const char *snapname = NULL;
delim = strchr(snapshot, '@');
if (delim != NULL) {
snapname = delim + 1;
if (recursive)
snapshot[delim - snapshot] = '\0';
}
cb.cb_recursive = recursive;
cb.cb_snapname = snapname;
cb.cb_nvlp = &nvl;
/*
* 1. collect holds data, set format options
*/
ret = zfs_for_each(argc, argv, flags, types, NULL, NULL, limit,
holds_callback, &cb);
if (ret != 0)
++errors;
}
/*
* 2. print holds data
*/
print_holds(scripted, literal, cb.cb_max_namelen, cb.cb_max_taglen,
nvl);
if (nvlist_empty(nvl))
(void) printf(gettext("no datasets available\n"));
nvlist_free(nvl);
return (0 != errors);
}
#define CHECK_SPINNER 30
#define SPINNER_TIME 3 /* seconds */
#define MOUNT_TIME 5 /* seconds */
static int
get_one_dataset(zfs_handle_t *zhp, void *data)
{
static char *spin[] = { "-", "\\", "|", "/" };
static int spinval = 0;
static int spincheck = 0;
static time_t last_spin_time = (time_t)0;
get_all_cb_t *cbp = data;
zfs_type_t type = zfs_get_type(zhp);
if (cbp->cb_verbose) {
if (--spincheck < 0) {
time_t now = time(NULL);
if (last_spin_time + SPINNER_TIME < now) {
update_progress(spin[spinval++ % 4]);
last_spin_time = now;
}
spincheck = CHECK_SPINNER;
}
}
/*
* Interate over any nested datasets.
*/
if (zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) {
zfs_close(zhp);
return (1);
}
/*
* Skip any datasets whose type does not match.
*/
if ((type & ZFS_TYPE_FILESYSTEM) == 0) {
zfs_close(zhp);
return (0);
}
libzfs_add_handle(cbp, zhp);
assert(cbp->cb_used <= cbp->cb_alloc);
return (0);
}
static void
get_all_datasets(zfs_handle_t ***dslist, size_t *count, boolean_t verbose)
{
get_all_cb_t cb = { 0 };
cb.cb_verbose = verbose;
cb.cb_getone = get_one_dataset;
if (verbose)
set_progress_header(gettext("Reading ZFS config"));
(void) zfs_iter_root(g_zfs, get_one_dataset, &cb);
*dslist = cb.cb_handles;
*count = cb.cb_used;
if (verbose)
finish_progress(gettext("done."));
}
/*
* Generic callback for sharing or mounting filesystems. Because the code is so
* similar, we have a common function with an extra parameter to determine which
* mode we are using.
*/
#define OP_SHARE 0x1
#define OP_MOUNT 0x2
/*
* Share or mount a dataset.
*/
static int
share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol,
boolean_t explicit, const char *options)
{
char mountpoint[ZFS_MAXPROPLEN];
char shareopts[ZFS_MAXPROPLEN];
char smbshareopts[ZFS_MAXPROPLEN];
const char *cmdname = op == OP_SHARE ? "share" : "mount";
struct mnttab mnt;
uint64_t zoned, canmount;
boolean_t shared_nfs, shared_smb;
assert(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM);
/*
* Check to make sure we can mount/share this dataset. If we
* are in the global zone and the filesystem is exported to a
* local zone, or if we are in a local zone and the
* filesystem is not exported, then it is an error.
*/
zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
if (zoned && getzoneid() == GLOBAL_ZONEID) {
if (!explicit)
return (0);
(void) fprintf(stderr, gettext("cannot %s '%s': "
"dataset is exported to a local zone\n"), cmdname,
zfs_get_name(zhp));
return (1);
} else if (!zoned && getzoneid() != GLOBAL_ZONEID) {
if (!explicit)
return (0);
(void) fprintf(stderr, gettext("cannot %s '%s': "
"permission denied\n"), cmdname,
zfs_get_name(zhp));
return (1);
}
/*
* Ignore any filesystems which don't apply to us. This
* includes those with a legacy mountpoint, or those with
* legacy share options.
*/
verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0);
verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts,
sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0);
verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts,
sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0);
if (op == OP_SHARE && strcmp(shareopts, "off") == 0 &&
strcmp(smbshareopts, "off") == 0) {
if (!explicit)
return (0);
(void) fprintf(stderr, gettext("cannot share '%s': "
"legacy share\n"), zfs_get_name(zhp));
(void) fprintf(stderr, gettext("to "
"share this filesystem set "
"sharenfs property on\n"));
return (1);
}
/*
* We cannot share or mount legacy filesystems. If the
* shareopts is non-legacy but the mountpoint is legacy, we
* treat it as a legacy share.
*/
if (strcmp(mountpoint, "legacy") == 0) {
if (!explicit)
return (0);
(void) fprintf(stderr, gettext("cannot %s '%s': "
"legacy mountpoint\n"), cmdname, zfs_get_name(zhp));
(void) fprintf(stderr, gettext("use %s(8) to "
"%s this filesystem\n"), cmdname, cmdname);
return (1);
}
if (strcmp(mountpoint, "none") == 0) {
if (!explicit)
return (0);
(void) fprintf(stderr, gettext("cannot %s '%s': no "
"mountpoint set\n"), cmdname, zfs_get_name(zhp));
return (1);
}
/*
* canmount explicit outcome
* on no pass through
* on yes pass through
* off no return 0
* off yes display error, return 1
* noauto no return 0
* noauto yes pass through
*/
canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
if (canmount == ZFS_CANMOUNT_OFF) {
if (!explicit)
return (0);
(void) fprintf(stderr, gettext("cannot %s '%s': "
"'canmount' property is set to 'off'\n"), cmdname,
zfs_get_name(zhp));
return (1);
} else if (canmount == ZFS_CANMOUNT_NOAUTO && !explicit) {
return (0);
}
/*
* If this filesystem is inconsistent and has a receive resume
* token, we can not mount it.
*/
if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) &&
zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
NULL, 0, NULL, NULL, 0, B_TRUE) == 0) {
if (!explicit)
return (0);
(void) fprintf(stderr, gettext("cannot %s '%s': "
"Contains partially-completed state from "
"\"zfs receive -r\", which can be resumed with "
"\"zfs send -t\"\n"),
cmdname, zfs_get_name(zhp));
return (1);
}
/*
* At this point, we have verified that the mountpoint and/or
* shareopts are appropriate for auto management. If the
* filesystem is already mounted or shared, return (failing
* for explicit requests); otherwise mount or share the
* filesystem.
*/
switch (op) {
case OP_SHARE:
shared_nfs = zfs_is_shared_nfs(zhp, NULL);
shared_smb = zfs_is_shared_smb(zhp, NULL);
if ((shared_nfs && shared_smb) ||
(shared_nfs && strcmp(shareopts, "on") == 0 &&
strcmp(smbshareopts, "off") == 0) ||
(shared_smb && strcmp(smbshareopts, "on") == 0 &&
strcmp(shareopts, "off") == 0)) {
if (!explicit)
return (0);
(void) fprintf(stderr, gettext("cannot share "
"'%s': filesystem already shared\n"),
zfs_get_name(zhp));
return (1);
}
if (!zfs_is_mounted(zhp, NULL) &&
zfs_mount(zhp, NULL, 0) != 0)
return (1);
if (protocol == NULL) {
if (zfs_shareall(zhp) != 0)
return (1);
} else if (strcmp(protocol, "nfs") == 0) {
if (zfs_share_nfs(zhp))
return (1);
} else if (strcmp(protocol, "smb") == 0) {
if (zfs_share_smb(zhp))
return (1);
} else {
(void) fprintf(stderr, gettext("cannot share "
"'%s': invalid share type '%s' "
"specified\n"),
zfs_get_name(zhp), protocol);
return (1);
}
break;
case OP_MOUNT:
if (options == NULL)
mnt.mnt_mntopts = "";
else
mnt.mnt_mntopts = (char *)options;
if (!hasmntopt(&mnt, MNTOPT_REMOUNT) &&
zfs_is_mounted(zhp, NULL)) {
if (!explicit)
return (0);
(void) fprintf(stderr, gettext("cannot mount "
"'%s': filesystem already mounted\n"),
zfs_get_name(zhp));
return (1);
}
if (zfs_mount(zhp, options, flags) != 0)
return (1);
break;
}
return (0);
}
/*
* Reports progress in the form "(current/total)". Not thread-safe.
*/
static void
report_mount_progress(int current, int total)
{
static time_t last_progress_time = 0;
time_t now = time(NULL);
char info[32];
/* report 1..n instead of 0..n-1 */
++current;
/* display header if we're here for the first time */
if (current == 1) {
set_progress_header(gettext("Mounting ZFS filesystems"));
} else if (current != total && last_progress_time + MOUNT_TIME >= now) {
/* too soon to report again */
return;
}
last_progress_time = now;
(void) sprintf(info, "(%d/%d)", current, total);
if (current == total)
finish_progress(info);
else
update_progress(info);
}
static void
append_options(char *mntopts, char *newopts)
{
int len = strlen(mntopts);
/* original length plus new string to append plus 1 for the comma */
if (len + 1 + strlen(newopts) >= MNT_LINE_MAX) {
(void) fprintf(stderr, gettext("the opts argument for "
"'%c' option is too long (more than %d chars)\n"),
"-o", MNT_LINE_MAX);
usage(B_FALSE);
}
if (*mntopts)
mntopts[len++] = ',';
(void) strcpy(&mntopts[len], newopts);
}
static int
share_mount(int op, int argc, char **argv)
{
int do_all = 0;
boolean_t verbose = B_FALSE;
int c, ret = 0;
char *options = NULL;
int flags = 0;
/* check options */
while ((c = getopt(argc, argv, op == OP_MOUNT ? ":avo:O" : "a"))
!= -1) {
switch (c) {
case 'a':
do_all = 1;
break;
case 'v':
verbose = B_TRUE;
break;
case 'o':
if (*optarg == '\0') {
(void) fprintf(stderr, gettext("empty mount "
"options (-o) specified\n"));
usage(B_FALSE);
}
if (options == NULL)
options = safe_malloc(MNT_LINE_MAX + 1);
/* option validation is done later */
append_options(options, optarg);
break;
case 'O':
warnx("no overlay mounts support on FreeBSD, ignoring");
break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
"'%c' option\n"), optopt);
usage(B_FALSE);
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
/* check number of arguments */
if (do_all) {
zfs_handle_t **dslist = NULL;
size_t i, count = 0;
char *protocol = NULL;
if (op == OP_SHARE && argc > 0) {
if (strcmp(argv[0], "nfs") != 0 &&
strcmp(argv[0], "smb") != 0) {
(void) fprintf(stderr, gettext("share type "
"must be 'nfs' or 'smb'\n"));
usage(B_FALSE);
}
protocol = argv[0];
argc--;
argv++;
}
if (argc != 0) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}
start_progress_timer();
get_all_datasets(&dslist, &count, verbose);
if (count == 0)
return (0);
qsort(dslist, count, sizeof (void *), libzfs_dataset_cmp);
for (i = 0; i < count; i++) {
if (verbose)
report_mount_progress(i, count);
if (share_mount_one(dslist[i], op, flags, protocol,
B_FALSE, options) != 0)
ret = 1;
zfs_close(dslist[i]);
}
free(dslist);
} else if (argc == 0) {
struct mnttab entry;
if ((op == OP_SHARE) || (options != NULL)) {
(void) fprintf(stderr, gettext("missing filesystem "
"argument (specify -a for all)\n"));
usage(B_FALSE);
}
/*
* When mount is given no arguments, go through /etc/mnttab and
* display any active ZFS mounts. We hide any snapshots, since
* they are controlled automatically.
*/
rewind(mnttab_file);
while (getmntent(mnttab_file, &entry) == 0) {
if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0 ||
strchr(entry.mnt_special, '@') != NULL)
continue;
(void) printf("%-30s %s\n", entry.mnt_special,
entry.mnt_mountp);
}
} else {
zfs_handle_t *zhp;
if (argc > 1) {
(void) fprintf(stderr,
gettext("too many arguments\n"));
usage(B_FALSE);
}
if ((zhp = zfs_open(g_zfs, argv[0],
ZFS_TYPE_FILESYSTEM)) == NULL) {
ret = 1;
} else {
ret = share_mount_one(zhp, op, flags, NULL, B_TRUE,
options);
zfs_close(zhp);
}
}
return (ret);
}
/*
* zfs mount -a [nfs]
* zfs mount filesystem
*
* Mount all filesystems, or mount the given filesystem.
*/
static int
zfs_do_mount(int argc, char **argv)
{
return (share_mount(OP_MOUNT, argc, argv));
}
/*
* zfs share -a [nfs | smb]
* zfs share filesystem
*
* Share all filesystems, or share the given filesystem.
*/
static int
zfs_do_share(int argc, char **argv)
{
return (share_mount(OP_SHARE, argc, argv));
}
typedef struct unshare_unmount_node {
zfs_handle_t *un_zhp;
char *un_mountp;
uu_avl_node_t un_avlnode;
} unshare_unmount_node_t;
/* ARGSUSED */
static int
unshare_unmount_compare(const void *larg, const void *rarg, void *unused)
{
const unshare_unmount_node_t *l = larg;
const unshare_unmount_node_t *r = rarg;
return (strcmp(l->un_mountp, r->un_mountp));
}
/*
* Convenience routine used by zfs_do_umount() and manual_unmount(). Given an
* absolute path, find the entry /etc/mnttab, verify that its a ZFS filesystem,
* and unmount it appropriately.
*/
static int
unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual)
{
zfs_handle_t *zhp;
int ret = 0;
struct stat64 statbuf;
struct extmnttab entry;
const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount";
ino_t path_inode;
/*
* Search for the path in /etc/mnttab. Rather than looking for the
* specific path, which can be fooled by non-standard paths (i.e. ".."
* or "//"), we stat() the path and search for the corresponding
* (major,minor) device pair.
*/
if (stat64(path, &statbuf) != 0) {
(void) fprintf(stderr, gettext("cannot %s '%s': %s\n"),
cmdname, path, strerror(errno));
return (1);
}
path_inode = statbuf.st_ino;
/*
* Search for the given (major,minor) pair in the mount table.
*/
#ifdef illumos
rewind(mnttab_file);
while ((ret = getextmntent(mnttab_file, &entry, 0)) == 0) {
if (entry.mnt_major == major(statbuf.st_dev) &&
entry.mnt_minor == minor(statbuf.st_dev))
break;
}
#else
{
struct statfs sfs;
if (statfs(path, &sfs) != 0) {
(void) fprintf(stderr, "%s: %s\n", path,
strerror(errno));
ret = -1;
}
statfs2mnttab(&sfs, &entry);
}
#endif
if (ret != 0) {
if (op == OP_SHARE) {
(void) fprintf(stderr, gettext("cannot %s '%s': not "
"currently mounted\n"), cmdname, path);
return (1);
}
(void) fprintf(stderr, gettext("warning: %s not in mnttab\n"),
path);
if ((ret = umount2(path, flags)) != 0)
(void) fprintf(stderr, gettext("%s: %s\n"), path,
strerror(errno));
return (ret != 0);
}
if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) {
(void) fprintf(stderr, gettext("cannot %s '%s': not a ZFS "
"filesystem\n"), cmdname, path);
return (1);
}
if ((zhp = zfs_open(g_zfs, entry.mnt_special,
ZFS_TYPE_FILESYSTEM)) == NULL)
return (1);
ret = 1;
if (stat64(entry.mnt_mountp, &statbuf) != 0) {
(void) fprintf(stderr, gettext("cannot %s '%s': %s\n"),
cmdname, path, strerror(errno));
goto out;
} else if (statbuf.st_ino != path_inode) {
(void) fprintf(stderr, gettext("cannot "
"%s '%s': not a mountpoint\n"), cmdname, path);
goto out;
}
if (op == OP_SHARE) {
char nfs_mnt_prop[ZFS_MAXPROPLEN];
char smbshare_prop[ZFS_MAXPROPLEN];
verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop,
sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0);
verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshare_prop,
sizeof (smbshare_prop), NULL, NULL, 0, B_FALSE) == 0);
if (strcmp(nfs_mnt_prop, "off") == 0 &&
strcmp(smbshare_prop, "off") == 0) {
(void) fprintf(stderr, gettext("cannot unshare "
"'%s': legacy share\n"), path);
#ifdef illumos
(void) fprintf(stderr, gettext("use "
"unshare(1M) to unshare this filesystem\n"));
#endif
} else if (!zfs_is_shared(zhp)) {
(void) fprintf(stderr, gettext("cannot unshare '%s': "
"not currently shared\n"), path);
} else {
ret = zfs_unshareall_bypath(zhp, path);
}
} else {
char mtpt_prop[ZFS_MAXPROPLEN];
verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mtpt_prop,
sizeof (mtpt_prop), NULL, NULL, 0, B_FALSE) == 0);
if (is_manual) {
ret = zfs_unmount(zhp, NULL, flags);
} else if (strcmp(mtpt_prop, "legacy") == 0) {
(void) fprintf(stderr, gettext("cannot unmount "
"'%s': legacy mountpoint\n"),
zfs_get_name(zhp));
(void) fprintf(stderr, gettext("use umount(8) "
"to unmount this filesystem\n"));
} else {
ret = zfs_unmountall(zhp, flags);
}
}
out:
zfs_close(zhp);
return (ret != 0);
}
/*
* Generic callback for unsharing or unmounting a filesystem.
*/
static int
unshare_unmount(int op, int argc, char **argv)
{
int do_all = 0;
int flags = 0;
int ret = 0;
int c;
zfs_handle_t *zhp;
char nfs_mnt_prop[ZFS_MAXPROPLEN];
char sharesmb[ZFS_MAXPROPLEN];
/* check options */
while ((c = getopt(argc, argv, op == OP_SHARE ? "a" : "af")) != -1) {
switch (c) {
case 'a':
do_all = 1;
break;
case 'f':
flags = MS_FORCE;
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
if (do_all) {
/*
* We could make use of zfs_for_each() to walk all datasets in
* the system, but this would be very inefficient, especially
* since we would have to linearly search /etc/mnttab for each
* one. Instead, do one pass through /etc/mnttab looking for
* zfs entries and call zfs_unmount() for each one.
*
* Things get a little tricky if the administrator has created
* mountpoints beneath other ZFS filesystems. In this case, we
* have to unmount the deepest filesystems first. To accomplish
* this, we place all the mountpoints in an AVL tree sorted by
* the special type (dataset name), and walk the result in
* reverse to make sure to get any snapshots first.
*/
struct mnttab entry;
uu_avl_pool_t *pool;
uu_avl_t *tree = NULL;
unshare_unmount_node_t *node;
uu_avl_index_t idx;
uu_avl_walk_t *walk;
if (argc != 0) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}
if (((pool = uu_avl_pool_create("unmount_pool",
sizeof (unshare_unmount_node_t),
offsetof(unshare_unmount_node_t, un_avlnode),
unshare_unmount_compare, UU_DEFAULT)) == NULL) ||
((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL))
nomem();
rewind(mnttab_file);
while (getmntent(mnttab_file, &entry) == 0) {
/* ignore non-ZFS entries */
if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0)
continue;
/* ignore snapshots */
if (strchr(entry.mnt_special, '@') != NULL)
continue;
if ((zhp = zfs_open(g_zfs, entry.mnt_special,
ZFS_TYPE_FILESYSTEM)) == NULL) {
ret = 1;
continue;
}
/*
* Ignore datasets that are excluded/restricted by
* parent pool name.
*/
if (zpool_skip_pool(zfs_get_pool_name(zhp))) {
zfs_close(zhp);
continue;
}
switch (op) {
case OP_SHARE:
verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS,
nfs_mnt_prop,
sizeof (nfs_mnt_prop),
NULL, NULL, 0, B_FALSE) == 0);
if (strcmp(nfs_mnt_prop, "off") != 0)
break;
verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB,
nfs_mnt_prop,
sizeof (nfs_mnt_prop),
NULL, NULL, 0, B_FALSE) == 0);
if (strcmp(nfs_mnt_prop, "off") == 0)
continue;
break;
case OP_MOUNT:
/* Ignore legacy mounts */
verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT,
nfs_mnt_prop,
sizeof (nfs_mnt_prop),
NULL, NULL, 0, B_FALSE) == 0);
if (strcmp(nfs_mnt_prop, "legacy") == 0)
continue;
/* Ignore canmount=noauto mounts */
if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) ==
ZFS_CANMOUNT_NOAUTO)
continue;
default:
break;
}
node = safe_malloc(sizeof (unshare_unmount_node_t));
node->un_zhp = zhp;
node->un_mountp = safe_strdup(entry.mnt_mountp);
uu_avl_node_init(node, &node->un_avlnode, pool);
if (uu_avl_find(tree, node, NULL, &idx) == NULL) {
uu_avl_insert(tree, node, idx);
} else {
zfs_close(node->un_zhp);
free(node->un_mountp);
free(node);
}
}
/*
* Walk the AVL tree in reverse, unmounting each filesystem and
* removing it from the AVL tree in the process.
*/
if ((walk = uu_avl_walk_start(tree,
UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL)
nomem();
while ((node = uu_avl_walk_next(walk)) != NULL) {
uu_avl_remove(tree, node);
switch (op) {
case OP_SHARE:
if (zfs_unshareall_bypath(node->un_zhp,
node->un_mountp) != 0)
ret = 1;
break;
case OP_MOUNT:
if (zfs_unmount(node->un_zhp,
node->un_mountp, flags) != 0)
ret = 1;
break;
}
zfs_close(node->un_zhp);
free(node->un_mountp);
free(node);
}
uu_avl_walk_end(walk);
uu_avl_destroy(tree);
uu_avl_pool_destroy(pool);
} else {
if (argc != 1) {
if (argc == 0)
(void) fprintf(stderr,
gettext("missing filesystem argument\n"));
else
(void) fprintf(stderr,
gettext("too many arguments\n"));
usage(B_FALSE);
}
/*
* We have an argument, but it may be a full path or a ZFS
* filesystem. Pass full paths off to unmount_path() (shared by
* manual_unmount), otherwise open the filesystem and pass to
* zfs_unmount().
*/
if (argv[0][0] == '/')
return (unshare_unmount_path(op, argv[0],
flags, B_FALSE));
if ((zhp = zfs_open(g_zfs, argv[0],
ZFS_TYPE_FILESYSTEM)) == NULL)
return (1);
verify(zfs_prop_get(zhp, op == OP_SHARE ?
ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT,
nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL,
NULL, 0, B_FALSE) == 0);
switch (op) {
case OP_SHARE:
verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS,
nfs_mnt_prop,
sizeof (nfs_mnt_prop),
NULL, NULL, 0, B_FALSE) == 0);
verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB,
sharesmb, sizeof (sharesmb), NULL, NULL,
0, B_FALSE) == 0);
if (strcmp(nfs_mnt_prop, "off") == 0 &&
strcmp(sharesmb, "off") == 0) {
(void) fprintf(stderr, gettext("cannot "
"unshare '%s': legacy share\n"),
zfs_get_name(zhp));
#ifdef illumos
(void) fprintf(stderr, gettext("use "
"unshare(1M) to unshare this "
"filesystem\n"));
#endif
ret = 1;
} else if (!zfs_is_shared(zhp)) {
(void) fprintf(stderr, gettext("cannot "
"unshare '%s': not currently "
"shared\n"), zfs_get_name(zhp));
ret = 1;
} else if (zfs_unshareall(zhp) != 0) {
ret = 1;
}
break;
case OP_MOUNT:
if (strcmp(nfs_mnt_prop, "legacy") == 0) {
(void) fprintf(stderr, gettext("cannot "
"unmount '%s': legacy "
"mountpoint\n"), zfs_get_name(zhp));
(void) fprintf(stderr, gettext("use "
"umount(8) to unmount this "
"filesystem\n"));
ret = 1;
} else if (!zfs_is_mounted(zhp, NULL)) {
(void) fprintf(stderr, gettext("cannot "
"unmount '%s': not currently "
"mounted\n"),
zfs_get_name(zhp));
ret = 1;
} else if (zfs_unmountall(zhp, flags) != 0) {
ret = 1;
}
break;
}
zfs_close(zhp);
}
return (ret);
}
/*
* zfs unmount -a
* zfs unmount filesystem
*
* Unmount all filesystems, or a specific ZFS filesystem.
*/
static int
zfs_do_unmount(int argc, char **argv)
{
return (unshare_unmount(OP_MOUNT, argc, argv));
}
/*
* zfs unshare -a
* zfs unshare filesystem
*
* Unshare all filesystems, or a specific ZFS filesystem.
*/
static int
zfs_do_unshare(int argc, char **argv)
{
return (unshare_unmount(OP_SHARE, argc, argv));
}
/*
* Attach/detach the given dataset to/from the given jail
*/
/* ARGSUSED */
static int
do_jail(int argc, char **argv, int attach)
{
zfs_handle_t *zhp;
int jailid, ret;
/* check number of arguments */
if (argc < 3) {
(void) fprintf(stderr, gettext("missing argument(s)\n"));
usage(B_FALSE);
}
if (argc > 3) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}
jailid = jail_getid(argv[1]);
if (jailid < 0) {
(void) fprintf(stderr, gettext("invalid jail id or name\n"));
usage(B_FALSE);
}
zhp = zfs_open(g_zfs, argv[2], ZFS_TYPE_FILESYSTEM);
if (zhp == NULL)
return (1);
ret = (zfs_jail(zhp, jailid, attach) != 0);
zfs_close(zhp);
return (ret);
}
/*
* zfs jail jailid filesystem
*
* Attach the given dataset to the given jail
*/
/* ARGSUSED */
static int
zfs_do_jail(int argc, char **argv)
{
return (do_jail(argc, argv, 1));
}
/*
* zfs unjail jailid filesystem
*
* Detach the given dataset from the given jail
*/
/* ARGSUSED */
static int
zfs_do_unjail(int argc, char **argv)
{
return (do_jail(argc, argv, 0));
}
/*
* Called when invoked as /etc/fs/zfs/mount. Do the mount if the mountpoint is
* 'legacy'. Otherwise, complain that use should be using 'zfs mount'.
*/
static int
manual_mount(int argc, char **argv)
{
zfs_handle_t *zhp;
char mountpoint[ZFS_MAXPROPLEN];
char mntopts[MNT_LINE_MAX] = { '\0' };
int ret = 0;
int c;
int flags = 0;
char *dataset, *path;
/* check options */
while ((c = getopt(argc, argv, ":mo:O")) != -1) {
switch (c) {
case 'o':
(void) strlcpy(mntopts, optarg, sizeof (mntopts));
break;
case 'O':
flags |= MS_OVERLAY;
break;
case 'm':
flags |= MS_NOMNTTAB;
break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
"'%c' option\n"), optopt);
usage(B_FALSE);
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
(void) fprintf(stderr, gettext("usage: mount [-o opts] "
"<path>\n"));
return (2);
}
}
argc -= optind;
argv += optind;
/* check that we only have two arguments */
if (argc != 2) {
if (argc == 0)
(void) fprintf(stderr, gettext("missing dataset "
"argument\n"));
else if (argc == 1)
(void) fprintf(stderr,
gettext("missing mountpoint argument\n"));
else
(void) fprintf(stderr, gettext("too many arguments\n"));
(void) fprintf(stderr, "usage: mount <dataset> <mountpoint>\n");
return (2);
}
dataset = argv[0];
path = argv[1];
/* try to open the dataset */
if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_FILESYSTEM)) == NULL)
return (1);
(void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
sizeof (mountpoint), NULL, NULL, 0, B_FALSE);
/* check for legacy mountpoint and complain appropriately */
ret = 0;
if (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) == 0) {
if (zmount(dataset, path, flags, MNTTYPE_ZFS,
NULL, 0, mntopts, sizeof (mntopts)) != 0) {
(void) fprintf(stderr, gettext("mount failed: %s\n"),
strerror(errno));
ret = 1;
}
} else {
(void) fprintf(stderr, gettext("filesystem '%s' cannot be "
"mounted using 'mount -t zfs'\n"), dataset);
(void) fprintf(stderr, gettext("Use 'zfs set mountpoint=%s' "
"instead.\n"), path);
(void) fprintf(stderr, gettext("If you must use 'mount -t zfs' "
"or /etc/fstab, use 'zfs set mountpoint=legacy'.\n"));
(void) fprintf(stderr, gettext("See zfs(8) for more "
"information.\n"));
ret = 1;
}
return (ret);
}
/*
* Called when invoked as /etc/fs/zfs/umount. Unlike a manual mount, we allow
* unmounts of non-legacy filesystems, as this is the dominant administrative
* interface.
*/
static int
manual_unmount(int argc, char **argv)
{
int flags = 0;
int c;
/* check options */
while ((c = getopt(argc, argv, "f")) != -1) {
switch (c) {
case 'f':
flags = MS_FORCE;
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
(void) fprintf(stderr, gettext("usage: unmount [-f] "
"<path>\n"));
return (2);
}
}
argc -= optind;
argv += optind;
/* check arguments */
if (argc != 1) {
if (argc == 0)
(void) fprintf(stderr, gettext("missing path "
"argument\n"));
else
(void) fprintf(stderr, gettext("too many arguments\n"));
(void) fprintf(stderr, gettext("usage: unmount [-f] <path>\n"));
return (2);
}
return (unshare_unmount_path(OP_MOUNT, argv[0], flags, B_TRUE));
}
static int
find_command_idx(char *command, int *idx)
{
int i;
for (i = 0; i < NCOMMAND; i++) {
if (command_table[i].name == NULL)
continue;
if (strcmp(command, command_table[i].name) == 0) {
*idx = i;
return (0);
}
}
return (1);
}
static int
zfs_do_diff(int argc, char **argv)
{
zfs_handle_t *zhp;
int flags = 0;
char *tosnap = NULL;
char *fromsnap = NULL;
char *atp, *copy;
int err = 0;
int c;
while ((c = getopt(argc, argv, "FHt")) != -1) {
switch (c) {
case 'F':
flags |= ZFS_DIFF_CLASSIFY;
break;
case 'H':
flags |= ZFS_DIFF_PARSEABLE;
break;
case 't':
flags |= ZFS_DIFF_TIMESTAMP;
break;
default:
(void) fprintf(stderr,
gettext("invalid option '%c'\n"), optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
if (argc < 1) {
(void) fprintf(stderr,
- gettext("must provide at least one snapshot name\n"));
+ gettext("must provide at least one snapshot name\n"));
usage(B_FALSE);
}
if (argc > 2) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}
fromsnap = argv[0];
tosnap = (argc == 2) ? argv[1] : NULL;
copy = NULL;
if (*fromsnap != '@')
copy = strdup(fromsnap);
else if (tosnap)
copy = strdup(tosnap);
if (copy == NULL)
usage(B_FALSE);
if ((atp = strchr(copy, '@')) != NULL)
*atp = '\0';
if ((zhp = zfs_open(g_zfs, copy, ZFS_TYPE_FILESYSTEM)) == NULL)
return (1);
free(copy);
/*
* Ignore SIGPIPE so that the library can give us
* information on any failure
*/
(void) sigignore(SIGPIPE);
err = zfs_show_diffs(zhp, STDOUT_FILENO, fromsnap, tosnap, flags);
zfs_close(zhp);
return (err != 0);
+}
+
+static int
+zfs_do_remap(int argc, char **argv)
+{
+ const char *fsname;
+ int err = 0;
+ if (argc != 2) {
+ (void) fprintf(stderr, gettext("wrong number of arguments\n"));
+ usage(B_FALSE);
+ }
+
+ fsname = argv[1];
+ err = zfs_remap_indirects(g_zfs, fsname);
+
+ return (err);
}
/*
* zfs bookmark <fs@snap> <fs#bmark>
*
* Creates a bookmark with the given name from the given snapshot.
*/
static int
zfs_do_bookmark(int argc, char **argv)
{
char snapname[ZFS_MAX_DATASET_NAME_LEN];
zfs_handle_t *zhp;
nvlist_t *nvl;
int ret = 0;
int c;
/* check options */
while ((c = getopt(argc, argv, "")) != -1) {
switch (c) {
case '?':
(void) fprintf(stderr,
gettext("invalid option '%c'\n"), optopt);
goto usage;
}
}
argc -= optind;
argv += optind;
/* check number of arguments */
if (argc < 1) {
(void) fprintf(stderr, gettext("missing snapshot argument\n"));
goto usage;
}
if (argc < 2) {
(void) fprintf(stderr, gettext("missing bookmark argument\n"));
goto usage;
}
if (strchr(argv[1], '#') == NULL) {
(void) fprintf(stderr,
gettext("invalid bookmark name '%s' -- "
"must contain a '#'\n"), argv[1]);
goto usage;
}
if (argv[0][0] == '@') {
/*
* Snapshot name begins with @.
* Default to same fs as bookmark.
*/
(void) strncpy(snapname, argv[1], sizeof (snapname));
*strchr(snapname, '#') = '\0';
(void) strlcat(snapname, argv[0], sizeof (snapname));
} else {
(void) strncpy(snapname, argv[0], sizeof (snapname));
}
zhp = zfs_open(g_zfs, snapname, ZFS_TYPE_SNAPSHOT);
if (zhp == NULL)
goto usage;
zfs_close(zhp);
nvl = fnvlist_alloc();
fnvlist_add_string(nvl, argv[1], snapname);
ret = lzc_bookmark(nvl, NULL);
fnvlist_free(nvl);
if (ret != 0) {
const char *err_msg;
char errbuf[1024];
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN,
"cannot create bookmark '%s'"), argv[1]);
switch (ret) {
case EXDEV:
err_msg = "bookmark is in a different pool";
break;
case EEXIST:
err_msg = "bookmark exists";
break;
case EINVAL:
err_msg = "invalid argument";
break;
case ENOTSUP:
err_msg = "bookmark feature not enabled";
break;
case ENOSPC:
err_msg = "out of space";
break;
default:
err_msg = "unknown error";
break;
}
(void) fprintf(stderr, "%s: %s\n", errbuf,
dgettext(TEXT_DOMAIN, err_msg));
}
return (ret != 0);
usage:
usage(B_FALSE);
return (-1);
}
static int
zfs_do_channel_program(int argc, char **argv)
{
int ret, fd;
char c;
char *progbuf, *filename, *poolname;
size_t progsize, progread;
nvlist_t *outnvl;
uint64_t instrlimit = ZCP_DEFAULT_INSTRLIMIT;
uint64_t memlimit = ZCP_DEFAULT_MEMLIMIT;
boolean_t sync_flag = B_TRUE;
zpool_handle_t *zhp;
/* check options */
while (-1 !=
(c = getopt(argc, argv, "nt:(instr-limit)m:(memory-limit)"))) {
switch (c) {
case 't':
case 'm': {
uint64_t arg;
char *endp;
errno = 0;
arg = strtoull(optarg, &endp, 0);
if (errno != 0 || *endp != '\0') {
(void) fprintf(stderr, gettext(
"invalid argument "
"'%s': expected integer\n"), optarg);
goto usage;
}
if (c == 't') {
if (arg > ZCP_MAX_INSTRLIMIT || arg == 0) {
(void) fprintf(stderr, gettext(
"Invalid instruction limit: "
"%s\n"), optarg);
return (1);
} else {
instrlimit = arg;
}
} else {
ASSERT3U(c, ==, 'm');
if (arg > ZCP_MAX_MEMLIMIT || arg == 0) {
(void) fprintf(stderr, gettext(
"Invalid memory limit: "
"%s\n"), optarg);
return (1);
} else {
memlimit = arg;
}
}
break;
}
case 'n': {
sync_flag = B_FALSE;
break;
}
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
goto usage;
}
}
argc -= optind;
argv += optind;
if (argc < 2) {
(void) fprintf(stderr,
gettext("invalid number of arguments\n"));
goto usage;
}
poolname = argv[0];
filename = argv[1];
if (strcmp(filename, "-") == 0) {
fd = 0;
filename = "standard input";
} else if ((fd = open(filename, O_RDONLY)) < 0) {
(void) fprintf(stderr, gettext("cannot open '%s': %s\n"),
filename, strerror(errno));
return (1);
}
if ((zhp = zpool_open(g_zfs, poolname)) == NULL) {
(void) fprintf(stderr, gettext("cannot open pool '%s'"),
poolname);
return (1);
}
zpool_close(zhp);
/*
* Read in the channel program, expanding the program buffer as
* necessary.
*/
progread = 0;
progsize = 1024;
progbuf = safe_malloc(progsize);
do {
ret = read(fd, progbuf + progread, progsize - progread);
progread += ret;
if (progread == progsize && ret > 0) {
progsize *= 2;
progbuf = safe_realloc(progbuf, progsize);
}
} while (ret > 0);
if (fd != 0)
(void) close(fd);
if (ret < 0) {
free(progbuf);
(void) fprintf(stderr,
gettext("cannot read '%s': %s\n"),
filename, strerror(errno));
return (1);
}
progbuf[progread] = '\0';
/*
* Any remaining arguments are passed as arguments to the lua script as
* a string array:
* {
* "argv" -> [ "arg 1", ... "arg n" ],
* }
*/
nvlist_t *argnvl = fnvlist_alloc();
fnvlist_add_string_array(argnvl, ZCP_ARG_CLIARGV, argv + 2, argc - 2);
if (sync_flag) {
ret = lzc_channel_program(poolname, progbuf,
instrlimit, memlimit, argnvl, &outnvl);
} else {
ret = lzc_channel_program_nosync(poolname, progbuf,
instrlimit, memlimit, argnvl, &outnvl);
}
if (ret != 0) {
/*
* On error, report the error message handed back by lua if one
* exists. Otherwise, generate an appropriate error message,
* falling back on strerror() for an unexpected return code.
*/
char *errstring = NULL;
if (nvlist_exists(outnvl, ZCP_RET_ERROR)) {
(void) nvlist_lookup_string(outnvl,
ZCP_RET_ERROR, &errstring);
if (errstring == NULL)
errstring = strerror(ret);
} else {
switch (ret) {
case EINVAL:
errstring =
"Invalid instruction or memory limit.";
break;
case ENOMEM:
errstring = "Return value too large.";
break;
case ENOSPC:
errstring = "Memory limit exhausted.";
break;
#ifdef illumos
case ETIME:
#else
case ETIMEDOUT:
#endif
errstring = "Timed out.";
break;
case EPERM:
errstring = "Permission denied. Channel "
"programs must be run as root.";
break;
default:
errstring = strerror(ret);
}
}
(void) fprintf(stderr,
gettext("Channel program execution failed:\n%s\n"),
errstring);
} else {
(void) printf("Channel program fully executed ");
if (nvlist_empty(outnvl)) {
(void) printf("with no return value.\n");
} else {
(void) printf("with return value:\n");
dump_nvlist(outnvl, 4);
}
}
free(progbuf);
fnvlist_free(outnvl);
fnvlist_free(argnvl);
return (ret != 0);
usage:
usage(B_FALSE);
return (-1);
}
int
main(int argc, char **argv)
{
int ret = 0;
int i;
char *progname;
char *cmdname;
(void) setlocale(LC_ALL, "");
(void) textdomain(TEXT_DOMAIN);
opterr = 0;
if ((g_zfs = libzfs_init()) == NULL) {
(void) fprintf(stderr, gettext("internal error: failed to "
"initialize ZFS library\n"));
return (1);
}
zfs_save_arguments(argc, argv, history_str, sizeof (history_str));
libzfs_print_on_error(g_zfs, B_TRUE);
if ((mnttab_file = fopen(MNTTAB, "r")) == NULL) {
(void) fprintf(stderr, gettext("internal error: unable to "
"open %s\n"), MNTTAB);
return (1);
}
/*
* This command also doubles as the /etc/fs mount and unmount program.
* Determine if we should take this behavior based on argv[0].
*/
progname = basename(argv[0]);
if (strcmp(progname, "mount") == 0) {
ret = manual_mount(argc, argv);
} else if (strcmp(progname, "umount") == 0) {
ret = manual_unmount(argc, argv);
} else {
/*
* Make sure the user has specified some command.
*/
if (argc < 2) {
(void) fprintf(stderr, gettext("missing command\n"));
usage(B_FALSE);
}
cmdname = argv[1];
/*
* The 'umount' command is an alias for 'unmount'
*/
if (strcmp(cmdname, "umount") == 0)
cmdname = "unmount";
/*
* The 'recv' command is an alias for 'receive'
*/
if (strcmp(cmdname, "recv") == 0)
cmdname = "receive";
/*
* The 'snap' command is an alias for 'snapshot'
*/
if (strcmp(cmdname, "snap") == 0)
cmdname = "snapshot";
/*
* Special case '-?'
*/
if (strcmp(cmdname, "-?") == 0)
usage(B_TRUE);
/*
* Run the appropriate command.
*/
libzfs_mnttab_cache(g_zfs, B_TRUE);
if (find_command_idx(cmdname, &i) == 0) {
current_command = &command_table[i];
ret = command_table[i].func(argc - 1, argv + 1);
} else if (strchr(cmdname, '=') != NULL) {
verify(find_command_idx("set", &i) == 0);
current_command = &command_table[i];
ret = command_table[i].func(argc, argv);
} else {
(void) fprintf(stderr, gettext("unrecognized "
"command '%s'\n"), cmdname);
usage(B_FALSE);
}
libzfs_mnttab_cache(g_zfs, B_FALSE);
}
(void) fclose(mnttab_file);
if (ret == 0 && log_history)
(void) zpool_log_history(g_zfs, history_str);
libzfs_fini(g_zfs);
/*
* The 'ZFS_ABORT' environment variable causes us to dump core on exit
* for the purposes of running ::findleaks.
*/
if (getenv("ZFS_ABORT") != NULL) {
(void) printf("dumping core by request\n");
abort();
}
return (ret);
}
Index: stable/11/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
===================================================================
--- stable/11/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c (revision 332524)
+++ stable/11/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c (revision 332525)
@@ -1,5761 +1,5940 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2012 by Frederik Wessels. All rights reserved.
* Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
* Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved.
* Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
* Copyright 2016 Nexenta Systems, Inc.
* Copyright (c) 2017 Datto Inc.
*/
#include <solaris.h>
#include <assert.h>
#include <ctype.h>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <libgen.h>
#include <libintl.h>
#include <libuutil.h>
#include <locale.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <unistd.h>
#include <priv.h>
#include <pwd.h>
#include <zone.h>
#include <sys/time.h>
#include <zfs_prop.h>
#include <sys/fs/zfs.h>
#include <sys/stat.h>
#include <libzfs.h>
#include "zpool_util.h"
#include "zfs_comutil.h"
#include "zfeature_common.h"
#include "statcommon.h"
static int zpool_do_create(int, char **);
static int zpool_do_destroy(int, char **);
static int zpool_do_add(int, char **);
static int zpool_do_remove(int, char **);
static int zpool_do_labelclear(int, char **);
static int zpool_do_list(int, char **);
static int zpool_do_iostat(int, char **);
static int zpool_do_status(int, char **);
static int zpool_do_online(int, char **);
static int zpool_do_offline(int, char **);
static int zpool_do_clear(int, char **);
static int zpool_do_reopen(int, char **);
static int zpool_do_reguid(int, char **);
static int zpool_do_attach(int, char **);
static int zpool_do_detach(int, char **);
static int zpool_do_replace(int, char **);
static int zpool_do_split(int, char **);
static int zpool_do_scrub(int, char **);
static int zpool_do_import(int, char **);
static int zpool_do_export(int, char **);
static int zpool_do_upgrade(int, char **);
static int zpool_do_history(int, char **);
static int zpool_do_get(int, char **);
static int zpool_do_set(int, char **);
/*
* These libumem hooks provide a reasonable set of defaults for the allocator's
* debugging facilities.
*/
#ifdef DEBUG
const char *
_umem_debug_init(void)
{
return ("default,verbose"); /* $UMEM_DEBUG setting */
}
const char *
_umem_logging_init(void)
{
return ("fail,contents"); /* $UMEM_LOGGING setting */
}
#endif
typedef enum {
HELP_ADD,
HELP_ATTACH,
HELP_CLEAR,
HELP_CREATE,
HELP_DESTROY,
HELP_DETACH,
HELP_EXPORT,
HELP_HISTORY,
HELP_IMPORT,
HELP_IOSTAT,
HELP_LABELCLEAR,
HELP_LIST,
HELP_OFFLINE,
HELP_ONLINE,
HELP_REPLACE,
HELP_REMOVE,
HELP_SCRUB,
HELP_STATUS,
HELP_UPGRADE,
HELP_GET,
HELP_SET,
HELP_SPLIT,
HELP_REGUID,
HELP_REOPEN
} zpool_help_t;
typedef struct zpool_command {
const char *name;
int (*func)(int, char **);
zpool_help_t usage;
} zpool_command_t;
/*
* Master command table. Each ZFS command has a name, associated function, and
* usage message. The usage messages need to be internationalized, so we have
* to have a function to return the usage message based on a command index.
*
* These commands are organized according to how they are displayed in the usage
* message. An empty command (one with a NULL name) indicates an empty line in
* the generic usage message.
*/
static zpool_command_t command_table[] = {
{ "create", zpool_do_create, HELP_CREATE },
{ "destroy", zpool_do_destroy, HELP_DESTROY },
{ NULL },
{ "add", zpool_do_add, HELP_ADD },
{ "remove", zpool_do_remove, HELP_REMOVE },
{ NULL },
{ "labelclear", zpool_do_labelclear, HELP_LABELCLEAR },
{ NULL },
{ "list", zpool_do_list, HELP_LIST },
{ "iostat", zpool_do_iostat, HELP_IOSTAT },
{ "status", zpool_do_status, HELP_STATUS },
{ NULL },
{ "online", zpool_do_online, HELP_ONLINE },
{ "offline", zpool_do_offline, HELP_OFFLINE },
{ "clear", zpool_do_clear, HELP_CLEAR },
{ "reopen", zpool_do_reopen, HELP_REOPEN },
{ NULL },
{ "attach", zpool_do_attach, HELP_ATTACH },
{ "detach", zpool_do_detach, HELP_DETACH },
{ "replace", zpool_do_replace, HELP_REPLACE },
{ "split", zpool_do_split, HELP_SPLIT },
{ NULL },
{ "scrub", zpool_do_scrub, HELP_SCRUB },
{ NULL },
{ "import", zpool_do_import, HELP_IMPORT },
{ "export", zpool_do_export, HELP_EXPORT },
{ "upgrade", zpool_do_upgrade, HELP_UPGRADE },
{ "reguid", zpool_do_reguid, HELP_REGUID },
{ NULL },
{ "history", zpool_do_history, HELP_HISTORY },
{ "get", zpool_do_get, HELP_GET },
{ "set", zpool_do_set, HELP_SET },
};
#define NCOMMAND (sizeof (command_table) / sizeof (command_table[0]))
static zpool_command_t *current_command;
static char history_str[HIS_MAX_RECORD_LEN];
static boolean_t log_history = B_TRUE;
static uint_t timestamp_fmt = NODATE;
static const char *
get_usage(zpool_help_t idx)
{
switch (idx) {
case HELP_ADD:
return (gettext("\tadd [-fn] <pool> <vdev> ...\n"));
case HELP_ATTACH:
return (gettext("\tattach [-f] <pool> <device> "
"<new-device>\n"));
case HELP_CLEAR:
return (gettext("\tclear [-nF] <pool> [device]\n"));
case HELP_CREATE:
return (gettext("\tcreate [-fnd] [-B] "
"[-o property=value] ... \n"
"\t [-O file-system-property=value] ... \n"
"\t [-m mountpoint] [-R root] <pool> <vdev> ...\n"));
case HELP_DESTROY:
return (gettext("\tdestroy [-f] <pool>\n"));
case HELP_DETACH:
return (gettext("\tdetach <pool> <device>\n"));
case HELP_EXPORT:
return (gettext("\texport [-f] <pool> ...\n"));
case HELP_HISTORY:
return (gettext("\thistory [-il] [<pool>] ...\n"));
case HELP_IMPORT:
return (gettext("\timport [-d dir] [-D]\n"
"\timport [-d dir | -c cachefile] [-F [-n]] <pool | id>\n"
"\timport [-o mntopts] [-o property=value] ... \n"
"\t [-d dir | -c cachefile] [-D] [-f] [-m] [-N] "
"[-R root] [-F [-n]] -a\n"
"\timport [-o mntopts] [-o property=value] ... \n"
"\t [-d dir | -c cachefile] [-D] [-f] [-m] [-N] "
"[-R root] [-F [-n]]\n"
"\t <pool | id> [newpool]\n"));
case HELP_IOSTAT:
return (gettext("\tiostat [-v] [-T d|u] [pool] ... [interval "
"[count]]\n"));
case HELP_LABELCLEAR:
return (gettext("\tlabelclear [-f] <vdev>\n"));
case HELP_LIST:
return (gettext("\tlist [-Hpv] [-o property[,...]] "
"[-T d|u] [pool] ... [interval [count]]\n"));
case HELP_OFFLINE:
return (gettext("\toffline [-t] <pool> <device> ...\n"));
case HELP_ONLINE:
return (gettext("\tonline [-e] <pool> <device> ...\n"));
case HELP_REPLACE:
return (gettext("\treplace [-f] <pool> <device> "
"[new-device]\n"));
case HELP_REMOVE:
- return (gettext("\tremove <pool> <device> ...\n"));
+ return (gettext("\tremove [-nps] <pool> <device> ...\n"));
case HELP_REOPEN:
return (gettext("\treopen <pool>\n"));
case HELP_SCRUB:
return (gettext("\tscrub [-s | -p] <pool> ...\n"));
case HELP_STATUS:
return (gettext("\tstatus [-vx] [-T d|u] [pool] ... [interval "
"[count]]\n"));
case HELP_UPGRADE:
return (gettext("\tupgrade [-v]\n"
"\tupgrade [-V version] <-a | pool ...>\n"));
case HELP_GET:
return (gettext("\tget [-Hp] [-o \"all\" | field[,...]] "
"<\"all\" | property[,...]> <pool> ...\n"));
case HELP_SET:
return (gettext("\tset <property=value> <pool> \n"));
case HELP_SPLIT:
return (gettext("\tsplit [-n] [-R altroot] [-o mntopts]\n"
"\t [-o property=value] <pool> <newpool> "
"[<device> ...]\n"));
case HELP_REGUID:
return (gettext("\treguid <pool>\n"));
}
abort();
/* NOTREACHED */
}
/*
* Callback routine that will print out a pool property value.
*/
static int
print_prop_cb(int prop, void *cb)
{
FILE *fp = cb;
(void) fprintf(fp, "\t%-15s ", zpool_prop_to_name(prop));
if (zpool_prop_readonly(prop))
(void) fprintf(fp, " NO ");
else
(void) fprintf(fp, " YES ");
if (zpool_prop_values(prop) == NULL)
(void) fprintf(fp, "-\n");
else
(void) fprintf(fp, "%s\n", zpool_prop_values(prop));
return (ZPROP_CONT);
}
/*
* Display usage message. If we're inside a command, display only the usage for
* that command. Otherwise, iterate over the entire command table and display
* a complete usage message.
*/
void
usage(boolean_t requested)
{
FILE *fp = requested ? stdout : stderr;
if (current_command == NULL) {
int i;
(void) fprintf(fp, gettext("usage: zpool command args ...\n"));
(void) fprintf(fp,
gettext("where 'command' is one of the following:\n\n"));
for (i = 0; i < NCOMMAND; i++) {
if (command_table[i].name == NULL)
(void) fprintf(fp, "\n");
else
(void) fprintf(fp, "%s",
get_usage(command_table[i].usage));
}
} else {
(void) fprintf(fp, gettext("usage:\n"));
(void) fprintf(fp, "%s", get_usage(current_command->usage));
}
if (current_command != NULL &&
((strcmp(current_command->name, "set") == 0) ||
(strcmp(current_command->name, "get") == 0) ||
(strcmp(current_command->name, "list") == 0))) {
(void) fprintf(fp,
gettext("\nthe following properties are supported:\n"));
(void) fprintf(fp, "\n\t%-15s %s %s\n\n",
"PROPERTY", "EDIT", "VALUES");
/* Iterate over all properties */
(void) zprop_iter(print_prop_cb, fp, B_FALSE, B_TRUE,
ZFS_TYPE_POOL);
(void) fprintf(fp, "\t%-15s ", "feature@...");
(void) fprintf(fp, "YES disabled | enabled | active\n");
(void) fprintf(fp, gettext("\nThe feature@ properties must be "
"appended with a feature name.\nSee zpool-features(7).\n"));
}
/*
* See comments at end of main().
*/
if (getenv("ZFS_ABORT") != NULL) {
(void) printf("dumping core by request\n");
abort();
}
exit(requested ? 0 : 2);
}
void
print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent,
boolean_t print_logs)
{
nvlist_t **child;
uint_t c, children;
char *vname;
if (name != NULL)
(void) printf("\t%*s%s\n", indent, "", name);
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0)
return;
for (c = 0; c < children; c++) {
uint64_t is_log = B_FALSE;
(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
&is_log);
if ((is_log && !print_logs) || (!is_log && print_logs))
continue;
vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE);
print_vdev_tree(zhp, vname, child[c], indent + 2,
B_FALSE);
free(vname);
}
}
static boolean_t
prop_list_contains_feature(nvlist_t *proplist)
{
nvpair_t *nvp;
for (nvp = nvlist_next_nvpair(proplist, NULL); NULL != nvp;
nvp = nvlist_next_nvpair(proplist, nvp)) {
if (zpool_prop_feature(nvpair_name(nvp)))
return (B_TRUE);
}
return (B_FALSE);
}
/*
* Add a property pair (name, string-value) into a property nvlist.
*/
static int
add_prop_list(const char *propname, char *propval, nvlist_t **props,
boolean_t poolprop)
{
zpool_prop_t prop = ZPROP_INVAL;
zfs_prop_t fprop;
nvlist_t *proplist;
const char *normnm;
char *strval;
if (*props == NULL &&
nvlist_alloc(props, NV_UNIQUE_NAME, 0) != 0) {
(void) fprintf(stderr,
gettext("internal error: out of memory\n"));
return (1);
}
proplist = *props;
if (poolprop) {
const char *vname = zpool_prop_to_name(ZPOOL_PROP_VERSION);
if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL &&
!zpool_prop_feature(propname)) {
(void) fprintf(stderr, gettext("property '%s' is "
"not a valid pool property\n"), propname);
return (2);
}
/*
* feature@ properties and version should not be specified
* at the same time.
*/
if ((prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname) &&
nvlist_exists(proplist, vname)) ||
(prop == ZPOOL_PROP_VERSION &&
prop_list_contains_feature(proplist))) {
(void) fprintf(stderr, gettext("'feature@' and "
"'version' properties cannot be specified "
"together\n"));
return (2);
}
if (zpool_prop_feature(propname))
normnm = propname;
else
normnm = zpool_prop_to_name(prop);
} else {
if ((fprop = zfs_name_to_prop(propname)) != ZPROP_INVAL) {
normnm = zfs_prop_to_name(fprop);
} else {
normnm = propname;
}
}
if (nvlist_lookup_string(proplist, normnm, &strval) == 0 &&
prop != ZPOOL_PROP_CACHEFILE) {
(void) fprintf(stderr, gettext("property '%s' "
"specified multiple times\n"), propname);
return (2);
}
if (nvlist_add_string(proplist, normnm, propval) != 0) {
(void) fprintf(stderr, gettext("internal "
"error: out of memory\n"));
return (1);
}
return (0);
}
/*
* zpool add [-fn] <pool> <vdev> ...
*
* -f Force addition of devices, even if they appear in use
* -n Do not add the devices, but display the resulting layout if
* they were to be added.
*
* Adds the given vdevs to 'pool'. As with create, the bulk of this work is
* handled by get_vdev_spec(), which constructs the nvlist needed to pass to
* libzfs.
*/
int
zpool_do_add(int argc, char **argv)
{
boolean_t force = B_FALSE;
boolean_t dryrun = B_FALSE;
int c;
nvlist_t *nvroot;
char *poolname;
zpool_boot_label_t boot_type;
uint64_t boot_size;
int ret;
zpool_handle_t *zhp;
nvlist_t *config;
/* check options */
while ((c = getopt(argc, argv, "fn")) != -1) {
switch (c) {
case 'f':
force = B_TRUE;
break;
case 'n':
dryrun = B_TRUE;
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
/* get pool name and check number of arguments */
if (argc < 1) {
(void) fprintf(stderr, gettext("missing pool name argument\n"));
usage(B_FALSE);
}
if (argc < 2) {
(void) fprintf(stderr, gettext("missing vdev specification\n"));
usage(B_FALSE);
}
poolname = argv[0];
argc--;
argv++;
if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
return (1);
if ((config = zpool_get_config(zhp, NULL)) == NULL) {
(void) fprintf(stderr, gettext("pool '%s' is unavailable\n"),
poolname);
zpool_close(zhp);
return (1);
}
if (zpool_is_bootable(zhp))
boot_type = ZPOOL_COPY_BOOT_LABEL;
else
boot_type = ZPOOL_NO_BOOT_LABEL;
/* pass off to get_vdev_spec for processing */
boot_size = zpool_get_prop_int(zhp, ZPOOL_PROP_BOOTSIZE, NULL);
nvroot = make_root_vdev(zhp, force, !force, B_FALSE, dryrun,
boot_type, boot_size, argc, argv);
if (nvroot == NULL) {
zpool_close(zhp);
return (1);
}
if (dryrun) {
nvlist_t *poolnvroot;
verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&poolnvroot) == 0);
(void) printf(gettext("would update '%s' to the following "
"configuration:\n"), zpool_get_name(zhp));
/* print original main pool and new tree */
print_vdev_tree(zhp, poolname, poolnvroot, 0, B_FALSE);
print_vdev_tree(zhp, NULL, nvroot, 0, B_FALSE);
/* Do the same for the logs */
if (num_logs(poolnvroot) > 0) {
print_vdev_tree(zhp, "logs", poolnvroot, 0, B_TRUE);
print_vdev_tree(zhp, NULL, nvroot, 0, B_TRUE);
} else if (num_logs(nvroot) > 0) {
print_vdev_tree(zhp, "logs", nvroot, 0, B_TRUE);
}
ret = 0;
} else {
ret = (zpool_add(zhp, nvroot) != 0);
}
nvlist_free(nvroot);
zpool_close(zhp);
return (ret);
}
/*
* zpool remove <pool> <vdev> ...
*
- * Removes the given vdev from the pool. Currently, this supports removing
- * spares, cache, and log devices from the pool.
+ * Removes the given vdev from the pool.
*/
int
zpool_do_remove(int argc, char **argv)
{
char *poolname;
int i, ret = 0;
zpool_handle_t *zhp;
+ boolean_t stop = B_FALSE;
+ boolean_t noop = B_FALSE;
+ boolean_t parsable = B_FALSE;
+ char c;
- argc--;
- argv++;
+ /* check options */
+ while ((c = getopt(argc, argv, "nps")) != -1) {
+ switch (c) {
+ case 'n':
+ noop = B_TRUE;
+ break;
+ case 'p':
+ parsable = B_TRUE;
+ break;
+ case 's':
+ stop = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
/* get pool name and check number of arguments */
if (argc < 1) {
(void) fprintf(stderr, gettext("missing pool name argument\n"));
usage(B_FALSE);
}
- if (argc < 2) {
- (void) fprintf(stderr, gettext("missing device\n"));
- usage(B_FALSE);
- }
poolname = argv[0];
if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
return (1);
- for (i = 1; i < argc; i++) {
- if (zpool_vdev_remove(zhp, argv[i]) != 0)
+ if (stop && noop) {
+ (void) fprintf(stderr, gettext("stop request ignored\n"));
+ return (0);
+ }
+
+ if (stop) {
+ if (argc > 1) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+ if (zpool_vdev_remove_cancel(zhp) != 0)
ret = 1;
+ } else {
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing device\n"));
+ usage(B_FALSE);
+ }
+
+ for (i = 1; i < argc; i++) {
+ if (noop) {
+ uint64_t size;
+
+ if (zpool_vdev_indirect_size(zhp, argv[i],
+ &size) != 0) {
+ ret = 1;
+ break;
+ }
+ if (parsable) {
+ (void) printf("%s %llu\n",
+ argv[i], size);
+ } else {
+ char valstr[32];
+ zfs_nicenum(size, valstr,
+ sizeof (valstr));
+ (void) printf("Memory that will be "
+ "used after removing %s: %s\n",
+ argv[i], valstr);
+ }
+ } else {
+ if (zpool_vdev_remove(zhp, argv[i]) != 0)
+ ret = 1;
+ }
+ }
}
return (ret);
}
/*
* zpool labelclear [-f] <vdev>
*
* -f Force clearing the label for the vdevs which are members of
* the exported or foreign pools.
*
* Verifies that the vdev is not active and zeros out the label information
* on the device.
*/
int
zpool_do_labelclear(int argc, char **argv)
{
char vdev[MAXPATHLEN];
char *name = NULL;
struct stat st;
int c, fd, ret = 0;
nvlist_t *config;
pool_state_t state;
boolean_t inuse = B_FALSE;
boolean_t force = B_FALSE;
/* check options */
while ((c = getopt(argc, argv, "f")) != -1) {
switch (c) {
case 'f':
force = B_TRUE;
break;
default:
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
/* get vdev name */
if (argc < 1) {
(void) fprintf(stderr, gettext("missing vdev name\n"));
usage(B_FALSE);
}
if (argc > 1) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}
/*
* Check if we were given absolute path and use it as is.
* Otherwise if the provided vdev name doesn't point to a file,
* try prepending dsk path and appending s0.
*/
(void) strlcpy(vdev, argv[0], sizeof (vdev));
if (vdev[0] != '/' && stat(vdev, &st) != 0) {
char *s;
(void) snprintf(vdev, sizeof (vdev), "%s/%s",
#ifdef illumos
ZFS_DISK_ROOT, argv[0]);
if ((s = strrchr(argv[0], 's')) == NULL ||
!isdigit(*(s + 1)))
(void) strlcat(vdev, "s0", sizeof (vdev));
#else
"/dev", argv[0]);
#endif
if (stat(vdev, &st) != 0) {
(void) fprintf(stderr, gettext(
"failed to find device %s, try specifying absolute "
"path instead\n"), argv[0]);
return (1);
}
}
if ((fd = open(vdev, O_RDWR)) < 0) {
(void) fprintf(stderr, gettext("failed to open %s: %s\n"),
vdev, strerror(errno));
return (1);
}
if (zpool_read_label(fd, &config) != 0) {
(void) fprintf(stderr,
gettext("failed to read label from %s\n"), vdev);
return (1);
}
nvlist_free(config);
ret = zpool_in_use(g_zfs, fd, &state, &name, &inuse);
if (ret != 0) {
(void) fprintf(stderr,
gettext("failed to check state for %s\n"), vdev);
return (1);
}
if (!inuse)
goto wipe_label;
switch (state) {
default:
case POOL_STATE_ACTIVE:
case POOL_STATE_SPARE:
case POOL_STATE_L2CACHE:
(void) fprintf(stderr, gettext(
"%s is a member (%s) of pool \"%s\"\n"),
vdev, zpool_pool_state_to_name(state), name);
ret = 1;
goto errout;
case POOL_STATE_EXPORTED:
if (force)
break;
(void) fprintf(stderr, gettext(
"use '-f' to override the following error:\n"
"%s is a member of exported pool \"%s\"\n"),
vdev, name);
ret = 1;
goto errout;
case POOL_STATE_POTENTIALLY_ACTIVE:
if (force)
break;
(void) fprintf(stderr, gettext(
"use '-f' to override the following error:\n"
"%s is a member of potentially active pool \"%s\"\n"),
vdev, name);
ret = 1;
goto errout;
case POOL_STATE_DESTROYED:
/* inuse should never be set for a destroyed pool */
assert(0);
break;
}
wipe_label:
ret = zpool_clear_label(fd);
if (ret != 0) {
(void) fprintf(stderr,
gettext("failed to clear label for %s\n"), vdev);
}
errout:
free(name);
(void) close(fd);
return (ret);
}
/*
* zpool create [-fnd] [-B] [-o property=value] ...
* [-O file-system-property=value] ...
* [-R root] [-m mountpoint] <pool> <dev> ...
*
* -B Create boot partition.
* -f Force creation, even if devices appear in use
* -n Do not create the pool, but display the resulting layout if it
* were to be created.
* -R Create a pool under an alternate root
* -m Set default mountpoint for the root dataset. By default it's
* '/<pool>'
* -o Set property=value.
* -d Don't automatically enable all supported pool features
* (individual features can be enabled with -o).
* -O Set fsproperty=value in the pool's root file system
*
* Creates the named pool according to the given vdev specification. The
* bulk of the vdev processing is done in get_vdev_spec() in zpool_vdev.c. Once
* we get the nvlist back from get_vdev_spec(), we either print out the contents
* (if '-n' was specified), or pass it to libzfs to do the creation.
*/
#define SYSTEM256 (256 * 1024 * 1024)
int
zpool_do_create(int argc, char **argv)
{
boolean_t force = B_FALSE;
boolean_t dryrun = B_FALSE;
boolean_t enable_all_pool_feat = B_TRUE;
zpool_boot_label_t boot_type = ZPOOL_NO_BOOT_LABEL;
uint64_t boot_size = 0;
int c;
nvlist_t *nvroot = NULL;
char *poolname;
int ret = 1;
char *altroot = NULL;
char *mountpoint = NULL;
nvlist_t *fsprops = NULL;
nvlist_t *props = NULL;
char *propval;
/* check options */
while ((c = getopt(argc, argv, ":fndBR:m:o:O:")) != -1) {
switch (c) {
case 'f':
force = B_TRUE;
break;
case 'n':
dryrun = B_TRUE;
break;
case 'd':
enable_all_pool_feat = B_FALSE;
break;
case 'B':
#ifdef illumos
/*
* We should create the system partition.
* Also make sure the size is set.
*/
boot_type = ZPOOL_CREATE_BOOT_LABEL;
if (boot_size == 0)
boot_size = SYSTEM256;
break;
#else
(void) fprintf(stderr,
gettext("option '%c' is not supported\n"),
optopt);
goto badusage;
#endif
case 'R':
altroot = optarg;
if (add_prop_list(zpool_prop_to_name(
ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE))
goto errout;
if (nvlist_lookup_string(props,
zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
&propval) == 0)
break;
if (add_prop_list(zpool_prop_to_name(
ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE))
goto errout;
break;
case 'm':
/* Equivalent to -O mountpoint=optarg */
mountpoint = optarg;
break;
case 'o':
if ((propval = strchr(optarg, '=')) == NULL) {
(void) fprintf(stderr, gettext("missing "
"'=' for -o option\n"));
goto errout;
}
*propval = '\0';
propval++;
if (add_prop_list(optarg, propval, &props, B_TRUE))
goto errout;
/*
* Get bootsize value for make_root_vdev().
*/
if (zpool_name_to_prop(optarg) == ZPOOL_PROP_BOOTSIZE) {
if (zfs_nicestrtonum(g_zfs, propval,
&boot_size) < 0 || boot_size == 0) {
(void) fprintf(stderr,
gettext("bad boot partition size "
"'%s': %s\n"), propval,
libzfs_error_description(g_zfs));
goto errout;
}
}
/*
* If the user is creating a pool that doesn't support
* feature flags, don't enable any features.
*/
if (zpool_name_to_prop(optarg) == ZPOOL_PROP_VERSION) {
char *end;
u_longlong_t ver;
ver = strtoull(propval, &end, 10);
if (*end == '\0' &&
ver < SPA_VERSION_FEATURES) {
enable_all_pool_feat = B_FALSE;
}
}
if (zpool_name_to_prop(optarg) == ZPOOL_PROP_ALTROOT)
altroot = propval;
break;
case 'O':
if ((propval = strchr(optarg, '=')) == NULL) {
(void) fprintf(stderr, gettext("missing "
"'=' for -O option\n"));
goto errout;
}
*propval = '\0';
propval++;
/*
* Mountpoints are checked and then added later.
* Uniquely among properties, they can be specified
* more than once, to avoid conflict with -m.
*/
if (0 == strcmp(optarg,
zfs_prop_to_name(ZFS_PROP_MOUNTPOINT))) {
mountpoint = propval;
} else if (add_prop_list(optarg, propval, &fsprops,
B_FALSE)) {
goto errout;
}
break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
"'%c' option\n"), optopt);
goto badusage;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
goto badusage;
}
}
argc -= optind;
argv += optind;
/* get pool name and check number of arguments */
if (argc < 1) {
(void) fprintf(stderr, gettext("missing pool name argument\n"));
goto badusage;
}
if (argc < 2) {
(void) fprintf(stderr, gettext("missing vdev specification\n"));
goto badusage;
}
poolname = argv[0];
/*
* As a special case, check for use of '/' in the name, and direct the
* user to use 'zfs create' instead.
*/
if (strchr(poolname, '/') != NULL) {
(void) fprintf(stderr, gettext("cannot create '%s': invalid "
"character '/' in pool name\n"), poolname);
(void) fprintf(stderr, gettext("use 'zfs create' to "
"create a dataset\n"));
goto errout;
}
/*
* Make sure the bootsize is set when ZPOOL_CREATE_BOOT_LABEL is used,
* and not set otherwise.
*/
if (boot_type == ZPOOL_CREATE_BOOT_LABEL) {
const char *propname;
char *strptr, *buf = NULL;
int rv;
propname = zpool_prop_to_name(ZPOOL_PROP_BOOTSIZE);
if (nvlist_lookup_string(props, propname, &strptr) != 0) {
(void) asprintf(&buf, "%" PRIu64, boot_size);
if (buf == NULL) {
(void) fprintf(stderr,
gettext("internal error: out of memory\n"));
goto errout;
}
rv = add_prop_list(propname, buf, &props, B_TRUE);
free(buf);
if (rv != 0)
goto errout;
}
} else {
const char *propname;
char *strptr;
propname = zpool_prop_to_name(ZPOOL_PROP_BOOTSIZE);
if (nvlist_lookup_string(props, propname, &strptr) == 0) {
(void) fprintf(stderr, gettext("error: setting boot "
"partition size requires option '-B'\n"));
goto errout;
}
}
/* pass off to get_vdev_spec for bulk processing */
nvroot = make_root_vdev(NULL, force, !force, B_FALSE, dryrun,
boot_type, boot_size, argc - 1, argv + 1);
if (nvroot == NULL)
goto errout;
/* make_root_vdev() allows 0 toplevel children if there are spares */
if (!zfs_allocatable_devs(nvroot)) {
(void) fprintf(stderr, gettext("invalid vdev "
"specification: at least one toplevel vdev must be "
"specified\n"));
goto errout;
}
if (altroot != NULL && altroot[0] != '/') {
(void) fprintf(stderr, gettext("invalid alternate root '%s': "
"must be an absolute path\n"), altroot);
goto errout;
}
/*
* Check the validity of the mountpoint and direct the user to use the
* '-m' mountpoint option if it looks like its in use.
* Ignore the checks if the '-f' option is given.
*/
if (!force && (mountpoint == NULL ||
(strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) != 0 &&
strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0))) {
char buf[MAXPATHLEN];
DIR *dirp;
if (mountpoint && mountpoint[0] != '/') {
(void) fprintf(stderr, gettext("invalid mountpoint "
"'%s': must be an absolute path, 'legacy', or "
"'none'\n"), mountpoint);
goto errout;
}
if (mountpoint == NULL) {
if (altroot != NULL)
(void) snprintf(buf, sizeof (buf), "%s/%s",
altroot, poolname);
else
(void) snprintf(buf, sizeof (buf), "/%s",
poolname);
} else {
if (altroot != NULL)
(void) snprintf(buf, sizeof (buf), "%s%s",
altroot, mountpoint);
else
(void) snprintf(buf, sizeof (buf), "%s",
mountpoint);
}
if ((dirp = opendir(buf)) == NULL && errno != ENOENT) {
(void) fprintf(stderr, gettext("mountpoint '%s' : "
"%s\n"), buf, strerror(errno));
(void) fprintf(stderr, gettext("use '-m' "
"option to provide a different default\n"));
goto errout;
} else if (dirp) {
int count = 0;
while (count < 3 && readdir(dirp) != NULL)
count++;
(void) closedir(dirp);
if (count > 2) {
(void) fprintf(stderr, gettext("mountpoint "
"'%s' exists and is not empty\n"), buf);
(void) fprintf(stderr, gettext("use '-m' "
"option to provide a "
"different default\n"));
goto errout;
}
}
}
/*
* Now that the mountpoint's validity has been checked, ensure that
* the property is set appropriately prior to creating the pool.
*/
if (mountpoint != NULL) {
ret = add_prop_list(zfs_prop_to_name(ZFS_PROP_MOUNTPOINT),
mountpoint, &fsprops, B_FALSE);
if (ret != 0)
goto errout;
}
ret = 1;
if (dryrun) {
/*
* For a dry run invocation, print out a basic message and run
* through all the vdevs in the list and print out in an
* appropriate hierarchy.
*/
(void) printf(gettext("would create '%s' with the "
"following layout:\n\n"), poolname);
print_vdev_tree(NULL, poolname, nvroot, 0, B_FALSE);
if (num_logs(nvroot) > 0)
print_vdev_tree(NULL, "logs", nvroot, 0, B_TRUE);
ret = 0;
} else {
/*
* Hand off to libzfs.
*/
if (enable_all_pool_feat) {
spa_feature_t i;
for (i = 0; i < SPA_FEATURES; i++) {
char propname[MAXPATHLEN];
zfeature_info_t *feat = &spa_feature_table[i];
(void) snprintf(propname, sizeof (propname),
"feature@%s", feat->fi_uname);
/*
* Skip feature if user specified it manually
* on the command line.
*/
if (nvlist_exists(props, propname))
continue;
ret = add_prop_list(propname,
ZFS_FEATURE_ENABLED, &props, B_TRUE);
if (ret != 0)
goto errout;
}
}
ret = 1;
if (zpool_create(g_zfs, poolname,
nvroot, props, fsprops) == 0) {
zfs_handle_t *pool = zfs_open(g_zfs, poolname,
ZFS_TYPE_FILESYSTEM);
if (pool != NULL) {
if (zfs_mount(pool, NULL, 0) == 0)
ret = zfs_shareall(pool);
zfs_close(pool);
}
} else if (libzfs_errno(g_zfs) == EZFS_INVALIDNAME) {
(void) fprintf(stderr, gettext("pool name may have "
"been omitted\n"));
}
}
errout:
nvlist_free(nvroot);
nvlist_free(fsprops);
nvlist_free(props);
return (ret);
badusage:
nvlist_free(fsprops);
nvlist_free(props);
usage(B_FALSE);
return (2);
}
/*
* zpool destroy <pool>
*
* -f Forcefully unmount any datasets
*
* Destroy the given pool. Automatically unmounts any datasets in the pool.
*/
int
zpool_do_destroy(int argc, char **argv)
{
boolean_t force = B_FALSE;
int c;
char *pool;
zpool_handle_t *zhp;
int ret;
/* check options */
while ((c = getopt(argc, argv, "f")) != -1) {
switch (c) {
case 'f':
force = B_TRUE;
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
/* check arguments */
if (argc < 1) {
(void) fprintf(stderr, gettext("missing pool argument\n"));
usage(B_FALSE);
}
if (argc > 1) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}
pool = argv[0];
if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) {
/*
* As a special case, check for use of '/' in the name, and
* direct the user to use 'zfs destroy' instead.
*/
if (strchr(pool, '/') != NULL)
(void) fprintf(stderr, gettext("use 'zfs destroy' to "
"destroy a dataset\n"));
return (1);
}
if (zpool_disable_datasets(zhp, force) != 0) {
(void) fprintf(stderr, gettext("could not destroy '%s': "
"could not unmount datasets\n"), zpool_get_name(zhp));
return (1);
}
/* The history must be logged as part of the export */
log_history = B_FALSE;
ret = (zpool_destroy(zhp, history_str) != 0);
zpool_close(zhp);
return (ret);
}
/*
* zpool export [-f] <pool> ...
*
* -f Forcefully unmount datasets
*
* Export the given pools. By default, the command will attempt to cleanly
* unmount any active datasets within the pool. If the '-f' flag is specified,
* then the datasets will be forcefully unmounted.
*/
int
zpool_do_export(int argc, char **argv)
{
boolean_t force = B_FALSE;
boolean_t hardforce = B_FALSE;
int c;
zpool_handle_t *zhp;
int ret;
int i;
/* check options */
while ((c = getopt(argc, argv, "fF")) != -1) {
switch (c) {
case 'f':
force = B_TRUE;
break;
case 'F':
hardforce = B_TRUE;
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
/* check arguments */
if (argc < 1) {
(void) fprintf(stderr, gettext("missing pool argument\n"));
usage(B_FALSE);
}
ret = 0;
for (i = 0; i < argc; i++) {
if ((zhp = zpool_open_canfail(g_zfs, argv[i])) == NULL) {
ret = 1;
continue;
}
if (zpool_disable_datasets(zhp, force) != 0) {
ret = 1;
zpool_close(zhp);
continue;
}
/* The history must be logged as part of the export */
log_history = B_FALSE;
if (hardforce) {
if (zpool_export_force(zhp, history_str) != 0)
ret = 1;
} else if (zpool_export(zhp, force, history_str) != 0) {
ret = 1;
}
zpool_close(zhp);
}
return (ret);
}
/*
* Given a vdev configuration, determine the maximum width needed for the device
* name column.
*/
static int
max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max)
{
char *name = zpool_vdev_name(g_zfs, zhp, nv, B_TRUE);
nvlist_t **child;
uint_t c, children;
int ret;
if (strlen(name) + depth > max)
max = strlen(name) + depth;
free(name);
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
&child, &children) == 0) {
for (c = 0; c < children; c++)
if ((ret = max_width(zhp, child[c], depth + 2,
max)) > max)
max = ret;
}
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
&child, &children) == 0) {
for (c = 0; c < children; c++)
if ((ret = max_width(zhp, child[c], depth + 2,
max)) > max)
max = ret;
}
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) == 0) {
for (c = 0; c < children; c++)
if ((ret = max_width(zhp, child[c], depth + 2,
max)) > max)
max = ret;
}
return (max);
}
typedef struct spare_cbdata {
uint64_t cb_guid;
zpool_handle_t *cb_zhp;
} spare_cbdata_t;
static boolean_t
find_vdev(nvlist_t *nv, uint64_t search)
{
uint64_t guid;
nvlist_t **child;
uint_t c, children;
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 &&
search == guid)
return (B_TRUE);
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) == 0) {
for (c = 0; c < children; c++)
if (find_vdev(child[c], search))
return (B_TRUE);
}
return (B_FALSE);
}
static int
find_spare(zpool_handle_t *zhp, void *data)
{
spare_cbdata_t *cbp = data;
nvlist_t *config, *nvroot;
config = zpool_get_config(zhp, NULL);
verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
if (find_vdev(nvroot, cbp->cb_guid)) {
cbp->cb_zhp = zhp;
return (1);
}
zpool_close(zhp);
return (0);
}
/*
* Print out configuration state as requested by status_callback.
*/
void
print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
int namewidth, int depth, boolean_t isspare)
{
nvlist_t **child;
uint_t c, vsc, children;
pool_scan_stat_t *ps = NULL;
vdev_stat_t *vs;
char rbuf[6], wbuf[6], cbuf[6];
char *vname;
uint64_t notpresent;
uint64_t ashift;
spare_cbdata_t cb;
const char *state;
+ char *type;
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0)
children = 0;
verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &vsc) == 0);
+ verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
+
+ if (strcmp(type, VDEV_TYPE_INDIRECT) == 0)
+ return;
+
state = zpool_state_to_name(vs->vs_state, vs->vs_aux);
if (isspare) {
/*
* For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for
* online drives.
*/
if (vs->vs_aux == VDEV_AUX_SPARED)
state = "INUSE";
else if (vs->vs_state == VDEV_STATE_HEALTHY)
state = "AVAIL";
}
(void) printf("\t%*s%-*s %-8s", depth, "", namewidth - depth,
name, state);
if (!isspare) {
zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf));
zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf));
zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf));
(void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf);
}
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
&notpresent) == 0 ||
vs->vs_state <= VDEV_STATE_CANT_OPEN) {
char *path;
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0)
(void) printf(" was %s", path);
} else if (vs->vs_aux != 0) {
(void) printf(" ");
switch (vs->vs_aux) {
case VDEV_AUX_OPEN_FAILED:
(void) printf(gettext("cannot open"));
break;
case VDEV_AUX_BAD_GUID_SUM:
(void) printf(gettext("missing device"));
break;
case VDEV_AUX_NO_REPLICAS:
(void) printf(gettext("insufficient replicas"));
break;
case VDEV_AUX_VERSION_NEWER:
(void) printf(gettext("newer version"));
break;
case VDEV_AUX_UNSUP_FEAT:
(void) printf(gettext("unsupported feature(s)"));
break;
case VDEV_AUX_ASHIFT_TOO_BIG:
(void) printf(gettext("unsupported minimum blocksize"));
break;
case VDEV_AUX_SPARED:
verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
&cb.cb_guid) == 0);
if (zpool_iter(g_zfs, find_spare, &cb) == 1) {
if (strcmp(zpool_get_name(cb.cb_zhp),
zpool_get_name(zhp)) == 0)
(void) printf(gettext("currently in "
"use"));
else
(void) printf(gettext("in use by "
"pool '%s'"),
zpool_get_name(cb.cb_zhp));
zpool_close(cb.cb_zhp);
} else {
(void) printf(gettext("currently in use"));
}
break;
case VDEV_AUX_ERR_EXCEEDED:
(void) printf(gettext("too many errors"));
break;
case VDEV_AUX_IO_FAILURE:
(void) printf(gettext("experienced I/O failures"));
break;
case VDEV_AUX_BAD_LOG:
(void) printf(gettext("bad intent log"));
break;
case VDEV_AUX_EXTERNAL:
(void) printf(gettext("external device fault"));
break;
case VDEV_AUX_SPLIT_POOL:
(void) printf(gettext("split into new pool"));
break;
default:
(void) printf(gettext("corrupted data"));
break;
}
} else if (children == 0 && !isspare &&
VDEV_STAT_VALID(vs_physical_ashift, vsc) &&
vs->vs_configured_ashift < vs->vs_physical_ashift) {
(void) printf(
gettext(" block size: %dB configured, %dB native"),
1 << vs->vs_configured_ashift, 1 << vs->vs_physical_ashift);
}
(void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_SCAN_STATS,
(uint64_t **)&ps, &c);
if (ps && ps->pss_state == DSS_SCANNING &&
vs->vs_scan_processed != 0 && children == 0) {
(void) printf(gettext(" (%s)"),
(ps->pss_func == POOL_SCAN_RESILVER) ?
"resilvering" : "repairing");
}
(void) printf("\n");
for (c = 0; c < children; c++) {
uint64_t islog = B_FALSE, ishole = B_FALSE;
/* Don't print logs or holes here */
(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
&islog);
(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
&ishole);
if (islog || ishole)
continue;
vname = zpool_vdev_name(g_zfs, zhp, child[c], B_TRUE);
print_status_config(zhp, vname, child[c],
namewidth, depth + 2, isspare);
free(vname);
}
}
/*
* Print the configuration of an exported pool. Iterate over all vdevs in the
* pool, printing out the name and status for each one.
*/
void
print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth)
{
nvlist_t **child;
uint_t c, children;
vdev_stat_t *vs;
char *type, *vname;
verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
if (strcmp(type, VDEV_TYPE_MISSING) == 0 ||
strcmp(type, VDEV_TYPE_HOLE) == 0)
return;
verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &c) == 0);
(void) printf("\t%*s%-*s", depth, "", namewidth - depth, name);
(void) printf(" %s", zpool_state_to_name(vs->vs_state, vs->vs_aux));
if (vs->vs_aux != 0) {
(void) printf(" ");
switch (vs->vs_aux) {
case VDEV_AUX_OPEN_FAILED:
(void) printf(gettext("cannot open"));
break;
case VDEV_AUX_BAD_GUID_SUM:
(void) printf(gettext("missing device"));
break;
case VDEV_AUX_NO_REPLICAS:
(void) printf(gettext("insufficient replicas"));
break;
case VDEV_AUX_VERSION_NEWER:
(void) printf(gettext("newer version"));
break;
case VDEV_AUX_UNSUP_FEAT:
(void) printf(gettext("unsupported feature(s)"));
break;
case VDEV_AUX_ERR_EXCEEDED:
(void) printf(gettext("too many errors"));
break;
default:
(void) printf(gettext("corrupted data"));
break;
}
}
(void) printf("\n");
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0)
return;
for (c = 0; c < children; c++) {
uint64_t is_log = B_FALSE;
(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
&is_log);
if (is_log)
continue;
vname = zpool_vdev_name(g_zfs, NULL, child[c], B_TRUE);
print_import_config(vname, child[c], namewidth, depth + 2);
free(vname);
}
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
&child, &children) == 0) {
(void) printf(gettext("\tcache\n"));
for (c = 0; c < children; c++) {
vname = zpool_vdev_name(g_zfs, NULL, child[c], B_FALSE);
(void) printf("\t %s\n", vname);
free(vname);
}
}
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
&child, &children) == 0) {
(void) printf(gettext("\tspares\n"));
for (c = 0; c < children; c++) {
vname = zpool_vdev_name(g_zfs, NULL, child[c], B_FALSE);
(void) printf("\t %s\n", vname);
free(vname);
}
}
}
/*
* Print log vdevs.
* Logs are recorded as top level vdevs in the main pool child array
* but with "is_log" set to 1. We use either print_status_config() or
* print_import_config() to print the top level logs then any log
* children (eg mirrored slogs) are printed recursively - which
* works because only the top level vdev is marked "is_log"
*/
static void
print_logs(zpool_handle_t *zhp, nvlist_t *nv, int namewidth, boolean_t verbose)
{
uint_t c, children;
nvlist_t **child;
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child,
&children) != 0)
return;
(void) printf(gettext("\tlogs\n"));
for (c = 0; c < children; c++) {
uint64_t is_log = B_FALSE;
char *name;
(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
&is_log);
if (!is_log)
continue;
name = zpool_vdev_name(g_zfs, zhp, child[c], B_TRUE);
if (verbose)
print_status_config(zhp, name, child[c], namewidth,
2, B_FALSE);
else
print_import_config(name, child[c], namewidth, 2);
free(name);
}
}
/*
* Display the status for the given pool.
*/
static void
show_import(nvlist_t *config)
{
uint64_t pool_state;
vdev_stat_t *vs;
char *name;
uint64_t guid;
char *msgid;
nvlist_t *nvroot;
int reason;
const char *health;
uint_t vsc;
int namewidth;
char *comment;
verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
&name) == 0);
verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
&guid) == 0);
verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
&pool_state) == 0);
verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &vsc) == 0);
health = zpool_state_to_name(vs->vs_state, vs->vs_aux);
reason = zpool_import_status(config, &msgid);
(void) printf(gettext(" pool: %s\n"), name);
(void) printf(gettext(" id: %llu\n"), (u_longlong_t)guid);
(void) printf(gettext(" state: %s"), health);
if (pool_state == POOL_STATE_DESTROYED)
(void) printf(gettext(" (DESTROYED)"));
(void) printf("\n");
switch (reason) {
case ZPOOL_STATUS_MISSING_DEV_R:
case ZPOOL_STATUS_MISSING_DEV_NR:
case ZPOOL_STATUS_BAD_GUID_SUM:
(void) printf(gettext(" status: One or more devices are "
"missing from the system.\n"));
break;
case ZPOOL_STATUS_CORRUPT_LABEL_R:
case ZPOOL_STATUS_CORRUPT_LABEL_NR:
(void) printf(gettext(" status: One or more devices contains "
"corrupted data.\n"));
break;
case ZPOOL_STATUS_CORRUPT_DATA:
(void) printf(
gettext(" status: The pool data is corrupted.\n"));
break;
case ZPOOL_STATUS_OFFLINE_DEV:
(void) printf(gettext(" status: One or more devices "
"are offlined.\n"));
break;
case ZPOOL_STATUS_CORRUPT_POOL:
(void) printf(gettext(" status: The pool metadata is "
"corrupted.\n"));
break;
case ZPOOL_STATUS_VERSION_OLDER:
(void) printf(gettext(" status: The pool is formatted using a "
"legacy on-disk version.\n"));
break;
case ZPOOL_STATUS_VERSION_NEWER:
(void) printf(gettext(" status: The pool is formatted using an "
"incompatible version.\n"));
break;
case ZPOOL_STATUS_FEAT_DISABLED:
(void) printf(gettext(" status: Some supported features are "
"not enabled on the pool.\n"));
break;
case ZPOOL_STATUS_UNSUP_FEAT_READ:
(void) printf(gettext("status: The pool uses the following "
"feature(s) not supported on this sytem:\n"));
zpool_print_unsup_feat(config);
break;
case ZPOOL_STATUS_UNSUP_FEAT_WRITE:
(void) printf(gettext("status: The pool can only be accessed "
"in read-only mode on this system. It\n\tcannot be "
"accessed in read-write mode because it uses the "
"following\n\tfeature(s) not supported on this system:\n"));
zpool_print_unsup_feat(config);
break;
case ZPOOL_STATUS_HOSTID_MISMATCH:
(void) printf(gettext(" status: The pool was last accessed by "
"another system.\n"));
break;
case ZPOOL_STATUS_FAULTED_DEV_R:
case ZPOOL_STATUS_FAULTED_DEV_NR:
(void) printf(gettext(" status: One or more devices are "
"faulted.\n"));
break;
case ZPOOL_STATUS_BAD_LOG:
(void) printf(gettext(" status: An intent log record cannot be "
"read.\n"));
break;
case ZPOOL_STATUS_RESILVERING:
(void) printf(gettext(" status: One or more devices were being "
"resilvered.\n"));
break;
case ZPOOL_STATUS_NON_NATIVE_ASHIFT:
(void) printf(gettext("status: One or more devices were "
"configured to use a non-native block size.\n"
"\tExpect reduced performance.\n"));
break;
default:
/*
* No other status can be seen when importing pools.
*/
assert(reason == ZPOOL_STATUS_OK);
}
/*
* Print out an action according to the overall state of the pool.
*/
if (vs->vs_state == VDEV_STATE_HEALTHY) {
if (reason == ZPOOL_STATUS_VERSION_OLDER ||
reason == ZPOOL_STATUS_FEAT_DISABLED) {
(void) printf(gettext(" action: The pool can be "
"imported using its name or numeric identifier, "
"though\n\tsome features will not be available "
"without an explicit 'zpool upgrade'.\n"));
} else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) {
(void) printf(gettext(" action: The pool can be "
"imported using its name or numeric "
"identifier and\n\tthe '-f' flag.\n"));
} else {
(void) printf(gettext(" action: The pool can be "
"imported using its name or numeric "
"identifier.\n"));
}
} else if (vs->vs_state == VDEV_STATE_DEGRADED) {
(void) printf(gettext(" action: The pool can be imported "
"despite missing or damaged devices. The\n\tfault "
"tolerance of the pool may be compromised if imported.\n"));
} else {
switch (reason) {
case ZPOOL_STATUS_VERSION_NEWER:
(void) printf(gettext(" action: The pool cannot be "
"imported. Access the pool on a system running "
"newer\n\tsoftware, or recreate the pool from "
"backup.\n"));
break;
case ZPOOL_STATUS_UNSUP_FEAT_READ:
(void) printf(gettext("action: The pool cannot be "
"imported. Access the pool on a system that "
"supports\n\tthe required feature(s), or recreate "
"the pool from backup.\n"));
break;
case ZPOOL_STATUS_UNSUP_FEAT_WRITE:
(void) printf(gettext("action: The pool cannot be "
"imported in read-write mode. Import the pool "
"with\n"
"\t\"-o readonly=on\", access the pool on a system "
"that supports the\n\trequired feature(s), or "
"recreate the pool from backup.\n"));
break;
case ZPOOL_STATUS_MISSING_DEV_R:
case ZPOOL_STATUS_MISSING_DEV_NR:
case ZPOOL_STATUS_BAD_GUID_SUM:
(void) printf(gettext(" action: The pool cannot be "
"imported. Attach the missing\n\tdevices and try "
"again.\n"));
break;
default:
(void) printf(gettext(" action: The pool cannot be "
"imported due to damaged devices or data.\n"));
}
}
/* Print the comment attached to the pool. */
if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
(void) printf(gettext("comment: %s\n"), comment);
/*
* If the state is "closed" or "can't open", and the aux state
* is "corrupt data":
*/
if (((vs->vs_state == VDEV_STATE_CLOSED) ||
(vs->vs_state == VDEV_STATE_CANT_OPEN)) &&
(vs->vs_aux == VDEV_AUX_CORRUPT_DATA)) {
if (pool_state == POOL_STATE_DESTROYED)
(void) printf(gettext("\tThe pool was destroyed, "
"but can be imported using the '-Df' flags.\n"));
else if (pool_state != POOL_STATE_EXPORTED)
(void) printf(gettext("\tThe pool may be active on "
"another system, but can be imported using\n\t"
"the '-f' flag.\n"));
}
if (msgid != NULL)
(void) printf(gettext(" see: http://illumos.org/msg/%s\n"),
msgid);
(void) printf(gettext(" config:\n\n"));
namewidth = max_width(NULL, nvroot, 0, 0);
if (namewidth < 10)
namewidth = 10;
print_import_config(name, nvroot, namewidth, 0);
if (num_logs(nvroot) > 0)
print_logs(NULL, nvroot, namewidth, B_FALSE);
if (reason == ZPOOL_STATUS_BAD_GUID_SUM) {
(void) printf(gettext("\n\tAdditional devices are known to "
"be part of this pool, though their\n\texact "
"configuration cannot be determined.\n"));
}
}
/*
* Perform the import for the given configuration. This passes the heavy
* lifting off to zpool_import_props(), and then mounts the datasets contained
* within the pool.
*/
static int
do_import(nvlist_t *config, const char *newname, const char *mntopts,
nvlist_t *props, int flags)
{
zpool_handle_t *zhp;
char *name;
uint64_t state;
uint64_t version;
verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
&name) == 0);
verify(nvlist_lookup_uint64(config,
ZPOOL_CONFIG_POOL_STATE, &state) == 0);
verify(nvlist_lookup_uint64(config,
ZPOOL_CONFIG_VERSION, &version) == 0);
if (!SPA_VERSION_IS_SUPPORTED(version)) {
(void) fprintf(stderr, gettext("cannot import '%s': pool "
"is formatted using an unsupported ZFS version\n"), name);
return (1);
} else if (state != POOL_STATE_EXPORTED &&
!(flags & ZFS_IMPORT_ANY_HOST)) {
uint64_t hostid;
if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID,
&hostid) == 0) {
if ((unsigned long)hostid != gethostid()) {
char *hostname;
uint64_t timestamp;
time_t t;
verify(nvlist_lookup_string(config,
ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
verify(nvlist_lookup_uint64(config,
ZPOOL_CONFIG_TIMESTAMP, &timestamp) == 0);
t = timestamp;
(void) fprintf(stderr, gettext("cannot import "
"'%s': pool may be in use from other "
"system, it was last accessed by %s "
"(hostid: 0x%lx) on %s"), name, hostname,
(unsigned long)hostid,
asctime(localtime(&t)));
(void) fprintf(stderr, gettext("use '-f' to "
"import anyway\n"));
return (1);
}
} else {
(void) fprintf(stderr, gettext("cannot import '%s': "
"pool may be in use from other system\n"), name);
(void) fprintf(stderr, gettext("use '-f' to import "
"anyway\n"));
return (1);
}
}
if (zpool_import_props(g_zfs, config, newname, props, flags) != 0)
return (1);
if (newname != NULL)
name = (char *)newname;
if ((zhp = zpool_open_canfail(g_zfs, name)) == NULL)
return (1);
if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL &&
!(flags & ZFS_IMPORT_ONLY) &&
zpool_enable_datasets(zhp, mntopts, 0) != 0) {
zpool_close(zhp);
return (1);
}
zpool_close(zhp);
return (0);
}
/*
* zpool import [-d dir] [-D]
* import [-o mntopts] [-o prop=value] ... [-R root] [-D]
* [-d dir | -c cachefile] [-f] -a
* import [-o mntopts] [-o prop=value] ... [-R root] [-D]
* [-d dir | -c cachefile] [-f] [-n] [-F] <pool | id> [newpool]
*
* -c Read pool information from a cachefile instead of searching
* devices.
*
* -d Scan in a specific directory, other than /dev/dsk. More than
* one directory can be specified using multiple '-d' options.
*
* -D Scan for previously destroyed pools or import all or only
* specified destroyed pools.
*
* -R Temporarily import the pool, with all mountpoints relative to
* the given root. The pool will remain exported when the machine
* is rebooted.
*
* -V Import even in the presence of faulted vdevs. This is an
* intentionally undocumented option for testing purposes, and
* treats the pool configuration as complete, leaving any bad
* vdevs in the FAULTED state. In other words, it does verbatim
* import.
*
* -f Force import, even if it appears that the pool is active.
*
* -F Attempt rewind if necessary.
*
* -n See if rewind would work, but don't actually rewind.
*
* -N Import the pool but don't mount datasets.
*
* -T Specify a starting txg to use for import. This option is
* intentionally undocumented option for testing purposes.
*
* -a Import all pools found.
*
* -o Set property=value and/or temporary mount options (without '=').
*
* The import command scans for pools to import, and import pools based on pool
* name and GUID. The pool can also be renamed as part of the import process.
*/
int
zpool_do_import(int argc, char **argv)
{
char **searchdirs = NULL;
int nsearch = 0;
int c;
int err = 0;
nvlist_t *pools = NULL;
boolean_t do_all = B_FALSE;
boolean_t do_destroyed = B_FALSE;
char *mntopts = NULL;
nvpair_t *elem;
nvlist_t *config;
uint64_t searchguid = 0;
char *searchname = NULL;
char *propval;
nvlist_t *found_config;
nvlist_t *policy = NULL;
nvlist_t *props = NULL;
boolean_t first;
int flags = ZFS_IMPORT_NORMAL;
uint32_t rewind_policy = ZPOOL_NO_REWIND;
boolean_t dryrun = B_FALSE;
boolean_t do_rewind = B_FALSE;
boolean_t xtreme_rewind = B_FALSE;
uint64_t pool_state, txg = -1ULL;
char *cachefile = NULL;
importargs_t idata = { 0 };
char *endptr;
/* check options */
while ((c = getopt(argc, argv, ":aCc:d:DEfFmnNo:R:T:VX")) != -1) {
switch (c) {
case 'a':
do_all = B_TRUE;
break;
case 'c':
cachefile = optarg;
break;
case 'd':
if (searchdirs == NULL) {
searchdirs = safe_malloc(sizeof (char *));
} else {
char **tmp = safe_malloc((nsearch + 1) *
sizeof (char *));
bcopy(searchdirs, tmp, nsearch *
sizeof (char *));
free(searchdirs);
searchdirs = tmp;
}
searchdirs[nsearch++] = optarg;
break;
case 'D':
do_destroyed = B_TRUE;
break;
case 'f':
flags |= ZFS_IMPORT_ANY_HOST;
break;
case 'F':
do_rewind = B_TRUE;
break;
case 'm':
flags |= ZFS_IMPORT_MISSING_LOG;
break;
case 'n':
dryrun = B_TRUE;
break;
case 'N':
flags |= ZFS_IMPORT_ONLY;
break;
case 'o':
if ((propval = strchr(optarg, '=')) != NULL) {
*propval = '\0';
propval++;
if (add_prop_list(optarg, propval,
&props, B_TRUE))
goto error;
} else {
mntopts = optarg;
}
break;
case 'R':
if (add_prop_list(zpool_prop_to_name(
ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE))
goto error;
if (nvlist_lookup_string(props,
zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
&propval) == 0)
break;
if (add_prop_list(zpool_prop_to_name(
ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE))
goto error;
break;
case 'T':
errno = 0;
txg = strtoull(optarg, &endptr, 0);
if (errno != 0 || *endptr != '\0') {
(void) fprintf(stderr,
gettext("invalid txg value\n"));
usage(B_FALSE);
}
rewind_policy = ZPOOL_DO_REWIND | ZPOOL_EXTREME_REWIND;
break;
case 'V':
flags |= ZFS_IMPORT_VERBATIM;
break;
case 'X':
xtreme_rewind = B_TRUE;
break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
"'%c' option\n"), optopt);
usage(B_FALSE);
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
if (cachefile && nsearch != 0) {
(void) fprintf(stderr, gettext("-c is incompatible with -d\n"));
usage(B_FALSE);
}
if ((dryrun || xtreme_rewind) && !do_rewind) {
(void) fprintf(stderr,
gettext("-n or -X only meaningful with -F\n"));
usage(B_FALSE);
}
if (dryrun)
rewind_policy = ZPOOL_TRY_REWIND;
else if (do_rewind)
rewind_policy = ZPOOL_DO_REWIND;
if (xtreme_rewind)
rewind_policy |= ZPOOL_EXTREME_REWIND;
/* In the future, we can capture further policy and include it here */
if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 ||
nvlist_add_uint64(policy, ZPOOL_REWIND_REQUEST_TXG, txg) != 0 ||
nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind_policy) != 0)
goto error;
if (searchdirs == NULL) {
searchdirs = safe_malloc(sizeof (char *));
searchdirs[0] = "/dev";
nsearch = 1;
}
/* check argument count */
if (do_all) {
if (argc != 0) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}
} else {
if (argc > 2) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}
/*
* Check for the SYS_CONFIG privilege. We do this explicitly
* here because otherwise any attempt to discover pools will
* silently fail.
*/
if (argc == 0 && !priv_ineffect(PRIV_SYS_CONFIG)) {
(void) fprintf(stderr, gettext("cannot "
"discover pools: permission denied\n"));
free(searchdirs);
nvlist_free(policy);
return (1);
}
}
/*
* Depending on the arguments given, we do one of the following:
*
* <none> Iterate through all pools and display information about
* each one.
*
* -a Iterate through all pools and try to import each one.
*
* <id> Find the pool that corresponds to the given GUID/pool
* name and import that one.
*
* -D Above options applies only to destroyed pools.
*/
if (argc != 0) {
char *endptr;
errno = 0;
searchguid = strtoull(argv[0], &endptr, 10);
if (errno != 0 || *endptr != '\0') {
searchname = argv[0];
searchguid = 0;
}
found_config = NULL;
/*
* User specified a name or guid. Ensure it's unique.
*/
idata.unique = B_TRUE;
}
idata.path = searchdirs;
idata.paths = nsearch;
idata.poolname = searchname;
idata.guid = searchguid;
idata.cachefile = cachefile;
pools = zpool_search_import(g_zfs, &idata);
if (pools != NULL && idata.exists &&
(argc == 1 || strcmp(argv[0], argv[1]) == 0)) {
(void) fprintf(stderr, gettext("cannot import '%s': "
"a pool with that name already exists\n"),
argv[0]);
(void) fprintf(stderr, gettext("use the form '%s "
"<pool | id> <newpool>' to give it a new name\n"),
"zpool import");
err = 1;
} else if (pools == NULL && idata.exists) {
(void) fprintf(stderr, gettext("cannot import '%s': "
"a pool with that name is already created/imported,\n"),
argv[0]);
(void) fprintf(stderr, gettext("and no additional pools "
"with that name were found\n"));
err = 1;
} else if (pools == NULL) {
if (argc != 0) {
(void) fprintf(stderr, gettext("cannot import '%s': "
"no such pool available\n"), argv[0]);
}
err = 1;
}
if (err == 1) {
free(searchdirs);
nvlist_free(policy);
return (1);
}
/*
* At this point we have a list of import candidate configs. Even if
* we were searching by pool name or guid, we still need to
* post-process the list to deal with pool state and possible
* duplicate names.
*/
err = 0;
elem = NULL;
first = B_TRUE;
while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
verify(nvpair_value_nvlist(elem, &config) == 0);
verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
&pool_state) == 0);
if (!do_destroyed && pool_state == POOL_STATE_DESTROYED)
continue;
if (do_destroyed && pool_state != POOL_STATE_DESTROYED)
continue;
verify(nvlist_add_nvlist(config, ZPOOL_REWIND_POLICY,
policy) == 0);
if (argc == 0) {
if (first)
first = B_FALSE;
else if (!do_all)
(void) printf("\n");
if (do_all) {
err |= do_import(config, NULL, mntopts,
props, flags);
} else {
show_import(config);
}
} else if (searchname != NULL) {
char *name;
/*
* We are searching for a pool based on name.
*/
verify(nvlist_lookup_string(config,
ZPOOL_CONFIG_POOL_NAME, &name) == 0);
if (strcmp(name, searchname) == 0) {
if (found_config != NULL) {
(void) fprintf(stderr, gettext(
"cannot import '%s': more than "
"one matching pool\n"), searchname);
(void) fprintf(stderr, gettext(
"import by numeric ID instead\n"));
err = B_TRUE;
}
found_config = config;
}
} else {
uint64_t guid;
/*
* Search for a pool by guid.
*/
verify(nvlist_lookup_uint64(config,
ZPOOL_CONFIG_POOL_GUID, &guid) == 0);
if (guid == searchguid)
found_config = config;
}
}
/*
* If we were searching for a specific pool, verify that we found a
* pool, and then do the import.
*/
if (argc != 0 && err == 0) {
if (found_config == NULL) {
(void) fprintf(stderr, gettext("cannot import '%s': "
"no such pool available\n"), argv[0]);
err = B_TRUE;
} else {
err |= do_import(found_config, argc == 1 ? NULL :
argv[1], mntopts, props, flags);
}
}
/*
* If we were just looking for pools, report an error if none were
* found.
*/
if (argc == 0 && first)
(void) fprintf(stderr,
gettext("no pools available to import\n"));
error:
nvlist_free(props);
nvlist_free(pools);
nvlist_free(policy);
free(searchdirs);
return (err ? 1 : 0);
}
typedef struct iostat_cbdata {
boolean_t cb_verbose;
int cb_namewidth;
int cb_iteration;
zpool_list_t *cb_list;
} iostat_cbdata_t;
static void
print_iostat_separator(iostat_cbdata_t *cb)
{
int i = 0;
for (i = 0; i < cb->cb_namewidth; i++)
(void) printf("-");
(void) printf(" ----- ----- ----- ----- ----- -----\n");
}
static void
print_iostat_header(iostat_cbdata_t *cb)
{
(void) printf("%*s capacity operations bandwidth\n",
cb->cb_namewidth, "");
(void) printf("%-*s alloc free read write read write\n",
cb->cb_namewidth, "pool");
print_iostat_separator(cb);
}
/*
* Display a single statistic.
*/
static void
print_one_stat(uint64_t value)
{
char buf[64];
zfs_nicenum(value, buf, sizeof (buf));
(void) printf(" %5s", buf);
}
/*
* Print out all the statistics for the given vdev. This can either be the
* toplevel configuration, or called recursively. If 'name' is NULL, then this
* is a verbose output, and we don't want to display the toplevel pool stats.
*/
void
print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
nvlist_t *newnv, iostat_cbdata_t *cb, int depth)
{
nvlist_t **oldchild, **newchild;
uint_t c, children;
vdev_stat_t *oldvs, *newvs;
vdev_stat_t zerovs = { 0 };
uint64_t tdelta;
double scale;
char *vname;
+ if (strcmp(name, VDEV_TYPE_INDIRECT) == 0)
+ return;
+
if (oldnv != NULL) {
verify(nvlist_lookup_uint64_array(oldnv,
ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0);
} else {
oldvs = &zerovs;
}
verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&newvs, &c) == 0);
if (strlen(name) + depth > cb->cb_namewidth)
(void) printf("%*s%s", depth, "", name);
else
(void) printf("%*s%s%*s", depth, "", name,
(int)(cb->cb_namewidth - strlen(name) - depth), "");
tdelta = newvs->vs_timestamp - oldvs->vs_timestamp;
if (tdelta == 0)
scale = 1.0;
else
scale = (double)NANOSEC / tdelta;
/* only toplevel vdevs have capacity stats */
if (newvs->vs_space == 0) {
(void) printf(" - -");
} else {
print_one_stat(newvs->vs_alloc);
print_one_stat(newvs->vs_space - newvs->vs_alloc);
}
print_one_stat((uint64_t)(scale * (newvs->vs_ops[ZIO_TYPE_READ] -
oldvs->vs_ops[ZIO_TYPE_READ])));
print_one_stat((uint64_t)(scale * (newvs->vs_ops[ZIO_TYPE_WRITE] -
oldvs->vs_ops[ZIO_TYPE_WRITE])));
print_one_stat((uint64_t)(scale * (newvs->vs_bytes[ZIO_TYPE_READ] -
oldvs->vs_bytes[ZIO_TYPE_READ])));
print_one_stat((uint64_t)(scale * (newvs->vs_bytes[ZIO_TYPE_WRITE] -
oldvs->vs_bytes[ZIO_TYPE_WRITE])));
(void) printf("\n");
if (!cb->cb_verbose)
return;
if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN,
&newchild, &children) != 0)
return;
if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN,
&oldchild, &c) != 0)
return;
for (c = 0; c < children; c++) {
uint64_t ishole = B_FALSE, islog = B_FALSE;
(void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_HOLE,
&ishole);
(void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG,
&islog);
if (ishole || islog)
continue;
vname = zpool_vdev_name(g_zfs, zhp, newchild[c], B_FALSE);
print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
newchild[c], cb, depth + 2);
free(vname);
}
/*
* Log device section
*/
if (num_logs(newnv) > 0) {
(void) printf("%-*s - - - - - "
"-\n", cb->cb_namewidth, "logs");
for (c = 0; c < children; c++) {
uint64_t islog = B_FALSE;
(void) nvlist_lookup_uint64(newchild[c],
ZPOOL_CONFIG_IS_LOG, &islog);
if (islog) {
vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
B_FALSE);
print_vdev_stats(zhp, vname, oldnv ?
oldchild[c] : NULL, newchild[c],
cb, depth + 2);
free(vname);
}
}
}
/*
* Include level 2 ARC devices in iostat output
*/
if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE,
&newchild, &children) != 0)
return;
if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE,
&oldchild, &c) != 0)
return;
if (children > 0) {
(void) printf("%-*s - - - - - "
"-\n", cb->cb_namewidth, "cache");
for (c = 0; c < children; c++) {
vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
B_FALSE);
print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
newchild[c], cb, depth + 2);
free(vname);
}
}
}
static int
refresh_iostat(zpool_handle_t *zhp, void *data)
{
iostat_cbdata_t *cb = data;
boolean_t missing;
/*
* If the pool has disappeared, remove it from the list and continue.
*/
if (zpool_refresh_stats(zhp, &missing) != 0)
return (-1);
if (missing)
pool_list_remove(cb->cb_list, zhp);
return (0);
}
/*
* Callback to print out the iostats for the given pool.
*/
int
print_iostat(zpool_handle_t *zhp, void *data)
{
iostat_cbdata_t *cb = data;
nvlist_t *oldconfig, *newconfig;
nvlist_t *oldnvroot, *newnvroot;
newconfig = zpool_get_config(zhp, &oldconfig);
if (cb->cb_iteration == 1)
oldconfig = NULL;
verify(nvlist_lookup_nvlist(newconfig, ZPOOL_CONFIG_VDEV_TREE,
&newnvroot) == 0);
if (oldconfig == NULL)
oldnvroot = NULL;
else
verify(nvlist_lookup_nvlist(oldconfig, ZPOOL_CONFIG_VDEV_TREE,
&oldnvroot) == 0);
/*
* Print out the statistics for the pool.
*/
print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot, cb, 0);
if (cb->cb_verbose)
print_iostat_separator(cb);
return (0);
}
int
get_namewidth(zpool_handle_t *zhp, void *data)
{
iostat_cbdata_t *cb = data;
nvlist_t *config, *nvroot;
if ((config = zpool_get_config(zhp, NULL)) != NULL) {
verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
if (!cb->cb_verbose)
cb->cb_namewidth = strlen(zpool_get_name(zhp));
else
cb->cb_namewidth = max_width(zhp, nvroot, 0,
cb->cb_namewidth);
}
/*
* The width must fall into the range [10,38]. The upper limit is the
* maximum we can have and still fit in 80 columns.
*/
if (cb->cb_namewidth < 10)
cb->cb_namewidth = 10;
if (cb->cb_namewidth > 38)
cb->cb_namewidth = 38;
return (0);
}
/*
* Parse the input string, get the 'interval' and 'count' value if there is one.
*/
static void
get_interval_count(int *argcp, char **argv, unsigned long *iv,
unsigned long *cnt)
{
unsigned long interval = 0, count = 0;
int argc = *argcp, errno;
/*
* Determine if the last argument is an integer or a pool name
*/
if (argc > 0 && isdigit(argv[argc - 1][0])) {
char *end;
errno = 0;
interval = strtoul(argv[argc - 1], &end, 10);
if (*end == '\0' && errno == 0) {
if (interval == 0) {
(void) fprintf(stderr, gettext("interval "
"cannot be zero\n"));
usage(B_FALSE);
}
/*
* Ignore the last parameter
*/
argc--;
} else {
/*
* If this is not a valid number, just plow on. The
* user will get a more informative error message later
* on.
*/
interval = 0;
}
}
/*
* If the last argument is also an integer, then we have both a count
* and an interval.
*/
if (argc > 0 && isdigit(argv[argc - 1][0])) {
char *end;
errno = 0;
count = interval;
interval = strtoul(argv[argc - 1], &end, 10);
if (*end == '\0' && errno == 0) {
if (interval == 0) {
(void) fprintf(stderr, gettext("interval "
"cannot be zero\n"));
usage(B_FALSE);
}
/*
* Ignore the last parameter
*/
argc--;
} else {
interval = 0;
}
}
*iv = interval;
*cnt = count;
*argcp = argc;
}
static void
get_timestamp_arg(char c)
{
if (c == 'u')
timestamp_fmt = UDATE;
else if (c == 'd')
timestamp_fmt = DDATE;
else
usage(B_FALSE);
}
/*
* zpool iostat [-v] [-T d|u] [pool] ... [interval [count]]
*
* -v Display statistics for individual vdevs
* -T Display a timestamp in date(1) or Unix format
*
* This command can be tricky because we want to be able to deal with pool
* creation/destruction as well as vdev configuration changes. The bulk of this
* processing is handled by the pool_list_* routines in zpool_iter.c. We rely
* on pool_list_update() to detect the addition of new pools. Configuration
* changes are all handled within libzfs.
*/
int
zpool_do_iostat(int argc, char **argv)
{
int c;
int ret;
int npools;
unsigned long interval = 0, count = 0;
zpool_list_t *list;
boolean_t verbose = B_FALSE;
iostat_cbdata_t cb;
/* check options */
while ((c = getopt(argc, argv, "T:v")) != -1) {
switch (c) {
case 'T':
get_timestamp_arg(*optarg);
break;
case 'v':
verbose = B_TRUE;
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
get_interval_count(&argc, argv, &interval, &count);
/*
* Construct the list of all interesting pools.
*/
ret = 0;
if ((list = pool_list_get(argc, argv, NULL, &ret)) == NULL)
return (1);
if (pool_list_count(list) == 0 && argc != 0) {
pool_list_free(list);
return (1);
}
if (pool_list_count(list) == 0 && interval == 0) {
pool_list_free(list);
(void) fprintf(stderr, gettext("no pools available\n"));
return (1);
}
/*
* Enter the main iostat loop.
*/
cb.cb_list = list;
cb.cb_verbose = verbose;
cb.cb_iteration = 0;
cb.cb_namewidth = 0;
for (;;) {
pool_list_update(list);
if ((npools = pool_list_count(list)) == 0)
break;
/*
* Refresh all statistics. This is done as an explicit step
* before calculating the maximum name width, so that any
* configuration changes are properly accounted for.
*/
(void) pool_list_iter(list, B_FALSE, refresh_iostat, &cb);
/*
* Iterate over all pools to determine the maximum width
* for the pool / device name column across all pools.
*/
cb.cb_namewidth = 0;
(void) pool_list_iter(list, B_FALSE, get_namewidth, &cb);
if (timestamp_fmt != NODATE)
print_timestamp(timestamp_fmt);
/*
* If it's the first time, or verbose mode, print the header.
*/
if (++cb.cb_iteration == 1 || verbose)
print_iostat_header(&cb);
(void) pool_list_iter(list, B_FALSE, print_iostat, &cb);
/*
* If there's more than one pool, and we're not in verbose mode
* (which prints a separator for us), then print a separator.
*/
if (npools > 1 && !verbose)
print_iostat_separator(&cb);
if (verbose)
(void) printf("\n");
/*
* Flush the output so that redirection to a file isn't buffered
* indefinitely.
*/
(void) fflush(stdout);
if (interval == 0)
break;
if (count != 0 && --count == 0)
break;
(void) sleep(interval);
}
pool_list_free(list);
return (ret);
}
typedef struct list_cbdata {
boolean_t cb_verbose;
int cb_namewidth;
boolean_t cb_scripted;
zprop_list_t *cb_proplist;
boolean_t cb_literal;
} list_cbdata_t;
/*
* Given a list of columns to display, output appropriate headers for each one.
*/
static void
print_header(list_cbdata_t *cb)
{
zprop_list_t *pl = cb->cb_proplist;
char headerbuf[ZPOOL_MAXPROPLEN];
const char *header;
boolean_t first = B_TRUE;
boolean_t right_justify;
size_t width = 0;
for (; pl != NULL; pl = pl->pl_next) {
width = pl->pl_width;
if (first && cb->cb_verbose) {
/*
* Reset the width to accommodate the verbose listing
* of devices.
*/
width = cb->cb_namewidth;
}
if (!first)
(void) printf(" ");
else
first = B_FALSE;
right_justify = B_FALSE;
if (pl->pl_prop != ZPROP_INVAL) {
header = zpool_prop_column_name(pl->pl_prop);
right_justify = zpool_prop_align_right(pl->pl_prop);
} else {
int i;
for (i = 0; pl->pl_user_prop[i] != '\0'; i++)
headerbuf[i] = toupper(pl->pl_user_prop[i]);
headerbuf[i] = '\0';
header = headerbuf;
}
if (pl->pl_next == NULL && !right_justify)
(void) printf("%s", header);
else if (right_justify)
(void) printf("%*s", width, header);
else
(void) printf("%-*s", width, header);
}
(void) printf("\n");
}
/*
* Given a pool and a list of properties, print out all the properties according
* to the described layout.
*/
static void
print_pool(zpool_handle_t *zhp, list_cbdata_t *cb)
{
zprop_list_t *pl = cb->cb_proplist;
boolean_t first = B_TRUE;
char property[ZPOOL_MAXPROPLEN];
char *propstr;
boolean_t right_justify;
size_t width;
for (; pl != NULL; pl = pl->pl_next) {
width = pl->pl_width;
if (first && cb->cb_verbose) {
/*
* Reset the width to accommodate the verbose listing
* of devices.
*/
width = cb->cb_namewidth;
}
if (!first) {
if (cb->cb_scripted)
(void) printf("\t");
else
(void) printf(" ");
} else {
first = B_FALSE;
}
right_justify = B_FALSE;
if (pl->pl_prop != ZPROP_INVAL) {
if (zpool_get_prop(zhp, pl->pl_prop, property,
sizeof (property), NULL, cb->cb_literal) != 0)
propstr = "-";
else
propstr = property;
right_justify = zpool_prop_align_right(pl->pl_prop);
} else if ((zpool_prop_feature(pl->pl_user_prop) ||
zpool_prop_unsupported(pl->pl_user_prop)) &&
zpool_prop_get_feature(zhp, pl->pl_user_prop, property,
sizeof (property)) == 0) {
propstr = property;
} else {
propstr = "-";
}
/*
* If this is being called in scripted mode, or if this is the
* last column and it is left-justified, don't include a width
* format specifier.
*/
if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify))
(void) printf("%s", propstr);
else if (right_justify)
(void) printf("%*s", width, propstr);
else
(void) printf("%-*s", width, propstr);
}
(void) printf("\n");
}
static void
print_one_column(zpool_prop_t prop, uint64_t value, boolean_t scripted,
boolean_t valid)
{
char propval[64];
boolean_t fixed;
size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL);
switch (prop) {
case ZPOOL_PROP_EXPANDSZ:
if (value == 0)
(void) strlcpy(propval, "-", sizeof (propval));
else
zfs_nicenum(value, propval, sizeof (propval));
break;
case ZPOOL_PROP_FRAGMENTATION:
if (value == ZFS_FRAG_INVALID) {
(void) strlcpy(propval, "-", sizeof (propval));
} else {
(void) snprintf(propval, sizeof (propval), "%llu%%",
value);
}
break;
case ZPOOL_PROP_CAPACITY:
(void) snprintf(propval, sizeof (propval), "%llu%%", value);
break;
default:
zfs_nicenum(value, propval, sizeof (propval));
}
if (!valid)
(void) strlcpy(propval, "-", sizeof (propval));
if (scripted)
(void) printf("\t%s", propval);
else
(void) printf(" %*s", width, propval);
}
void
print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
list_cbdata_t *cb, int depth)
{
nvlist_t **child;
vdev_stat_t *vs;
uint_t c, children;
char *vname;
boolean_t scripted = cb->cb_scripted;
uint64_t islog = B_FALSE;
boolean_t haslog = B_FALSE;
char *dashes = "%-*s - - - - - -\n";
verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &c) == 0);
if (name != NULL) {
boolean_t toplevel = (vs->vs_space != 0);
uint64_t cap;
+ if (strcmp(name, VDEV_TYPE_INDIRECT) == 0)
+ return;
+
if (scripted)
(void) printf("\t%s", name);
else if (strlen(name) + depth > cb->cb_namewidth)
(void) printf("%*s%s", depth, "", name);
else
(void) printf("%*s%s%*s", depth, "", name,
(int)(cb->cb_namewidth - strlen(name) - depth), "");
/*
* Print the properties for the individual vdevs. Some
* properties are only applicable to toplevel vdevs. The
* 'toplevel' boolean value is passed to the print_one_column()
* to indicate that the value is valid.
*/
print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, scripted,
toplevel);
print_one_column(ZPOOL_PROP_ALLOCATED, vs->vs_alloc, scripted,
toplevel);
print_one_column(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc,
scripted, toplevel);
print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, scripted,
B_TRUE);
print_one_column(ZPOOL_PROP_FRAGMENTATION,
vs->vs_fragmentation, scripted,
(vs->vs_fragmentation != ZFS_FRAG_INVALID && toplevel));
cap = (vs->vs_space == 0) ? 0 :
(vs->vs_alloc * 100 / vs->vs_space);
print_one_column(ZPOOL_PROP_CAPACITY, cap, scripted, toplevel);
(void) printf("\n");
}
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0)
return;
for (c = 0; c < children; c++) {
uint64_t ishole = B_FALSE;
if (nvlist_lookup_uint64(child[c],
ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole)
continue;
if (nvlist_lookup_uint64(child[c],
ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) {
haslog = B_TRUE;
continue;
}
vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE);
print_list_stats(zhp, vname, child[c], cb, depth + 2);
free(vname);
}
if (haslog == B_TRUE) {
/* LINTED E_SEC_PRINTF_VAR_FMT */
(void) printf(dashes, cb->cb_namewidth, "log");
for (c = 0; c < children; c++) {
if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
&islog) != 0 || !islog)
continue;
vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE);
print_list_stats(zhp, vname, child[c], cb, depth + 2);
free(vname);
}
}
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
&child, &children) == 0 && children > 0) {
/* LINTED E_SEC_PRINTF_VAR_FMT */
(void) printf(dashes, cb->cb_namewidth, "cache");
for (c = 0; c < children; c++) {
vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE);
print_list_stats(zhp, vname, child[c], cb, depth + 2);
free(vname);
}
}
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child,
&children) == 0 && children > 0) {
/* LINTED E_SEC_PRINTF_VAR_FMT */
(void) printf(dashes, cb->cb_namewidth, "spare");
for (c = 0; c < children; c++) {
vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE);
print_list_stats(zhp, vname, child[c], cb, depth + 2);
free(vname);
}
}
}
/*
* Generic callback function to list a pool.
*/
int
list_callback(zpool_handle_t *zhp, void *data)
{
list_cbdata_t *cbp = data;
nvlist_t *config;
nvlist_t *nvroot;
config = zpool_get_config(zhp, NULL);
print_pool(zhp, cbp);
if (!cbp->cb_verbose)
return (0);
verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
print_list_stats(zhp, NULL, nvroot, cbp, 0);
return (0);
}
/*
* zpool list [-Hp] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]]
*
* -H Scripted mode. Don't display headers, and separate properties
* by a single tab.
* -o List of properties to display. Defaults to
* "name,size,allocated,free,expandsize,fragmentation,capacity,"
* "dedupratio,health,altroot"
* -p Diplay values in parsable (exact) format.
* -T Display a timestamp in date(1) or Unix format
*
* List all pools in the system, whether or not they're healthy. Output space
* statistics for each one, as well as health status summary.
*/
int
zpool_do_list(int argc, char **argv)
{
int c;
int ret;
list_cbdata_t cb = { 0 };
static char default_props[] =
"name,size,allocated,free,expandsize,fragmentation,capacity,"
"dedupratio,health,altroot";
char *props = default_props;
unsigned long interval = 0, count = 0;
zpool_list_t *list;
boolean_t first = B_TRUE;
/* check options */
while ((c = getopt(argc, argv, ":Ho:pT:v")) != -1) {
switch (c) {
case 'H':
cb.cb_scripted = B_TRUE;
break;
case 'o':
props = optarg;
break;
case 'p':
cb.cb_literal = B_TRUE;
break;
case 'T':
get_timestamp_arg(*optarg);
break;
case 'v':
cb.cb_verbose = B_TRUE;
break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
"'%c' option\n"), optopt);
usage(B_FALSE);
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
get_interval_count(&argc, argv, &interval, &count);
if (zprop_get_list(g_zfs, props, &cb.cb_proplist, ZFS_TYPE_POOL) != 0)
usage(B_FALSE);
for (;;) {
if ((list = pool_list_get(argc, argv, &cb.cb_proplist,
&ret)) == NULL)
return (1);
if (pool_list_count(list) == 0)
break;
cb.cb_namewidth = 0;
(void) pool_list_iter(list, B_FALSE, get_namewidth, &cb);
if (timestamp_fmt != NODATE)
print_timestamp(timestamp_fmt);
if (!cb.cb_scripted && (first || cb.cb_verbose)) {
print_header(&cb);
first = B_FALSE;
}
ret = pool_list_iter(list, B_TRUE, list_callback, &cb);
if (interval == 0)
break;
if (count != 0 && --count == 0)
break;
pool_list_free(list);
(void) sleep(interval);
}
if (argc == 0 && !cb.cb_scripted && pool_list_count(list) == 0) {
(void) printf(gettext("no pools available\n"));
ret = 0;
}
pool_list_free(list);
zprop_free_list(cb.cb_proplist);
return (ret);
}
static int
zpool_do_attach_or_replace(int argc, char **argv, int replacing)
{
boolean_t force = B_FALSE;
int c;
nvlist_t *nvroot;
char *poolname, *old_disk, *new_disk;
zpool_handle_t *zhp;
zpool_boot_label_t boot_type;
uint64_t boot_size;
int ret;
/* check options */
while ((c = getopt(argc, argv, "f")) != -1) {
switch (c) {
case 'f':
force = B_TRUE;
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
/* get pool name and check number of arguments */
if (argc < 1) {
(void) fprintf(stderr, gettext("missing pool name argument\n"));
usage(B_FALSE);
}
poolname = argv[0];
if (argc < 2) {
(void) fprintf(stderr,
gettext("missing <device> specification\n"));
usage(B_FALSE);
}
old_disk = argv[1];
if (argc < 3) {
if (!replacing) {
(void) fprintf(stderr,
gettext("missing <new_device> specification\n"));
usage(B_FALSE);
}
new_disk = old_disk;
argc -= 1;
argv += 1;
} else {
new_disk = argv[2];
argc -= 2;
argv += 2;
}
if (argc > 1) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}
if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
return (1);
if (zpool_get_config(zhp, NULL) == NULL) {
(void) fprintf(stderr, gettext("pool '%s' is unavailable\n"),
poolname);
zpool_close(zhp);
return (1);
}
if (zpool_is_bootable(zhp))
boot_type = ZPOOL_COPY_BOOT_LABEL;
else
boot_type = ZPOOL_NO_BOOT_LABEL;
boot_size = zpool_get_prop_int(zhp, ZPOOL_PROP_BOOTSIZE, NULL);
nvroot = make_root_vdev(zhp, force, B_FALSE, replacing, B_FALSE,
boot_type, boot_size, argc, argv);
if (nvroot == NULL) {
zpool_close(zhp);
return (1);
}
ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing);
nvlist_free(nvroot);
zpool_close(zhp);
return (ret);
}
/*
* zpool replace [-f] <pool> <device> <new_device>
*
* -f Force attach, even if <new_device> appears to be in use.
*
* Replace <device> with <new_device>.
*/
/* ARGSUSED */
int
zpool_do_replace(int argc, char **argv)
{
return (zpool_do_attach_or_replace(argc, argv, B_TRUE));
}
/*
* zpool attach [-f] <pool> <device> <new_device>
*
* -f Force attach, even if <new_device> appears to be in use.
*
* Attach <new_device> to the mirror containing <device>. If <device> is not
* part of a mirror, then <device> will be transformed into a mirror of
* <device> and <new_device>. In either case, <new_device> will begin life
* with a DTL of [0, now], and will immediately begin to resilver itself.
*/
int
zpool_do_attach(int argc, char **argv)
{
return (zpool_do_attach_or_replace(argc, argv, B_FALSE));
}
/*
* zpool detach [-f] <pool> <device>
*
* -f Force detach of <device>, even if DTLs argue against it
* (not supported yet)
*
* Detach a device from a mirror. The operation will be refused if <device>
* is the last device in the mirror, or if the DTLs indicate that this device
* has the only valid copy of some data.
*/
/* ARGSUSED */
int
zpool_do_detach(int argc, char **argv)
{
int c;
char *poolname, *path;
zpool_handle_t *zhp;
int ret;
/* check options */
while ((c = getopt(argc, argv, "f")) != -1) {
switch (c) {
case 'f':
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
/* get pool name and check number of arguments */
if (argc < 1) {
(void) fprintf(stderr, gettext("missing pool name argument\n"));
usage(B_FALSE);
}
if (argc < 2) {
(void) fprintf(stderr,
gettext("missing <device> specification\n"));
usage(B_FALSE);
}
poolname = argv[0];
path = argv[1];
if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
return (1);
ret = zpool_vdev_detach(zhp, path);
zpool_close(zhp);
return (ret);
}
/*
* zpool split [-n] [-o prop=val] ...
* [-o mntopt] ...
* [-R altroot] <pool> <newpool> [<device> ...]
*
* -n Do not split the pool, but display the resulting layout if
* it were to be split.
* -o Set property=value, or set mount options.
* -R Mount the split-off pool under an alternate root.
*
* Splits the named pool and gives it the new pool name. Devices to be split
* off may be listed, provided that no more than one device is specified
* per top-level vdev mirror. The newly split pool is left in an exported
* state unless -R is specified.
*
* Restrictions: the top-level of the pool pool must only be made up of
* mirrors; all devices in the pool must be healthy; no device may be
* undergoing a resilvering operation.
*/
int
zpool_do_split(int argc, char **argv)
{
char *srcpool, *newpool, *propval;
char *mntopts = NULL;
splitflags_t flags;
int c, ret = 0;
zpool_handle_t *zhp;
nvlist_t *config, *props = NULL;
flags.dryrun = B_FALSE;
flags.import = B_FALSE;
/* check options */
while ((c = getopt(argc, argv, ":R:no:")) != -1) {
switch (c) {
case 'R':
flags.import = B_TRUE;
if (add_prop_list(
zpool_prop_to_name(ZPOOL_PROP_ALTROOT), optarg,
&props, B_TRUE) != 0) {
nvlist_free(props);
usage(B_FALSE);
}
break;
case 'n':
flags.dryrun = B_TRUE;
break;
case 'o':
if ((propval = strchr(optarg, '=')) != NULL) {
*propval = '\0';
propval++;
if (add_prop_list(optarg, propval,
&props, B_TRUE) != 0) {
nvlist_free(props);
usage(B_FALSE);
}
} else {
mntopts = optarg;
}
break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
"'%c' option\n"), optopt);
usage(B_FALSE);
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
break;
}
}
if (!flags.import && mntopts != NULL) {
(void) fprintf(stderr, gettext("setting mntopts is only "
"valid when importing the pool\n"));
usage(B_FALSE);
}
argc -= optind;
argv += optind;
if (argc < 1) {
(void) fprintf(stderr, gettext("Missing pool name\n"));
usage(B_FALSE);
}
if (argc < 2) {
(void) fprintf(stderr, gettext("Missing new pool name\n"));
usage(B_FALSE);
}
srcpool = argv[0];
newpool = argv[1];
argc -= 2;
argv += 2;
if ((zhp = zpool_open(g_zfs, srcpool)) == NULL)
return (1);
config = split_mirror_vdev(zhp, newpool, props, flags, argc, argv);
if (config == NULL) {
ret = 1;
} else {
if (flags.dryrun) {
(void) printf(gettext("would create '%s' with the "
"following layout:\n\n"), newpool);
print_vdev_tree(NULL, newpool, config, 0, B_FALSE);
}
nvlist_free(config);
}
zpool_close(zhp);
if (ret != 0 || flags.dryrun || !flags.import)
return (ret);
/*
* The split was successful. Now we need to open the new
* pool and import it.
*/
if ((zhp = zpool_open_canfail(g_zfs, newpool)) == NULL)
return (1);
if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL &&
zpool_enable_datasets(zhp, mntopts, 0) != 0) {
ret = 1;
(void) fprintf(stderr, gettext("Split was successful, but "
"the datasets could not all be mounted\n"));
(void) fprintf(stderr, gettext("Try doing '%s' with a "
"different altroot\n"), "zpool import");
}
zpool_close(zhp);
return (ret);
}
/*
* zpool online <pool> <device> ...
*/
int
zpool_do_online(int argc, char **argv)
{
int c, i;
char *poolname;
zpool_handle_t *zhp;
int ret = 0;
vdev_state_t newstate;
int flags = 0;
/* check options */
while ((c = getopt(argc, argv, "et")) != -1) {
switch (c) {
case 'e':
flags |= ZFS_ONLINE_EXPAND;
break;
case 't':
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
/* get pool name and check number of arguments */
if (argc < 1) {
(void) fprintf(stderr, gettext("missing pool name\n"));
usage(B_FALSE);
}
if (argc < 2) {
(void) fprintf(stderr, gettext("missing device name\n"));
usage(B_FALSE);
}
poolname = argv[0];
if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
return (1);
for (i = 1; i < argc; i++) {
if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) {
if (newstate != VDEV_STATE_HEALTHY) {
(void) printf(gettext("warning: device '%s' "
"onlined, but remains in faulted state\n"),
argv[i]);
if (newstate == VDEV_STATE_FAULTED)
(void) printf(gettext("use 'zpool "
"clear' to restore a faulted "
"device\n"));
else
(void) printf(gettext("use 'zpool "
"replace' to replace devices "
"that are no longer present\n"));
}
} else {
ret = 1;
}
}
zpool_close(zhp);
return (ret);
}
/*
* zpool offline [-ft] <pool> <device> ...
*
* -f Force the device into the offline state, even if doing
* so would appear to compromise pool availability.
* (not supported yet)
*
* -t Only take the device off-line temporarily. The offline
* state will not be persistent across reboots.
*/
/* ARGSUSED */
int
zpool_do_offline(int argc, char **argv)
{
int c, i;
char *poolname;
zpool_handle_t *zhp;
int ret = 0;
boolean_t istmp = B_FALSE;
/* check options */
while ((c = getopt(argc, argv, "ft")) != -1) {
switch (c) {
case 't':
istmp = B_TRUE;
break;
case 'f':
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
/* get pool name and check number of arguments */
if (argc < 1) {
(void) fprintf(stderr, gettext("missing pool name\n"));
usage(B_FALSE);
}
if (argc < 2) {
(void) fprintf(stderr, gettext("missing device name\n"));
usage(B_FALSE);
}
poolname = argv[0];
if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
return (1);
for (i = 1; i < argc; i++) {
if (zpool_vdev_offline(zhp, argv[i], istmp) != 0)
ret = 1;
}
zpool_close(zhp);
return (ret);
}
/*
* zpool clear <pool> [device]
*
* Clear all errors associated with a pool or a particular device.
*/
int
zpool_do_clear(int argc, char **argv)
{
int c;
int ret = 0;
boolean_t dryrun = B_FALSE;
boolean_t do_rewind = B_FALSE;
boolean_t xtreme_rewind = B_FALSE;
uint32_t rewind_policy = ZPOOL_NO_REWIND;
nvlist_t *policy = NULL;
zpool_handle_t *zhp;
char *pool, *device;
/* check options */
while ((c = getopt(argc, argv, "FnX")) != -1) {
switch (c) {
case 'F':
do_rewind = B_TRUE;
break;
case 'n':
dryrun = B_TRUE;
break;
case 'X':
xtreme_rewind = B_TRUE;
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
if (argc < 1) {
(void) fprintf(stderr, gettext("missing pool name\n"));
usage(B_FALSE);
}
if (argc > 2) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}
if ((dryrun || xtreme_rewind) && !do_rewind) {
(void) fprintf(stderr,
gettext("-n or -X only meaningful with -F\n"));
usage(B_FALSE);
}
if (dryrun)
rewind_policy = ZPOOL_TRY_REWIND;
else if (do_rewind)
rewind_policy = ZPOOL_DO_REWIND;
if (xtreme_rewind)
rewind_policy |= ZPOOL_EXTREME_REWIND;
/* In future, further rewind policy choices can be passed along here */
if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 ||
nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind_policy) != 0)
return (1);
pool = argv[0];
device = argc == 2 ? argv[1] : NULL;
if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) {
nvlist_free(policy);
return (1);
}
if (zpool_clear(zhp, device, policy) != 0)
ret = 1;
zpool_close(zhp);
nvlist_free(policy);
return (ret);
}
/*
* zpool reguid <pool>
*/
int
zpool_do_reguid(int argc, char **argv)
{
int c;
char *poolname;
zpool_handle_t *zhp;
int ret = 0;
/* check options */
while ((c = getopt(argc, argv, "")) != -1) {
switch (c) {
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
/* get pool name and check number of arguments */
if (argc < 1) {
(void) fprintf(stderr, gettext("missing pool name\n"));
usage(B_FALSE);
}
if (argc > 1) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}
poolname = argv[0];
if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
return (1);
ret = zpool_reguid(zhp);
zpool_close(zhp);
return (ret);
}
/*
* zpool reopen <pool>
*
* Reopen the pool so that the kernel can update the sizes of all vdevs.
*/
int
zpool_do_reopen(int argc, char **argv)
{
int c;
int ret = 0;
zpool_handle_t *zhp;
char *pool;
/* check options */
while ((c = getopt(argc, argv, "")) != -1) {
switch (c) {
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc--;
argv++;
if (argc < 1) {
(void) fprintf(stderr, gettext("missing pool name\n"));
usage(B_FALSE);
}
if (argc > 1) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}
pool = argv[0];
if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL)
return (1);
ret = zpool_reopen(zhp);
zpool_close(zhp);
return (ret);
}
typedef struct scrub_cbdata {
int cb_type;
int cb_argc;
char **cb_argv;
pool_scrub_cmd_t cb_scrub_cmd;
} scrub_cbdata_t;
int
scrub_callback(zpool_handle_t *zhp, void *data)
{
scrub_cbdata_t *cb = data;
int err;
/*
* Ignore faulted pools.
*/
if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
(void) fprintf(stderr, gettext("cannot scrub '%s': pool is "
"currently unavailable\n"), zpool_get_name(zhp));
return (1);
}
err = zpool_scan(zhp, cb->cb_type, cb->cb_scrub_cmd);
return (err != 0);
}
/*
* zpool scrub [-s | -p] <pool> ...
*
* -s Stop. Stops any in-progress scrub.
* -p Pause. Pause in-progress scrub.
*/
int
zpool_do_scrub(int argc, char **argv)
{
int c;
scrub_cbdata_t cb;
cb.cb_type = POOL_SCAN_SCRUB;
cb.cb_scrub_cmd = POOL_SCRUB_NORMAL;
/* check options */
while ((c = getopt(argc, argv, "sp")) != -1) {
switch (c) {
case 's':
cb.cb_type = POOL_SCAN_NONE;
break;
case 'p':
cb.cb_scrub_cmd = POOL_SCRUB_PAUSE;
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
if (cb.cb_type == POOL_SCAN_NONE &&
cb.cb_scrub_cmd == POOL_SCRUB_PAUSE) {
(void) fprintf(stderr, gettext("invalid option combination: "
"-s and -p are mutually exclusive\n"));
usage(B_FALSE);
}
cb.cb_argc = argc;
cb.cb_argv = argv;
argc -= optind;
argv += optind;
if (argc < 1) {
(void) fprintf(stderr, gettext("missing pool name argument\n"));
usage(B_FALSE);
}
return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb));
}
typedef struct status_cbdata {
int cb_count;
boolean_t cb_allpools;
boolean_t cb_verbose;
boolean_t cb_explain;
boolean_t cb_first;
boolean_t cb_dedup_stats;
} status_cbdata_t;
/*
* Print out detailed scrub status.
*/
-void
+static void
print_scan_status(pool_scan_stat_t *ps)
{
time_t start, end, pause;
uint64_t elapsed, mins_left, hours_left;
uint64_t pass_exam, examined, total;
uint_t rate;
double fraction_done;
char processed_buf[7], examined_buf[7], total_buf[7], rate_buf[7];
(void) printf(gettext(" scan: "));
/* If there's never been a scan, there's not much to say. */
if (ps == NULL || ps->pss_func == POOL_SCAN_NONE ||
ps->pss_func >= POOL_SCAN_FUNCS) {
(void) printf(gettext("none requested\n"));
return;
}
start = ps->pss_start_time;
end = ps->pss_end_time;
pause = ps->pss_pass_scrub_pause;
zfs_nicenum(ps->pss_processed, processed_buf, sizeof (processed_buf));
assert(ps->pss_func == POOL_SCAN_SCRUB ||
ps->pss_func == POOL_SCAN_RESILVER);
/*
* Scan is finished or canceled.
*/
if (ps->pss_state == DSS_FINISHED) {
uint64_t minutes_taken = (end - start) / 60;
char *fmt = NULL;
if (ps->pss_func == POOL_SCAN_SCRUB) {
fmt = gettext("scrub repaired %s in %lluh%um with "
"%llu errors on %s");
} else if (ps->pss_func == POOL_SCAN_RESILVER) {
fmt = gettext("resilvered %s in %lluh%um with "
"%llu errors on %s");
}
/* LINTED */
(void) printf(fmt, processed_buf,
(u_longlong_t)(minutes_taken / 60),
(uint_t)(minutes_taken % 60),
(u_longlong_t)ps->pss_errors,
ctime((time_t *)&end));
return;
} else if (ps->pss_state == DSS_CANCELED) {
if (ps->pss_func == POOL_SCAN_SCRUB) {
(void) printf(gettext("scrub canceled on %s"),
ctime(&end));
} else if (ps->pss_func == POOL_SCAN_RESILVER) {
(void) printf(gettext("resilver canceled on %s"),
ctime(&end));
}
return;
}
assert(ps->pss_state == DSS_SCANNING);
/*
* Scan is in progress.
*/
if (ps->pss_func == POOL_SCAN_SCRUB) {
if (pause == 0) {
(void) printf(gettext("scrub in progress since %s"),
ctime(&start));
} else {
char buf[32];
struct tm *p = localtime(&pause);
(void) strftime(buf, sizeof (buf), "%a %b %e %T %Y", p);
(void) printf(gettext("scrub paused since %s\n"), buf);
(void) printf(gettext("\tscrub started on %s"),
ctime(&start));
}
} else if (ps->pss_func == POOL_SCAN_RESILVER) {
(void) printf(gettext("resilver in progress since %s"),
ctime(&start));
}
examined = ps->pss_examined ? ps->pss_examined : 1;
total = ps->pss_to_examine;
fraction_done = (double)examined / total;
/* elapsed time for this pass */
elapsed = time(NULL) - ps->pss_pass_start;
elapsed -= ps->pss_pass_scrub_spent_paused;
elapsed = elapsed ? elapsed : 1;
pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1;
rate = pass_exam / elapsed;
rate = rate ? rate : 1;
mins_left = ((total - examined) / rate) / 60;
hours_left = mins_left / 60;
zfs_nicenum(examined, examined_buf, sizeof (examined_buf));
zfs_nicenum(total, total_buf, sizeof (total_buf));
/*
* do not print estimated time if hours_left is more than 30 days
* or we have a paused scrub
*/
if (pause == 0) {
zfs_nicenum(rate, rate_buf, sizeof (rate_buf));
(void) printf(gettext("\t%s scanned out of %s at %s/s"),
examined_buf, total_buf, rate_buf);
if (hours_left < (30 * 24)) {
(void) printf(gettext(", %lluh%um to go\n"),
(u_longlong_t)hours_left, (uint_t)(mins_left % 60));
} else {
(void) printf(gettext(
", (scan is slow, no estimated time)\n"));
}
} else {
(void) printf(gettext("\t%s scanned out of %s\n"),
examined_buf, total_buf);
}
if (ps->pss_func == POOL_SCAN_RESILVER) {
(void) printf(gettext(" %s resilvered, %.2f%% done\n"),
processed_buf, 100 * fraction_done);
} else if (ps->pss_func == POOL_SCAN_SCRUB) {
(void) printf(gettext(" %s repaired, %.2f%% done\n"),
processed_buf, 100 * fraction_done);
}
}
+/*
+ * Print out detailed removal status.
+ */
static void
+print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs)
+{
+ char copied_buf[7], examined_buf[7], total_buf[7], rate_buf[7];
+ time_t start, end;
+ nvlist_t *config, *nvroot;
+ nvlist_t **child;
+ uint_t children;
+ char *vdev_name;
+
+ if (prs == NULL || prs->prs_state == DSS_NONE)
+ return;
+
+ /*
+ * Determine name of vdev.
+ */
+ config = zpool_get_config(zhp, NULL);
+ nvroot = fnvlist_lookup_nvlist(config,
+ ZPOOL_CONFIG_VDEV_TREE);
+ verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) == 0);
+ assert(prs->prs_removing_vdev < children);
+ vdev_name = zpool_vdev_name(g_zfs, zhp,
+ child[prs->prs_removing_vdev], B_TRUE);
+
+ (void) printf(gettext("remove: "));
+
+ start = prs->prs_start_time;
+ end = prs->prs_end_time;
+ zfs_nicenum(prs->prs_copied, copied_buf, sizeof (copied_buf));
+
+ /*
+ * Removal is finished or canceled.
+ */
+ if (prs->prs_state == DSS_FINISHED) {
+ uint64_t minutes_taken = (end - start) / 60;
+
+ (void) printf(gettext("Removal of vdev %llu copied %s "
+ "in %lluh%um, completed on %s"),
+ (longlong_t)prs->prs_removing_vdev,
+ copied_buf,
+ (u_longlong_t)(minutes_taken / 60),
+ (uint_t)(minutes_taken % 60),
+ ctime((time_t *)&end));
+ } else if (prs->prs_state == DSS_CANCELED) {
+ (void) printf(gettext("Removal of %s canceled on %s"),
+ vdev_name, ctime(&end));
+ } else {
+ uint64_t copied, total, elapsed, mins_left, hours_left;
+ double fraction_done;
+ uint_t rate;
+
+ assert(prs->prs_state == DSS_SCANNING);
+
+ /*
+ * Removal is in progress.
+ */
+ (void) printf(gettext(
+ "Evacuation of %s in progress since %s"),
+ vdev_name, ctime(&start));
+
+ copied = prs->prs_copied > 0 ? prs->prs_copied : 1;
+ total = prs->prs_to_copy;
+ fraction_done = (double)copied / total;
+
+ /* elapsed time for this pass */
+ elapsed = time(NULL) - prs->prs_start_time;
+ elapsed = elapsed > 0 ? elapsed : 1;
+ rate = copied / elapsed;
+ rate = rate > 0 ? rate : 1;
+ mins_left = ((total - copied) / rate) / 60;
+ hours_left = mins_left / 60;
+
+ zfs_nicenum(copied, examined_buf, sizeof (examined_buf));
+ zfs_nicenum(total, total_buf, sizeof (total_buf));
+ zfs_nicenum(rate, rate_buf, sizeof (rate_buf));
+
+ /*
+ * do not print estimated time if hours_left is more than
+ * 30 days
+ */
+ (void) printf(gettext(" %s copied out of %s at %s/s, "
+ "%.2f%% done"),
+ examined_buf, total_buf, rate_buf, 100 * fraction_done);
+ if (hours_left < (30 * 24)) {
+ (void) printf(gettext(", %lluh%um to go\n"),
+ (u_longlong_t)hours_left, (uint_t)(mins_left % 60));
+ } else {
+ (void) printf(gettext(
+ ", (copy is slow, no estimated time)\n"));
+ }
+ }
+
+ if (prs->prs_mapping_memory > 0) {
+ char mem_buf[7];
+ zfs_nicenum(prs->prs_mapping_memory, mem_buf, sizeof (mem_buf));
+ (void) printf(gettext(" %s memory used for "
+ "removed device mappings\n"),
+ mem_buf);
+ }
+}
+
+static void
print_error_log(zpool_handle_t *zhp)
{
nvlist_t *nverrlist = NULL;
nvpair_t *elem;
char *pathname;
size_t len = MAXPATHLEN * 2;
if (zpool_get_errlog(zhp, &nverrlist) != 0) {
(void) printf("errors: List of errors unavailable "
"(insufficient privileges)\n");
return;
}
(void) printf("errors: Permanent errors have been "
"detected in the following files:\n\n");
pathname = safe_malloc(len);
elem = NULL;
while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) {
nvlist_t *nv;
uint64_t dsobj, obj;
verify(nvpair_value_nvlist(elem, &nv) == 0);
verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_DATASET,
&dsobj) == 0);
verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_OBJECT,
&obj) == 0);
zpool_obj_to_path(zhp, dsobj, obj, pathname, len);
(void) printf("%7s %s\n", "", pathname);
}
free(pathname);
nvlist_free(nverrlist);
}
static void
print_spares(zpool_handle_t *zhp, nvlist_t **spares, uint_t nspares,
int namewidth)
{
uint_t i;
char *name;
if (nspares == 0)
return;
(void) printf(gettext("\tspares\n"));
for (i = 0; i < nspares; i++) {
name = zpool_vdev_name(g_zfs, zhp, spares[i], B_FALSE);
print_status_config(zhp, name, spares[i],
namewidth, 2, B_TRUE);
free(name);
}
}
static void
print_l2cache(zpool_handle_t *zhp, nvlist_t **l2cache, uint_t nl2cache,
int namewidth)
{
uint_t i;
char *name;
if (nl2cache == 0)
return;
(void) printf(gettext("\tcache\n"));
for (i = 0; i < nl2cache; i++) {
name = zpool_vdev_name(g_zfs, zhp, l2cache[i], B_FALSE);
print_status_config(zhp, name, l2cache[i],
namewidth, 2, B_FALSE);
free(name);
}
}
static void
print_dedup_stats(nvlist_t *config)
{
ddt_histogram_t *ddh;
ddt_stat_t *dds;
ddt_object_t *ddo;
uint_t c;
/*
* If the pool was faulted then we may not have been able to
* obtain the config. Otherwise, if we have anything in the dedup
* table continue processing the stats.
*/
if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS,
(uint64_t **)&ddo, &c) != 0)
return;
(void) printf("\n");
(void) printf(gettext(" dedup: "));
if (ddo->ddo_count == 0) {
(void) printf(gettext("no DDT entries\n"));
return;
}
(void) printf("DDT entries %llu, size %llu on disk, %llu in core\n",
(u_longlong_t)ddo->ddo_count,
(u_longlong_t)ddo->ddo_dspace,
(u_longlong_t)ddo->ddo_mspace);
verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS,
(uint64_t **)&dds, &c) == 0);
verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM,
(uint64_t **)&ddh, &c) == 0);
zpool_dump_ddt(dds, ddh);
}
/*
* Display a summary of pool status. Displays a summary such as:
*
* pool: tank
* status: DEGRADED
* reason: One or more devices ...
* see: http://illumos.org/msg/ZFS-xxxx-01
* config:
* mirror DEGRADED
* c1t0d0 OK
* c2t0d0 UNAVAIL
*
* When given the '-v' option, we print out the complete config. If the '-e'
* option is specified, then we print out error rate information as well.
*/
int
status_callback(zpool_handle_t *zhp, void *data)
{
status_cbdata_t *cbp = data;
nvlist_t *config, *nvroot;
char *msgid;
int reason;
const char *health;
uint_t c;
vdev_stat_t *vs;
config = zpool_get_config(zhp, NULL);
reason = zpool_get_status(zhp, &msgid);
cbp->cb_count++;
/*
* If we were given 'zpool status -x', only report those pools with
* problems.
*/
if (cbp->cb_explain &&
(reason == ZPOOL_STATUS_OK ||
reason == ZPOOL_STATUS_VERSION_OLDER ||
reason == ZPOOL_STATUS_NON_NATIVE_ASHIFT ||
reason == ZPOOL_STATUS_FEAT_DISABLED)) {
if (!cbp->cb_allpools) {
(void) printf(gettext("pool '%s' is healthy\n"),
zpool_get_name(zhp));
if (cbp->cb_first)
cbp->cb_first = B_FALSE;
}
return (0);
}
if (cbp->cb_first)
cbp->cb_first = B_FALSE;
else
(void) printf("\n");
- verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
- &nvroot) == 0);
+ nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &c) == 0);
health = zpool_state_to_name(vs->vs_state, vs->vs_aux);
(void) printf(gettext(" pool: %s\n"), zpool_get_name(zhp));
(void) printf(gettext(" state: %s\n"), health);
switch (reason) {
case ZPOOL_STATUS_MISSING_DEV_R:
(void) printf(gettext("status: One or more devices could not "
"be opened. Sufficient replicas exist for\n\tthe pool to "
"continue functioning in a degraded state.\n"));
(void) printf(gettext("action: Attach the missing device and "
"online it using 'zpool online'.\n"));
break;
case ZPOOL_STATUS_MISSING_DEV_NR:
(void) printf(gettext("status: One or more devices could not "
"be opened. There are insufficient\n\treplicas for the "
"pool to continue functioning.\n"));
(void) printf(gettext("action: Attach the missing device and "
"online it using 'zpool online'.\n"));
break;
case ZPOOL_STATUS_CORRUPT_LABEL_R:
(void) printf(gettext("status: One or more devices could not "
"be used because the label is missing or\n\tinvalid. "
"Sufficient replicas exist for the pool to continue\n\t"
"functioning in a degraded state.\n"));
(void) printf(gettext("action: Replace the device using "
"'zpool replace'.\n"));
break;
case ZPOOL_STATUS_CORRUPT_LABEL_NR:
(void) printf(gettext("status: One or more devices could not "
"be used because the label is missing \n\tor invalid. "
"There are insufficient replicas for the pool to "
"continue\n\tfunctioning.\n"));
zpool_explain_recover(zpool_get_handle(zhp),
zpool_get_name(zhp), reason, config);
break;
case ZPOOL_STATUS_FAILING_DEV:
(void) printf(gettext("status: One or more devices has "
"experienced an unrecoverable error. An\n\tattempt was "
"made to correct the error. Applications are "
"unaffected.\n"));
(void) printf(gettext("action: Determine if the device needs "
"to be replaced, and clear the errors\n\tusing "
"'zpool clear' or replace the device with 'zpool "
"replace'.\n"));
break;
case ZPOOL_STATUS_OFFLINE_DEV:
(void) printf(gettext("status: One or more devices has "
"been taken offline by the administrator.\n\tSufficient "
"replicas exist for the pool to continue functioning in "
"a\n\tdegraded state.\n"));
(void) printf(gettext("action: Online the device using "
"'zpool online' or replace the device with\n\t'zpool "
"replace'.\n"));
break;
case ZPOOL_STATUS_REMOVED_DEV:
(void) printf(gettext("status: One or more devices has "
"been removed by the administrator.\n\tSufficient "
"replicas exist for the pool to continue functioning in "
"a\n\tdegraded state.\n"));
(void) printf(gettext("action: Online the device using "
"'zpool online' or replace the device with\n\t'zpool "
"replace'.\n"));
break;
case ZPOOL_STATUS_RESILVERING:
(void) printf(gettext("status: One or more devices is "
"currently being resilvered. The pool will\n\tcontinue "
"to function, possibly in a degraded state.\n"));
(void) printf(gettext("action: Wait for the resilver to "
"complete.\n"));
break;
case ZPOOL_STATUS_CORRUPT_DATA:
(void) printf(gettext("status: One or more devices has "
"experienced an error resulting in data\n\tcorruption. "
"Applications may be affected.\n"));
(void) printf(gettext("action: Restore the file in question "
"if possible. Otherwise restore the\n\tentire pool from "
"backup.\n"));
break;
case ZPOOL_STATUS_CORRUPT_POOL:
(void) printf(gettext("status: The pool metadata is corrupted "
"and the pool cannot be opened.\n"));
zpool_explain_recover(zpool_get_handle(zhp),
zpool_get_name(zhp), reason, config);
break;
case ZPOOL_STATUS_VERSION_OLDER:
(void) printf(gettext("status: The pool is formatted using a "
"legacy on-disk format. The pool can\n\tstill be used, "
"but some features are unavailable.\n"));
(void) printf(gettext("action: Upgrade the pool using 'zpool "
"upgrade'. Once this is done, the\n\tpool will no longer "
"be accessible on software that does not support feature\n"
"\tflags.\n"));
break;
case ZPOOL_STATUS_VERSION_NEWER:
(void) printf(gettext("status: The pool has been upgraded to a "
"newer, incompatible on-disk version.\n\tThe pool cannot "
"be accessed on this system.\n"));
(void) printf(gettext("action: Access the pool from a system "
"running more recent software, or\n\trestore the pool from "
"backup.\n"));
break;
case ZPOOL_STATUS_FEAT_DISABLED:
(void) printf(gettext("status: Some supported features are not "
"enabled on the pool. The pool can\n\tstill be used, but "
"some features are unavailable.\n"));
(void) printf(gettext("action: Enable all features using "
"'zpool upgrade'. Once this is done,\n\tthe pool may no "
"longer be accessible by software that does not support\n\t"
"the features. See zpool-features(7) for details.\n"));
break;
case ZPOOL_STATUS_UNSUP_FEAT_READ:
(void) printf(gettext("status: The pool cannot be accessed on "
"this system because it uses the\n\tfollowing feature(s) "
"not supported on this system:\n"));
zpool_print_unsup_feat(config);
(void) printf("\n");
(void) printf(gettext("action: Access the pool from a system "
"that supports the required feature(s),\n\tor restore the "
"pool from backup.\n"));
break;
case ZPOOL_STATUS_UNSUP_FEAT_WRITE:
(void) printf(gettext("status: The pool can only be accessed "
"in read-only mode on this system. It\n\tcannot be "
"accessed in read-write mode because it uses the "
"following\n\tfeature(s) not supported on this system:\n"));
zpool_print_unsup_feat(config);
(void) printf("\n");
(void) printf(gettext("action: The pool cannot be accessed in "
"read-write mode. Import the pool with\n"
"\t\"-o readonly=on\", access the pool from a system that "
"supports the\n\trequired feature(s), or restore the "
"pool from backup.\n"));
break;
case ZPOOL_STATUS_FAULTED_DEV_R:
(void) printf(gettext("status: One or more devices are "
"faulted in response to persistent errors.\n\tSufficient "
"replicas exist for the pool to continue functioning "
"in a\n\tdegraded state.\n"));
(void) printf(gettext("action: Replace the faulted device, "
"or use 'zpool clear' to mark the device\n\trepaired.\n"));
break;
case ZPOOL_STATUS_FAULTED_DEV_NR:
(void) printf(gettext("status: One or more devices are "
"faulted in response to persistent errors. There are "
"insufficient replicas for the pool to\n\tcontinue "
"functioning.\n"));
(void) printf(gettext("action: Destroy and re-create the pool "
"from a backup source. Manually marking the device\n"
"\trepaired using 'zpool clear' may allow some data "
"to be recovered.\n"));
break;
case ZPOOL_STATUS_IO_FAILURE_WAIT:
case ZPOOL_STATUS_IO_FAILURE_CONTINUE:
(void) printf(gettext("status: One or more devices are "
"faulted in response to IO failures.\n"));
(void) printf(gettext("action: Make sure the affected devices "
"are connected, then run 'zpool clear'.\n"));
break;
case ZPOOL_STATUS_BAD_LOG:
(void) printf(gettext("status: An intent log record "
"could not be read.\n"
"\tWaiting for adminstrator intervention to fix the "
"faulted pool.\n"));
(void) printf(gettext("action: Either restore the affected "
"device(s) and run 'zpool online',\n"
"\tor ignore the intent log records by running "
"'zpool clear'.\n"));
break;
case ZPOOL_STATUS_NON_NATIVE_ASHIFT:
(void) printf(gettext("status: One or more devices are "
"configured to use a non-native block size.\n"
"\tExpect reduced performance.\n"));
(void) printf(gettext("action: Replace affected devices with "
"devices that support the\n\tconfigured block size, or "
"migrate data to a properly configured\n\tpool.\n"));
break;
default:
/*
* The remaining errors can't actually be generated, yet.
*/
assert(reason == ZPOOL_STATUS_OK);
}
if (msgid != NULL)
(void) printf(gettext(" see: http://illumos.org/msg/%s\n"),
msgid);
if (config != NULL) {
int namewidth;
uint64_t nerr;
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
pool_scan_stat_t *ps = NULL;
+ pool_removal_stat_t *prs = NULL;
(void) nvlist_lookup_uint64_array(nvroot,
ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c);
print_scan_status(ps);
+
+ (void) nvlist_lookup_uint64_array(nvroot,
+ ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c);
+ print_removal_status(zhp, prs);
namewidth = max_width(zhp, nvroot, 0, 0);
if (namewidth < 10)
namewidth = 10;
(void) printf(gettext("config:\n\n"));
(void) printf(gettext("\t%-*s %-8s %5s %5s %5s\n"), namewidth,
"NAME", "STATE", "READ", "WRITE", "CKSUM");
print_status_config(zhp, zpool_get_name(zhp), nvroot,
namewidth, 0, B_FALSE);
if (num_logs(nvroot) > 0)
print_logs(zhp, nvroot, namewidth, B_TRUE);
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
&l2cache, &nl2cache) == 0)
print_l2cache(zhp, l2cache, nl2cache, namewidth);
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
&spares, &nspares) == 0)
print_spares(zhp, spares, nspares, namewidth);
if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
&nerr) == 0) {
nvlist_t *nverrlist = NULL;
/*
* If the approximate error count is small, get a
* precise count by fetching the entire log and
* uniquifying the results.
*/
if (nerr > 0 && nerr < 100 && !cbp->cb_verbose &&
zpool_get_errlog(zhp, &nverrlist) == 0) {
nvpair_t *elem;
elem = NULL;
nerr = 0;
while ((elem = nvlist_next_nvpair(nverrlist,
elem)) != NULL) {
nerr++;
}
}
nvlist_free(nverrlist);
(void) printf("\n");
if (nerr == 0)
(void) printf(gettext("errors: No known data "
"errors\n"));
else if (!cbp->cb_verbose)
(void) printf(gettext("errors: %llu data "
"errors, use '-v' for a list\n"),
(u_longlong_t)nerr);
else
print_error_log(zhp);
}
if (cbp->cb_dedup_stats)
print_dedup_stats(config);
} else {
(void) printf(gettext("config: The configuration cannot be "
"determined.\n"));
}
return (0);
}
/*
* zpool status [-vx] [-T d|u] [pool] ... [interval [count]]
*
* -v Display complete error logs
* -x Display only pools with potential problems
* -D Display dedup status (undocumented)
* -T Display a timestamp in date(1) or Unix format
*
* Describes the health status of all pools or some subset.
*/
int
zpool_do_status(int argc, char **argv)
{
int c;
int ret;
unsigned long interval = 0, count = 0;
status_cbdata_t cb = { 0 };
/* check options */
while ((c = getopt(argc, argv, "vxDT:")) != -1) {
switch (c) {
case 'v':
cb.cb_verbose = B_TRUE;
break;
case 'x':
cb.cb_explain = B_TRUE;
break;
case 'D':
cb.cb_dedup_stats = B_TRUE;
break;
case 'T':
get_timestamp_arg(*optarg);
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
get_interval_count(&argc, argv, &interval, &count);
if (argc == 0)
cb.cb_allpools = B_TRUE;
cb.cb_first = B_TRUE;
for (;;) {
if (timestamp_fmt != NODATE)
print_timestamp(timestamp_fmt);
ret = for_each_pool(argc, argv, B_TRUE, NULL,
status_callback, &cb);
if (argc == 0 && cb.cb_count == 0)
(void) printf(gettext("no pools available\n"));
else if (cb.cb_explain && cb.cb_first && cb.cb_allpools)
(void) printf(gettext("all pools are healthy\n"));
if (ret != 0)
return (ret);
if (interval == 0)
break;
if (count != 0 && --count == 0)
break;
(void) sleep(interval);
}
return (0);
}
typedef struct upgrade_cbdata {
boolean_t cb_first;
boolean_t cb_unavail;
char cb_poolname[ZFS_MAX_DATASET_NAME_LEN];
int cb_argc;
uint64_t cb_version;
char **cb_argv;
} upgrade_cbdata_t;
#ifdef __FreeBSD__
static int
is_root_pool(zpool_handle_t *zhp)
{
static struct statfs sfs;
static char *poolname = NULL;
static boolean_t stated = B_FALSE;
char *slash;
if (!stated) {
stated = B_TRUE;
if (statfs("/", &sfs) == -1) {
(void) fprintf(stderr,
"Unable to stat root file system: %s.\n",
strerror(errno));
return (0);
}
if (strcmp(sfs.f_fstypename, "zfs") != 0)
return (0);
poolname = sfs.f_mntfromname;
if ((slash = strchr(poolname, '/')) != NULL)
*slash = '\0';
}
return (poolname != NULL && strcmp(poolname, zpool_get_name(zhp)) == 0);
}
static void
root_pool_upgrade_check(zpool_handle_t *zhp, char *poolname, int size)
{
if (poolname[0] == '\0' && is_root_pool(zhp))
(void) strlcpy(poolname, zpool_get_name(zhp), size);
}
#endif /* FreeBSD */
static int
upgrade_version(zpool_handle_t *zhp, uint64_t version)
{
int ret;
nvlist_t *config;
uint64_t oldversion;
config = zpool_get_config(zhp, NULL);
verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
&oldversion) == 0);
assert(SPA_VERSION_IS_SUPPORTED(oldversion));
assert(oldversion < version);
ret = zpool_upgrade(zhp, version);
if (ret != 0)
return (ret);
if (version >= SPA_VERSION_FEATURES) {
(void) printf(gettext("Successfully upgraded "
"'%s' from version %llu to feature flags.\n"),
zpool_get_name(zhp), oldversion);
} else {
(void) printf(gettext("Successfully upgraded "
"'%s' from version %llu to version %llu.\n"),
zpool_get_name(zhp), oldversion, version);
}
return (0);
}
static int
upgrade_enable_all(zpool_handle_t *zhp, int *countp)
{
int i, ret, count;
boolean_t firstff = B_TRUE;
nvlist_t *enabled = zpool_get_features(zhp);
count = 0;
for (i = 0; i < SPA_FEATURES; i++) {
const char *fname = spa_feature_table[i].fi_uname;
const char *fguid = spa_feature_table[i].fi_guid;
if (!nvlist_exists(enabled, fguid)) {
char *propname;
verify(-1 != asprintf(&propname, "feature@%s", fname));
ret = zpool_set_prop(zhp, propname,
ZFS_FEATURE_ENABLED);
if (ret != 0) {
free(propname);
return (ret);
}
count++;
if (firstff) {
(void) printf(gettext("Enabled the "
"following features on '%s':\n"),
zpool_get_name(zhp));
firstff = B_FALSE;
}
(void) printf(gettext(" %s\n"), fname);
free(propname);
}
}
if (countp != NULL)
*countp = count;
return (0);
}
static int
upgrade_cb(zpool_handle_t *zhp, void *arg)
{
upgrade_cbdata_t *cbp = arg;
nvlist_t *config;
uint64_t version;
boolean_t printnl = B_FALSE;
int ret;
if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
(void) fprintf(stderr, gettext("cannot upgrade '%s': pool is "
"currently unavailable.\n\n"), zpool_get_name(zhp));
cbp->cb_unavail = B_TRUE;
/* Allow iteration to continue. */
return (0);
}
config = zpool_get_config(zhp, NULL);
verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
&version) == 0);
assert(SPA_VERSION_IS_SUPPORTED(version));
if (version < cbp->cb_version) {
cbp->cb_first = B_FALSE;
ret = upgrade_version(zhp, cbp->cb_version);
if (ret != 0)
return (ret);
#ifdef __FreeBSD__
root_pool_upgrade_check(zhp, cbp->cb_poolname,
sizeof(cbp->cb_poolname));
#endif /* __FreeBSD__ */
printnl = B_TRUE;
#ifdef illumos
/*
* If they did "zpool upgrade -a", then we could
* be doing ioctls to different pools. We need
* to log this history once to each pool, and bypass
* the normal history logging that happens in main().
*/
(void) zpool_log_history(g_zfs, history_str);
log_history = B_FALSE;
#endif
}
if (cbp->cb_version >= SPA_VERSION_FEATURES) {
int count;
ret = upgrade_enable_all(zhp, &count);
if (ret != 0)
return (ret);
if (count > 0) {
cbp->cb_first = B_FALSE;
printnl = B_TRUE;
#ifdef __FreeBSD__
root_pool_upgrade_check(zhp, cbp->cb_poolname,
sizeof(cbp->cb_poolname));
#endif /* __FreeBSD__ */
/*
* If they did "zpool upgrade -a", then we could
* be doing ioctls to different pools. We need
* to log this history once to each pool, and bypass
* the normal history logging that happens in main().
*/
(void) zpool_log_history(g_zfs, history_str);
log_history = B_FALSE;
}
}
if (printnl) {
(void) printf(gettext("\n"));
}
return (0);
}
static int
upgrade_list_unavail(zpool_handle_t *zhp, void *arg)
{
upgrade_cbdata_t *cbp = arg;
if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
if (cbp->cb_first) {
(void) fprintf(stderr, gettext("The following pools "
"are unavailable and cannot be upgraded as this "
"time.\n\n"));
(void) fprintf(stderr, gettext("POOL\n"));
(void) fprintf(stderr, gettext("------------\n"));
cbp->cb_first = B_FALSE;
}
(void) printf(gettext("%s\n"), zpool_get_name(zhp));
cbp->cb_unavail = B_TRUE;
}
return (0);
}
static int
upgrade_list_older_cb(zpool_handle_t *zhp, void *arg)
{
upgrade_cbdata_t *cbp = arg;
nvlist_t *config;
uint64_t version;
if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
/*
* This will have been reported by upgrade_list_unavail so
* just allow iteration to continue.
*/
cbp->cb_unavail = B_TRUE;
return (0);
}
config = zpool_get_config(zhp, NULL);
verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
&version) == 0);
assert(SPA_VERSION_IS_SUPPORTED(version));
if (version < SPA_VERSION_FEATURES) {
if (cbp->cb_first) {
(void) printf(gettext("The following pools are "
"formatted with legacy version numbers and can\n"
"be upgraded to use feature flags. After "
"being upgraded, these pools\nwill no "
"longer be accessible by software that does not "
"support feature\nflags.\n\n"));
(void) printf(gettext("VER POOL\n"));
(void) printf(gettext("--- ------------\n"));
cbp->cb_first = B_FALSE;
}
(void) printf("%2llu %s\n", (u_longlong_t)version,
zpool_get_name(zhp));
}
return (0);
}
static int
upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg)
{
upgrade_cbdata_t *cbp = arg;
nvlist_t *config;
uint64_t version;
if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
/*
* This will have been reported by upgrade_list_unavail so
* just allow iteration to continue.
*/
cbp->cb_unavail = B_TRUE;
return (0);
}
config = zpool_get_config(zhp, NULL);
verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
&version) == 0);
if (version >= SPA_VERSION_FEATURES) {
int i;
boolean_t poolfirst = B_TRUE;
nvlist_t *enabled = zpool_get_features(zhp);
for (i = 0; i < SPA_FEATURES; i++) {
const char *fguid = spa_feature_table[i].fi_guid;
const char *fname = spa_feature_table[i].fi_uname;
if (!nvlist_exists(enabled, fguid)) {
if (cbp->cb_first) {
(void) printf(gettext("\nSome "
"supported features are not "
"enabled on the following pools. "
"Once a\nfeature is enabled the "
"pool may become incompatible with "
"software\nthat does not support "
"the feature. See "
"zpool-features(7) for "
"details.\n\n"));
(void) printf(gettext("POOL "
"FEATURE\n"));
(void) printf(gettext("------"
"---------\n"));
cbp->cb_first = B_FALSE;
}
if (poolfirst) {
(void) printf(gettext("%s\n"),
zpool_get_name(zhp));
poolfirst = B_FALSE;
}
(void) printf(gettext(" %s\n"), fname);
}
}
}
return (0);
}
/* ARGSUSED */
static int
upgrade_one(zpool_handle_t *zhp, void *data)
{
boolean_t printnl = B_FALSE;
upgrade_cbdata_t *cbp = data;
uint64_t cur_version;
int ret;
if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
(void) fprintf(stderr, gettext("cannot upgrade '%s': pool is "
"is currently unavailable.\n\n"), zpool_get_name(zhp));
cbp->cb_unavail = B_TRUE;
return (1);
}
if (strcmp("log", zpool_get_name(zhp)) == 0) {
(void) printf(gettext("'log' is now a reserved word\n"
"Pool 'log' must be renamed using export and import"
" to upgrade.\n\n"));
return (1);
}
cur_version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
if (cur_version > cbp->cb_version) {
(void) printf(gettext("Pool '%s' is already formatted "
"using more current version '%llu'.\n\n"),
zpool_get_name(zhp), cur_version);
return (0);
}
if (cbp->cb_version != SPA_VERSION && cur_version == cbp->cb_version) {
(void) printf(gettext("Pool '%s' is already formatted "
"using version %llu.\n\n"), zpool_get_name(zhp),
cbp->cb_version);
return (0);
}
if (cur_version != cbp->cb_version) {
printnl = B_TRUE;
ret = upgrade_version(zhp, cbp->cb_version);
if (ret != 0)
return (ret);
#ifdef __FreeBSD__
root_pool_upgrade_check(zhp, cbp->cb_poolname,
sizeof(cbp->cb_poolname));
#endif /* __FreeBSD__ */
}
if (cbp->cb_version >= SPA_VERSION_FEATURES) {
int count = 0;
ret = upgrade_enable_all(zhp, &count);
if (ret != 0)
return (ret);
if (count != 0) {
printnl = B_TRUE;
#ifdef __FreeBSD__
root_pool_upgrade_check(zhp, cbp->cb_poolname,
sizeof(cbp->cb_poolname));
#endif /* __FreeBSD __*/
} else if (cur_version == SPA_VERSION) {
(void) printf(gettext("Pool '%s' already has all "
"supported features enabled.\n\n"),
zpool_get_name(zhp));
}
}
if (printnl) {
(void) printf(gettext("\n"));
}
return (0);
}
/*
* zpool upgrade
* zpool upgrade -v
* zpool upgrade [-V version] <-a | pool ...>
*
* With no arguments, display downrev'd ZFS pool available for upgrade.
* Individual pools can be upgraded by specifying the pool, and '-a' will
* upgrade all pools.
*/
int
zpool_do_upgrade(int argc, char **argv)
{
int c;
upgrade_cbdata_t cb = { 0 };
int ret = 0;
boolean_t showversions = B_FALSE;
boolean_t upgradeall = B_FALSE;
char *end;
/* check options */
while ((c = getopt(argc, argv, ":avV:")) != -1) {
switch (c) {
case 'a':
upgradeall = B_TRUE;
break;
case 'v':
showversions = B_TRUE;
break;
case 'V':
cb.cb_version = strtoll(optarg, &end, 10);
if (*end != '\0' ||
!SPA_VERSION_IS_SUPPORTED(cb.cb_version)) {
(void) fprintf(stderr,
gettext("invalid version '%s'\n"), optarg);
usage(B_FALSE);
}
break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
"'%c' option\n"), optopt);
usage(B_FALSE);
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
cb.cb_argc = argc;
cb.cb_argv = argv;
argc -= optind;
argv += optind;
if (cb.cb_version == 0) {
cb.cb_version = SPA_VERSION;
} else if (!upgradeall && argc == 0) {
(void) fprintf(stderr, gettext("-V option is "
"incompatible with other arguments\n"));
usage(B_FALSE);
}
if (showversions) {
if (upgradeall || argc != 0) {
(void) fprintf(stderr, gettext("-v option is "
"incompatible with other arguments\n"));
usage(B_FALSE);
}
} else if (upgradeall) {
if (argc != 0) {
(void) fprintf(stderr, gettext("-a option should not "
"be used along with a pool name\n"));
usage(B_FALSE);
}
}
(void) printf(gettext("This system supports ZFS pool feature "
"flags.\n\n"));
if (showversions) {
int i;
(void) printf(gettext("The following features are "
"supported:\n\n"));
(void) printf(gettext("FEAT DESCRIPTION\n"));
(void) printf("----------------------------------------------"
"---------------\n");
for (i = 0; i < SPA_FEATURES; i++) {
zfeature_info_t *fi = &spa_feature_table[i];
const char *ro =
(fi->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
" (read-only compatible)" : "";
(void) printf("%-37s%s\n", fi->fi_uname, ro);
(void) printf(" %s\n", fi->fi_desc);
}
(void) printf("\n");
(void) printf(gettext("The following legacy versions are also "
"supported:\n\n"));
(void) printf(gettext("VER DESCRIPTION\n"));
(void) printf("--- -----------------------------------------"
"---------------\n");
(void) printf(gettext(" 1 Initial ZFS version\n"));
(void) printf(gettext(" 2 Ditto blocks "
"(replicated metadata)\n"));
(void) printf(gettext(" 3 Hot spares and double parity "
"RAID-Z\n"));
(void) printf(gettext(" 4 zpool history\n"));
(void) printf(gettext(" 5 Compression using the gzip "
"algorithm\n"));
(void) printf(gettext(" 6 bootfs pool property\n"));
(void) printf(gettext(" 7 Separate intent log devices\n"));
(void) printf(gettext(" 8 Delegated administration\n"));
(void) printf(gettext(" 9 refquota and refreservation "
"properties\n"));
(void) printf(gettext(" 10 Cache devices\n"));
(void) printf(gettext(" 11 Improved scrub performance\n"));
(void) printf(gettext(" 12 Snapshot properties\n"));
(void) printf(gettext(" 13 snapused property\n"));
(void) printf(gettext(" 14 passthrough-x aclinherit\n"));
(void) printf(gettext(" 15 user/group space accounting\n"));
(void) printf(gettext(" 16 stmf property support\n"));
(void) printf(gettext(" 17 Triple-parity RAID-Z\n"));
(void) printf(gettext(" 18 Snapshot user holds\n"));
(void) printf(gettext(" 19 Log device removal\n"));
(void) printf(gettext(" 20 Compression using zle "
"(zero-length encoding)\n"));
(void) printf(gettext(" 21 Deduplication\n"));
(void) printf(gettext(" 22 Received properties\n"));
(void) printf(gettext(" 23 Slim ZIL\n"));
(void) printf(gettext(" 24 System attributes\n"));
(void) printf(gettext(" 25 Improved scrub stats\n"));
(void) printf(gettext(" 26 Improved snapshot deletion "
"performance\n"));
(void) printf(gettext(" 27 Improved snapshot creation "
"performance\n"));
(void) printf(gettext(" 28 Multiple vdev replacements\n"));
(void) printf(gettext("\nFor more information on a particular "
"version, including supported releases,\n"));
(void) printf(gettext("see the ZFS Administration Guide.\n\n"));
} else if (argc == 0 && upgradeall) {
cb.cb_first = B_TRUE;
ret = zpool_iter(g_zfs, upgrade_cb, &cb);
if (ret == 0 && cb.cb_first) {
if (cb.cb_version == SPA_VERSION) {
(void) printf(gettext("All %spools are already "
"formatted using feature flags.\n\n"),
cb.cb_unavail ? gettext("available ") : "");
(void) printf(gettext("Every %sfeature flags "
"pool already has all supported features "
"enabled.\n"),
cb.cb_unavail ? gettext("available ") : "");
} else {
(void) printf(gettext("All pools are already "
"formatted with version %llu or higher.\n"),
cb.cb_version);
}
}
} else if (argc == 0) {
cb.cb_first = B_TRUE;
ret = zpool_iter(g_zfs, upgrade_list_unavail, &cb);
assert(ret == 0);
if (!cb.cb_first) {
(void) fprintf(stderr, "\n");
}
cb.cb_first = B_TRUE;
ret = zpool_iter(g_zfs, upgrade_list_older_cb, &cb);
assert(ret == 0);
if (cb.cb_first) {
(void) printf(gettext("All %spools are formatted using "
"feature flags.\n\n"), cb.cb_unavail ?
gettext("available ") : "");
} else {
(void) printf(gettext("\nUse 'zpool upgrade -v' "
"for a list of available legacy versions.\n"));
}
cb.cb_first = B_TRUE;
ret = zpool_iter(g_zfs, upgrade_list_disabled_cb, &cb);
assert(ret == 0);
if (cb.cb_first) {
(void) printf(gettext("Every %sfeature flags pool has "
"all supported features enabled.\n"),
cb.cb_unavail ? gettext("available ") : "");
} else {
(void) printf(gettext("\n"));
}
} else {
ret = for_each_pool(argc, argv, B_TRUE, NULL,
upgrade_one, &cb);
}
if (cb.cb_poolname[0] != '\0') {
(void) printf(
"If you boot from pool '%s', don't forget to update boot code.\n"
"Assuming you use GPT partitioning and da0 is your boot disk\n"
"the following command will do it:\n"
"\n"
"\tgpart bootcode -b /boot/pmbr -p /boot/gptzfsboot -i 1 da0\n\n",
cb.cb_poolname);
}
return (ret);
}
typedef struct hist_cbdata {
boolean_t first;
boolean_t longfmt;
boolean_t internal;
} hist_cbdata_t;
/*
* Print out the command history for a specific pool.
*/
static int
get_history_one(zpool_handle_t *zhp, void *data)
{
nvlist_t *nvhis;
nvlist_t **records;
uint_t numrecords;
int ret, i;
hist_cbdata_t *cb = (hist_cbdata_t *)data;
cb->first = B_FALSE;
(void) printf(gettext("History for '%s':\n"), zpool_get_name(zhp));
if ((ret = zpool_get_history(zhp, &nvhis)) != 0)
return (ret);
verify(nvlist_lookup_nvlist_array(nvhis, ZPOOL_HIST_RECORD,
&records, &numrecords) == 0);
for (i = 0; i < numrecords; i++) {
nvlist_t *rec = records[i];
char tbuf[30] = "";
if (nvlist_exists(rec, ZPOOL_HIST_TIME)) {
time_t tsec;
struct tm t;
tsec = fnvlist_lookup_uint64(records[i],
ZPOOL_HIST_TIME);
(void) localtime_r(&tsec, &t);
(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
}
if (nvlist_exists(rec, ZPOOL_HIST_CMD)) {
(void) printf("%s %s", tbuf,
fnvlist_lookup_string(rec, ZPOOL_HIST_CMD));
} else if (nvlist_exists(rec, ZPOOL_HIST_INT_EVENT)) {
int ievent =
fnvlist_lookup_uint64(rec, ZPOOL_HIST_INT_EVENT);
if (!cb->internal)
continue;
if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) {
(void) printf("%s unrecognized record:\n",
tbuf);
dump_nvlist(rec, 4);
continue;
}
(void) printf("%s [internal %s txg:%lld] %s", tbuf,
zfs_history_event_names[ievent],
fnvlist_lookup_uint64(rec, ZPOOL_HIST_TXG),
fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR));
} else if (nvlist_exists(rec, ZPOOL_HIST_INT_NAME)) {
if (!cb->internal)
continue;
(void) printf("%s [txg:%lld] %s", tbuf,
fnvlist_lookup_uint64(rec, ZPOOL_HIST_TXG),
fnvlist_lookup_string(rec, ZPOOL_HIST_INT_NAME));
if (nvlist_exists(rec, ZPOOL_HIST_DSNAME)) {
(void) printf(" %s (%llu)",
fnvlist_lookup_string(rec,
ZPOOL_HIST_DSNAME),
fnvlist_lookup_uint64(rec,
ZPOOL_HIST_DSID));
}
(void) printf(" %s", fnvlist_lookup_string(rec,
ZPOOL_HIST_INT_STR));
} else if (nvlist_exists(rec, ZPOOL_HIST_IOCTL)) {
if (!cb->internal)
continue;
(void) printf("%s ioctl %s\n", tbuf,
fnvlist_lookup_string(rec, ZPOOL_HIST_IOCTL));
if (nvlist_exists(rec, ZPOOL_HIST_INPUT_NVL)) {
(void) printf(" input:\n");
dump_nvlist(fnvlist_lookup_nvlist(rec,
ZPOOL_HIST_INPUT_NVL), 8);
}
if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_NVL)) {
(void) printf(" output:\n");
dump_nvlist(fnvlist_lookup_nvlist(rec,
ZPOOL_HIST_OUTPUT_NVL), 8);
}
if (nvlist_exists(rec, ZPOOL_HIST_ERRNO)) {
(void) printf(" errno: %lld\n",
fnvlist_lookup_int64(rec,
ZPOOL_HIST_ERRNO));
}
} else {
if (!cb->internal)
continue;
(void) printf("%s unrecognized record:\n", tbuf);
dump_nvlist(rec, 4);
}
if (!cb->longfmt) {
(void) printf("\n");
continue;
}
(void) printf(" [");
if (nvlist_exists(rec, ZPOOL_HIST_WHO)) {
uid_t who = fnvlist_lookup_uint64(rec, ZPOOL_HIST_WHO);
struct passwd *pwd = getpwuid(who);
(void) printf("user %d ", (int)who);
if (pwd != NULL)
(void) printf("(%s) ", pwd->pw_name);
}
if (nvlist_exists(rec, ZPOOL_HIST_HOST)) {
(void) printf("on %s",
fnvlist_lookup_string(rec, ZPOOL_HIST_HOST));
}
if (nvlist_exists(rec, ZPOOL_HIST_ZONE)) {
(void) printf(":%s",
fnvlist_lookup_string(rec, ZPOOL_HIST_ZONE));
}
(void) printf("]");
(void) printf("\n");
}
(void) printf("\n");
nvlist_free(nvhis);
return (ret);
}
/*
* zpool history <pool>
*
* Displays the history of commands that modified pools.
*/
int
zpool_do_history(int argc, char **argv)
{
hist_cbdata_t cbdata = { 0 };
int ret;
int c;
cbdata.first = B_TRUE;
/* check options */
while ((c = getopt(argc, argv, "li")) != -1) {
switch (c) {
case 'l':
cbdata.longfmt = B_TRUE;
break;
case 'i':
cbdata.internal = B_TRUE;
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
ret = for_each_pool(argc, argv, B_FALSE, NULL, get_history_one,
&cbdata);
if (argc == 0 && cbdata.first == B_TRUE) {
(void) printf(gettext("no pools available\n"));
return (0);
}
return (ret);
}
static int
get_callback(zpool_handle_t *zhp, void *data)
{
zprop_get_cbdata_t *cbp = (zprop_get_cbdata_t *)data;
char value[MAXNAMELEN];
zprop_source_t srctype;
zprop_list_t *pl;
for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) {
/*
* Skip the special fake placeholder. This will also skip
* over the name property when 'all' is specified.
*/
if (pl->pl_prop == ZPOOL_PROP_NAME &&
pl == cbp->cb_proplist)
continue;
if (pl->pl_prop == ZPROP_INVAL &&
(zpool_prop_feature(pl->pl_user_prop) ||
zpool_prop_unsupported(pl->pl_user_prop))) {
srctype = ZPROP_SRC_LOCAL;
if (zpool_prop_get_feature(zhp, pl->pl_user_prop,
value, sizeof (value)) == 0) {
zprop_print_one_property(zpool_get_name(zhp),
cbp, pl->pl_user_prop, value, srctype,
NULL, NULL);
}
} else {
if (zpool_get_prop(zhp, pl->pl_prop, value,
sizeof (value), &srctype, cbp->cb_literal) != 0)
continue;
zprop_print_one_property(zpool_get_name(zhp), cbp,
zpool_prop_to_name(pl->pl_prop), value, srctype,
NULL, NULL);
}
}
return (0);
}
/*
* zpool get [-Hp] [-o "all" | field[,...]] <"all" | property[,...]> <pool> ...
*
* -H Scripted mode. Don't display headers, and separate properties
* by a single tab.
* -o List of columns to display. Defaults to
* "name,property,value,source".
* -p Diplay values in parsable (exact) format.
*
* Get properties of pools in the system. Output space statistics
* for each one as well as other attributes.
*/
int
zpool_do_get(int argc, char **argv)
{
zprop_get_cbdata_t cb = { 0 };
zprop_list_t fake_name = { 0 };
int ret;
int c, i;
char *value;
cb.cb_first = B_TRUE;
/*
* Set up default columns and sources.
*/
cb.cb_sources = ZPROP_SRC_ALL;
cb.cb_columns[0] = GET_COL_NAME;
cb.cb_columns[1] = GET_COL_PROPERTY;
cb.cb_columns[2] = GET_COL_VALUE;
cb.cb_columns[3] = GET_COL_SOURCE;
cb.cb_type = ZFS_TYPE_POOL;
/* check options */
while ((c = getopt(argc, argv, ":Hpo:")) != -1) {
switch (c) {
case 'p':
cb.cb_literal = B_TRUE;
break;
case 'H':
cb.cb_scripted = B_TRUE;
break;
case 'o':
bzero(&cb.cb_columns, sizeof (cb.cb_columns));
i = 0;
while (*optarg != '\0') {
static char *col_subopts[] =
{ "name", "property", "value", "source",
"all", NULL };
if (i == ZFS_GET_NCOLS) {
(void) fprintf(stderr, gettext("too "
"many fields given to -o "
"option\n"));
usage(B_FALSE);
}
switch (getsubopt(&optarg, col_subopts,
&value)) {
case 0:
cb.cb_columns[i++] = GET_COL_NAME;
break;
case 1:
cb.cb_columns[i++] = GET_COL_PROPERTY;
break;
case 2:
cb.cb_columns[i++] = GET_COL_VALUE;
break;
case 3:
cb.cb_columns[i++] = GET_COL_SOURCE;
break;
case 4:
if (i > 0) {
(void) fprintf(stderr,
gettext("\"all\" conflicts "
"with specific fields "
"given to -o option\n"));
usage(B_FALSE);
}
cb.cb_columns[0] = GET_COL_NAME;
cb.cb_columns[1] = GET_COL_PROPERTY;
cb.cb_columns[2] = GET_COL_VALUE;
cb.cb_columns[3] = GET_COL_SOURCE;
i = ZFS_GET_NCOLS;
break;
default:
(void) fprintf(stderr,
gettext("invalid column name "
"'%s'\n"), suboptarg);
usage(B_FALSE);
}
}
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
if (argc < 1) {
(void) fprintf(stderr, gettext("missing property "
"argument\n"));
usage(B_FALSE);
}
if (zprop_get_list(g_zfs, argv[0], &cb.cb_proplist,
ZFS_TYPE_POOL) != 0)
usage(B_FALSE);
argc--;
argv++;
if (cb.cb_proplist != NULL) {
fake_name.pl_prop = ZPOOL_PROP_NAME;
fake_name.pl_width = strlen(gettext("NAME"));
fake_name.pl_next = cb.cb_proplist;
cb.cb_proplist = &fake_name;
}
ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist,
get_callback, &cb);
if (cb.cb_proplist == &fake_name)
zprop_free_list(fake_name.pl_next);
else
zprop_free_list(cb.cb_proplist);
return (ret);
}
typedef struct set_cbdata {
char *cb_propname;
char *cb_value;
boolean_t cb_any_successful;
} set_cbdata_t;
int
set_callback(zpool_handle_t *zhp, void *data)
{
int error;
set_cbdata_t *cb = (set_cbdata_t *)data;
error = zpool_set_prop(zhp, cb->cb_propname, cb->cb_value);
if (!error)
cb->cb_any_successful = B_TRUE;
return (error);
}
int
zpool_do_set(int argc, char **argv)
{
set_cbdata_t cb = { 0 };
int error;
if (argc > 1 && argv[1][0] == '-') {
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
argv[1][1]);
usage(B_FALSE);
}
if (argc < 2) {
(void) fprintf(stderr, gettext("missing property=value "
"argument\n"));
usage(B_FALSE);
}
if (argc < 3) {
(void) fprintf(stderr, gettext("missing pool name\n"));
usage(B_FALSE);
}
if (argc > 3) {
(void) fprintf(stderr, gettext("too many pool names\n"));
usage(B_FALSE);
}
cb.cb_propname = argv[1];
cb.cb_value = strchr(cb.cb_propname, '=');
if (cb.cb_value == NULL) {
(void) fprintf(stderr, gettext("missing value in "
"property=value argument\n"));
usage(B_FALSE);
}
*(cb.cb_value) = '\0';
cb.cb_value++;
error = for_each_pool(argc - 2, argv + 2, B_TRUE, NULL,
set_callback, &cb);
return (error);
}
static int
find_command_idx(char *command, int *idx)
{
int i;
for (i = 0; i < NCOMMAND; i++) {
if (command_table[i].name == NULL)
continue;
if (strcmp(command, command_table[i].name) == 0) {
*idx = i;
return (0);
}
}
return (1);
}
int
main(int argc, char **argv)
{
int ret = 0;
int i;
char *cmdname;
(void) setlocale(LC_ALL, "");
(void) textdomain(TEXT_DOMAIN);
if ((g_zfs = libzfs_init()) == NULL) {
(void) fprintf(stderr, gettext("internal error: failed to "
"initialize ZFS library\n"));
return (1);
}
libzfs_print_on_error(g_zfs, B_TRUE);
opterr = 0;
/*
* Make sure the user has specified some command.
*/
if (argc < 2) {
(void) fprintf(stderr, gettext("missing command\n"));
usage(B_FALSE);
}
cmdname = argv[1];
/*
* Special case '-?'
*/
if (strcmp(cmdname, "-?") == 0)
usage(B_TRUE);
zfs_save_arguments(argc, argv, history_str, sizeof (history_str));
/*
* Run the appropriate command.
*/
if (find_command_idx(cmdname, &i) == 0) {
current_command = &command_table[i];
ret = command_table[i].func(argc - 1, argv + 1);
} else if (strchr(cmdname, '=')) {
verify(find_command_idx("set", &i) == 0);
current_command = &command_table[i];
ret = command_table[i].func(argc, argv);
} else if (strcmp(cmdname, "freeze") == 0 && argc == 3) {
/*
* 'freeze' is a vile debugging abomination, so we treat
* it as such.
*/
zfs_cmd_t zc = { 0 };
(void) strlcpy(zc.zc_name, argv[2], sizeof (zc.zc_name));
return (!!zfs_ioctl(g_zfs, ZFS_IOC_POOL_FREEZE, &zc));
} else {
(void) fprintf(stderr, gettext("unrecognized "
"command '%s'\n"), cmdname);
usage(B_FALSE);
}
if (ret == 0 && log_history)
(void) zpool_log_history(g_zfs, history_str);
libzfs_fini(g_zfs);
/*
* The 'ZFS_ABORT' environment variable causes us to dump core on exit
* for the purposes of running ::findleaks.
*/
if (getenv("ZFS_ABORT") != NULL) {
(void) printf("dumping core by request\n");
abort();
}
return (ret);
}
Index: stable/11/cddl/contrib/opensolaris/cmd/ztest/ztest.c
===================================================================
--- stable/11/cddl/contrib/opensolaris/cmd/ztest/ztest.c (revision 332524)
+++ stable/11/cddl/contrib/opensolaris/cmd/ztest/ztest.c (revision 332525)
@@ -1,6425 +1,6496 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2017 Joyent, Inc.
*/
/*
* The objective of this program is to provide a DMU/ZAP/SPA stress test
* that runs entirely in userland, is easy to use, and easy to extend.
*
* The overall design of the ztest program is as follows:
*
* (1) For each major functional area (e.g. adding vdevs to a pool,
* creating and destroying datasets, reading and writing objects, etc)
* we have a simple routine to test that functionality. These
* individual routines do not have to do anything "stressful".
*
* (2) We turn these simple functionality tests into a stress test by
* running them all in parallel, with as many threads as desired,
* and spread across as many datasets, objects, and vdevs as desired.
*
* (3) While all this is happening, we inject faults into the pool to
* verify that self-healing data really works.
*
* (4) Every time we open a dataset, we change its checksum and compression
* functions. Thus even individual objects vary from block to block
* in which checksum they use and whether they're compressed.
*
* (5) To verify that we never lose on-disk consistency after a crash,
* we run the entire test in a child of the main process.
* At random times, the child self-immolates with a SIGKILL.
* This is the software equivalent of pulling the power cord.
* The parent then runs the test again, using the existing
* storage pool, as many times as desired. If backwards compatibility
* testing is enabled ztest will sometimes run the "older" version
* of ztest after a SIGKILL.
*
* (6) To verify that we don't have future leaks or temporal incursions,
* many of the functional tests record the transaction group number
* as part of their data. When reading old data, they verify that
* the transaction group number is less than the current, open txg.
* If you add a new test, please do this if applicable.
*
* When run with no arguments, ztest runs for about five minutes and
* produces no output if successful. To get a little bit of information,
* specify -V. To get more information, specify -VV, and so on.
*
* To turn this into an overnight stress test, use -T to specify run time.
*
* You can ask more more vdevs [-v], datasets [-d], or threads [-t]
* to increase the pool capacity, fanout, and overall stress level.
*
* Use the -k option to set the desired frequency of kills.
*
* When ztest invokes itself it passes all relevant information through a
* temporary file which is mmap-ed in the child process. This allows shared
* memory to survive the exec syscall. The ztest_shared_hdr_t struct is always
* stored at offset 0 of this file and contains information on the size and
* number of shared structures in the file. The information stored in this file
* must remain backwards compatible with older versions of ztest so that
* ztest can invoke them during backwards compatibility testing (-B).
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/dmu.h>
#include <sys/txg.h>
#include <sys/dbuf.h>
#include <sys/zap.h>
#include <sys/dmu_objset.h>
#include <sys/poll.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/wait.h>
#include <sys/mman.h>
#include <sys/resource.h>
#include <sys/zio.h>
#include <sys/zil.h>
#include <sys/zil_impl.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_file.h>
#include <sys/spa_impl.h>
#include <sys/metaslab_impl.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_destroy.h>
#include <sys/dsl_scan.h>
#include <sys/zio_checksum.h>
#include <sys/refcount.h>
#include <sys/zfeature.h>
#include <sys/dsl_userhold.h>
#include <sys/abd.h>
#include <stdio.h>
#include <stdio_ext.h>
#include <stdlib.h>
#include <unistd.h>
#include <signal.h>
#include <umem.h>
#include <dlfcn.h>
#include <ctype.h>
#include <math.h>
#include <errno.h>
#include <sys/fs/zfs.h>
#include <libnvpair.h>
#include <libcmdutils.h>
static int ztest_fd_data = -1;
static int ztest_fd_rand = -1;
typedef struct ztest_shared_hdr {
uint64_t zh_hdr_size;
uint64_t zh_opts_size;
uint64_t zh_size;
uint64_t zh_stats_size;
uint64_t zh_stats_count;
uint64_t zh_ds_size;
uint64_t zh_ds_count;
} ztest_shared_hdr_t;
static ztest_shared_hdr_t *ztest_shared_hdr;
typedef struct ztest_shared_opts {
char zo_pool[ZFS_MAX_DATASET_NAME_LEN];
char zo_dir[ZFS_MAX_DATASET_NAME_LEN];
char zo_alt_ztest[MAXNAMELEN];
char zo_alt_libpath[MAXNAMELEN];
uint64_t zo_vdevs;
uint64_t zo_vdevtime;
size_t zo_vdev_size;
int zo_ashift;
int zo_mirrors;
int zo_raidz;
int zo_raidz_parity;
int zo_datasets;
int zo_threads;
uint64_t zo_passtime;
uint64_t zo_killrate;
int zo_verbose;
int zo_init;
uint64_t zo_time;
uint64_t zo_maxloops;
uint64_t zo_metaslab_gang_bang;
} ztest_shared_opts_t;
static const ztest_shared_opts_t ztest_opts_defaults = {
.zo_pool = { 'z', 't', 'e', 's', 't', '\0' },
.zo_dir = { '/', 't', 'm', 'p', '\0' },
.zo_alt_ztest = { '\0' },
.zo_alt_libpath = { '\0' },
.zo_vdevs = 5,
.zo_ashift = SPA_MINBLOCKSHIFT,
.zo_mirrors = 2,
.zo_raidz = 4,
.zo_raidz_parity = 1,
.zo_vdev_size = SPA_MINDEVSIZE * 4, /* 256m default size */
.zo_datasets = 7,
.zo_threads = 23,
.zo_passtime = 60, /* 60 seconds */
.zo_killrate = 70, /* 70% kill rate */
.zo_verbose = 0,
.zo_init = 1,
.zo_time = 300, /* 5 minutes */
.zo_maxloops = 50, /* max loops during spa_freeze() */
.zo_metaslab_gang_bang = 32 << 10
};
extern uint64_t metaslab_gang_bang;
extern uint64_t metaslab_df_alloc_threshold;
extern uint64_t zfs_deadman_synctime_ms;
extern int metaslab_preload_limit;
extern boolean_t zfs_compressed_arc_enabled;
extern boolean_t zfs_abd_scatter_enabled;
static ztest_shared_opts_t *ztest_shared_opts;
static ztest_shared_opts_t ztest_opts;
typedef struct ztest_shared_ds {
uint64_t zd_seq;
} ztest_shared_ds_t;
static ztest_shared_ds_t *ztest_shared_ds;
#define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d])
#define BT_MAGIC 0x123456789abcdefULL
#define MAXFAULTS() \
(MAX(zs->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1)
enum ztest_io_type {
ZTEST_IO_WRITE_TAG,
ZTEST_IO_WRITE_PATTERN,
ZTEST_IO_WRITE_ZEROES,
ZTEST_IO_TRUNCATE,
ZTEST_IO_SETATTR,
ZTEST_IO_REWRITE,
ZTEST_IO_TYPES
};
typedef struct ztest_block_tag {
uint64_t bt_magic;
uint64_t bt_objset;
uint64_t bt_object;
uint64_t bt_offset;
uint64_t bt_gen;
uint64_t bt_txg;
uint64_t bt_crtxg;
} ztest_block_tag_t;
typedef struct bufwad {
uint64_t bw_index;
uint64_t bw_txg;
uint64_t bw_data;
} bufwad_t;
/*
* XXX -- fix zfs range locks to be generic so we can use them here.
*/
typedef enum {
RL_READER,
RL_WRITER,
RL_APPEND
} rl_type_t;
typedef struct rll {
void *rll_writer;
int rll_readers;
mutex_t rll_lock;
cond_t rll_cv;
} rll_t;
typedef struct rl {
uint64_t rl_object;
uint64_t rl_offset;
uint64_t rl_size;
rll_t *rl_lock;
} rl_t;
#define ZTEST_RANGE_LOCKS 64
#define ZTEST_OBJECT_LOCKS 64
/*
* Object descriptor. Used as a template for object lookup/create/remove.
*/
typedef struct ztest_od {
uint64_t od_dir;
uint64_t od_object;
dmu_object_type_t od_type;
dmu_object_type_t od_crtype;
uint64_t od_blocksize;
uint64_t od_crblocksize;
uint64_t od_gen;
uint64_t od_crgen;
char od_name[ZFS_MAX_DATASET_NAME_LEN];
} ztest_od_t;
/*
* Per-dataset state.
*/
typedef struct ztest_ds {
ztest_shared_ds_t *zd_shared;
objset_t *zd_os;
rwlock_t zd_zilog_lock;
zilog_t *zd_zilog;
ztest_od_t *zd_od; /* debugging aid */
char zd_name[ZFS_MAX_DATASET_NAME_LEN];
mutex_t zd_dirobj_lock;
rll_t zd_object_lock[ZTEST_OBJECT_LOCKS];
rll_t zd_range_lock[ZTEST_RANGE_LOCKS];
} ztest_ds_t;
/*
* Per-iteration state.
*/
typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id);
typedef struct ztest_info {
ztest_func_t *zi_func; /* test function */
uint64_t zi_iters; /* iterations per execution */
uint64_t *zi_interval; /* execute every <interval> seconds */
} ztest_info_t;
typedef struct ztest_shared_callstate {
uint64_t zc_count; /* per-pass count */
uint64_t zc_time; /* per-pass time */
uint64_t zc_next; /* next time to call this function */
} ztest_shared_callstate_t;
static ztest_shared_callstate_t *ztest_shared_callstate;
#define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c])
/*
* Note: these aren't static because we want dladdr() to work.
*/
ztest_func_t ztest_dmu_read_write;
ztest_func_t ztest_dmu_write_parallel;
ztest_func_t ztest_dmu_object_alloc_free;
ztest_func_t ztest_dmu_commit_callbacks;
ztest_func_t ztest_zap;
ztest_func_t ztest_zap_parallel;
ztest_func_t ztest_zil_commit;
ztest_func_t ztest_zil_remount;
ztest_func_t ztest_dmu_read_write_zcopy;
ztest_func_t ztest_dmu_objset_create_destroy;
ztest_func_t ztest_dmu_prealloc;
ztest_func_t ztest_fzap;
ztest_func_t ztest_dmu_snapshot_create_destroy;
ztest_func_t ztest_dsl_prop_get_set;
ztest_func_t ztest_spa_prop_get_set;
ztest_func_t ztest_spa_create_destroy;
ztest_func_t ztest_fault_inject;
ztest_func_t ztest_ddt_repair;
ztest_func_t ztest_dmu_snapshot_hold;
ztest_func_t ztest_spa_rename;
ztest_func_t ztest_scrub;
ztest_func_t ztest_dsl_dataset_promote_busy;
ztest_func_t ztest_vdev_attach_detach;
ztest_func_t ztest_vdev_LUN_growth;
ztest_func_t ztest_vdev_add_remove;
ztest_func_t ztest_vdev_aux_add_remove;
ztest_func_t ztest_split_pool;
ztest_func_t ztest_reguid;
ztest_func_t ztest_spa_upgrade;
+ztest_func_t ztest_device_removal;
+ztest_func_t ztest_remap_blocks;
uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */
uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */
uint64_t zopt_often = 1ULL * NANOSEC; /* every second */
uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */
uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */
ztest_info_t ztest_info[] = {
{ ztest_dmu_read_write, 1, &zopt_always },
{ ztest_dmu_write_parallel, 10, &zopt_always },
{ ztest_dmu_object_alloc_free, 1, &zopt_always },
{ ztest_dmu_commit_callbacks, 1, &zopt_always },
{ ztest_zap, 30, &zopt_always },
{ ztest_zap_parallel, 100, &zopt_always },
{ ztest_split_pool, 1, &zopt_always },
{ ztest_zil_commit, 1, &zopt_incessant },
{ ztest_zil_remount, 1, &zopt_sometimes },
{ ztest_dmu_read_write_zcopy, 1, &zopt_often },
{ ztest_dmu_objset_create_destroy, 1, &zopt_often },
{ ztest_dsl_prop_get_set, 1, &zopt_often },
{ ztest_spa_prop_get_set, 1, &zopt_sometimes },
#if 0
{ ztest_dmu_prealloc, 1, &zopt_sometimes },
#endif
{ ztest_fzap, 1, &zopt_sometimes },
{ ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes },
{ ztest_spa_create_destroy, 1, &zopt_sometimes },
{ ztest_fault_inject, 1, &zopt_sometimes },
{ ztest_ddt_repair, 1, &zopt_sometimes },
{ ztest_dmu_snapshot_hold, 1, &zopt_sometimes },
{ ztest_reguid, 1, &zopt_rarely },
{ ztest_spa_rename, 1, &zopt_rarely },
{ ztest_scrub, 1, &zopt_rarely },
{ ztest_spa_upgrade, 1, &zopt_rarely },
{ ztest_dsl_dataset_promote_busy, 1, &zopt_rarely },
{ ztest_vdev_attach_detach, 1, &zopt_sometimes },
{ ztest_vdev_LUN_growth, 1, &zopt_rarely },
{ ztest_vdev_add_remove, 1,
&ztest_opts.zo_vdevtime },
{ ztest_vdev_aux_add_remove, 1,
&ztest_opts.zo_vdevtime },
+ { ztest_device_removal, 1, &zopt_sometimes },
+ { ztest_remap_blocks, 1, &zopt_sometimes }
};
#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t))
/*
* The following struct is used to hold a list of uncalled commit callbacks.
* The callbacks are ordered by txg number.
*/
typedef struct ztest_cb_list {
mutex_t zcl_callbacks_lock;
list_t zcl_callbacks;
} ztest_cb_list_t;
/*
* Stuff we need to share writably between parent and child.
*/
typedef struct ztest_shared {
boolean_t zs_do_init;
hrtime_t zs_proc_start;
hrtime_t zs_proc_stop;
hrtime_t zs_thread_start;
hrtime_t zs_thread_stop;
hrtime_t zs_thread_kill;
uint64_t zs_enospc_count;
uint64_t zs_vdev_next_leaf;
uint64_t zs_vdev_aux;
uint64_t zs_alloc;
uint64_t zs_space;
uint64_t zs_splits;
uint64_t zs_mirrors;
uint64_t zs_metaslab_sz;
uint64_t zs_metaslab_df_alloc_threshold;
uint64_t zs_guid;
} ztest_shared_t;
#define ID_PARALLEL -1ULL
static char ztest_dev_template[] = "%s/%s.%llua";
static char ztest_aux_template[] = "%s/%s.%s.%llu";
ztest_shared_t *ztest_shared;
static spa_t *ztest_spa = NULL;
static ztest_ds_t *ztest_ds;
static mutex_t ztest_vdev_lock;
/*
* The ztest_name_lock protects the pool and dataset namespace used by
* the individual tests. To modify the namespace, consumers must grab
* this lock as writer. Grabbing the lock as reader will ensure that the
* namespace does not change while the lock is held.
*/
static rwlock_t ztest_name_lock;
static boolean_t ztest_dump_core = B_TRUE;
static boolean_t ztest_exiting;
/* Global commit callback list */
static ztest_cb_list_t zcl;
enum ztest_object {
ZTEST_META_DNODE = 0,
ZTEST_DIROBJ,
ZTEST_OBJECTS
};
static void usage(boolean_t) __NORETURN;
/*
* These libumem hooks provide a reasonable set of defaults for the allocator's
* debugging facilities.
*/
const char *
_umem_debug_init()
{
return ("default,verbose"); /* $UMEM_DEBUG setting */
}
const char *
_umem_logging_init(void)
{
return ("fail,contents"); /* $UMEM_LOGGING setting */
}
#define FATAL_MSG_SZ 1024
char *fatal_msg;
static void
fatal(int do_perror, char *message, ...)
{
va_list args;
int save_errno = errno;
char buf[FATAL_MSG_SZ];
(void) fflush(stdout);
va_start(args, message);
(void) sprintf(buf, "ztest: ");
/* LINTED */
(void) vsprintf(buf + strlen(buf), message, args);
va_end(args);
if (do_perror) {
(void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf),
": %s", strerror(save_errno));
}
(void) fprintf(stderr, "%s\n", buf);
fatal_msg = buf; /* to ease debugging */
if (ztest_dump_core)
abort();
exit(3);
}
static int
str2shift(const char *buf)
{
const char *ends = "BKMGTPEZ";
int i;
if (buf[0] == '\0')
return (0);
for (i = 0; i < strlen(ends); i++) {
if (toupper(buf[0]) == ends[i])
break;
}
if (i == strlen(ends)) {
(void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n",
buf);
usage(B_FALSE);
}
if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) {
return (10*i);
}
(void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf);
usage(B_FALSE);
/* NOTREACHED */
}
static uint64_t
nicenumtoull(const char *buf)
{
char *end;
uint64_t val;
val = strtoull(buf, &end, 0);
if (end == buf) {
(void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf);
usage(B_FALSE);
} else if (end[0] == '.') {
double fval = strtod(buf, &end);
fval *= pow(2, str2shift(end));
if (fval > UINT64_MAX) {
(void) fprintf(stderr, "ztest: value too large: %s\n",
buf);
usage(B_FALSE);
}
val = (uint64_t)fval;
} else {
int shift = str2shift(end);
if (shift >= 64 || (val << shift) >> shift != val) {
(void) fprintf(stderr, "ztest: value too large: %s\n",
buf);
usage(B_FALSE);
}
val <<= shift;
}
return (val);
}
static void
usage(boolean_t requested)
{
const ztest_shared_opts_t *zo = &ztest_opts_defaults;
char nice_vdev_size[NN_NUMBUF_SZ];
char nice_gang_bang[NN_NUMBUF_SZ];
FILE *fp = requested ? stdout : stderr;
nicenum(zo->zo_vdev_size, nice_vdev_size, sizeof (nice_vdev_size));
nicenum(zo->zo_metaslab_gang_bang, nice_gang_bang,
sizeof (nice_gang_bang));
(void) fprintf(fp, "Usage: %s\n"
"\t[-v vdevs (default: %llu)]\n"
"\t[-s size_of_each_vdev (default: %s)]\n"
"\t[-a alignment_shift (default: %d)] use 0 for random\n"
"\t[-m mirror_copies (default: %d)]\n"
"\t[-r raidz_disks (default: %d)]\n"
"\t[-R raidz_parity (default: %d)]\n"
"\t[-d datasets (default: %d)]\n"
"\t[-t threads (default: %d)]\n"
"\t[-g gang_block_threshold (default: %s)]\n"
"\t[-i init_count (default: %d)] initialize pool i times\n"
"\t[-k kill_percentage (default: %llu%%)]\n"
"\t[-p pool_name (default: %s)]\n"
"\t[-f dir (default: %s)] file directory for vdev files\n"
"\t[-V] verbose (use multiple times for ever more blather)\n"
"\t[-E] use existing pool instead of creating new one\n"
"\t[-T time (default: %llu sec)] total run time\n"
"\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n"
"\t[-P passtime (default: %llu sec)] time per pass\n"
"\t[-B alt_ztest (default: <none>)] alternate ztest path\n"
"\t[-o variable=value] ... set global variable to an unsigned\n"
"\t 32-bit integer value\n"
"\t[-h] (print help)\n"
"",
zo->zo_pool,
(u_longlong_t)zo->zo_vdevs, /* -v */
nice_vdev_size, /* -s */
zo->zo_ashift, /* -a */
zo->zo_mirrors, /* -m */
zo->zo_raidz, /* -r */
zo->zo_raidz_parity, /* -R */
zo->zo_datasets, /* -d */
zo->zo_threads, /* -t */
nice_gang_bang, /* -g */
zo->zo_init, /* -i */
(u_longlong_t)zo->zo_killrate, /* -k */
zo->zo_pool, /* -p */
zo->zo_dir, /* -f */
(u_longlong_t)zo->zo_time, /* -T */
(u_longlong_t)zo->zo_maxloops, /* -F */
(u_longlong_t)zo->zo_passtime);
exit(requested ? 0 : 1);
}
static void
process_options(int argc, char **argv)
{
char *path;
ztest_shared_opts_t *zo = &ztest_opts;
int opt;
uint64_t value;
char altdir[MAXNAMELEN] = { 0 };
bcopy(&ztest_opts_defaults, zo, sizeof (*zo));
while ((opt = getopt(argc, argv,
"v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:hF:B:o:")) != EOF) {
value = 0;
switch (opt) {
case 'v':
case 's':
case 'a':
case 'm':
case 'r':
case 'R':
case 'd':
case 't':
case 'g':
case 'i':
case 'k':
case 'T':
case 'P':
case 'F':
value = nicenumtoull(optarg);
}
switch (opt) {
case 'v':
zo->zo_vdevs = value;
break;
case 's':
zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value);
break;
case 'a':
zo->zo_ashift = value;
break;
case 'm':
zo->zo_mirrors = value;
break;
case 'r':
zo->zo_raidz = MAX(1, value);
break;
case 'R':
zo->zo_raidz_parity = MIN(MAX(value, 1), 3);
break;
case 'd':
zo->zo_datasets = MAX(1, value);
break;
case 't':
zo->zo_threads = MAX(1, value);
break;
case 'g':
zo->zo_metaslab_gang_bang = MAX(SPA_MINBLOCKSIZE << 1,
value);
break;
case 'i':
zo->zo_init = value;
break;
case 'k':
zo->zo_killrate = value;
break;
case 'p':
(void) strlcpy(zo->zo_pool, optarg,
sizeof (zo->zo_pool));
break;
case 'f':
path = realpath(optarg, NULL);
if (path == NULL) {
(void) fprintf(stderr, "error: %s: %s\n",
optarg, strerror(errno));
usage(B_FALSE);
} else {
(void) strlcpy(zo->zo_dir, path,
sizeof (zo->zo_dir));
}
break;
case 'V':
zo->zo_verbose++;
break;
case 'E':
zo->zo_init = 0;
break;
case 'T':
zo->zo_time = value;
break;
case 'P':
zo->zo_passtime = MAX(1, value);
break;
case 'F':
zo->zo_maxloops = MAX(1, value);
break;
case 'B':
(void) strlcpy(altdir, optarg, sizeof (altdir));
break;
case 'o':
if (set_global_var(optarg) != 0)
usage(B_FALSE);
break;
case 'h':
usage(B_TRUE);
break;
case '?':
default:
usage(B_FALSE);
break;
}
}
zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1);
zo->zo_vdevtime =
(zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs :
UINT64_MAX >> 2);
if (strlen(altdir) > 0) {
char *cmd;
char *realaltdir;
char *bin;
char *ztest;
char *isa;
int isalen;
cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
VERIFY(NULL != realpath(getexecname(), cmd));
if (0 != access(altdir, F_OK)) {
ztest_dump_core = B_FALSE;
fatal(B_TRUE, "invalid alternate ztest path: %s",
altdir);
}
VERIFY(NULL != realpath(altdir, realaltdir));
/*
* 'cmd' should be of the form "<anything>/usr/bin/<isa>/ztest".
* We want to extract <isa> to determine if we should use
* 32 or 64 bit binaries.
*/
bin = strstr(cmd, "/usr/bin/");
ztest = strstr(bin, "/ztest");
isa = bin + 9;
isalen = ztest - isa;
(void) snprintf(zo->zo_alt_ztest, sizeof (zo->zo_alt_ztest),
"%s/usr/bin/%.*s/ztest", realaltdir, isalen, isa);
(void) snprintf(zo->zo_alt_libpath, sizeof (zo->zo_alt_libpath),
"%s/usr/lib/%.*s", realaltdir, isalen, isa);
if (0 != access(zo->zo_alt_ztest, X_OK)) {
ztest_dump_core = B_FALSE;
fatal(B_TRUE, "invalid alternate ztest: %s",
zo->zo_alt_ztest);
} else if (0 != access(zo->zo_alt_libpath, X_OK)) {
ztest_dump_core = B_FALSE;
fatal(B_TRUE, "invalid alternate lib directory %s",
zo->zo_alt_libpath);
}
umem_free(cmd, MAXPATHLEN);
umem_free(realaltdir, MAXPATHLEN);
}
}
static void
ztest_kill(ztest_shared_t *zs)
{
zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa));
zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa));
/*
* Before we kill off ztest, make sure that the config is updated.
- * See comment above spa_config_sync().
+ * See comment above spa_write_cachefile().
*/
mutex_enter(&spa_namespace_lock);
- spa_config_sync(ztest_spa, B_FALSE, B_FALSE);
+ spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE);
mutex_exit(&spa_namespace_lock);
zfs_dbgmsg_print(FTAG);
(void) kill(getpid(), SIGKILL);
}
static uint64_t
ztest_random(uint64_t range)
{
uint64_t r;
ASSERT3S(ztest_fd_rand, >=, 0);
if (range == 0)
return (0);
if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r))
fatal(1, "short read from /dev/urandom");
return (r % range);
}
/* ARGSUSED */
static void
ztest_record_enospc(const char *s)
{
ztest_shared->zs_enospc_count++;
}
static uint64_t
ztest_get_ashift(void)
{
if (ztest_opts.zo_ashift == 0)
return (SPA_MINBLOCKSHIFT + ztest_random(5));
return (ztest_opts.zo_ashift);
}
static nvlist_t *
make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift)
{
char pathbuf[MAXPATHLEN];
uint64_t vdev;
nvlist_t *file;
if (ashift == 0)
ashift = ztest_get_ashift();
if (path == NULL) {
path = pathbuf;
if (aux != NULL) {
vdev = ztest_shared->zs_vdev_aux;
(void) snprintf(path, sizeof (pathbuf),
ztest_aux_template, ztest_opts.zo_dir,
pool == NULL ? ztest_opts.zo_pool : pool,
aux, vdev);
} else {
vdev = ztest_shared->zs_vdev_next_leaf++;
(void) snprintf(path, sizeof (pathbuf),
ztest_dev_template, ztest_opts.zo_dir,
pool == NULL ? ztest_opts.zo_pool : pool, vdev);
}
}
if (size != 0) {
int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666);
if (fd == -1)
fatal(1, "can't open %s", path);
if (ftruncate(fd, size) != 0)
fatal(1, "can't ftruncate %s", path);
(void) close(fd);
}
VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0);
VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0);
return (file);
}
static nvlist_t *
make_vdev_raidz(char *path, char *aux, char *pool, size_t size,
uint64_t ashift, int r)
{
nvlist_t *raidz, **child;
int c;
if (r < 2)
return (make_vdev_file(path, aux, pool, size, ashift));
child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL);
for (c = 0; c < r; c++)
child[c] = make_vdev_file(path, aux, pool, size, ashift);
VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0);
VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE,
VDEV_TYPE_RAIDZ) == 0);
VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY,
ztest_opts.zo_raidz_parity) == 0);
VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN,
child, r) == 0);
for (c = 0; c < r; c++)
nvlist_free(child[c]);
umem_free(child, r * sizeof (nvlist_t *));
return (raidz);
}
static nvlist_t *
make_vdev_mirror(char *path, char *aux, char *pool, size_t size,
uint64_t ashift, int r, int m)
{
nvlist_t *mirror, **child;
int c;
if (m < 1)
return (make_vdev_raidz(path, aux, pool, size, ashift, r));
child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL);
for (c = 0; c < m; c++)
child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r);
VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0);
VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE,
VDEV_TYPE_MIRROR) == 0);
VERIFY(nvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN,
child, m) == 0);
for (c = 0; c < m; c++)
nvlist_free(child[c]);
umem_free(child, m * sizeof (nvlist_t *));
return (mirror);
}
static nvlist_t *
make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift,
int log, int r, int m, int t)
{
nvlist_t *root, **child;
int c;
ASSERT(t > 0);
child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL);
for (c = 0; c < t; c++) {
child[c] = make_vdev_mirror(path, aux, pool, size, ashift,
r, m);
VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
log) == 0);
}
VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0);
VERIFY(nvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN,
child, t) == 0);
for (c = 0; c < t; c++)
nvlist_free(child[c]);
umem_free(child, t * sizeof (nvlist_t *));
return (root);
}
/*
* Find a random spa version. Returns back a random spa version in the
* range [initial_version, SPA_VERSION_FEATURES].
*/
static uint64_t
ztest_random_spa_version(uint64_t initial_version)
{
uint64_t version = initial_version;
if (version <= SPA_VERSION_BEFORE_FEATURES) {
version = version +
ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1);
}
if (version > SPA_VERSION_BEFORE_FEATURES)
version = SPA_VERSION_FEATURES;
ASSERT(SPA_VERSION_IS_SUPPORTED(version));
return (version);
}
static int
ztest_random_blocksize(void)
{
uint64_t block_shift;
/*
* Choose a block size >= the ashift.
* If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks.
*/
int maxbs = SPA_OLD_MAXBLOCKSHIFT;
if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE)
maxbs = 20;
block_shift = ztest_random(maxbs - ztest_spa->spa_max_ashift + 1);
return (1 << (SPA_MINBLOCKSHIFT + block_shift));
}
static int
ztest_random_ibshift(void)
{
return (DN_MIN_INDBLKSHIFT +
ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1));
}
static uint64_t
ztest_random_vdev_top(spa_t *spa, boolean_t log_ok)
{
uint64_t top;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *tvd;
ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
do {
top = ztest_random(rvd->vdev_children);
tvd = rvd->vdev_child[top];
- } while (tvd->vdev_ishole || (tvd->vdev_islog && !log_ok) ||
+ } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) ||
tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL);
return (top);
}
static uint64_t
ztest_random_dsl_prop(zfs_prop_t prop)
{
uint64_t value;
do {
value = zfs_prop_random_value(prop, ztest_random(-1ULL));
} while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF);
return (value);
}
static int
ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value,
boolean_t inherit)
{
const char *propname = zfs_prop_to_name(prop);
const char *valname;
char setpoint[MAXPATHLEN];
uint64_t curval;
int error;
error = dsl_prop_set_int(osname, propname,
(inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value);
if (error == ENOSPC) {
ztest_record_enospc(FTAG);
return (error);
}
ASSERT0(error);
VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint));
if (ztest_opts.zo_verbose >= 6) {
VERIFY(zfs_prop_index_to_string(prop, curval, &valname) == 0);
(void) printf("%s %s = %s at '%s'\n",
osname, propname, valname, setpoint);
}
return (error);
}
static int
ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value)
{
spa_t *spa = ztest_spa;
nvlist_t *props = NULL;
int error;
VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0);
error = spa_prop_set(spa, props);
nvlist_free(props);
if (error == ENOSPC) {
ztest_record_enospc(FTAG);
return (error);
}
ASSERT0(error);
return (error);
}
static void
ztest_rll_init(rll_t *rll)
{
rll->rll_writer = NULL;
rll->rll_readers = 0;
VERIFY(_mutex_init(&rll->rll_lock, USYNC_THREAD, NULL) == 0);
VERIFY(cond_init(&rll->rll_cv, USYNC_THREAD, NULL) == 0);
}
static void
ztest_rll_destroy(rll_t *rll)
{
ASSERT(rll->rll_writer == NULL);
ASSERT(rll->rll_readers == 0);
VERIFY(_mutex_destroy(&rll->rll_lock) == 0);
VERIFY(cond_destroy(&rll->rll_cv) == 0);
}
static void
ztest_rll_lock(rll_t *rll, rl_type_t type)
{
VERIFY(mutex_lock(&rll->rll_lock) == 0);
if (type == RL_READER) {
while (rll->rll_writer != NULL)
(void) cond_wait(&rll->rll_cv, &rll->rll_lock);
rll->rll_readers++;
} else {
while (rll->rll_writer != NULL || rll->rll_readers)
(void) cond_wait(&rll->rll_cv, &rll->rll_lock);
rll->rll_writer = curthread;
}
VERIFY(mutex_unlock(&rll->rll_lock) == 0);
}
static void
ztest_rll_unlock(rll_t *rll)
{
VERIFY(mutex_lock(&rll->rll_lock) == 0);
if (rll->rll_writer) {
ASSERT(rll->rll_readers == 0);
rll->rll_writer = NULL;
} else {
ASSERT(rll->rll_readers != 0);
ASSERT(rll->rll_writer == NULL);
rll->rll_readers--;
}
if (rll->rll_writer == NULL && rll->rll_readers == 0)
VERIFY(cond_broadcast(&rll->rll_cv) == 0);
VERIFY(mutex_unlock(&rll->rll_lock) == 0);
}
static void
ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type)
{
rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
ztest_rll_lock(rll, type);
}
static void
ztest_object_unlock(ztest_ds_t *zd, uint64_t object)
{
rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
ztest_rll_unlock(rll);
}
static rl_t *
ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset,
uint64_t size, rl_type_t type)
{
uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1));
rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)];
rl_t *rl;
rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL);
rl->rl_object = object;
rl->rl_offset = offset;
rl->rl_size = size;
rl->rl_lock = rll;
ztest_rll_lock(rll, type);
return (rl);
}
static void
ztest_range_unlock(rl_t *rl)
{
rll_t *rll = rl->rl_lock;
ztest_rll_unlock(rll);
umem_free(rl, sizeof (*rl));
}
static void
ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os)
{
zd->zd_os = os;
zd->zd_zilog = dmu_objset_zil(os);
zd->zd_shared = szd;
dmu_objset_name(os, zd->zd_name);
if (zd->zd_shared != NULL)
zd->zd_shared->zd_seq = 0;
VERIFY(rwlock_init(&zd->zd_zilog_lock, USYNC_THREAD, NULL) == 0);
VERIFY(_mutex_init(&zd->zd_dirobj_lock, USYNC_THREAD, NULL) == 0);
for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
ztest_rll_init(&zd->zd_object_lock[l]);
for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
ztest_rll_init(&zd->zd_range_lock[l]);
}
static void
ztest_zd_fini(ztest_ds_t *zd)
{
VERIFY(_mutex_destroy(&zd->zd_dirobj_lock) == 0);
for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
ztest_rll_destroy(&zd->zd_object_lock[l]);
for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
ztest_rll_destroy(&zd->zd_range_lock[l]);
}
#define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT)
static uint64_t
ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag)
{
uint64_t txg;
int error;
/*
* Attempt to assign tx to some transaction group.
*/
error = dmu_tx_assign(tx, txg_how);
if (error) {
if (error == ERESTART) {
ASSERT(txg_how == TXG_NOWAIT);
dmu_tx_wait(tx);
} else {
ASSERT3U(error, ==, ENOSPC);
ztest_record_enospc(tag);
}
dmu_tx_abort(tx);
return (0);
}
txg = dmu_tx_get_txg(tx);
ASSERT(txg != 0);
return (txg);
}
static void
ztest_pattern_set(void *buf, uint64_t size, uint64_t value)
{
uint64_t *ip = buf;
uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
while (ip < ip_end)
*ip++ = value;
}
static boolean_t
ztest_pattern_match(void *buf, uint64_t size, uint64_t value)
{
uint64_t *ip = buf;
uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
uint64_t diff = 0;
while (ip < ip_end)
diff |= (value - *ip++);
return (diff == 0);
}
static void
ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
{
bt->bt_magic = BT_MAGIC;
bt->bt_objset = dmu_objset_id(os);
bt->bt_object = object;
bt->bt_offset = offset;
bt->bt_gen = gen;
bt->bt_txg = txg;
bt->bt_crtxg = crtxg;
}
static void
ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
{
ASSERT3U(bt->bt_magic, ==, BT_MAGIC);
ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os));
ASSERT3U(bt->bt_object, ==, object);
ASSERT3U(bt->bt_offset, ==, offset);
ASSERT3U(bt->bt_gen, <=, gen);
ASSERT3U(bt->bt_txg, <=, txg);
ASSERT3U(bt->bt_crtxg, ==, crtxg);
}
static ztest_block_tag_t *
ztest_bt_bonus(dmu_buf_t *db)
{
dmu_object_info_t doi;
ztest_block_tag_t *bt;
dmu_object_info_from_db(db, &doi);
ASSERT3U(doi.doi_bonus_size, <=, db->db_size);
ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt));
bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt));
return (bt);
}
/*
* ZIL logging ops
*/
#define lrz_type lr_mode
#define lrz_blocksize lr_uid
#define lrz_ibshift lr_gid
#define lrz_bonustype lr_rdev
#define lrz_bonuslen lr_crtime[1]
static void
ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr)
{
char *name = (void *)(lr + 1); /* name follows lr */
size_t namesize = strlen(name) + 1;
itx_t *itx;
if (zil_replaying(zd->zd_zilog, tx))
return;
itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize);
bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
sizeof (*lr) + namesize - sizeof (lr_t));
zil_itx_assign(zd->zd_zilog, itx, tx);
}
static void
ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object)
{
char *name = (void *)(lr + 1); /* name follows lr */
size_t namesize = strlen(name) + 1;
itx_t *itx;
if (zil_replaying(zd->zd_zilog, tx))
return;
itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize);
bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
sizeof (*lr) + namesize - sizeof (lr_t));
itx->itx_oid = object;
zil_itx_assign(zd->zd_zilog, itx, tx);
}
static void
ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
{
itx_t *itx;
itx_wr_state_t write_state = ztest_random(WR_NUM_STATES);
if (zil_replaying(zd->zd_zilog, tx))
return;
if (lr->lr_length > ZIL_MAX_LOG_DATA)
write_state = WR_INDIRECT;
itx = zil_itx_create(TX_WRITE,
sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0));
if (write_state == WR_COPIED &&
dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length,
((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) {
zil_itx_destroy(itx);
itx = zil_itx_create(TX_WRITE, sizeof (*lr));
write_state = WR_NEED_COPY;
}
itx->itx_private = zd;
itx->itx_wr_state = write_state;
itx->itx_sync = (ztest_random(8) == 0);
bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
sizeof (*lr) - sizeof (lr_t));
zil_itx_assign(zd->zd_zilog, itx, tx);
}
static void
ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr)
{
itx_t *itx;
if (zil_replaying(zd->zd_zilog, tx))
return;
itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
sizeof (*lr) - sizeof (lr_t));
itx->itx_sync = B_FALSE;
zil_itx_assign(zd->zd_zilog, itx, tx);
}
static void
ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr)
{
itx_t *itx;
if (zil_replaying(zd->zd_zilog, tx))
return;
itx = zil_itx_create(TX_SETATTR, sizeof (*lr));
bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
sizeof (*lr) - sizeof (lr_t));
itx->itx_sync = B_FALSE;
zil_itx_assign(zd->zd_zilog, itx, tx);
}
/*
* ZIL replay ops
*/
static int
ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap)
{
ztest_ds_t *zd = arg1;
lr_create_t *lr = arg2;
char *name = (void *)(lr + 1); /* name follows lr */
objset_t *os = zd->zd_os;
ztest_block_tag_t *bbt;
dmu_buf_t *db;
dmu_tx_t *tx;
uint64_t txg;
int error = 0;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
ASSERT(lr->lr_doid == ZTEST_DIROBJ);
ASSERT(name[0] != '\0');
tx = dmu_tx_create(os);
dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name);
if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
} else {
dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
}
txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
if (txg == 0)
return (ENOSPC);
ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid);
if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
if (lr->lr_foid == 0) {
lr->lr_foid = zap_create(os,
lr->lrz_type, lr->lrz_bonustype,
lr->lrz_bonuslen, tx);
} else {
error = zap_create_claim(os, lr->lr_foid,
lr->lrz_type, lr->lrz_bonustype,
lr->lrz_bonuslen, tx);
}
} else {
if (lr->lr_foid == 0) {
lr->lr_foid = dmu_object_alloc(os,
lr->lrz_type, 0, lr->lrz_bonustype,
lr->lrz_bonuslen, tx);
} else {
error = dmu_object_claim(os, lr->lr_foid,
lr->lrz_type, 0, lr->lrz_bonustype,
lr->lrz_bonuslen, tx);
}
}
if (error) {
ASSERT3U(error, ==, EEXIST);
ASSERT(zd->zd_zilog->zl_replay);
dmu_tx_commit(tx);
return (error);
}
ASSERT(lr->lr_foid != 0);
if (lr->lrz_type != DMU_OT_ZAP_OTHER)
VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid,
lr->lrz_blocksize, lr->lrz_ibshift, tx));
VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
bbt = ztest_bt_bonus(db);
dmu_buf_will_dirty(db, tx);
ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_gen, txg, txg);
dmu_buf_rele(db, FTAG);
VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1,
&lr->lr_foid, tx));
(void) ztest_log_create(zd, tx, lr);
dmu_tx_commit(tx);
return (0);
}
static int
ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap)
{
ztest_ds_t *zd = arg1;
lr_remove_t *lr = arg2;
char *name = (void *)(lr + 1); /* name follows lr */
objset_t *os = zd->zd_os;
dmu_object_info_t doi;
dmu_tx_t *tx;
uint64_t object, txg;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
ASSERT(lr->lr_doid == ZTEST_DIROBJ);
ASSERT(name[0] != '\0');
VERIFY3U(0, ==,
zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object));
ASSERT(object != 0);
ztest_object_lock(zd, object, RL_WRITER);
VERIFY3U(0, ==, dmu_object_info(os, object, &doi));
tx = dmu_tx_create(os);
dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name);
dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
if (txg == 0) {
ztest_object_unlock(zd, object);
return (ENOSPC);
}
if (doi.doi_type == DMU_OT_ZAP_OTHER) {
VERIFY3U(0, ==, zap_destroy(os, object, tx));
} else {
VERIFY3U(0, ==, dmu_object_free(os, object, tx));
}
VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx));
(void) ztest_log_remove(zd, tx, lr, object);
dmu_tx_commit(tx);
ztest_object_unlock(zd, object);
return (0);
}
static int
ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap)
{
ztest_ds_t *zd = arg1;
lr_write_t *lr = arg2;
objset_t *os = zd->zd_os;
void *data = lr + 1; /* data follows lr */
uint64_t offset, length;
ztest_block_tag_t *bt = data;
ztest_block_tag_t *bbt;
uint64_t gen, txg, lrtxg, crtxg;
dmu_object_info_t doi;
dmu_tx_t *tx;
dmu_buf_t *db;
arc_buf_t *abuf = NULL;
rl_t *rl;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
offset = lr->lr_offset;
length = lr->lr_length;
/* If it's a dmu_sync() block, write the whole block */
if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
if (length < blocksize) {
offset -= offset % blocksize;
length = blocksize;
}
}
if (bt->bt_magic == BSWAP_64(BT_MAGIC))
byteswap_uint64_array(bt, sizeof (*bt));
if (bt->bt_magic != BT_MAGIC)
bt = NULL;
ztest_object_lock(zd, lr->lr_foid, RL_READER);
rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER);
VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
dmu_object_info_from_db(db, &doi);
bbt = ztest_bt_bonus(db);
ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
gen = bbt->bt_gen;
crtxg = bbt->bt_crtxg;
lrtxg = lr->lr_common.lrc_txg;
tx = dmu_tx_create(os);
dmu_tx_hold_write(tx, lr->lr_foid, offset, length);
if (ztest_random(8) == 0 && length == doi.doi_data_block_size &&
P2PHASE(offset, length) == 0)
abuf = dmu_request_arcbuf(db, length);
txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
if (txg == 0) {
if (abuf != NULL)
dmu_return_arcbuf(abuf);
dmu_buf_rele(db, FTAG);
ztest_range_unlock(rl);
ztest_object_unlock(zd, lr->lr_foid);
return (ENOSPC);
}
if (bt != NULL) {
/*
* Usually, verify the old data before writing new data --
* but not always, because we also want to verify correct
* behavior when the data was not recently read into cache.
*/
ASSERT(offset % doi.doi_data_block_size == 0);
if (ztest_random(4) != 0) {
int prefetch = ztest_random(2) ?
DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH;
ztest_block_tag_t rbt;
VERIFY(dmu_read(os, lr->lr_foid, offset,
sizeof (rbt), &rbt, prefetch) == 0);
if (rbt.bt_magic == BT_MAGIC) {
ztest_bt_verify(&rbt, os, lr->lr_foid,
offset, gen, txg, crtxg);
}
}
/*
* Writes can appear to be newer than the bonus buffer because
* the ztest_get_data() callback does a dmu_read() of the
* open-context data, which may be different than the data
* as it was when the write was generated.
*/
if (zd->zd_zilog->zl_replay) {
ztest_bt_verify(bt, os, lr->lr_foid, offset,
MAX(gen, bt->bt_gen), MAX(txg, lrtxg),
bt->bt_crtxg);
}
/*
* Set the bt's gen/txg to the bonus buffer's gen/txg
* so that all of the usual ASSERTs will work.
*/
ztest_bt_generate(bt, os, lr->lr_foid, offset, gen, txg, crtxg);
}
if (abuf == NULL) {
dmu_write(os, lr->lr_foid, offset, length, data, tx);
} else {
bcopy(data, abuf->b_data, length);
dmu_assign_arcbuf(db, offset, abuf, tx);
}
(void) ztest_log_write(zd, tx, lr);
dmu_buf_rele(db, FTAG);
dmu_tx_commit(tx);
ztest_range_unlock(rl);
ztest_object_unlock(zd, lr->lr_foid);
return (0);
}
static int
ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
{
ztest_ds_t *zd = arg1;
lr_truncate_t *lr = arg2;
objset_t *os = zd->zd_os;
dmu_tx_t *tx;
uint64_t txg;
rl_t *rl;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
ztest_object_lock(zd, lr->lr_foid, RL_READER);
rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length,
RL_WRITER);
tx = dmu_tx_create(os);
dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length);
txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
if (txg == 0) {
ztest_range_unlock(rl);
ztest_object_unlock(zd, lr->lr_foid);
return (ENOSPC);
}
VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset,
lr->lr_length, tx) == 0);
(void) ztest_log_truncate(zd, tx, lr);
dmu_tx_commit(tx);
ztest_range_unlock(rl);
ztest_object_unlock(zd, lr->lr_foid);
return (0);
}
static int
ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap)
{
ztest_ds_t *zd = arg1;
lr_setattr_t *lr = arg2;
objset_t *os = zd->zd_os;
dmu_tx_t *tx;
dmu_buf_t *db;
ztest_block_tag_t *bbt;
uint64_t txg, lrtxg, crtxg;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
ztest_object_lock(zd, lr->lr_foid, RL_WRITER);
VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
tx = dmu_tx_create(os);
dmu_tx_hold_bonus(tx, lr->lr_foid);
txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
if (txg == 0) {
dmu_buf_rele(db, FTAG);
ztest_object_unlock(zd, lr->lr_foid);
return (ENOSPC);
}
bbt = ztest_bt_bonus(db);
ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
crtxg = bbt->bt_crtxg;
lrtxg = lr->lr_common.lrc_txg;
if (zd->zd_zilog->zl_replay) {
ASSERT(lr->lr_size != 0);
ASSERT(lr->lr_mode != 0);
ASSERT(lrtxg != 0);
} else {
/*
* Randomly change the size and increment the generation.
*/
lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) *
sizeof (*bbt);
lr->lr_mode = bbt->bt_gen + 1;
ASSERT(lrtxg == 0);
}
/*
* Verify that the current bonus buffer is not newer than our txg.
*/
ztest_bt_verify(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode,
MAX(txg, lrtxg), crtxg);
dmu_buf_will_dirty(db, tx);
ASSERT3U(lr->lr_size, >=, sizeof (*bbt));
ASSERT3U(lr->lr_size, <=, db->db_size);
VERIFY0(dmu_set_bonus(db, lr->lr_size, tx));
bbt = ztest_bt_bonus(db);
ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, txg, crtxg);
dmu_buf_rele(db, FTAG);
(void) ztest_log_setattr(zd, tx, lr);
dmu_tx_commit(tx);
ztest_object_unlock(zd, lr->lr_foid);
return (0);
}
zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
NULL, /* 0 no such transaction type */
ztest_replay_create, /* TX_CREATE */
NULL, /* TX_MKDIR */
NULL, /* TX_MKXATTR */
NULL, /* TX_SYMLINK */
ztest_replay_remove, /* TX_REMOVE */
NULL, /* TX_RMDIR */
NULL, /* TX_LINK */
NULL, /* TX_RENAME */
ztest_replay_write, /* TX_WRITE */
ztest_replay_truncate, /* TX_TRUNCATE */
ztest_replay_setattr, /* TX_SETATTR */
NULL, /* TX_ACL */
NULL, /* TX_CREATE_ACL */
NULL, /* TX_CREATE_ATTR */
NULL, /* TX_CREATE_ACL_ATTR */
NULL, /* TX_MKDIR_ACL */
NULL, /* TX_MKDIR_ATTR */
NULL, /* TX_MKDIR_ACL_ATTR */
NULL, /* TX_WRITE2 */
};
/*
* ZIL get_data callbacks
*/
static void
ztest_get_done(zgd_t *zgd, int error)
{
ztest_ds_t *zd = zgd->zgd_private;
uint64_t object = zgd->zgd_rl->rl_object;
if (zgd->zgd_db)
dmu_buf_rele(zgd->zgd_db, zgd);
ztest_range_unlock(zgd->zgd_rl);
ztest_object_unlock(zd, object);
if (error == 0 && zgd->zgd_bp)
zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
umem_free(zgd, sizeof (*zgd));
}
static int
ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb,
zio_t *zio)
{
ztest_ds_t *zd = arg;
objset_t *os = zd->zd_os;
uint64_t object = lr->lr_foid;
uint64_t offset = lr->lr_offset;
uint64_t size = lr->lr_length;
uint64_t txg = lr->lr_common.lrc_txg;
uint64_t crtxg;
dmu_object_info_t doi;
dmu_buf_t *db;
zgd_t *zgd;
int error;
ASSERT3P(lwb, !=, NULL);
ASSERT3P(zio, !=, NULL);
ASSERT3U(size, !=, 0);
ztest_object_lock(zd, object, RL_READER);
error = dmu_bonus_hold(os, object, FTAG, &db);
if (error) {
ztest_object_unlock(zd, object);
return (error);
}
crtxg = ztest_bt_bonus(db)->bt_crtxg;
if (crtxg == 0 || crtxg > txg) {
dmu_buf_rele(db, FTAG);
ztest_object_unlock(zd, object);
return (ENOENT);
}
dmu_object_info_from_db(db, &doi);
dmu_buf_rele(db, FTAG);
db = NULL;
zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL);
zgd->zgd_lwb = lwb;
zgd->zgd_private = zd;
if (buf != NULL) { /* immediate write */
zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
RL_READER);
error = dmu_read(os, object, offset, size, buf,
DMU_READ_NO_PREFETCH);
ASSERT(error == 0);
} else {
size = doi.doi_data_block_size;
if (ISP2(size)) {
offset = P2ALIGN(offset, size);
} else {
ASSERT(offset < size);
offset = 0;
}
zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
RL_READER);
error = dmu_buf_hold(os, object, offset, zgd, &db,
DMU_READ_NO_PREFETCH);
if (error == 0) {
blkptr_t *bp = &lr->lr_blkptr;
zgd->zgd_db = db;
zgd->zgd_bp = bp;
ASSERT(db->db_offset == offset);
ASSERT(db->db_size == size);
error = dmu_sync(zio, lr->lr_common.lrc_txg,
ztest_get_done, zgd);
if (error == 0)
return (0);
}
}
ztest_get_done(zgd, error);
return (error);
}
static void *
ztest_lr_alloc(size_t lrsize, char *name)
{
char *lr;
size_t namesize = name ? strlen(name) + 1 : 0;
lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL);
if (name)
bcopy(name, lr + lrsize, namesize);
return (lr);
}
void
ztest_lr_free(void *lr, size_t lrsize, char *name)
{
size_t namesize = name ? strlen(name) + 1 : 0;
umem_free(lr, lrsize + namesize);
}
/*
* Lookup a bunch of objects. Returns the number of objects not found.
*/
static int
ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count)
{
int missing = 0;
int error;
ASSERT(_mutex_held(&zd->zd_dirobj_lock));
for (int i = 0; i < count; i++, od++) {
od->od_object = 0;
error = zap_lookup(zd->zd_os, od->od_dir, od->od_name,
sizeof (uint64_t), 1, &od->od_object);
if (error) {
ASSERT(error == ENOENT);
ASSERT(od->od_object == 0);
missing++;
} else {
dmu_buf_t *db;
ztest_block_tag_t *bbt;
dmu_object_info_t doi;
ASSERT(od->od_object != 0);
ASSERT(missing == 0); /* there should be no gaps */
ztest_object_lock(zd, od->od_object, RL_READER);
VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os,
od->od_object, FTAG, &db));
dmu_object_info_from_db(db, &doi);
bbt = ztest_bt_bonus(db);
ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
od->od_type = doi.doi_type;
od->od_blocksize = doi.doi_data_block_size;
od->od_gen = bbt->bt_gen;
dmu_buf_rele(db, FTAG);
ztest_object_unlock(zd, od->od_object);
}
}
return (missing);
}
static int
ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count)
{
int missing = 0;
ASSERT(_mutex_held(&zd->zd_dirobj_lock));
for (int i = 0; i < count; i++, od++) {
if (missing) {
od->od_object = 0;
missing++;
continue;
}
lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
lr->lr_doid = od->od_dir;
lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */
lr->lrz_type = od->od_crtype;
lr->lrz_blocksize = od->od_crblocksize;
lr->lrz_ibshift = ztest_random_ibshift();
lr->lrz_bonustype = DMU_OT_UINT64_OTHER;
lr->lrz_bonuslen = dmu_bonus_max();
lr->lr_gen = od->od_crgen;
lr->lr_crtime[0] = time(NULL);
if (ztest_replay_create(zd, lr, B_FALSE) != 0) {
ASSERT(missing == 0);
od->od_object = 0;
missing++;
} else {
od->od_object = lr->lr_foid;
od->od_type = od->od_crtype;
od->od_blocksize = od->od_crblocksize;
od->od_gen = od->od_crgen;
ASSERT(od->od_object != 0);
}
ztest_lr_free(lr, sizeof (*lr), od->od_name);
}
return (missing);
}
static int
ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count)
{
int missing = 0;
int error;
ASSERT(_mutex_held(&zd->zd_dirobj_lock));
od += count - 1;
for (int i = count - 1; i >= 0; i--, od--) {
if (missing) {
missing++;
continue;
}
/*
* No object was found.
*/
if (od->od_object == 0)
continue;
lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
lr->lr_doid = od->od_dir;
if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) {
ASSERT3U(error, ==, ENOSPC);
missing++;
} else {
od->od_object = 0;
}
ztest_lr_free(lr, sizeof (*lr), od->od_name);
}
return (missing);
}
static int
ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size,
void *data)
{
lr_write_t *lr;
int error;
lr = ztest_lr_alloc(sizeof (*lr) + size, NULL);
lr->lr_foid = object;
lr->lr_offset = offset;
lr->lr_length = size;
lr->lr_blkoff = 0;
BP_ZERO(&lr->lr_blkptr);
bcopy(data, lr + 1, size);
error = ztest_replay_write(zd, lr, B_FALSE);
ztest_lr_free(lr, sizeof (*lr) + size, NULL);
return (error);
}
static int
ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
{
lr_truncate_t *lr;
int error;
lr = ztest_lr_alloc(sizeof (*lr), NULL);
lr->lr_foid = object;
lr->lr_offset = offset;
lr->lr_length = size;
error = ztest_replay_truncate(zd, lr, B_FALSE);
ztest_lr_free(lr, sizeof (*lr), NULL);
return (error);
}
static int
ztest_setattr(ztest_ds_t *zd, uint64_t object)
{
lr_setattr_t *lr;
int error;
lr = ztest_lr_alloc(sizeof (*lr), NULL);
lr->lr_foid = object;
lr->lr_size = 0;
lr->lr_mode = 0;
error = ztest_replay_setattr(zd, lr, B_FALSE);
ztest_lr_free(lr, sizeof (*lr), NULL);
return (error);
}
static void
ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
{
objset_t *os = zd->zd_os;
dmu_tx_t *tx;
uint64_t txg;
rl_t *rl;
txg_wait_synced(dmu_objset_pool(os), 0);
ztest_object_lock(zd, object, RL_READER);
rl = ztest_range_lock(zd, object, offset, size, RL_WRITER);
tx = dmu_tx_create(os);
dmu_tx_hold_write(tx, object, offset, size);
txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
if (txg != 0) {
dmu_prealloc(os, object, offset, size, tx);
dmu_tx_commit(tx);
txg_wait_synced(dmu_objset_pool(os), txg);
} else {
(void) dmu_free_long_range(os, object, offset, size);
}
ztest_range_unlock(rl);
ztest_object_unlock(zd, object);
}
static void
ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
{
int err;
ztest_block_tag_t wbt;
dmu_object_info_t doi;
enum ztest_io_type io_type;
uint64_t blocksize;
void *data;
VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0);
blocksize = doi.doi_data_block_size;
data = umem_alloc(blocksize, UMEM_NOFAIL);
/*
* Pick an i/o type at random, biased toward writing block tags.
*/
io_type = ztest_random(ZTEST_IO_TYPES);
if (ztest_random(2) == 0)
io_type = ZTEST_IO_WRITE_TAG;
(void) rw_rdlock(&zd->zd_zilog_lock);
switch (io_type) {
case ZTEST_IO_WRITE_TAG:
ztest_bt_generate(&wbt, zd->zd_os, object, offset, 0, 0, 0);
(void) ztest_write(zd, object, offset, sizeof (wbt), &wbt);
break;
case ZTEST_IO_WRITE_PATTERN:
(void) memset(data, 'a' + (object + offset) % 5, blocksize);
if (ztest_random(2) == 0) {
/*
* Induce fletcher2 collisions to ensure that
* zio_ddt_collision() detects and resolves them
* when using fletcher2-verify for deduplication.
*/
((uint64_t *)data)[0] ^= 1ULL << 63;
((uint64_t *)data)[4] ^= 1ULL << 63;
}
(void) ztest_write(zd, object, offset, blocksize, data);
break;
case ZTEST_IO_WRITE_ZEROES:
bzero(data, blocksize);
(void) ztest_write(zd, object, offset, blocksize, data);
break;
case ZTEST_IO_TRUNCATE:
(void) ztest_truncate(zd, object, offset, blocksize);
break;
case ZTEST_IO_SETATTR:
(void) ztest_setattr(zd, object);
break;
case ZTEST_IO_REWRITE:
(void) rw_rdlock(&ztest_name_lock);
err = ztest_dsl_prop_set_uint64(zd->zd_name,
ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa),
B_FALSE);
VERIFY(err == 0 || err == ENOSPC);
err = ztest_dsl_prop_set_uint64(zd->zd_name,
ZFS_PROP_COMPRESSION,
ztest_random_dsl_prop(ZFS_PROP_COMPRESSION),
B_FALSE);
VERIFY(err == 0 || err == ENOSPC);
(void) rw_unlock(&ztest_name_lock);
VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data,
DMU_READ_NO_PREFETCH));
(void) ztest_write(zd, object, offset, blocksize, data);
break;
}
(void) rw_unlock(&zd->zd_zilog_lock);
umem_free(data, blocksize);
}
/*
* Initialize an object description template.
*/
static void
ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index,
dmu_object_type_t type, uint64_t blocksize, uint64_t gen)
{
od->od_dir = ZTEST_DIROBJ;
od->od_object = 0;
od->od_crtype = type;
od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize();
od->od_crgen = gen;
od->od_type = DMU_OT_NONE;
od->od_blocksize = 0;
od->od_gen = 0;
(void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]",
tag, (int64_t)id, index);
}
/*
* Lookup or create the objects for a test using the od template.
* If the objects do not all exist, or if 'remove' is specified,
* remove any existing objects and create new ones. Otherwise,
* use the existing objects.
*/
static int
ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove)
{
int count = size / sizeof (*od);
int rv = 0;
VERIFY(mutex_lock(&zd->zd_dirobj_lock) == 0);
if ((ztest_lookup(zd, od, count) != 0 || remove) &&
(ztest_remove(zd, od, count) != 0 ||
ztest_create(zd, od, count) != 0))
rv = -1;
zd->zd_od = od;
VERIFY(mutex_unlock(&zd->zd_dirobj_lock) == 0);
return (rv);
}
/* ARGSUSED */
void
ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
{
zilog_t *zilog = zd->zd_zilog;
(void) rw_rdlock(&zd->zd_zilog_lock);
zil_commit(zilog, ztest_random(ZTEST_OBJECTS));
/*
* Remember the committed values in zd, which is in parent/child
* shared memory. If we die, the next iteration of ztest_run()
* will verify that the log really does contain this record.
*/
mutex_enter(&zilog->zl_lock);
ASSERT(zd->zd_shared != NULL);
ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq);
zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq;
mutex_exit(&zilog->zl_lock);
(void) rw_unlock(&zd->zd_zilog_lock);
}
/*
* This function is designed to simulate the operations that occur during a
* mount/unmount operation. We hold the dataset across these operations in an
* attempt to expose any implicit assumptions about ZIL management.
*/
/* ARGSUSED */
void
ztest_zil_remount(ztest_ds_t *zd, uint64_t id)
{
objset_t *os = zd->zd_os;
/*
* We grab the zd_dirobj_lock to ensure that no other thread is
* updating the zil (i.e. adding in-memory log records) and the
* zd_zilog_lock to block any I/O.
*/
VERIFY0(mutex_lock(&zd->zd_dirobj_lock));
(void) rw_wrlock(&zd->zd_zilog_lock);
/* zfsvfs_teardown() */
zil_close(zd->zd_zilog);
/* zfsvfs_setup() */
VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog);
zil_replay(os, zd, ztest_replay_vector);
(void) rw_unlock(&zd->zd_zilog_lock);
VERIFY(mutex_unlock(&zd->zd_dirobj_lock) == 0);
}
/*
* Verify that we can't destroy an active pool, create an existing pool,
* or create a pool with a bad vdev spec.
*/
/* ARGSUSED */
void
ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
{
ztest_shared_opts_t *zo = &ztest_opts;
spa_t *spa;
nvlist_t *nvroot;
/*
* Attempt to create using a bad file.
*/
nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1);
VERIFY3U(ENOENT, ==,
spa_create("ztest_bad_file", nvroot, NULL, NULL));
nvlist_free(nvroot);
/*
* Attempt to create using a bad mirror.
*/
nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 2, 1);
VERIFY3U(ENOENT, ==,
spa_create("ztest_bad_mirror", nvroot, NULL, NULL));
nvlist_free(nvroot);
/*
* Attempt to create an existing pool. It shouldn't matter
* what's in the nvroot; we should fail with EEXIST.
*/
(void) rw_rdlock(&ztest_name_lock);
nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1);
VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL));
nvlist_free(nvroot);
VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG));
VERIFY3U(EBUSY, ==, spa_destroy(zo->zo_pool));
spa_close(spa, FTAG);
(void) rw_unlock(&ztest_name_lock);
}
/* ARGSUSED */
void
ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)
{
spa_t *spa;
uint64_t initial_version = SPA_VERSION_INITIAL;
uint64_t version, newversion;
nvlist_t *nvroot, *props;
char *name;
VERIFY0(mutex_lock(&ztest_vdev_lock));
name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool);
/*
* Clean up from previous runs.
*/
(void) spa_destroy(name);
nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0,
0, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1);
/*
* If we're configuring a RAIDZ device then make sure that the
* the initial version is capable of supporting that feature.
*/
switch (ztest_opts.zo_raidz_parity) {
case 0:
case 1:
initial_version = SPA_VERSION_INITIAL;
break;
case 2:
initial_version = SPA_VERSION_RAIDZ2;
break;
case 3:
initial_version = SPA_VERSION_RAIDZ3;
break;
}
/*
* Create a pool with a spa version that can be upgraded. Pick
* a value between initial_version and SPA_VERSION_BEFORE_FEATURES.
*/
do {
version = ztest_random_spa_version(initial_version);
} while (version > SPA_VERSION_BEFORE_FEATURES);
props = fnvlist_alloc();
fnvlist_add_uint64(props,
zpool_prop_to_name(ZPOOL_PROP_VERSION), version);
VERIFY0(spa_create(name, nvroot, props, NULL));
fnvlist_free(nvroot);
fnvlist_free(props);
VERIFY0(spa_open(name, &spa, FTAG));
VERIFY3U(spa_version(spa), ==, version);
newversion = ztest_random_spa_version(version + 1);
if (ztest_opts.zo_verbose >= 4) {
(void) printf("upgrading spa version from %llu to %llu\n",
(u_longlong_t)version, (u_longlong_t)newversion);
}
spa_upgrade(spa, newversion);
VERIFY3U(spa_version(spa), >, version);
VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config,
zpool_prop_to_name(ZPOOL_PROP_VERSION)));
spa_close(spa, FTAG);
strfree(name);
VERIFY0(mutex_unlock(&ztest_vdev_lock));
}
static vdev_t *
vdev_lookup_by_path(vdev_t *vd, const char *path)
{
vdev_t *mvd;
if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
return (vd);
for (int c = 0; c < vd->vdev_children; c++)
if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
NULL)
return (mvd);
return (NULL);
}
/*
* Find the first available hole which can be used as a top-level.
*/
int
find_vdev_hole(spa_t *spa)
{
vdev_t *rvd = spa->spa_root_vdev;
int c;
ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV);
for (c = 0; c < rvd->vdev_children; c++) {
vdev_t *cvd = rvd->vdev_child[c];
if (cvd->vdev_ishole)
break;
}
return (c);
}
/*
* Verify that vdev_add() works as expected.
*/
/* ARGSUSED */
void
ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
{
ztest_shared_t *zs = ztest_shared;
spa_t *spa = ztest_spa;
uint64_t leaves;
uint64_t guid;
nvlist_t *nvroot;
int error;
VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves;
/*
* If we have slogs then remove them 1/4 of the time.
*/
if (spa_has_slogs(spa) && ztest_random(4) == 0) {
/*
* Grab the guid from the head of the log class rotor.
*/
guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid;
spa_config_exit(spa, SCL_VDEV, FTAG);
/*
* We have to grab the zs_name_lock as writer to
* prevent a race between removing a slog (dmu_objset_find)
* and destroying a dataset. Removing the slog will
* grab a reference on the dataset which may cause
* dmu_objset_destroy() to fail with EBUSY thus
* leaving the dataset in an inconsistent state.
*/
VERIFY(rw_wrlock(&ztest_name_lock) == 0);
error = spa_vdev_remove(spa, guid, B_FALSE);
VERIFY(rw_unlock(&ztest_name_lock) == 0);
if (error && error != EEXIST)
fatal(0, "spa_vdev_remove() = %d", error);
} else {
spa_config_exit(spa, SCL_VDEV, FTAG);
/*
* Make 1/4 of the devices be log devices.
*/
nvroot = make_vdev_root(NULL, NULL, NULL,
ztest_opts.zo_vdev_size, 0,
ztest_random(4) == 0, ztest_opts.zo_raidz,
zs->zs_mirrors, 1);
error = spa_vdev_add(spa, nvroot);
nvlist_free(nvroot);
if (error == ENOSPC)
ztest_record_enospc("spa_vdev_add");
else if (error != 0)
fatal(0, "spa_vdev_add() = %d", error);
}
VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
}
/*
* Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
*/
/* ARGSUSED */
void
ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
{
ztest_shared_t *zs = ztest_shared;
spa_t *spa = ztest_spa;
vdev_t *rvd = spa->spa_root_vdev;
spa_aux_vdev_t *sav;
char *aux;
uint64_t guid = 0;
int error;
if (ztest_random(2) == 0) {
sav = &spa->spa_spares;
aux = ZPOOL_CONFIG_SPARES;
} else {
sav = &spa->spa_l2cache;
aux = ZPOOL_CONFIG_L2CACHE;
}
VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
if (sav->sav_count != 0 && ztest_random(4) == 0) {
/*
* Pick a random device to remove.
*/
guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid;
} else {
/*
* Find an unused device we can add.
*/
zs->zs_vdev_aux = 0;
for (;;) {
char path[MAXPATHLEN];
int c;
(void) snprintf(path, sizeof (path), ztest_aux_template,
ztest_opts.zo_dir, ztest_opts.zo_pool, aux,
zs->zs_vdev_aux);
for (c = 0; c < sav->sav_count; c++)
if (strcmp(sav->sav_vdevs[c]->vdev_path,
path) == 0)
break;
if (c == sav->sav_count &&
vdev_lookup_by_path(rvd, path) == NULL)
break;
zs->zs_vdev_aux++;
}
}
spa_config_exit(spa, SCL_VDEV, FTAG);
if (guid == 0) {
/*
* Add a new device.
*/
nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL,
(ztest_opts.zo_vdev_size * 5) / 4, 0, 0, 0, 0, 1);
error = spa_vdev_add(spa, nvroot);
if (error != 0)
fatal(0, "spa_vdev_add(%p) = %d", nvroot, error);
nvlist_free(nvroot);
} else {
/*
* Remove an existing device. Sometimes, dirty its
* vdev state first to make sure we handle removal
* of devices that have pending state changes.
*/
if (ztest_random(2) == 0)
(void) vdev_online(spa, guid, 0, NULL);
error = spa_vdev_remove(spa, guid, B_FALSE);
if (error != 0 && error != EBUSY)
fatal(0, "spa_vdev_remove(%llu) = %d", guid, error);
}
VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
}
/*
* split a pool if it has mirror tlvdevs
*/
/* ARGSUSED */
void
ztest_split_pool(ztest_ds_t *zd, uint64_t id)
{
ztest_shared_t *zs = ztest_shared;
spa_t *spa = ztest_spa;
vdev_t *rvd = spa->spa_root_vdev;
nvlist_t *tree, **child, *config, *split, **schild;
uint_t c, children, schildren = 0, lastlogid = 0;
int error = 0;
VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
/* ensure we have a useable config; mirrors of raidz aren't supported */
if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) {
VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
return;
}
/* clean up the old pool, if any */
(void) spa_destroy("splitp");
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
/* generate a config from the existing config */
mutex_enter(&spa->spa_props_lock);
VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE,
&tree) == 0);
mutex_exit(&spa->spa_props_lock);
VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child,
&children) == 0);
schild = malloc(rvd->vdev_children * sizeof (nvlist_t *));
for (c = 0; c < children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
nvlist_t **mchild;
uint_t mchildren;
if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) {
VERIFY(nvlist_alloc(&schild[schildren], NV_UNIQUE_NAME,
0) == 0);
VERIFY(nvlist_add_string(schild[schildren],
ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0);
VERIFY(nvlist_add_uint64(schild[schildren],
ZPOOL_CONFIG_IS_HOLE, 1) == 0);
if (lastlogid == 0)
lastlogid = schildren;
++schildren;
continue;
}
lastlogid = 0;
VERIFY(nvlist_lookup_nvlist_array(child[c],
ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0);
VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0);
}
/* OK, create a config that can be used to split */
VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0);
VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE,
VDEV_TYPE_ROOT) == 0);
VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild,
lastlogid != 0 ? lastlogid : schildren) == 0);
VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0);
for (c = 0; c < schildren; c++)
nvlist_free(schild[c]);
free(schild);
nvlist_free(split);
spa_config_exit(spa, SCL_VDEV, FTAG);
(void) rw_wrlock(&ztest_name_lock);
error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE);
(void) rw_unlock(&ztest_name_lock);
nvlist_free(config);
if (error == 0) {
(void) printf("successful split - results:\n");
mutex_enter(&spa_namespace_lock);
show_pool_stats(spa);
show_pool_stats(spa_lookup("splitp"));
mutex_exit(&spa_namespace_lock);
++zs->zs_splits;
--zs->zs_mirrors;
}
VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
}
/*
* Verify that we can attach and detach devices.
*/
/* ARGSUSED */
void
ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
{
ztest_shared_t *zs = ztest_shared;
spa_t *spa = ztest_spa;
spa_aux_vdev_t *sav = &spa->spa_spares;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *oldvd, *newvd, *pvd;
nvlist_t *root;
uint64_t leaves;
uint64_t leaf, top;
uint64_t ashift = ztest_get_ashift();
uint64_t oldguid, pguid;
uint64_t oldsize, newsize;
char oldpath[MAXPATHLEN], newpath[MAXPATHLEN];
int replacing;
int oldvd_has_siblings = B_FALSE;
int newvd_is_spare = B_FALSE;
int oldvd_is_log;
int error, expected_error;
VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
/*
+ * If a vdev is in the process of being removed, its removal may
+ * finish while we are in progress, leading to an unexpected error
+ * value. Don't bother trying to attach while we are in the middle
+ * of removal.
+ */
+ if (spa->spa_vdev_removal != NULL) {
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
+ return;
+ }
+
+ /*
* Decide whether to do an attach or a replace.
*/
replacing = ztest_random(2);
/*
* Pick a random top-level vdev.
*/
top = ztest_random_vdev_top(spa, B_TRUE);
/*
* Pick a random leaf within it.
*/
leaf = ztest_random(leaves);
/*
* Locate this vdev.
*/
oldvd = rvd->vdev_child[top];
if (zs->zs_mirrors >= 1) {
ASSERT(oldvd->vdev_ops == &vdev_mirror_ops);
ASSERT(oldvd->vdev_children >= zs->zs_mirrors);
oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz];
}
if (ztest_opts.zo_raidz > 1) {
ASSERT(oldvd->vdev_ops == &vdev_raidz_ops);
ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz);
oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz];
}
/*
* If we're already doing an attach or replace, oldvd may be a
* mirror vdev -- in which case, pick a random child.
*/
while (oldvd->vdev_children != 0) {
oldvd_has_siblings = B_TRUE;
ASSERT(oldvd->vdev_children >= 2);
oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)];
}
oldguid = oldvd->vdev_guid;
oldsize = vdev_get_min_asize(oldvd);
oldvd_is_log = oldvd->vdev_top->vdev_islog;
(void) strcpy(oldpath, oldvd->vdev_path);
pvd = oldvd->vdev_parent;
pguid = pvd->vdev_guid;
/*
* If oldvd has siblings, then half of the time, detach it.
*/
if (oldvd_has_siblings && ztest_random(2) == 0) {
- spa_config_exit(spa, SCL_VDEV, FTAG);
+ spa_config_exit(spa, SCL_ALL, FTAG);
error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE);
if (error != 0 && error != ENODEV && error != EBUSY &&
error != ENOTSUP)
fatal(0, "detach (%s) returned %d", oldpath, error);
VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
return;
}
/*
* For the new vdev, choose with equal probability between the two
* standard paths (ending in either 'a' or 'b') or a random hot spare.
*/
if (sav->sav_count != 0 && ztest_random(3) == 0) {
newvd = sav->sav_vdevs[ztest_random(sav->sav_count)];
newvd_is_spare = B_TRUE;
(void) strcpy(newpath, newvd->vdev_path);
} else {
(void) snprintf(newpath, sizeof (newpath), ztest_dev_template,
ztest_opts.zo_dir, ztest_opts.zo_pool,
top * leaves + leaf);
if (ztest_random(2) == 0)
newpath[strlen(newpath) - 1] = 'b';
newvd = vdev_lookup_by_path(rvd, newpath);
}
if (newvd) {
+ /*
+ * Reopen to ensure the vdev's asize field isn't stale.
+ */
+ vdev_reopen(newvd);
newsize = vdev_get_min_asize(newvd);
} else {
/*
* Make newsize a little bigger or smaller than oldsize.
* If it's smaller, the attach should fail.
* If it's larger, and we're doing a replace,
* we should get dynamic LUN growth when we're done.
*/
newsize = 10 * oldsize / (9 + ztest_random(3));
}
/*
* If pvd is not a mirror or root, the attach should fail with ENOTSUP,
* unless it's a replace; in that case any non-replacing parent is OK.
*
* If newvd is already part of the pool, it should fail with EBUSY.
*
* If newvd is too small, it should fail with EOVERFLOW.
*/
if (pvd->vdev_ops != &vdev_mirror_ops &&
pvd->vdev_ops != &vdev_root_ops && (!replacing ||
pvd->vdev_ops == &vdev_replacing_ops ||
pvd->vdev_ops == &vdev_spare_ops))
expected_error = ENOTSUP;
else if (newvd_is_spare && (!replacing || oldvd_is_log))
expected_error = ENOTSUP;
else if (newvd == oldvd)
expected_error = replacing ? 0 : EBUSY;
else if (vdev_lookup_by_path(rvd, newpath) != NULL)
expected_error = EBUSY;
else if (newsize < oldsize)
expected_error = EOVERFLOW;
else if (ashift > oldvd->vdev_top->vdev_ashift)
expected_error = EDOM;
else
expected_error = 0;
- spa_config_exit(spa, SCL_VDEV, FTAG);
+ spa_config_exit(spa, SCL_ALL, FTAG);
/*
* Build the nvlist describing newpath.
*/
root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0,
ashift, 0, 0, 0, 1);
error = spa_vdev_attach(spa, oldguid, root, replacing);
nvlist_free(root);
/*
* If our parent was the replacing vdev, but the replace completed,
* then instead of failing with ENOTSUP we may either succeed,
* fail with ENODEV, or fail with EOVERFLOW.
*/
if (expected_error == ENOTSUP &&
(error == 0 || error == ENODEV || error == EOVERFLOW))
expected_error = error;
/*
* If someone grew the LUN, the replacement may be too small.
*/
if (error == EOVERFLOW || error == EBUSY)
expected_error = error;
/* XXX workaround 6690467 */
if (error != expected_error && expected_error != EBUSY) {
fatal(0, "attach (%s %llu, %s %llu, %d) "
"returned %d, expected %d",
oldpath, oldsize, newpath,
newsize, replacing, error, expected_error);
}
VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
}
+/* ARGSUSED */
+void
+ztest_device_removal(ztest_ds_t *zd, uint64_t id)
+{
+ spa_t *spa = ztest_spa;
+ vdev_t *vd;
+ uint64_t guid;
+
+ VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE));
+ guid = vd->vdev_guid;
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ (void) spa_vdev_remove(spa, guid, B_FALSE);
+
+ VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
+}
+
/*
* Callback function which expands the physical size of the vdev.
*/
vdev_t *
grow_vdev(vdev_t *vd, void *arg)
{
spa_t *spa = vd->vdev_spa;
size_t *newsize = arg;
size_t fsize;
int fd;
ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
ASSERT(vd->vdev_ops->vdev_op_leaf);
if ((fd = open(vd->vdev_path, O_RDWR)) == -1)
return (vd);
fsize = lseek(fd, 0, SEEK_END);
(void) ftruncate(fd, *newsize);
if (ztest_opts.zo_verbose >= 6) {
(void) printf("%s grew from %lu to %lu bytes\n",
vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize);
}
(void) close(fd);
return (NULL);
}
/*
* Callback function which expands a given vdev by calling vdev_online().
*/
/* ARGSUSED */
vdev_t *
online_vdev(vdev_t *vd, void *arg)
{
spa_t *spa = vd->vdev_spa;
vdev_t *tvd = vd->vdev_top;
uint64_t guid = vd->vdev_guid;
uint64_t generation = spa->spa_config_generation + 1;
vdev_state_t newstate = VDEV_STATE_UNKNOWN;
int error;
ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
ASSERT(vd->vdev_ops->vdev_op_leaf);
/* Calling vdev_online will initialize the new metaslabs */
spa_config_exit(spa, SCL_STATE, spa);
error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate);
spa_config_enter(spa, SCL_STATE, spa, RW_READER);
/*
* If vdev_online returned an error or the underlying vdev_open
* failed then we abort the expand. The only way to know that
* vdev_open fails is by checking the returned newstate.
*/
if (error || newstate != VDEV_STATE_HEALTHY) {
if (ztest_opts.zo_verbose >= 5) {
(void) printf("Unable to expand vdev, state %llu, "
"error %d\n", (u_longlong_t)newstate, error);
}
return (vd);
}
ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY);
/*
* Since we dropped the lock we need to ensure that we're
* still talking to the original vdev. It's possible this
* vdev may have been detached/replaced while we were
* trying to online it.
*/
if (generation != spa->spa_config_generation) {
if (ztest_opts.zo_verbose >= 5) {
(void) printf("vdev configuration has changed, "
"guid %llu, state %llu, expected gen %llu, "
"got gen %llu\n",
(u_longlong_t)guid,
(u_longlong_t)tvd->vdev_state,
(u_longlong_t)generation,
(u_longlong_t)spa->spa_config_generation);
}
return (vd);
}
return (NULL);
}
/*
* Traverse the vdev tree calling the supplied function.
* We continue to walk the tree until we either have walked all
* children or we receive a non-NULL return from the callback.
* If a NULL callback is passed, then we just return back the first
* leaf vdev we encounter.
*/
vdev_t *
vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg)
{
if (vd->vdev_ops->vdev_op_leaf) {
if (func == NULL)
return (vd);
else
return (func(vd, arg));
}
for (uint_t c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL)
return (cvd);
}
return (NULL);
}
/*
* Verify that dynamic LUN growth works as expected.
*/
/* ARGSUSED */
void
ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
{
spa_t *spa = ztest_spa;
vdev_t *vd, *tvd;
metaslab_class_t *mc;
metaslab_group_t *mg;
size_t psize, newsize;
uint64_t top;
uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count;
VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+ /*
+ * If there is a vdev removal in progress, it could complete while
+ * we are running, in which case we would not be able to verify
+ * that the metaslab_class space increased (because it decreases
+ * when the device removal completes).
+ */
+ if (spa->spa_vdev_removal != NULL) {
+ spa_config_exit(spa, SCL_STATE, FTAG);
+ VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
+ return;
+ }
+
top = ztest_random_vdev_top(spa, B_TRUE);
tvd = spa->spa_root_vdev->vdev_child[top];
mg = tvd->vdev_mg;
mc = mg->mg_class;
old_ms_count = tvd->vdev_ms_count;
old_class_space = metaslab_class_get_space(mc);
/*
* Determine the size of the first leaf vdev associated with
* our top-level device.
*/
vd = vdev_walk_tree(tvd, NULL, NULL);
ASSERT3P(vd, !=, NULL);
ASSERT(vd->vdev_ops->vdev_op_leaf);
psize = vd->vdev_psize;
/*
* We only try to expand the vdev if it's healthy, less than 4x its
* original size, and it has a valid psize.
*/
if (tvd->vdev_state != VDEV_STATE_HEALTHY ||
psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) {
spa_config_exit(spa, SCL_STATE, spa);
VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
return;
}
ASSERT(psize > 0);
newsize = psize + psize / 8;
ASSERT3U(newsize, >, psize);
if (ztest_opts.zo_verbose >= 6) {
(void) printf("Expanding LUN %s from %lu to %lu\n",
vd->vdev_path, (ulong_t)psize, (ulong_t)newsize);
}
/*
* Growing the vdev is a two step process:
* 1). expand the physical size (i.e. relabel)
* 2). online the vdev to create the new metaslabs
*/
if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL ||
vdev_walk_tree(tvd, online_vdev, NULL) != NULL ||
tvd->vdev_state != VDEV_STATE_HEALTHY) {
if (ztest_opts.zo_verbose >= 5) {
(void) printf("Could not expand LUN because "
"the vdev configuration changed.\n");
}
spa_config_exit(spa, SCL_STATE, spa);
VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
return;
}
spa_config_exit(spa, SCL_STATE, spa);
/*
* Expanding the LUN will update the config asynchronously,
* thus we must wait for the async thread to complete any
* pending tasks before proceeding.
*/
for (;;) {
boolean_t done;
mutex_enter(&spa->spa_async_lock);
done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks);
mutex_exit(&spa->spa_async_lock);
if (done)
break;
txg_wait_synced(spa_get_dsl(spa), 0);
(void) poll(NULL, 0, 100);
}
spa_config_enter(spa, SCL_STATE, spa, RW_READER);
tvd = spa->spa_root_vdev->vdev_child[top];
new_ms_count = tvd->vdev_ms_count;
new_class_space = metaslab_class_get_space(mc);
if (tvd->vdev_mg != mg || mg->mg_class != mc) {
if (ztest_opts.zo_verbose >= 5) {
(void) printf("Could not verify LUN expansion due to "
"intervening vdev offline or remove.\n");
}
spa_config_exit(spa, SCL_STATE, spa);
VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
return;
}
/*
* Make sure we were able to grow the vdev.
*/
- if (new_ms_count <= old_ms_count)
- fatal(0, "LUN expansion failed: ms_count %llu <= %llu\n",
+ if (new_ms_count <= old_ms_count) {
+ fatal(0, "LUN expansion failed: ms_count %llu < %llu\n",
old_ms_count, new_ms_count);
+ }
/*
* Make sure we were able to grow the pool.
*/
- if (new_class_space <= old_class_space)
- fatal(0, "LUN expansion failed: class_space %llu <= %llu\n",
+ if (new_class_space <= old_class_space) {
+ fatal(0, "LUN expansion failed: class_space %llu < %llu\n",
old_class_space, new_class_space);
+ }
if (ztest_opts.zo_verbose >= 5) {
char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ];
nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf));
nicenum(new_class_space, newnumbuf, sizeof (newnumbuf));
(void) printf("%s grew from %s to %s\n",
spa->spa_name, oldnumbuf, newnumbuf);
}
spa_config_exit(spa, SCL_STATE, spa);
VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
}
/*
* Verify that dmu_objset_{create,destroy,open,close} work as expected.
*/
/* ARGSUSED */
static void
ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
{
/*
* Create the objects common to all ztest datasets.
*/
VERIFY(zap_create_claim(os, ZTEST_DIROBJ,
DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
}
static int
ztest_dataset_create(char *dsname)
{
uint64_t zilset = ztest_random(100);
int err = dmu_objset_create(dsname, DMU_OST_OTHER, 0,
ztest_objset_create_cb, NULL);
if (err || zilset < 80)
return (err);
if (ztest_opts.zo_verbose >= 6)
(void) printf("Setting dataset %s to sync always\n", dsname);
return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC,
ZFS_SYNC_ALWAYS, B_FALSE));
}
/* ARGSUSED */
static int
ztest_objset_destroy_cb(const char *name, void *arg)
{
objset_t *os;
dmu_object_info_t doi;
int error;
/*
* Verify that the dataset contains a directory object.
*/
VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, FTAG, &os));
error = dmu_object_info(os, ZTEST_DIROBJ, &doi);
if (error != ENOENT) {
/* We could have crashed in the middle of destroying it */
ASSERT0(error);
ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER);
ASSERT3S(doi.doi_physical_blocks_512, >=, 0);
}
dmu_objset_disown(os, FTAG);
/*
* Destroy the dataset.
*/
if (strchr(name, '@') != NULL) {
VERIFY0(dsl_destroy_snapshot(name, B_FALSE));
} else {
VERIFY0(dsl_destroy_head(name));
}
return (0);
}
static boolean_t
ztest_snapshot_create(char *osname, uint64_t id)
{
char snapname[ZFS_MAX_DATASET_NAME_LEN];
int error;
(void) snprintf(snapname, sizeof (snapname), "%llu", (u_longlong_t)id);
error = dmu_objset_snapshot_one(osname, snapname);
if (error == ENOSPC) {
ztest_record_enospc(FTAG);
return (B_FALSE);
}
if (error != 0 && error != EEXIST) {
fatal(0, "ztest_snapshot_create(%s@%s) = %d", osname,
snapname, error);
}
return (B_TRUE);
}
static boolean_t
ztest_snapshot_destroy(char *osname, uint64_t id)
{
char snapname[ZFS_MAX_DATASET_NAME_LEN];
int error;
(void) snprintf(snapname, sizeof (snapname), "%s@%llu", osname,
(u_longlong_t)id);
error = dsl_destroy_snapshot(snapname, B_FALSE);
if (error != 0 && error != ENOENT)
fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error);
return (B_TRUE);
}
/* ARGSUSED */
void
ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
{
ztest_ds_t zdtmp;
int iters;
int error;
objset_t *os, *os2;
char name[ZFS_MAX_DATASET_NAME_LEN];
zilog_t *zilog;
(void) rw_rdlock(&ztest_name_lock);
(void) snprintf(name, sizeof (name), "%s/temp_%llu",
ztest_opts.zo_pool, (u_longlong_t)id);
/*
* If this dataset exists from a previous run, process its replay log
* half of the time. If we don't replay it, then dmu_objset_destroy()
* (invoked from ztest_objset_destroy_cb()) should just throw it away.
*/
if (ztest_random(2) == 0 &&
dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os) == 0) {
ztest_zd_init(&zdtmp, NULL, os);
zil_replay(os, &zdtmp, ztest_replay_vector);
ztest_zd_fini(&zdtmp);
dmu_objset_disown(os, FTAG);
}
/*
* There may be an old instance of the dataset we're about to
* create lying around from a previous run. If so, destroy it
* and all of its snapshots.
*/
(void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
/*
* Verify that the destroyed dataset is no longer in the namespace.
*/
VERIFY3U(ENOENT, ==, dmu_objset_own(name, DMU_OST_OTHER, B_TRUE,
FTAG, &os));
/*
* Verify that we can create a new dataset.
*/
error = ztest_dataset_create(name);
if (error) {
if (error == ENOSPC) {
ztest_record_enospc(FTAG);
(void) rw_unlock(&ztest_name_lock);
return;
}
fatal(0, "dmu_objset_create(%s) = %d", name, error);
}
VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os));
ztest_zd_init(&zdtmp, NULL, os);
/*
* Open the intent log for it.
*/
zilog = zil_open(os, ztest_get_data);
/*
* Put some objects in there, do a little I/O to them,
* and randomly take a couple of snapshots along the way.
*/
iters = ztest_random(5);
for (int i = 0; i < iters; i++) {
ztest_dmu_object_alloc_free(&zdtmp, id);
if (ztest_random(iters) == 0)
(void) ztest_snapshot_create(name, i);
}
/*
* Verify that we cannot create an existing dataset.
*/
VERIFY3U(EEXIST, ==,
dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL));
/*
* Verify that we can hold an objset that is also owned.
*/
VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2));
dmu_objset_rele(os2, FTAG);
/*
* Verify that we cannot own an objset that is already owned.
*/
VERIFY3U(EBUSY, ==,
dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os2));
zil_close(zilog);
dmu_objset_disown(os, FTAG);
ztest_zd_fini(&zdtmp);
(void) rw_unlock(&ztest_name_lock);
}
/*
* Verify that dmu_snapshot_{create,destroy,open,close} work as expected.
*/
void
ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id)
{
(void) rw_rdlock(&ztest_name_lock);
(void) ztest_snapshot_destroy(zd->zd_name, id);
(void) ztest_snapshot_create(zd->zd_name, id);
(void) rw_unlock(&ztest_name_lock);
}
/*
* Cleanup non-standard snapshots and clones.
*/
void
ztest_dsl_dataset_cleanup(char *osname, uint64_t id)
{
char snap1name[ZFS_MAX_DATASET_NAME_LEN];
char clone1name[ZFS_MAX_DATASET_NAME_LEN];
char snap2name[ZFS_MAX_DATASET_NAME_LEN];
char clone2name[ZFS_MAX_DATASET_NAME_LEN];
char snap3name[ZFS_MAX_DATASET_NAME_LEN];
int error;
(void) snprintf(snap1name, sizeof (snap1name),
"%s@s1_%llu", osname, id);
(void) snprintf(clone1name, sizeof (clone1name),
"%s/c1_%llu", osname, id);
(void) snprintf(snap2name, sizeof (snap2name),
"%s@s2_%llu", clone1name, id);
(void) snprintf(clone2name, sizeof (clone2name),
"%s/c2_%llu", osname, id);
(void) snprintf(snap3name, sizeof (snap3name),
"%s@s3_%llu", clone1name, id);
error = dsl_destroy_head(clone2name);
if (error && error != ENOENT)
fatal(0, "dsl_destroy_head(%s) = %d", clone2name, error);
error = dsl_destroy_snapshot(snap3name, B_FALSE);
if (error && error != ENOENT)
fatal(0, "dsl_destroy_snapshot(%s) = %d", snap3name, error);
error = dsl_destroy_snapshot(snap2name, B_FALSE);
if (error && error != ENOENT)
fatal(0, "dsl_destroy_snapshot(%s) = %d", snap2name, error);
error = dsl_destroy_head(clone1name);
if (error && error != ENOENT)
fatal(0, "dsl_destroy_head(%s) = %d", clone1name, error);
error = dsl_destroy_snapshot(snap1name, B_FALSE);
if (error && error != ENOENT)
fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error);
}
/*
* Verify dsl_dataset_promote handles EBUSY
*/
void
ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
{
objset_t *os;
char snap1name[ZFS_MAX_DATASET_NAME_LEN];
char clone1name[ZFS_MAX_DATASET_NAME_LEN];
char snap2name[ZFS_MAX_DATASET_NAME_LEN];
char clone2name[ZFS_MAX_DATASET_NAME_LEN];
char snap3name[ZFS_MAX_DATASET_NAME_LEN];
char *osname = zd->zd_name;
int error;
(void) rw_rdlock(&ztest_name_lock);
ztest_dsl_dataset_cleanup(osname, id);
(void) snprintf(snap1name, sizeof (snap1name),
"%s@s1_%llu", osname, id);
(void) snprintf(clone1name, sizeof (clone1name),
"%s/c1_%llu", osname, id);
(void) snprintf(snap2name, sizeof (snap2name),
"%s@s2_%llu", clone1name, id);
(void) snprintf(clone2name, sizeof (clone2name),
"%s/c2_%llu", osname, id);
(void) snprintf(snap3name, sizeof (snap3name),
"%s@s3_%llu", clone1name, id);
error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1);
if (error && error != EEXIST) {
if (error == ENOSPC) {
ztest_record_enospc(FTAG);
goto out;
}
fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error);
}
error = dmu_objset_clone(clone1name, snap1name);
if (error) {
if (error == ENOSPC) {
ztest_record_enospc(FTAG);
goto out;
}
fatal(0, "dmu_objset_create(%s) = %d", clone1name, error);
}
error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1);
if (error && error != EEXIST) {
if (error == ENOSPC) {
ztest_record_enospc(FTAG);
goto out;
}
fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error);
}
error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1);
if (error && error != EEXIST) {
if (error == ENOSPC) {
ztest_record_enospc(FTAG);
goto out;
}
fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
}
error = dmu_objset_clone(clone2name, snap3name);
if (error) {
if (error == ENOSPC) {
ztest_record_enospc(FTAG);
goto out;
}
fatal(0, "dmu_objset_create(%s) = %d", clone2name, error);
}
error = dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, FTAG, &os);
if (error)
fatal(0, "dmu_objset_own(%s) = %d", snap2name, error);
error = dsl_dataset_promote(clone2name, NULL);
if (error == ENOSPC) {
dmu_objset_disown(os, FTAG);
ztest_record_enospc(FTAG);
goto out;
}
if (error != EBUSY)
fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
error);
dmu_objset_disown(os, FTAG);
out:
ztest_dsl_dataset_cleanup(osname, id);
(void) rw_unlock(&ztest_name_lock);
}
/*
* Verify that dmu_object_{alloc,free} work as expected.
*/
void
ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
{
ztest_od_t od[4];
int batchsize = sizeof (od) / sizeof (od[0]);
for (int b = 0; b < batchsize; b++)
ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0);
/*
* Destroy the previous batch of objects, create a new batch,
* and do some I/O on the new objects.
*/
if (ztest_object_init(zd, od, sizeof (od), B_TRUE) != 0)
return;
while (ztest_random(4 * batchsize) != 0)
ztest_io(zd, od[ztest_random(batchsize)].od_object,
ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
}
/*
* Verify that dmu_{read,write} work as expected.
*/
void
ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
{
objset_t *os = zd->zd_os;
ztest_od_t od[2];
dmu_tx_t *tx;
int i, freeit, error;
uint64_t n, s, txg;
bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT;
uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t);
uint64_t regions = 997;
uint64_t stride = 123456789ULL;
uint64_t width = 40;
int free_percent = 5;
/*
* This test uses two objects, packobj and bigobj, that are always
* updated together (i.e. in the same tx) so that their contents are
* in sync and can be compared. Their contents relate to each other
* in a simple way: packobj is a dense array of 'bufwad' structures,
* while bigobj is a sparse array of the same bufwads. Specifically,
* for any index n, there are three bufwads that should be identical:
*
* packobj, at offset n * sizeof (bufwad_t)
* bigobj, at the head of the nth chunk
* bigobj, at the tail of the nth chunk
*
* The chunk size is arbitrary. It doesn't have to be a power of two,
* and it doesn't have any relation to the object blocksize.
* The only requirement is that it can hold at least two bufwads.
*
* Normally, we write the bufwad to each of these locations.
* However, free_percent of the time we instead write zeroes to
* packobj and perform a dmu_free_range() on bigobj. By comparing
* bigobj to packobj, we can verify that the DMU is correctly
* tracking which parts of an object are allocated and free,
* and that the contents of the allocated blocks are correct.
*/
/*
* Read the directory info. If it's the first time, set things up.
*/
ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, chunksize);
ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize);
if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
return;
bigobj = od[0].od_object;
packobj = od[1].od_object;
chunksize = od[0].od_gen;
ASSERT(chunksize == od[1].od_gen);
/*
* Prefetch a random chunk of the big object.
* Our aim here is to get some async reads in flight
* for blocks that we may free below; the DMU should
* handle this race correctly.
*/
n = ztest_random(regions) * stride + ztest_random(width);
s = 1 + ztest_random(2 * width - 1);
dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize,
ZIO_PRIORITY_SYNC_READ);
/*
* Pick a random index and compute the offsets into packobj and bigobj.
*/
n = ztest_random(regions) * stride + ztest_random(width);
s = 1 + ztest_random(width - 1);
packoff = n * sizeof (bufwad_t);
packsize = s * sizeof (bufwad_t);
bigoff = n * chunksize;
bigsize = s * chunksize;
packbuf = umem_alloc(packsize, UMEM_NOFAIL);
bigbuf = umem_alloc(bigsize, UMEM_NOFAIL);
/*
* free_percent of the time, free a range of bigobj rather than
* overwriting it.
*/
freeit = (ztest_random(100) < free_percent);
/*
* Read the current contents of our objects.
*/
error = dmu_read(os, packobj, packoff, packsize, packbuf,
DMU_READ_PREFETCH);
ASSERT0(error);
error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf,
DMU_READ_PREFETCH);
ASSERT0(error);
/*
* Get a tx for the mods to both packobj and bigobj.
*/
tx = dmu_tx_create(os);
dmu_tx_hold_write(tx, packobj, packoff, packsize);
if (freeit)
dmu_tx_hold_free(tx, bigobj, bigoff, bigsize);
else
dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
/* This accounts for setting the checksum/compression. */
dmu_tx_hold_bonus(tx, bigobj);
txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
if (txg == 0) {
umem_free(packbuf, packsize);
umem_free(bigbuf, bigsize);
return;
}
enum zio_checksum cksum;
do {
cksum = (enum zio_checksum)
ztest_random_dsl_prop(ZFS_PROP_CHECKSUM);
} while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS);
dmu_object_set_checksum(os, bigobj, cksum, tx);
enum zio_compress comp;
do {
comp = (enum zio_compress)
ztest_random_dsl_prop(ZFS_PROP_COMPRESSION);
} while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS);
dmu_object_set_compress(os, bigobj, comp, tx);
/*
* For each index from n to n + s, verify that the existing bufwad
* in packobj matches the bufwads at the head and tail of the
* corresponding chunk in bigobj. Then update all three bufwads
* with the new values we want to write out.
*/
for (i = 0; i < s; i++) {
/* LINTED */
pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
/* LINTED */
bigH = (bufwad_t *)((char *)bigbuf + i * chunksize);
/* LINTED */
bigT = (bufwad_t *)((char *)bigH + chunksize) - 1;
ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
if (pack->bw_txg > txg)
fatal(0, "future leak: got %llx, open txg is %llx",
pack->bw_txg, txg);
if (pack->bw_data != 0 && pack->bw_index != n + i)
fatal(0, "wrong index: got %llx, wanted %llx+%llx",
pack->bw_index, n, i);
if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
if (freeit) {
bzero(pack, sizeof (bufwad_t));
} else {
pack->bw_index = n + i;
pack->bw_txg = txg;
pack->bw_data = 1 + ztest_random(-2ULL);
}
*bigH = *pack;
*bigT = *pack;
}
/*
* We've verified all the old bufwads, and made new ones.
* Now write them out.
*/
dmu_write(os, packobj, packoff, packsize, packbuf, tx);
if (freeit) {
if (ztest_opts.zo_verbose >= 7) {
(void) printf("freeing offset %llx size %llx"
" txg %llx\n",
(u_longlong_t)bigoff,
(u_longlong_t)bigsize,
(u_longlong_t)txg);
}
VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx));
} else {
if (ztest_opts.zo_verbose >= 7) {
(void) printf("writing offset %llx size %llx"
" txg %llx\n",
(u_longlong_t)bigoff,
(u_longlong_t)bigsize,
(u_longlong_t)txg);
}
dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx);
}
dmu_tx_commit(tx);
/*
* Sanity check the stuff we just wrote.
*/
{
void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
VERIFY(0 == dmu_read(os, packobj, packoff,
packsize, packcheck, DMU_READ_PREFETCH));
VERIFY(0 == dmu_read(os, bigobj, bigoff,
bigsize, bigcheck, DMU_READ_PREFETCH));
ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
umem_free(packcheck, packsize);
umem_free(bigcheck, bigsize);
}
umem_free(packbuf, packsize);
umem_free(bigbuf, bigsize);
}
void
compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf,
uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg)
{
uint64_t i;
bufwad_t *pack;
bufwad_t *bigH;
bufwad_t *bigT;
/*
* For each index from n to n + s, verify that the existing bufwad
* in packobj matches the bufwads at the head and tail of the
* corresponding chunk in bigobj. Then update all three bufwads
* with the new values we want to write out.
*/
for (i = 0; i < s; i++) {
/* LINTED */
pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
/* LINTED */
bigH = (bufwad_t *)((char *)bigbuf + i * chunksize);
/* LINTED */
bigT = (bufwad_t *)((char *)bigH + chunksize) - 1;
ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
if (pack->bw_txg > txg)
fatal(0, "future leak: got %llx, open txg is %llx",
pack->bw_txg, txg);
if (pack->bw_data != 0 && pack->bw_index != n + i)
fatal(0, "wrong index: got %llx, wanted %llx+%llx",
pack->bw_index, n, i);
if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
pack->bw_index = n + i;
pack->bw_txg = txg;
pack->bw_data = 1 + ztest_random(-2ULL);
*bigH = *pack;
*bigT = *pack;
}
}
void
ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
{
objset_t *os = zd->zd_os;
ztest_od_t od[2];
dmu_tx_t *tx;
uint64_t i;
int error;
uint64_t n, s, txg;
bufwad_t *packbuf, *bigbuf;
uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
uint64_t blocksize = ztest_random_blocksize();
uint64_t chunksize = blocksize;
uint64_t regions = 997;
uint64_t stride = 123456789ULL;
uint64_t width = 9;
dmu_buf_t *bonus_db;
arc_buf_t **bigbuf_arcbufs;
dmu_object_info_t doi;
/*
* This test uses two objects, packobj and bigobj, that are always
* updated together (i.e. in the same tx) so that their contents are
* in sync and can be compared. Their contents relate to each other
* in a simple way: packobj is a dense array of 'bufwad' structures,
* while bigobj is a sparse array of the same bufwads. Specifically,
* for any index n, there are three bufwads that should be identical:
*
* packobj, at offset n * sizeof (bufwad_t)
* bigobj, at the head of the nth chunk
* bigobj, at the tail of the nth chunk
*
* The chunk size is set equal to bigobj block size so that
* dmu_assign_arcbuf() can be tested for object updates.
*/
/*
* Read the directory info. If it's the first time, set things up.
*/
ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize);
if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
return;
bigobj = od[0].od_object;
packobj = od[1].od_object;
blocksize = od[0].od_blocksize;
chunksize = blocksize;
ASSERT(chunksize == od[1].od_gen);
VERIFY(dmu_object_info(os, bigobj, &doi) == 0);
VERIFY(ISP2(doi.doi_data_block_size));
VERIFY(chunksize == doi.doi_data_block_size);
VERIFY(chunksize >= 2 * sizeof (bufwad_t));
/*
* Pick a random index and compute the offsets into packobj and bigobj.
*/
n = ztest_random(regions) * stride + ztest_random(width);
s = 1 + ztest_random(width - 1);
packoff = n * sizeof (bufwad_t);
packsize = s * sizeof (bufwad_t);
bigoff = n * chunksize;
bigsize = s * chunksize;
packbuf = umem_zalloc(packsize, UMEM_NOFAIL);
bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL);
VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db));
bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL);
/*
* Iteration 0 test zcopy for DB_UNCACHED dbufs.
* Iteration 1 test zcopy to already referenced dbufs.
* Iteration 2 test zcopy to dirty dbuf in the same txg.
* Iteration 3 test zcopy to dbuf dirty in previous txg.
* Iteration 4 test zcopy when dbuf is no longer dirty.
* Iteration 5 test zcopy when it can't be done.
* Iteration 6 one more zcopy write.
*/
for (i = 0; i < 7; i++) {
uint64_t j;
uint64_t off;
/*
* In iteration 5 (i == 5) use arcbufs
* that don't match bigobj blksz to test
* dmu_assign_arcbuf() when it can't directly
* assign an arcbuf to a dbuf.
*/
for (j = 0; j < s; j++) {
if (i != 5) {
bigbuf_arcbufs[j] =
dmu_request_arcbuf(bonus_db, chunksize);
} else {
bigbuf_arcbufs[2 * j] =
dmu_request_arcbuf(bonus_db, chunksize / 2);
bigbuf_arcbufs[2 * j + 1] =
dmu_request_arcbuf(bonus_db, chunksize / 2);
}
}
/*
* Get a tx for the mods to both packobj and bigobj.
*/
tx = dmu_tx_create(os);
dmu_tx_hold_write(tx, packobj, packoff, packsize);
dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
if (txg == 0) {
umem_free(packbuf, packsize);
umem_free(bigbuf, bigsize);
for (j = 0; j < s; j++) {
if (i != 5) {
dmu_return_arcbuf(bigbuf_arcbufs[j]);
} else {
dmu_return_arcbuf(
bigbuf_arcbufs[2 * j]);
dmu_return_arcbuf(
bigbuf_arcbufs[2 * j + 1]);
}
}
umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
dmu_buf_rele(bonus_db, FTAG);
return;
}
/*
* 50% of the time don't read objects in the 1st iteration to
* test dmu_assign_arcbuf() for the case when there're no
* existing dbufs for the specified offsets.
*/
if (i != 0 || ztest_random(2) != 0) {
error = dmu_read(os, packobj, packoff,
packsize, packbuf, DMU_READ_PREFETCH);
ASSERT0(error);
error = dmu_read(os, bigobj, bigoff, bigsize,
bigbuf, DMU_READ_PREFETCH);
ASSERT0(error);
}
compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize,
n, chunksize, txg);
/*
* We've verified all the old bufwads, and made new ones.
* Now write them out.
*/
dmu_write(os, packobj, packoff, packsize, packbuf, tx);
if (ztest_opts.zo_verbose >= 7) {
(void) printf("writing offset %llx size %llx"
" txg %llx\n",
(u_longlong_t)bigoff,
(u_longlong_t)bigsize,
(u_longlong_t)txg);
}
for (off = bigoff, j = 0; j < s; j++, off += chunksize) {
dmu_buf_t *dbt;
if (i != 5) {
bcopy((caddr_t)bigbuf + (off - bigoff),
bigbuf_arcbufs[j]->b_data, chunksize);
} else {
bcopy((caddr_t)bigbuf + (off - bigoff),
bigbuf_arcbufs[2 * j]->b_data,
chunksize / 2);
bcopy((caddr_t)bigbuf + (off - bigoff) +
chunksize / 2,
bigbuf_arcbufs[2 * j + 1]->b_data,
chunksize / 2);
}
if (i == 1) {
VERIFY(dmu_buf_hold(os, bigobj, off,
FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0);
}
if (i != 5) {
dmu_assign_arcbuf(bonus_db, off,
bigbuf_arcbufs[j], tx);
} else {
dmu_assign_arcbuf(bonus_db, off,
bigbuf_arcbufs[2 * j], tx);
dmu_assign_arcbuf(bonus_db,
off + chunksize / 2,
bigbuf_arcbufs[2 * j + 1], tx);
}
if (i == 1) {
dmu_buf_rele(dbt, FTAG);
}
}
dmu_tx_commit(tx);
/*
* Sanity check the stuff we just wrote.
*/
{
void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
VERIFY(0 == dmu_read(os, packobj, packoff,
packsize, packcheck, DMU_READ_PREFETCH));
VERIFY(0 == dmu_read(os, bigobj, bigoff,
bigsize, bigcheck, DMU_READ_PREFETCH));
ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
umem_free(packcheck, packsize);
umem_free(bigcheck, bigsize);
}
if (i == 2) {
txg_wait_open(dmu_objset_pool(os), 0);
} else if (i == 3) {
txg_wait_synced(dmu_objset_pool(os), 0);
}
}
dmu_buf_rele(bonus_db, FTAG);
umem_free(packbuf, packsize);
umem_free(bigbuf, bigsize);
umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
}
/* ARGSUSED */
void
ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id)
{
ztest_od_t od[1];
uint64_t offset = (1ULL << (ztest_random(20) + 43)) +
(ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
/*
* Have multiple threads write to large offsets in an object
* to verify that parallel writes to an object -- even to the
* same blocks within the object -- doesn't cause any trouble.
*/
ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
return;
while (ztest_random(10) != 0)
ztest_io(zd, od[0].od_object, offset);
}
void
ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id)
{
ztest_od_t od[1];
uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) +
(ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
uint64_t count = ztest_random(20) + 1;
uint64_t blocksize = ztest_random_blocksize();
void *data;
ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
return;
if (ztest_truncate(zd, od[0].od_object, offset, count * blocksize) != 0)
return;
ztest_prealloc(zd, od[0].od_object, offset, count * blocksize);
data = umem_zalloc(blocksize, UMEM_NOFAIL);
while (ztest_random(count) != 0) {
uint64_t randoff = offset + (ztest_random(count) * blocksize);
if (ztest_write(zd, od[0].od_object, randoff, blocksize,
data) != 0)
break;
while (ztest_random(4) != 0)
ztest_io(zd, od[0].od_object, randoff);
}
umem_free(data, blocksize);
}
/*
* Verify that zap_{create,destroy,add,remove,update} work as expected.
*/
#define ZTEST_ZAP_MIN_INTS 1
#define ZTEST_ZAP_MAX_INTS 4
#define ZTEST_ZAP_MAX_PROPS 1000
void
ztest_zap(ztest_ds_t *zd, uint64_t id)
{
objset_t *os = zd->zd_os;
ztest_od_t od[1];
uint64_t object;
uint64_t txg, last_txg;
uint64_t value[ZTEST_ZAP_MAX_INTS];
uint64_t zl_ints, zl_intsize, prop;
int i, ints;
dmu_tx_t *tx;
char propname[100], txgname[100];
int error;
char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" };
ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);
if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
return;
object = od[0].od_object;
/*
* Generate a known hash collision, and verify that
* we can lookup and remove both entries.
*/
tx = dmu_tx_create(os);
dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
if (txg == 0)
return;
for (i = 0; i < 2; i++) {
value[i] = i;
VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t),
1, &value[i], tx));
}
for (i = 0; i < 2; i++) {
VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i],
sizeof (uint64_t), 1, &value[i], tx));
VERIFY3U(0, ==,
zap_length(os, object, hc[i], &zl_intsize, &zl_ints));
ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
ASSERT3U(zl_ints, ==, 1);
}
for (i = 0; i < 2; i++) {
VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx));
}
dmu_tx_commit(tx);
/*
* Generate a buch of random entries.
*/
ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS);
prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
(void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
(void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
bzero(value, sizeof (value));
last_txg = 0;
/*
* If these zap entries already exist, validate their contents.
*/
error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
if (error == 0) {
ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
ASSERT3U(zl_ints, ==, 1);
VERIFY(zap_lookup(os, object, txgname, zl_intsize,
zl_ints, &last_txg) == 0);
VERIFY(zap_length(os, object, propname, &zl_intsize,
&zl_ints) == 0);
ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
ASSERT3U(zl_ints, ==, ints);
VERIFY(zap_lookup(os, object, propname, zl_intsize,
zl_ints, value) == 0);
for (i = 0; i < ints; i++) {
ASSERT3U(value[i], ==, last_txg + object + i);
}
} else {
ASSERT3U(error, ==, ENOENT);
}
/*
* Atomically update two entries in our zap object.
* The first is named txg_%llu, and contains the txg
* in which the property was last updated. The second
* is named prop_%llu, and the nth element of its value
* should be txg + object + n.
*/
tx = dmu_tx_create(os);
dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
if (txg == 0)
return;
if (last_txg > txg)
fatal(0, "zap future leak: old %llu new %llu", last_txg, txg);
for (i = 0; i < ints; i++)
value[i] = txg + object + i;
VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t),
1, &txg, tx));
VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t),
ints, value, tx));
dmu_tx_commit(tx);
/*
* Remove a random pair of entries.
*/
prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
(void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
(void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
if (error == ENOENT)
return;
ASSERT0(error);
tx = dmu_tx_create(os);
dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
if (txg == 0)
return;
VERIFY3U(0, ==, zap_remove(os, object, txgname, tx));
VERIFY3U(0, ==, zap_remove(os, object, propname, tx));
dmu_tx_commit(tx);
}
/*
* Testcase to test the upgrading of a microzap to fatzap.
*/
void
ztest_fzap(ztest_ds_t *zd, uint64_t id)
{
objset_t *os = zd->zd_os;
ztest_od_t od[1];
uint64_t object, txg;
ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);
if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
return;
object = od[0].od_object;
/*
* Add entries to this ZAP and make sure it spills over
* and gets upgraded to a fatzap. Also, since we are adding
* 2050 entries we should see ptrtbl growth and leaf-block split.
*/
for (int i = 0; i < 2050; i++) {
char name[ZFS_MAX_DATASET_NAME_LEN];
uint64_t value = i;
dmu_tx_t *tx;
int error;
(void) snprintf(name, sizeof (name), "fzap-%llu-%llu",
id, value);
tx = dmu_tx_create(os);
dmu_tx_hold_zap(tx, object, B_TRUE, name);
txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
if (txg == 0)
return;
error = zap_add(os, object, name, sizeof (uint64_t), 1,
&value, tx);
ASSERT(error == 0 || error == EEXIST);
dmu_tx_commit(tx);
}
}
/* ARGSUSED */
void
ztest_zap_parallel(ztest_ds_t *zd, uint64_t id)
{
objset_t *os = zd->zd_os;
ztest_od_t od[1];
uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
dmu_tx_t *tx;
int i, namelen, error;
int micro = ztest_random(2);
char name[20], string_value[20];
void *data;
ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0);
if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
return;
object = od[0].od_object;
/*
* Generate a random name of the form 'xxx.....' where each
* x is a random printable character and the dots are dots.
* There are 94 such characters, and the name length goes from
* 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
*/
namelen = ztest_random(sizeof (name) - 5) + 5 + 1;
for (i = 0; i < 3; i++)
name[i] = '!' + ztest_random('~' - '!' + 1);
for (; i < namelen - 1; i++)
name[i] = '.';
name[i] = '\0';
if ((namelen & 1) || micro) {
wsize = sizeof (txg);
wc = 1;
data = &txg;
} else {
wsize = 1;
wc = namelen;
data = string_value;
}
count = -1ULL;
VERIFY0(zap_count(os, object, &count));
ASSERT(count != -1ULL);
/*
* Select an operation: length, lookup, add, update, remove.
*/
i = ztest_random(5);
if (i >= 2) {
tx = dmu_tx_create(os);
dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
if (txg == 0)
return;
bcopy(name, string_value, namelen);
} else {
tx = NULL;
txg = 0;
bzero(string_value, namelen);
}
switch (i) {
case 0:
error = zap_length(os, object, name, &zl_wsize, &zl_wc);
if (error == 0) {
ASSERT3U(wsize, ==, zl_wsize);
ASSERT3U(wc, ==, zl_wc);
} else {
ASSERT3U(error, ==, ENOENT);
}
break;
case 1:
error = zap_lookup(os, object, name, wsize, wc, data);
if (error == 0) {
if (data == string_value &&
bcmp(name, data, namelen) != 0)
fatal(0, "name '%s' != val '%s' len %d",
name, data, namelen);
} else {
ASSERT3U(error, ==, ENOENT);
}
break;
case 2:
error = zap_add(os, object, name, wsize, wc, data, tx);
ASSERT(error == 0 || error == EEXIST);
break;
case 3:
VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0);
break;
case 4:
error = zap_remove(os, object, name, tx);
ASSERT(error == 0 || error == ENOENT);
break;
}
if (tx != NULL)
dmu_tx_commit(tx);
}
/*
* Commit callback data.
*/
typedef struct ztest_cb_data {
list_node_t zcd_node;
uint64_t zcd_txg;
int zcd_expected_err;
boolean_t zcd_added;
boolean_t zcd_called;
spa_t *zcd_spa;
} ztest_cb_data_t;
/* This is the actual commit callback function */
static void
ztest_commit_callback(void *arg, int error)
{
ztest_cb_data_t *data = arg;
uint64_t synced_txg;
VERIFY(data != NULL);
VERIFY3S(data->zcd_expected_err, ==, error);
VERIFY(!data->zcd_called);
synced_txg = spa_last_synced_txg(data->zcd_spa);
if (data->zcd_txg > synced_txg)
fatal(0, "commit callback of txg %" PRIu64 " called prematurely"
", last synced txg = %" PRIu64 "\n", data->zcd_txg,
synced_txg);
data->zcd_called = B_TRUE;
if (error == ECANCELED) {
ASSERT0(data->zcd_txg);
ASSERT(!data->zcd_added);
/*
* The private callback data should be destroyed here, but
* since we are going to check the zcd_called field after
* dmu_tx_abort(), we will destroy it there.
*/
return;
}
/* Was this callback added to the global callback list? */
if (!data->zcd_added)
goto out;
ASSERT3U(data->zcd_txg, !=, 0);
/* Remove our callback from the list */
(void) mutex_lock(&zcl.zcl_callbacks_lock);
list_remove(&zcl.zcl_callbacks, data);
(void) mutex_unlock(&zcl.zcl_callbacks_lock);
out:
umem_free(data, sizeof (ztest_cb_data_t));
}
/* Allocate and initialize callback data structure */
static ztest_cb_data_t *
ztest_create_cb_data(objset_t *os, uint64_t txg)
{
ztest_cb_data_t *cb_data;
cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL);
cb_data->zcd_txg = txg;
cb_data->zcd_spa = dmu_objset_spa(os);
return (cb_data);
}
/*
* If a number of txgs equal to this threshold have been created after a commit
* callback has been registered but not called, then we assume there is an
* implementation bug.
*/
#define ZTEST_COMMIT_CALLBACK_THRESH (TXG_CONCURRENT_STATES + 2)
/*
* Commit callback test.
*/
void
ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id)
{
objset_t *os = zd->zd_os;
ztest_od_t od[1];
dmu_tx_t *tx;
ztest_cb_data_t *cb_data[3], *tmp_cb;
uint64_t old_txg, txg;
int i, error;
ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
return;
tx = dmu_tx_create(os);
cb_data[0] = ztest_create_cb_data(os, 0);
dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]);
dmu_tx_hold_write(tx, od[0].od_object, 0, sizeof (uint64_t));
/* Every once in a while, abort the transaction on purpose */
if (ztest_random(100) == 0)
error = -1;
if (!error)
error = dmu_tx_assign(tx, TXG_NOWAIT);
txg = error ? 0 : dmu_tx_get_txg(tx);
cb_data[0]->zcd_txg = txg;
cb_data[1] = ztest_create_cb_data(os, txg);
dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]);
if (error) {
/*
* It's not a strict requirement to call the registered
* callbacks from inside dmu_tx_abort(), but that's what
* it's supposed to happen in the current implementation
* so we will check for that.
*/
for (i = 0; i < 2; i++) {
cb_data[i]->zcd_expected_err = ECANCELED;
VERIFY(!cb_data[i]->zcd_called);
}
dmu_tx_abort(tx);
for (i = 0; i < 2; i++) {
VERIFY(cb_data[i]->zcd_called);
umem_free(cb_data[i], sizeof (ztest_cb_data_t));
}
return;
}
cb_data[2] = ztest_create_cb_data(os, txg);
dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]);
/*
* Read existing data to make sure there isn't a future leak.
*/
VERIFY(0 == dmu_read(os, od[0].od_object, 0, sizeof (uint64_t),
&old_txg, DMU_READ_PREFETCH));
if (old_txg > txg)
fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64,
old_txg, txg);
dmu_write(os, od[0].od_object, 0, sizeof (uint64_t), &txg, tx);
(void) mutex_lock(&zcl.zcl_callbacks_lock);
/*
* Since commit callbacks don't have any ordering requirement and since
* it is theoretically possible for a commit callback to be called
* after an arbitrary amount of time has elapsed since its txg has been
* synced, it is difficult to reliably determine whether a commit
* callback hasn't been called due to high load or due to a flawed
* implementation.
*
* In practice, we will assume that if after a certain number of txgs a
* commit callback hasn't been called, then most likely there's an
* implementation bug..
*/
tmp_cb = list_head(&zcl.zcl_callbacks);
if (tmp_cb != NULL &&
(txg - ZTEST_COMMIT_CALLBACK_THRESH) > tmp_cb->zcd_txg) {
fatal(0, "Commit callback threshold exceeded, oldest txg: %"
PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg);
}
/*
* Let's find the place to insert our callbacks.
*
* Even though the list is ordered by txg, it is possible for the
* insertion point to not be the end because our txg may already be
* quiescing at this point and other callbacks in the open txg
* (from other objsets) may have sneaked in.
*/
tmp_cb = list_tail(&zcl.zcl_callbacks);
while (tmp_cb != NULL && tmp_cb->zcd_txg > txg)
tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb);
/* Add the 3 callbacks to the list */
for (i = 0; i < 3; i++) {
if (tmp_cb == NULL)
list_insert_head(&zcl.zcl_callbacks, cb_data[i]);
else
list_insert_after(&zcl.zcl_callbacks, tmp_cb,
cb_data[i]);
cb_data[i]->zcd_added = B_TRUE;
VERIFY(!cb_data[i]->zcd_called);
tmp_cb = cb_data[i];
}
(void) mutex_unlock(&zcl.zcl_callbacks_lock);
dmu_tx_commit(tx);
}
/* ARGSUSED */
void
ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id)
{
zfs_prop_t proplist[] = {
ZFS_PROP_CHECKSUM,
ZFS_PROP_COMPRESSION,
ZFS_PROP_COPIES,
ZFS_PROP_DEDUP
};
(void) rw_rdlock(&ztest_name_lock);
for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++)
(void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p],
ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2));
(void) rw_unlock(&ztest_name_lock);
}
/* ARGSUSED */
void
+ztest_remap_blocks(ztest_ds_t *zd, uint64_t id)
+{
+ (void) rw_rdlock(&ztest_name_lock);
+
+ int error = dmu_objset_remap_indirects(zd->zd_name);
+ if (error == ENOSPC)
+ error = 0;
+ ASSERT0(error);
+
+ (void) rw_unlock(&ztest_name_lock);
+}
+
+/* ARGSUSED */
+void
ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id)
{
nvlist_t *props = NULL;
(void) rw_rdlock(&ztest_name_lock);
(void) ztest_spa_prop_set_uint64(ZPOOL_PROP_DEDUPDITTO,
ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN));
VERIFY0(spa_prop_get(ztest_spa, &props));
if (ztest_opts.zo_verbose >= 6)
dump_nvlist(props, 4);
nvlist_free(props);
(void) rw_unlock(&ztest_name_lock);
}
static int
user_release_one(const char *snapname, const char *holdname)
{
nvlist_t *snaps, *holds;
int error;
snaps = fnvlist_alloc();
holds = fnvlist_alloc();
fnvlist_add_boolean(holds, holdname);
fnvlist_add_nvlist(snaps, snapname, holds);
fnvlist_free(holds);
error = dsl_dataset_user_release(snaps, NULL);
fnvlist_free(snaps);
return (error);
}
/*
* Test snapshot hold/release and deferred destroy.
*/
void
ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
{
int error;
objset_t *os = zd->zd_os;
objset_t *origin;
char snapname[100];
char fullname[100];
char clonename[100];
char tag[100];
char osname[ZFS_MAX_DATASET_NAME_LEN];
nvlist_t *holds;
(void) rw_rdlock(&ztest_name_lock);
dmu_objset_name(os, osname);
(void) snprintf(snapname, sizeof (snapname), "sh1_%llu", id);
(void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname);
(void) snprintf(clonename, sizeof (clonename),
"%s/ch1_%llu", osname, id);
(void) snprintf(tag, sizeof (tag), "tag_%llu", id);
/*
* Clean up from any previous run.
*/
error = dsl_destroy_head(clonename);
if (error != ENOENT)
ASSERT0(error);
error = user_release_one(fullname, tag);
if (error != ESRCH && error != ENOENT)
ASSERT0(error);
error = dsl_destroy_snapshot(fullname, B_FALSE);
if (error != ENOENT)
ASSERT0(error);
/*
* Create snapshot, clone it, mark snap for deferred destroy,
* destroy clone, verify snap was also destroyed.
*/
error = dmu_objset_snapshot_one(osname, snapname);
if (error) {
if (error == ENOSPC) {
ztest_record_enospc("dmu_objset_snapshot");
goto out;
}
fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
}
error = dmu_objset_clone(clonename, fullname);
if (error) {
if (error == ENOSPC) {
ztest_record_enospc("dmu_objset_clone");
goto out;
}
fatal(0, "dmu_objset_clone(%s) = %d", clonename, error);
}
error = dsl_destroy_snapshot(fullname, B_TRUE);
if (error) {
fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d",
fullname, error);
}
error = dsl_destroy_head(clonename);
if (error)
fatal(0, "dsl_destroy_head(%s) = %d", clonename, error);
error = dmu_objset_hold(fullname, FTAG, &origin);
if (error != ENOENT)
fatal(0, "dmu_objset_hold(%s) = %d", fullname, error);
/*
* Create snapshot, add temporary hold, verify that we can't
* destroy a held snapshot, mark for deferred destroy,
* release hold, verify snapshot was destroyed.
*/
error = dmu_objset_snapshot_one(osname, snapname);
if (error) {
if (error == ENOSPC) {
ztest_record_enospc("dmu_objset_snapshot");
goto out;
}
fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
}
holds = fnvlist_alloc();
fnvlist_add_string(holds, fullname, tag);
error = dsl_dataset_user_hold(holds, 0, NULL);
fnvlist_free(holds);
if (error == ENOSPC) {
ztest_record_enospc("dsl_dataset_user_hold");
goto out;
} else if (error) {
fatal(0, "dsl_dataset_user_hold(%s, %s) = %u",
fullname, tag, error);
}
error = dsl_destroy_snapshot(fullname, B_FALSE);
if (error != EBUSY) {
fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d",
fullname, error);
}
error = dsl_destroy_snapshot(fullname, B_TRUE);
if (error) {
fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d",
fullname, error);
}
error = user_release_one(fullname, tag);
if (error)
fatal(0, "user_release_one(%s, %s) = %d", fullname, tag, error);
VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT);
out:
(void) rw_unlock(&ztest_name_lock);
}
/*
* Inject random faults into the on-disk data.
*/
/* ARGSUSED */
void
ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
{
ztest_shared_t *zs = ztest_shared;
spa_t *spa = ztest_spa;
int fd;
uint64_t offset;
uint64_t leaves;
uint64_t bad = 0x1990c0ffeedecadeULL;
uint64_t top, leaf;
char path0[MAXPATHLEN];
char pathrand[MAXPATHLEN];
size_t fsize;
int bshift = SPA_MAXBLOCKSHIFT + 2;
int iters = 1000;
int maxfaults;
int mirror_save;
vdev_t *vd0 = NULL;
uint64_t guid0 = 0;
boolean_t islog = B_FALSE;
VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
maxfaults = MAXFAULTS();
leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
mirror_save = zs->zs_mirrors;
VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
ASSERT(leaves >= 1);
/*
* Grab the name lock as reader. There are some operations
* which don't like to have their vdevs changed while
* they are in progress (i.e. spa_change_guid). Those
* operations will have grabbed the name lock as writer.
*/
(void) rw_rdlock(&ztest_name_lock);
/*
* We need SCL_STATE here because we're going to look at vd0->vdev_tsd.
*/
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
if (ztest_random(2) == 0) {
/*
* Inject errors on a normal data device or slog device.
*/
top = ztest_random_vdev_top(spa, B_TRUE);
leaf = ztest_random(leaves) + zs->zs_splits;
/*
* Generate paths to the first leaf in this top-level vdev,
* and to the random leaf we selected. We'll induce transient
* write failures and random online/offline activity on leaf 0,
* and we'll write random garbage to the randomly chosen leaf.
*/
(void) snprintf(path0, sizeof (path0), ztest_dev_template,
ztest_opts.zo_dir, ztest_opts.zo_pool,
top * leaves + zs->zs_splits);
(void) snprintf(pathrand, sizeof (pathrand), ztest_dev_template,
ztest_opts.zo_dir, ztest_opts.zo_pool,
top * leaves + leaf);
vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0);
if (vd0 != NULL && vd0->vdev_top->vdev_islog)
islog = B_TRUE;
/*
* If the top-level vdev needs to be resilvered
* then we only allow faults on the device that is
* resilvering.
*/
if (vd0 != NULL && maxfaults != 1 &&
(!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) ||
vd0->vdev_resilver_txg != 0)) {
/*
* Make vd0 explicitly claim to be unreadable,
* or unwriteable, or reach behind its back
* and close the underlying fd. We can do this if
* maxfaults == 0 because we'll fail and reexecute,
* and we can do it if maxfaults >= 2 because we'll
* have enough redundancy. If maxfaults == 1, the
* combination of this with injection of random data
* corruption below exceeds the pool's fault tolerance.
*/
vdev_file_t *vf = vd0->vdev_tsd;
+
+ zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d",
+ (long long)vd0->vdev_id, (int)maxfaults);
if (vf != NULL && ztest_random(3) == 0) {
(void) close(vf->vf_vnode->v_fd);
vf->vf_vnode->v_fd = -1;
} else if (ztest_random(2) == 0) {
vd0->vdev_cant_read = B_TRUE;
} else {
vd0->vdev_cant_write = B_TRUE;
}
guid0 = vd0->vdev_guid;
}
} else {
/*
* Inject errors on an l2cache device.
*/
spa_aux_vdev_t *sav = &spa->spa_l2cache;
if (sav->sav_count == 0) {
spa_config_exit(spa, SCL_STATE, FTAG);
(void) rw_unlock(&ztest_name_lock);
return;
}
vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)];
guid0 = vd0->vdev_guid;
(void) strcpy(path0, vd0->vdev_path);
(void) strcpy(pathrand, vd0->vdev_path);
leaf = 0;
leaves = 1;
maxfaults = INT_MAX; /* no limit on cache devices */
}
spa_config_exit(spa, SCL_STATE, FTAG);
(void) rw_unlock(&ztest_name_lock);
/*
* If we can tolerate two or more faults, or we're dealing
* with a slog, randomly online/offline vd0.
*/
if ((maxfaults >= 2 || islog) && guid0 != 0) {
if (ztest_random(10) < 6) {
int flags = (ztest_random(2) == 0 ?
ZFS_OFFLINE_TEMPORARY : 0);
/*
* We have to grab the zs_name_lock as writer to
* prevent a race between offlining a slog and
* destroying a dataset. Offlining the slog will
* grab a reference on the dataset which may cause
* dmu_objset_destroy() to fail with EBUSY thus
* leaving the dataset in an inconsistent state.
*/
if (islog)
(void) rw_wrlock(&ztest_name_lock);
VERIFY(vdev_offline(spa, guid0, flags) != EBUSY);
if (islog)
(void) rw_unlock(&ztest_name_lock);
} else {
/*
* Ideally we would like to be able to randomly
* call vdev_[on|off]line without holding locks
* to force unpredictable failures but the side
* effects of vdev_[on|off]line prevent us from
* doing so. We grab the ztest_vdev_lock here to
* prevent a race between injection testing and
* aux_vdev removal.
*/
VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
(void) vdev_online(spa, guid0, 0, NULL);
VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
}
}
if (maxfaults == 0)
return;
/*
* We have at least single-fault tolerance, so inject data corruption.
*/
fd = open(pathrand, O_RDWR);
if (fd == -1) /* we hit a gap in the device namespace */
return;
fsize = lseek(fd, 0, SEEK_END);
while (--iters != 0) {
/*
* The offset must be chosen carefully to ensure that
* we do not inject a given logical block with errors
* on two different leaf devices, because ZFS can not
* tolerate that (if maxfaults==1).
*
* We divide each leaf into chunks of size
* (# leaves * SPA_MAXBLOCKSIZE * 4). Within each chunk
* there is a series of ranges to which we can inject errors.
* Each range can accept errors on only a single leaf vdev.
* The error injection ranges are separated by ranges
* which we will not inject errors on any device (DMZs).
* Each DMZ must be large enough such that a single block
* can not straddle it, so that a single block can not be
* a target in two different injection ranges (on different
* leaf vdevs).
*
* For example, with 3 leaves, each chunk looks like:
* 0 to 32M: injection range for leaf 0
* 32M to 64M: DMZ - no injection allowed
* 64M to 96M: injection range for leaf 1
* 96M to 128M: DMZ - no injection allowed
* 128M to 160M: injection range for leaf 2
* 160M to 192M: DMZ - no injection allowed
*/
offset = ztest_random(fsize / (leaves << bshift)) *
(leaves << bshift) + (leaf << bshift) +
(ztest_random(1ULL << (bshift - 1)) & -8ULL);
/*
* Only allow damage to the labels at one end of the vdev.
*
* If all labels are damaged, the device will be totally
* inaccessible, which will result in loss of data,
* because we also damage (parts of) the other side of
* the mirror/raidz.
*
* Additionally, we will always have both an even and an
* odd label, so that we can handle crashes in the
* middle of vdev_config_sync().
*/
if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE)
continue;
/*
* The two end labels are stored at the "end" of the disk, but
* the end of the disk (vdev_psize) is aligned to
* sizeof (vdev_label_t).
*/
uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t));
if ((leaf & 1) == 1 &&
offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE)
continue;
VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
if (mirror_save != zs->zs_mirrors) {
VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
(void) close(fd);
return;
}
if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad))
fatal(1, "can't inject bad word at 0x%llx in %s",
offset, pathrand);
VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
if (ztest_opts.zo_verbose >= 7)
(void) printf("injected bad word into %s,"
" offset 0x%llx\n", pathrand, (u_longlong_t)offset);
}
(void) close(fd);
}
/*
* Verify that DDT repair works as expected.
*/
void
ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
{
ztest_shared_t *zs = ztest_shared;
spa_t *spa = ztest_spa;
objset_t *os = zd->zd_os;
ztest_od_t od[1];
uint64_t object, blocksize, txg, pattern, psize;
enum zio_checksum checksum = spa_dedup_checksum(spa);
dmu_buf_t *db;
dmu_tx_t *tx;
abd_t *abd;
blkptr_t blk;
int copies = 2 * ZIO_DEDUPDITTO_MIN;
blocksize = ztest_random_blocksize();
blocksize = MIN(blocksize, 2048); /* because we write so many */
ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
return;
/*
* Take the name lock as writer to prevent anyone else from changing
* the pool and dataset properies we need to maintain during this test.
*/
(void) rw_wrlock(&ztest_name_lock);
if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum,
B_FALSE) != 0 ||
ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1,
B_FALSE) != 0) {
(void) rw_unlock(&ztest_name_lock);
return;
}
dmu_objset_stats_t dds;
dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
dmu_objset_fast_stat(os, &dds);
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
object = od[0].od_object;
blocksize = od[0].od_blocksize;
pattern = zs->zs_guid ^ dds.dds_guid;
ASSERT(object != 0);
tx = dmu_tx_create(os);
dmu_tx_hold_write(tx, object, 0, copies * blocksize);
txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
if (txg == 0) {
(void) rw_unlock(&ztest_name_lock);
return;
}
/*
* Write all the copies of our block.
*/
for (int i = 0; i < copies; i++) {
uint64_t offset = i * blocksize;
int error = dmu_buf_hold(os, object, offset, FTAG, &db,
DMU_READ_NO_PREFETCH);
if (error != 0) {
fatal(B_FALSE, "dmu_buf_hold(%p, %llu, %llu) = %u",
os, (long long)object, (long long) offset, error);
}
ASSERT(db->db_offset == offset);
ASSERT(db->db_size == blocksize);
ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) ||
ztest_pattern_match(db->db_data, db->db_size, 0ULL));
dmu_buf_will_fill(db, tx);
ztest_pattern_set(db->db_data, db->db_size, pattern);
dmu_buf_rele(db, FTAG);
}
dmu_tx_commit(tx);
txg_wait_synced(spa_get_dsl(spa), txg);
/*
* Find out what block we got.
*/
VERIFY0(dmu_buf_hold(os, object, 0, FTAG, &db,
DMU_READ_NO_PREFETCH));
blk = *((dmu_buf_impl_t *)db)->db_blkptr;
dmu_buf_rele(db, FTAG);
/*
* Damage the block. Dedup-ditto will save us when we read it later.
*/
psize = BP_GET_PSIZE(&blk);
abd = abd_alloc_linear(psize, B_TRUE);
ztest_pattern_set(abd_to_buf(abd), psize, ~pattern);
(void) zio_wait(zio_rewrite(NULL, spa, 0, &blk,
abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL));
abd_free(abd);
(void) rw_unlock(&ztest_name_lock);
}
/*
* Scrub the pool.
*/
/* ARGSUSED */
void
ztest_scrub(ztest_ds_t *zd, uint64_t id)
{
spa_t *spa = ztest_spa;
(void) spa_scan(spa, POOL_SCAN_SCRUB);
(void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
(void) spa_scan(spa, POOL_SCAN_SCRUB);
}
/*
* Change the guid for the pool.
*/
/* ARGSUSED */
void
ztest_reguid(ztest_ds_t *zd, uint64_t id)
{
spa_t *spa = ztest_spa;
uint64_t orig, load;
int error;
orig = spa_guid(spa);
load = spa_load_guid(spa);
(void) rw_wrlock(&ztest_name_lock);
error = spa_change_guid(spa);
(void) rw_unlock(&ztest_name_lock);
if (error != 0)
return;
if (ztest_opts.zo_verbose >= 4) {
(void) printf("Changed guid old %llu -> %llu\n",
(u_longlong_t)orig, (u_longlong_t)spa_guid(spa));
}
VERIFY3U(orig, !=, spa_guid(spa));
VERIFY3U(load, ==, spa_load_guid(spa));
}
/*
* Rename the pool to a different name and then rename it back.
*/
/* ARGSUSED */
void
ztest_spa_rename(ztest_ds_t *zd, uint64_t id)
{
char *oldname, *newname;
spa_t *spa;
(void) rw_wrlock(&ztest_name_lock);
oldname = ztest_opts.zo_pool;
newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
(void) strcpy(newname, oldname);
(void) strcat(newname, "_tmp");
/*
* Do the rename
*/
VERIFY3U(0, ==, spa_rename(oldname, newname));
/*
* Try to open it under the old name, which shouldn't exist
*/
VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
/*
* Open it under the new name and make sure it's still the same spa_t.
*/
VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
ASSERT(spa == ztest_spa);
spa_close(spa, FTAG);
/*
* Rename it back to the original
*/
VERIFY3U(0, ==, spa_rename(newname, oldname));
/*
* Make sure it can still be opened
*/
VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
ASSERT(spa == ztest_spa);
spa_close(spa, FTAG);
umem_free(newname, strlen(newname) + 1);
(void) rw_unlock(&ztest_name_lock);
}
/*
* Verify pool integrity by running zdb.
*/
static void
ztest_run_zdb(char *pool)
{
int status;
char zdb[MAXPATHLEN + MAXNAMELEN + 20];
char zbuf[1024];
char *bin;
char *ztest;
char *isa;
int isalen;
FILE *fp;
strlcpy(zdb, "/usr/bin/ztest", sizeof(zdb));
/* zdb lives in /usr/sbin, while ztest lives in /usr/bin */
bin = strstr(zdb, "/usr/bin/");
ztest = strstr(bin, "/ztest");
isa = bin + 8;
isalen = ztest - isa;
isa = strdup(isa);
/* LINTED */
(void) sprintf(bin,
"/usr/sbin%.*s/zdb -bcc%s%s -G -d -U %s %s",
isalen,
isa,
ztest_opts.zo_verbose >= 3 ? "s" : "",
ztest_opts.zo_verbose >= 4 ? "v" : "",
spa_config_path,
pool);
free(isa);
if (ztest_opts.zo_verbose >= 5)
(void) printf("Executing %s\n", strstr(zdb, "zdb "));
fp = popen(zdb, "r");
assert(fp != NULL);
while (fgets(zbuf, sizeof (zbuf), fp) != NULL)
if (ztest_opts.zo_verbose >= 3)
(void) printf("%s", zbuf);
status = pclose(fp);
if (status == 0)
return;
ztest_dump_core = 0;
if (WIFEXITED(status))
fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status));
else
fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status));
}
static void
ztest_walk_pool_directory(char *header)
{
spa_t *spa = NULL;
if (ztest_opts.zo_verbose >= 6)
(void) printf("%s\n", header);
mutex_enter(&spa_namespace_lock);
while ((spa = spa_next(spa)) != NULL)
if (ztest_opts.zo_verbose >= 6)
(void) printf("\t%s\n", spa_name(spa));
mutex_exit(&spa_namespace_lock);
}
static void
ztest_spa_import_export(char *oldname, char *newname)
{
nvlist_t *config, *newconfig;
uint64_t pool_guid;
spa_t *spa;
int error;
if (ztest_opts.zo_verbose >= 4) {
(void) printf("import/export: old = %s, new = %s\n",
oldname, newname);
}
/*
* Clean up from previous runs.
*/
(void) spa_destroy(newname);
/*
* Get the pool's configuration and guid.
*/
VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
/*
* Kick off a scrub to tickle scrub/export races.
*/
if (ztest_random(2) == 0)
(void) spa_scan(spa, POOL_SCAN_SCRUB);
pool_guid = spa_guid(spa);
spa_close(spa, FTAG);
ztest_walk_pool_directory("pools before export");
/*
* Export it.
*/
VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE));
ztest_walk_pool_directory("pools after export");
/*
* Try to import it.
*/
newconfig = spa_tryimport(config);
ASSERT(newconfig != NULL);
nvlist_free(newconfig);
/*
* Import it under the new name.
*/
error = spa_import(newname, config, NULL, 0);
if (error != 0) {
dump_nvlist(config, 0);
fatal(B_FALSE, "couldn't import pool %s as %s: error %u",
oldname, newname, error);
}
ztest_walk_pool_directory("pools after import");
/*
* Try to import it again -- should fail with EEXIST.
*/
VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0));
/*
* Try to import it under a different name -- should fail with EEXIST.
*/
VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0));
/*
* Verify that the pool is no longer visible under the old name.
*/
VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
/*
* Verify that we can open and close the pool using the new name.
*/
VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
ASSERT(pool_guid == spa_guid(spa));
spa_close(spa, FTAG);
nvlist_free(config);
}
static void
ztest_resume(spa_t *spa)
{
if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6)
(void) printf("resuming from suspended state\n");
spa_vdev_state_enter(spa, SCL_NONE);
vdev_clear(spa, NULL);
(void) spa_vdev_state_exit(spa, NULL, 0);
(void) zio_resume(spa);
}
static void *
ztest_resume_thread(void *arg)
{
spa_t *spa = arg;
while (!ztest_exiting) {
if (spa_suspended(spa))
ztest_resume(spa);
(void) poll(NULL, 0, 100);
/*
* Periodically change the zfs_compressed_arc_enabled setting.
*/
if (ztest_random(10) == 0)
zfs_compressed_arc_enabled = ztest_random(2);
/*
* Periodically change the zfs_abd_scatter_enabled setting.
*/
if (ztest_random(10) == 0)
zfs_abd_scatter_enabled = ztest_random(2);
}
return (NULL);
}
static void *
ztest_deadman_thread(void *arg)
{
ztest_shared_t *zs = arg;
spa_t *spa = ztest_spa;
hrtime_t delta, total = 0;
for (;;) {
delta = zs->zs_thread_stop - zs->zs_thread_start +
MSEC2NSEC(zfs_deadman_synctime_ms);
(void) poll(NULL, 0, (int)NSEC2MSEC(delta));
/*
* If the pool is suspended then fail immediately. Otherwise,
* check to see if the pool is making any progress. If
* vdev_deadman() discovers that there hasn't been any recent
* I/Os then it will end up aborting the tests.
*/
if (spa_suspended(spa) || spa->spa_root_vdev == NULL) {
fatal(0, "aborting test after %llu seconds because "
"pool has transitioned to a suspended state.",
zfs_deadman_synctime_ms / 1000);
return (NULL);
}
vdev_deadman(spa->spa_root_vdev);
total += zfs_deadman_synctime_ms/1000;
(void) printf("ztest has been running for %lld seconds\n",
total);
}
}
static void
ztest_execute(int test, ztest_info_t *zi, uint64_t id)
{
ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets];
ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test);
hrtime_t functime = gethrtime();
for (int i = 0; i < zi->zi_iters; i++)
zi->zi_func(zd, id);
functime = gethrtime() - functime;
atomic_add_64(&zc->zc_count, 1);
atomic_add_64(&zc->zc_time, functime);
if (ztest_opts.zo_verbose >= 4) {
Dl_info dli;
(void) dladdr((void *)zi->zi_func, &dli);
(void) printf("%6.2f sec in %s\n",
(double)functime / NANOSEC, dli.dli_sname);
}
}
static void *
ztest_thread(void *arg)
{
int rand;
uint64_t id = (uintptr_t)arg;
ztest_shared_t *zs = ztest_shared;
uint64_t call_next;
hrtime_t now;
ztest_info_t *zi;
ztest_shared_callstate_t *zc;
while ((now = gethrtime()) < zs->zs_thread_stop) {
/*
* See if it's time to force a crash.
*/
if (now > zs->zs_thread_kill)
ztest_kill(zs);
/*
* If we're getting ENOSPC with some regularity, stop.
*/
if (zs->zs_enospc_count > 10)
break;
/*
* Pick a random function to execute.
*/
rand = ztest_random(ZTEST_FUNCS);
zi = &ztest_info[rand];
zc = ZTEST_GET_SHARED_CALLSTATE(rand);
call_next = zc->zc_next;
if (now >= call_next &&
atomic_cas_64(&zc->zc_next, call_next, call_next +
ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) {
ztest_execute(rand, zi, id);
}
}
return (NULL);
}
static void
ztest_dataset_name(char *dsname, char *pool, int d)
{
(void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d);
}
static void
ztest_dataset_destroy(int d)
{
char name[ZFS_MAX_DATASET_NAME_LEN];
ztest_dataset_name(name, ztest_opts.zo_pool, d);
if (ztest_opts.zo_verbose >= 3)
(void) printf("Destroying %s to free up space\n", name);
/*
* Cleanup any non-standard clones and snapshots. In general,
* ztest thread t operates on dataset (t % zopt_datasets),
* so there may be more than one thing to clean up.
*/
for (int t = d; t < ztest_opts.zo_threads;
t += ztest_opts.zo_datasets) {
ztest_dsl_dataset_cleanup(name, t);
}
(void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
}
static void
ztest_dataset_dirobj_verify(ztest_ds_t *zd)
{
uint64_t usedobjs, dirobjs, scratch;
/*
* ZTEST_DIROBJ is the object directory for the entire dataset.
* Therefore, the number of objects in use should equal the
* number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself.
* If not, we have an object leak.
*
* Note that we can only check this in ztest_dataset_open(),
* when the open-context and syncing-context values agree.
* That's because zap_count() returns the open-context value,
* while dmu_objset_space() returns the rootbp fill count.
*/
VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs));
dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch);
ASSERT3U(dirobjs + 1, ==, usedobjs);
}
static int
ztest_dataset_open(int d)
{
ztest_ds_t *zd = &ztest_ds[d];
uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq;
objset_t *os;
zilog_t *zilog;
char name[ZFS_MAX_DATASET_NAME_LEN];
int error;
ztest_dataset_name(name, ztest_opts.zo_pool, d);
(void) rw_rdlock(&ztest_name_lock);
error = ztest_dataset_create(name);
if (error == ENOSPC) {
(void) rw_unlock(&ztest_name_lock);
ztest_record_enospc(FTAG);
return (error);
}
ASSERT(error == 0 || error == EEXIST);
VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, zd, &os));
(void) rw_unlock(&ztest_name_lock);
ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os);
zilog = zd->zd_zilog;
if (zilog->zl_header->zh_claim_lr_seq != 0 &&
zilog->zl_header->zh_claim_lr_seq < committed_seq)
fatal(0, "missing log records: claimed %llu < committed %llu",
zilog->zl_header->zh_claim_lr_seq, committed_seq);
ztest_dataset_dirobj_verify(zd);
zil_replay(os, zd, ztest_replay_vector);
ztest_dataset_dirobj_verify(zd);
if (ztest_opts.zo_verbose >= 6)
(void) printf("%s replay %llu blocks, %llu records, seq %llu\n",
zd->zd_name,
(u_longlong_t)zilog->zl_parse_blk_count,
(u_longlong_t)zilog->zl_parse_lr_count,
(u_longlong_t)zilog->zl_replaying_seq);
zilog = zil_open(os, ztest_get_data);
if (zilog->zl_replaying_seq != 0 &&
zilog->zl_replaying_seq < committed_seq)
fatal(0, "missing log records: replayed %llu < committed %llu",
zilog->zl_replaying_seq, committed_seq);
return (0);
}
static void
ztest_dataset_close(int d)
{
ztest_ds_t *zd = &ztest_ds[d];
zil_close(zd->zd_zilog);
dmu_objset_disown(zd->zd_os, zd);
ztest_zd_fini(zd);
}
/*
* Kick off threads to run tests on all datasets in parallel.
*/
static void
ztest_run(ztest_shared_t *zs)
{
thread_t *tid;
spa_t *spa;
objset_t *os;
thread_t resume_tid;
int error;
ztest_exiting = B_FALSE;
/*
* Initialize parent/child shared state.
*/
VERIFY(_mutex_init(&ztest_vdev_lock, USYNC_THREAD, NULL) == 0);
VERIFY(rwlock_init(&ztest_name_lock, USYNC_THREAD, NULL) == 0);
zs->zs_thread_start = gethrtime();
zs->zs_thread_stop =
zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC;
zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop);
zs->zs_thread_kill = zs->zs_thread_stop;
if (ztest_random(100) < ztest_opts.zo_killrate) {
zs->zs_thread_kill -=
ztest_random(ztest_opts.zo_passtime * NANOSEC);
}
(void) _mutex_init(&zcl.zcl_callbacks_lock, USYNC_THREAD, NULL);
list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t),
offsetof(ztest_cb_data_t, zcd_node));
/*
* Open our pool.
*/
kernel_init(FREAD | FWRITE);
VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG));
spa->spa_debug = B_TRUE;
metaslab_preload_limit = ztest_random(20) + 1;
ztest_spa = spa;
dmu_objset_stats_t dds;
VERIFY0(dmu_objset_own(ztest_opts.zo_pool,
DMU_OST_ANY, B_TRUE, FTAG, &os));
dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
dmu_objset_fast_stat(os, &dds);
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
zs->zs_guid = dds.dds_guid;
dmu_objset_disown(os, FTAG);
spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;
/*
* We don't expect the pool to suspend unless maxfaults == 0,
* in which case ztest_fault_inject() temporarily takes away
* the only valid replica.
*/
if (MAXFAULTS() == 0)
spa->spa_failmode = ZIO_FAILURE_MODE_WAIT;
else
spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
/*
* Create a thread to periodically resume suspended I/O.
*/
VERIFY(thr_create(0, 0, ztest_resume_thread, spa, THR_BOUND,
&resume_tid) == 0);
/*
* Create a deadman thread to abort() if we hang.
*/
VERIFY(thr_create(0, 0, ztest_deadman_thread, zs, THR_BOUND,
NULL) == 0);
/*
* Verify that we can safely inquire about about any object,
* whether it's allocated or not. To make it interesting,
* we probe a 5-wide window around each power of two.
* This hits all edge cases, including zero and the max.
*/
for (int t = 0; t < 64; t++) {
for (int d = -5; d <= 5; d++) {
error = dmu_object_info(spa->spa_meta_objset,
(1ULL << t) + d, NULL);
ASSERT(error == 0 || error == ENOENT ||
error == EINVAL);
}
}
/*
* If we got any ENOSPC errors on the previous run, destroy something.
*/
if (zs->zs_enospc_count != 0) {
int d = ztest_random(ztest_opts.zo_datasets);
ztest_dataset_destroy(d);
}
zs->zs_enospc_count = 0;
tid = umem_zalloc(ztest_opts.zo_threads * sizeof (thread_t),
UMEM_NOFAIL);
if (ztest_opts.zo_verbose >= 4)
(void) printf("starting main threads...\n");
/*
* Kick off all the tests that run in parallel.
*/
for (int t = 0; t < ztest_opts.zo_threads; t++) {
if (t < ztest_opts.zo_datasets &&
ztest_dataset_open(t) != 0)
return;
VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t,
THR_BOUND, &tid[t]) == 0);
}
/*
* Wait for all of the tests to complete. We go in reverse order
* so we don't close datasets while threads are still using them.
*/
for (int t = ztest_opts.zo_threads - 1; t >= 0; t--) {
VERIFY(thr_join(tid[t], NULL, NULL) == 0);
if (t < ztest_opts.zo_datasets)
ztest_dataset_close(t);
}
txg_wait_synced(spa_get_dsl(spa), 0);
zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
zfs_dbgmsg_print(FTAG);
umem_free(tid, ztest_opts.zo_threads * sizeof (thread_t));
/* Kill the resume thread */
ztest_exiting = B_TRUE;
VERIFY(thr_join(resume_tid, NULL, NULL) == 0);
ztest_resume(spa);
/*
* Right before closing the pool, kick off a bunch of async I/O;
* spa_close() should wait for it to complete.
*/
for (uint64_t object = 1; object < 50; object++) {
dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20,
ZIO_PRIORITY_SYNC_READ);
}
spa_close(spa, FTAG);
/*
* Verify that we can loop over all pools.
*/
mutex_enter(&spa_namespace_lock);
for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa))
if (ztest_opts.zo_verbose > 3)
(void) printf("spa_next: found %s\n", spa_name(spa));
mutex_exit(&spa_namespace_lock);
/*
* Verify that we can export the pool and reimport it under a
* different name.
*/
if (ztest_random(2) == 0) {
char name[ZFS_MAX_DATASET_NAME_LEN];
(void) snprintf(name, sizeof (name), "%s_import",
ztest_opts.zo_pool);
ztest_spa_import_export(ztest_opts.zo_pool, name);
ztest_spa_import_export(name, ztest_opts.zo_pool);
}
kernel_fini();
list_destroy(&zcl.zcl_callbacks);
(void) _mutex_destroy(&zcl.zcl_callbacks_lock);
(void) rwlock_destroy(&ztest_name_lock);
(void) _mutex_destroy(&ztest_vdev_lock);
}
static void
ztest_freeze(void)
{
ztest_ds_t *zd = &ztest_ds[0];
spa_t *spa;
int numloops = 0;
if (ztest_opts.zo_verbose >= 3)
(void) printf("testing spa_freeze()...\n");
kernel_init(FREAD | FWRITE);
VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
VERIFY3U(0, ==, ztest_dataset_open(0));
spa->spa_debug = B_TRUE;
ztest_spa = spa;
/*
* Force the first log block to be transactionally allocated.
* We have to do this before we freeze the pool -- otherwise
* the log chain won't be anchored.
*/
while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) {
ztest_dmu_object_alloc_free(zd, 0);
zil_commit(zd->zd_zilog, 0);
}
txg_wait_synced(spa_get_dsl(spa), 0);
/*
* Freeze the pool. This stops spa_sync() from doing anything,
* so that the only way to record changes from now on is the ZIL.
*/
spa_freeze(spa);
/*
* Because it is hard to predict how much space a write will actually
* require beforehand, we leave ourselves some fudge space to write over
* capacity.
*/
uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2;
/*
* Run tests that generate log records but don't alter the pool config
* or depend on DSL sync tasks (snapshots, objset create/destroy, etc).
* We do a txg_wait_synced() after each iteration to force the txg
* to increase well beyond the last synced value in the uberblock.
* The ZIL should be OK with that.
*
* Run a random number of times less than zo_maxloops and ensure we do
* not run out of space on the pool.
*/
while (ztest_random(10) != 0 &&
numloops++ < ztest_opts.zo_maxloops &&
metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) {
ztest_od_t od;
ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE));
ztest_io(zd, od.od_object,
ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
txg_wait_synced(spa_get_dsl(spa), 0);
}
/*
* Commit all of the changes we just generated.
*/
zil_commit(zd->zd_zilog, 0);
txg_wait_synced(spa_get_dsl(spa), 0);
/*
* Close our dataset and close the pool.
*/
ztest_dataset_close(0);
spa_close(spa, FTAG);
kernel_fini();
/*
* Open and close the pool and dataset to induce log replay.
*/
kernel_init(FREAD | FWRITE);
VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
ASSERT(spa_freeze_txg(spa) == UINT64_MAX);
VERIFY3U(0, ==, ztest_dataset_open(0));
ztest_dataset_close(0);
spa->spa_debug = B_TRUE;
ztest_spa = spa;
txg_wait_synced(spa_get_dsl(spa), 0);
ztest_reguid(NULL, 0);
spa_close(spa, FTAG);
kernel_fini();
}
void
print_time(hrtime_t t, char *timebuf)
{
hrtime_t s = t / NANOSEC;
hrtime_t m = s / 60;
hrtime_t h = m / 60;
hrtime_t d = h / 24;
s -= m * 60;
m -= h * 60;
h -= d * 24;
timebuf[0] = '\0';
if (d)
(void) sprintf(timebuf,
"%llud%02lluh%02llum%02llus", d, h, m, s);
else if (h)
(void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s);
else if (m)
(void) sprintf(timebuf, "%llum%02llus", m, s);
else
(void) sprintf(timebuf, "%llus", s);
}
static nvlist_t *
make_random_props()
{
nvlist_t *props;
VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
if (ztest_random(2) == 0)
return (props);
VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0);
return (props);
}
/*
* Create a storage pool with the given name and initial vdev size.
* Then test spa_freeze() functionality.
*/
static void
ztest_init(ztest_shared_t *zs)
{
spa_t *spa;
nvlist_t *nvroot, *props;
VERIFY(_mutex_init(&ztest_vdev_lock, USYNC_THREAD, NULL) == 0);
VERIFY(rwlock_init(&ztest_name_lock, USYNC_THREAD, NULL) == 0);
kernel_init(FREAD | FWRITE);
/*
* Create the storage pool.
*/
(void) spa_destroy(ztest_opts.zo_pool);
ztest_shared->zs_vdev_next_leaf = 0;
zs->zs_splits = 0;
zs->zs_mirrors = ztest_opts.zo_mirrors;
nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
0, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
props = make_random_props();
for (int i = 0; i < SPA_FEATURES; i++) {
char buf[1024];
(void) snprintf(buf, sizeof (buf), "feature@%s",
spa_feature_table[i].fi_uname);
VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0));
}
VERIFY3U(0, ==, spa_create(ztest_opts.zo_pool, nvroot, props, NULL));
nvlist_free(nvroot);
nvlist_free(props);
VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
zs->zs_metaslab_sz =
1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
spa_close(spa, FTAG);
kernel_fini();
ztest_run_zdb(ztest_opts.zo_pool);
ztest_freeze();
ztest_run_zdb(ztest_opts.zo_pool);
(void) rwlock_destroy(&ztest_name_lock);
(void) _mutex_destroy(&ztest_vdev_lock);
}
static void
setup_data_fd(void)
{
static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX";
ztest_fd_data = mkstemp(ztest_name_data);
ASSERT3S(ztest_fd_data, >=, 0);
(void) unlink(ztest_name_data);
}
static int
shared_data_size(ztest_shared_hdr_t *hdr)
{
int size;
size = hdr->zh_hdr_size;
size += hdr->zh_opts_size;
size += hdr->zh_size;
size += hdr->zh_stats_size * hdr->zh_stats_count;
size += hdr->zh_ds_size * hdr->zh_ds_count;
return (size);
}
static void
setup_hdr(void)
{
int size;
ztest_shared_hdr_t *hdr;
hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()),
PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0);
ASSERT(hdr != MAP_FAILED);
VERIFY3U(0, ==, ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t)));
hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t);
hdr->zh_opts_size = sizeof (ztest_shared_opts_t);
hdr->zh_size = sizeof (ztest_shared_t);
hdr->zh_stats_size = sizeof (ztest_shared_callstate_t);
hdr->zh_stats_count = ZTEST_FUNCS;
hdr->zh_ds_size = sizeof (ztest_shared_ds_t);
hdr->zh_ds_count = ztest_opts.zo_datasets;
size = shared_data_size(hdr);
VERIFY3U(0, ==, ftruncate(ztest_fd_data, size));
(void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize()));
}
static void
setup_data(void)
{
int size, offset;
ztest_shared_hdr_t *hdr;
uint8_t *buf;
hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()),
PROT_READ, MAP_SHARED, ztest_fd_data, 0);
ASSERT(hdr != MAP_FAILED);
size = shared_data_size(hdr);
(void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize()));
hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()),
PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0);
ASSERT(hdr != MAP_FAILED);
buf = (uint8_t *)hdr;
offset = hdr->zh_hdr_size;
ztest_shared_opts = (void *)&buf[offset];
offset += hdr->zh_opts_size;
ztest_shared = (void *)&buf[offset];
offset += hdr->zh_size;
ztest_shared_callstate = (void *)&buf[offset];
offset += hdr->zh_stats_size * hdr->zh_stats_count;
ztest_shared_ds = (void *)&buf[offset];
}
static boolean_t
exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp)
{
pid_t pid;
int status;
char *cmdbuf = NULL;
pid = fork();
if (cmd == NULL) {
cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
(void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN);
cmd = cmdbuf;
}
if (pid == -1)
fatal(1, "fork failed");
if (pid == 0) { /* child */
char *emptyargv[2] = { cmd, NULL };
char fd_data_str[12];
struct rlimit rl = { 1024, 1024 };
(void) setrlimit(RLIMIT_NOFILE, &rl);
(void) close(ztest_fd_rand);
VERIFY3U(11, >=,
snprintf(fd_data_str, 12, "%d", ztest_fd_data));
VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1));
(void) enable_extended_FILE_stdio(-1, -1);
if (libpath != NULL)
VERIFY(0 == setenv("LD_LIBRARY_PATH", libpath, 1));
#ifdef illumos
(void) execv(cmd, emptyargv);
#else
(void) execvp(cmd, emptyargv);
#endif
ztest_dump_core = B_FALSE;
fatal(B_TRUE, "exec failed: %s", cmd);
}
if (cmdbuf != NULL) {
umem_free(cmdbuf, MAXPATHLEN);
cmd = NULL;
}
while (waitpid(pid, &status, 0) != pid)
continue;
if (statusp != NULL)
*statusp = status;
if (WIFEXITED(status)) {
if (WEXITSTATUS(status) != 0) {
(void) fprintf(stderr, "child exited with code %d\n",
WEXITSTATUS(status));
exit(2);
}
return (B_FALSE);
} else if (WIFSIGNALED(status)) {
if (!ignorekill || WTERMSIG(status) != SIGKILL) {
(void) fprintf(stderr, "child died with signal %d\n",
WTERMSIG(status));
exit(3);
}
return (B_TRUE);
} else {
(void) fprintf(stderr, "something strange happened to child\n");
exit(4);
/* NOTREACHED */
}
}
static void
ztest_run_init(void)
{
ztest_shared_t *zs = ztest_shared;
ASSERT(ztest_opts.zo_init != 0);
/*
* Blow away any existing copy of zpool.cache
*/
(void) remove(spa_config_path);
/*
* Create and initialize our storage pool.
*/
for (int i = 1; i <= ztest_opts.zo_init; i++) {
bzero(zs, sizeof (ztest_shared_t));
if (ztest_opts.zo_verbose >= 3 &&
ztest_opts.zo_init != 1) {
(void) printf("ztest_init(), pass %d\n", i);
}
ztest_init(zs);
}
}
int
main(int argc, char **argv)
{
int kills = 0;
int iters = 0;
int older = 0;
int newer = 0;
ztest_shared_t *zs;
ztest_info_t *zi;
ztest_shared_callstate_t *zc;
char timebuf[100];
char numbuf[NN_NUMBUF_SZ];
spa_t *spa;
char *cmd;
boolean_t hasalt;
char *fd_data_str = getenv("ZTEST_FD_DATA");
(void) setvbuf(stdout, NULL, _IOLBF, 0);
dprintf_setup(&argc, argv);
zfs_deadman_synctime_ms = 300000;
ztest_fd_rand = open("/dev/urandom", O_RDONLY);
ASSERT3S(ztest_fd_rand, >=, 0);
if (!fd_data_str) {
process_options(argc, argv);
setup_data_fd();
setup_hdr();
setup_data();
bcopy(&ztest_opts, ztest_shared_opts,
sizeof (*ztest_shared_opts));
} else {
ztest_fd_data = atoi(fd_data_str);
setup_data();
bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts));
}
ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count);
/* Override location of zpool.cache */
VERIFY3U(asprintf((char **)&spa_config_path, "%s/zpool.cache",
ztest_opts.zo_dir), !=, -1);
ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t),
UMEM_NOFAIL);
zs = ztest_shared;
if (fd_data_str) {
metaslab_gang_bang = ztest_opts.zo_metaslab_gang_bang;
metaslab_df_alloc_threshold =
zs->zs_metaslab_df_alloc_threshold;
if (zs->zs_do_init)
ztest_run_init();
else
ztest_run(zs);
exit(0);
}
hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0);
if (ztest_opts.zo_verbose >= 1) {
(void) printf("%llu vdevs, %d datasets, %d threads,"
" %llu seconds...\n",
(u_longlong_t)ztest_opts.zo_vdevs,
ztest_opts.zo_datasets,
ztest_opts.zo_threads,
(u_longlong_t)ztest_opts.zo_time);
}
cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
(void) strlcpy(cmd, getexecname(), MAXNAMELEN);
zs->zs_do_init = B_TRUE;
if (strlen(ztest_opts.zo_alt_ztest) != 0) {
if (ztest_opts.zo_verbose >= 1) {
(void) printf("Executing older ztest for "
"initialization: %s\n", ztest_opts.zo_alt_ztest);
}
VERIFY(!exec_child(ztest_opts.zo_alt_ztest,
ztest_opts.zo_alt_libpath, B_FALSE, NULL));
} else {
VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL));
}
zs->zs_do_init = B_FALSE;
zs->zs_proc_start = gethrtime();
zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC;
for (int f = 0; f < ZTEST_FUNCS; f++) {
zi = &ztest_info[f];
zc = ZTEST_GET_SHARED_CALLSTATE(f);
if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop)
zc->zc_next = UINT64_MAX;
else
zc->zc_next = zs->zs_proc_start +
ztest_random(2 * zi->zi_interval[0] + 1);
}
/*
* Run the tests in a loop. These tests include fault injection
* to verify that self-healing data works, and forced crashes
* to verify that we never lose on-disk consistency.
*/
while (gethrtime() < zs->zs_proc_stop) {
int status;
boolean_t killed;
/*
* Initialize the workload counters for each function.
*/
for (int f = 0; f < ZTEST_FUNCS; f++) {
zc = ZTEST_GET_SHARED_CALLSTATE(f);
zc->zc_count = 0;
zc->zc_time = 0;
}
/* Set the allocation switch size */
zs->zs_metaslab_df_alloc_threshold =
ztest_random(zs->zs_metaslab_sz / 4) + 1;
if (!hasalt || ztest_random(2) == 0) {
if (hasalt && ztest_opts.zo_verbose >= 1) {
(void) printf("Executing newer ztest: %s\n",
cmd);
}
newer++;
killed = exec_child(cmd, NULL, B_TRUE, &status);
} else {
if (hasalt && ztest_opts.zo_verbose >= 1) {
(void) printf("Executing older ztest: %s\n",
ztest_opts.zo_alt_ztest);
}
older++;
killed = exec_child(ztest_opts.zo_alt_ztest,
ztest_opts.zo_alt_libpath, B_TRUE, &status);
}
if (killed)
kills++;
iters++;
if (ztest_opts.zo_verbose >= 1) {
hrtime_t now = gethrtime();
now = MIN(now, zs->zs_proc_stop);
print_time(zs->zs_proc_stop - now, timebuf);
nicenum(zs->zs_space, numbuf, sizeof (numbuf));
(void) printf("Pass %3d, %8s, %3llu ENOSPC, "
"%4.1f%% of %5s used, %3.0f%% done, %8s to go\n",
iters,
WIFEXITED(status) ? "Complete" : "SIGKILL",
(u_longlong_t)zs->zs_enospc_count,
100.0 * zs->zs_alloc / zs->zs_space,
numbuf,
100.0 * (now - zs->zs_proc_start) /
(ztest_opts.zo_time * NANOSEC), timebuf);
}
if (ztest_opts.zo_verbose >= 2) {
(void) printf("\nWorkload summary:\n\n");
(void) printf("%7s %9s %s\n",
"Calls", "Time", "Function");
(void) printf("%7s %9s %s\n",
"-----", "----", "--------");
for (int f = 0; f < ZTEST_FUNCS; f++) {
Dl_info dli;
zi = &ztest_info[f];
zc = ZTEST_GET_SHARED_CALLSTATE(f);
print_time(zc->zc_time, timebuf);
(void) dladdr((void *)zi->zi_func, &dli);
(void) printf("%7llu %9s %s\n",
(u_longlong_t)zc->zc_count, timebuf,
dli.dli_sname);
}
(void) printf("\n");
}
/*
* It's possible that we killed a child during a rename test,
* in which case we'll have a 'ztest_tmp' pool lying around
* instead of 'ztest'. Do a blind rename in case this happened.
*/
kernel_init(FREAD);
if (spa_open(ztest_opts.zo_pool, &spa, FTAG) == 0) {
spa_close(spa, FTAG);
} else {
char tmpname[ZFS_MAX_DATASET_NAME_LEN];
kernel_fini();
kernel_init(FREAD | FWRITE);
(void) snprintf(tmpname, sizeof (tmpname), "%s_tmp",
ztest_opts.zo_pool);
(void) spa_rename(tmpname, ztest_opts.zo_pool);
}
kernel_fini();
ztest_run_zdb(ztest_opts.zo_pool);
}
if (ztest_opts.zo_verbose >= 1) {
if (hasalt) {
(void) printf("%d runs of older ztest: %s\n", older,
ztest_opts.zo_alt_ztest);
(void) printf("%d runs of newer ztest: %s\n", newer,
cmd);
}
(void) printf("%d killed, %d completed, %.0f%% kill rate\n",
kills, iters - kills, (100.0 * kills) / MAX(1, iters));
}
umem_free(cmd, MAXNAMELEN);
return (0);
}
Index: stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
===================================================================
--- stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h (revision 332524)
+++ stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h (revision 332525)
@@ -1,833 +1,837 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Pawel Jakub Dawidek. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2016 Nexenta Systems, Inc.
* Copyright (c) 2017 Datto Inc.
*/
#ifndef _LIBZFS_H
#define _LIBZFS_H
#include <assert.h>
#include <libnvpair.h>
#include <sys/mnttab.h>
#include <sys/param.h>
#include <sys/types.h>
#include <sys/varargs.h>
#include <sys/fs/zfs.h>
#include <sys/avl.h>
#include <sys/zfs_ioctl.h>
#include <libzfs_core.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
* Miscellaneous ZFS constants
*/
#define ZFS_MAXPROPLEN MAXPATHLEN
#define ZPOOL_MAXPROPLEN MAXPATHLEN
/*
* libzfs errors
*/
typedef enum zfs_error {
EZFS_SUCCESS = 0, /* no error -- success */
EZFS_NOMEM = 2000, /* out of memory */
EZFS_BADPROP, /* invalid property value */
EZFS_PROPREADONLY, /* cannot set readonly property */
EZFS_PROPTYPE, /* property does not apply to dataset type */
EZFS_PROPNONINHERIT, /* property is not inheritable */
EZFS_PROPSPACE, /* bad quota or reservation */
EZFS_BADTYPE, /* dataset is not of appropriate type */
EZFS_BUSY, /* pool or dataset is busy */
EZFS_EXISTS, /* pool or dataset already exists */
EZFS_NOENT, /* no such pool or dataset */
EZFS_BADSTREAM, /* bad backup stream */
EZFS_DSREADONLY, /* dataset is readonly */
EZFS_VOLTOOBIG, /* volume is too large for 32-bit system */
EZFS_INVALIDNAME, /* invalid dataset name */
EZFS_BADRESTORE, /* unable to restore to destination */
EZFS_BADBACKUP, /* backup failed */
EZFS_BADTARGET, /* bad attach/detach/replace target */
EZFS_NODEVICE, /* no such device in pool */
EZFS_BADDEV, /* invalid device to add */
EZFS_NOREPLICAS, /* no valid replicas */
EZFS_RESILVERING, /* currently resilvering */
EZFS_BADVERSION, /* unsupported version */
EZFS_POOLUNAVAIL, /* pool is currently unavailable */
EZFS_DEVOVERFLOW, /* too many devices in one vdev */
EZFS_BADPATH, /* must be an absolute path */
EZFS_CROSSTARGET, /* rename or clone across pool or dataset */
EZFS_ZONED, /* used improperly in local zone */
EZFS_MOUNTFAILED, /* failed to mount dataset */
EZFS_UMOUNTFAILED, /* failed to unmount dataset */
EZFS_UNSHARENFSFAILED, /* unshare(1M) failed */
EZFS_SHARENFSFAILED, /* share(1M) failed */
EZFS_PERM, /* permission denied */
EZFS_NOSPC, /* out of space */
EZFS_FAULT, /* bad address */
EZFS_IO, /* I/O error */
EZFS_INTR, /* signal received */
EZFS_ISSPARE, /* device is a hot spare */
EZFS_INVALCONFIG, /* invalid vdev configuration */
EZFS_RECURSIVE, /* recursive dependency */
EZFS_NOHISTORY, /* no history object */
EZFS_POOLPROPS, /* couldn't retrieve pool props */
EZFS_POOL_NOTSUP, /* ops not supported for this type of pool */
EZFS_POOL_INVALARG, /* invalid argument for this pool operation */
EZFS_NAMETOOLONG, /* dataset name is too long */
EZFS_OPENFAILED, /* open of device failed */
EZFS_NOCAP, /* couldn't get capacity */
EZFS_LABELFAILED, /* write of label failed */
EZFS_BADWHO, /* invalid permission who */
EZFS_BADPERM, /* invalid permission */
EZFS_BADPERMSET, /* invalid permission set name */
EZFS_NODELEGATION, /* delegated administration is disabled */
EZFS_UNSHARESMBFAILED, /* failed to unshare over smb */
EZFS_SHARESMBFAILED, /* failed to share over smb */
EZFS_BADCACHE, /* bad cache file */
EZFS_ISL2CACHE, /* device is for the level 2 ARC */
EZFS_VDEVNOTSUP, /* unsupported vdev type */
EZFS_NOTSUP, /* ops not supported on this dataset */
EZFS_ACTIVE_SPARE, /* pool has active shared spare devices */
EZFS_UNPLAYED_LOGS, /* log device has unplayed logs */
EZFS_REFTAG_RELE, /* snapshot release: tag not found */
EZFS_REFTAG_HOLD, /* snapshot hold: tag already exists */
EZFS_TAGTOOLONG, /* snapshot hold/rele: tag too long */
EZFS_PIPEFAILED, /* pipe create failed */
EZFS_THREADCREATEFAILED, /* thread create failed */
EZFS_POSTSPLIT_ONLINE, /* onlining a disk after splitting it */
EZFS_SCRUBBING, /* currently scrubbing */
EZFS_NO_SCRUB, /* no active scrub */
EZFS_DIFF, /* general failure of zfs diff */
EZFS_DIFFDATA, /* bad zfs diff data */
EZFS_POOLREADONLY, /* pool is in read-only mode */
EZFS_SCRUB_PAUSED, /* scrub currently paused */
+ EZFS_NO_PENDING, /* cannot cancel, no operation is pending */
EZFS_UNKNOWN
} zfs_error_t;
/*
* UEFI boot support parameters. When creating whole disk boot pool,
* zpool create should allow to create EFI System partition for UEFI boot
* program. In case of BIOS, the EFI System partition is not used
* even if it does exist.
*/
typedef enum zpool_boot_label {
ZPOOL_NO_BOOT_LABEL = 0,
ZPOOL_CREATE_BOOT_LABEL,
ZPOOL_COPY_BOOT_LABEL
} zpool_boot_label_t;
/*
* The following data structures are all part
* of the zfs_allow_t data structure which is
* used for printing 'allow' permissions.
* It is a linked list of zfs_allow_t's which
* then contain avl tree's for user/group/sets/...
* and each one of the entries in those trees have
* avl tree's for the permissions they belong to and
* whether they are local,descendent or local+descendent
* permissions. The AVL trees are used primarily for
* sorting purposes, but also so that we can quickly find
* a given user and or permission.
*/
typedef struct zfs_perm_node {
avl_node_t z_node;
char z_pname[MAXPATHLEN];
} zfs_perm_node_t;
typedef struct zfs_allow_node {
avl_node_t z_node;
char z_key[MAXPATHLEN]; /* name, such as joe */
avl_tree_t z_localdescend; /* local+descendent perms */
avl_tree_t z_local; /* local permissions */
avl_tree_t z_descend; /* descendent permissions */
} zfs_allow_node_t;
typedef struct zfs_allow {
struct zfs_allow *z_next;
char z_setpoint[MAXPATHLEN];
avl_tree_t z_sets;
avl_tree_t z_crperms;
avl_tree_t z_user;
avl_tree_t z_group;
avl_tree_t z_everyone;
} zfs_allow_t;
/*
* Basic handle types
*/
typedef struct zfs_handle zfs_handle_t;
typedef struct zpool_handle zpool_handle_t;
typedef struct libzfs_handle libzfs_handle_t;
/*
* Library initialization
*/
extern libzfs_handle_t *libzfs_init(void);
extern void libzfs_fini(libzfs_handle_t *);
extern libzfs_handle_t *zpool_get_handle(zpool_handle_t *);
extern libzfs_handle_t *zfs_get_handle(zfs_handle_t *);
extern void libzfs_print_on_error(libzfs_handle_t *, boolean_t);
extern void zfs_save_arguments(int argc, char **, char *, int);
extern int zpool_log_history(libzfs_handle_t *, const char *);
extern int libzfs_errno(libzfs_handle_t *);
extern const char *libzfs_error_action(libzfs_handle_t *);
extern const char *libzfs_error_description(libzfs_handle_t *);
extern int zfs_standard_error(libzfs_handle_t *, int, const char *);
extern void libzfs_mnttab_init(libzfs_handle_t *);
extern void libzfs_mnttab_fini(libzfs_handle_t *);
extern void libzfs_mnttab_cache(libzfs_handle_t *, boolean_t);
extern int libzfs_mnttab_find(libzfs_handle_t *, const char *,
struct mnttab *);
extern void libzfs_mnttab_add(libzfs_handle_t *, const char *,
const char *, const char *);
extern void libzfs_mnttab_remove(libzfs_handle_t *, const char *);
/*
* Basic handle functions
*/
extern zpool_handle_t *zpool_open(libzfs_handle_t *, const char *);
extern zpool_handle_t *zpool_open_canfail(libzfs_handle_t *, const char *);
extern void zpool_close(zpool_handle_t *);
extern const char *zpool_get_name(zpool_handle_t *);
extern int zpool_get_state(zpool_handle_t *);
extern const char *zpool_state_to_name(vdev_state_t, vdev_aux_t);
extern const char *zpool_pool_state_to_name(pool_state_t);
extern void zpool_free_handles(libzfs_handle_t *);
extern int zpool_nextboot(libzfs_handle_t *, uint64_t, uint64_t, const char *);
/*
* Iterate over all active pools in the system.
*/
typedef int (*zpool_iter_f)(zpool_handle_t *, void *);
extern int zpool_iter(libzfs_handle_t *, zpool_iter_f, void *);
extern boolean_t zpool_skip_pool(const char *);
/*
* Functions to create and destroy pools
*/
extern int zpool_create(libzfs_handle_t *, const char *, nvlist_t *,
nvlist_t *, nvlist_t *);
extern int zpool_destroy(zpool_handle_t *, const char *);
extern int zpool_add(zpool_handle_t *, nvlist_t *);
typedef struct splitflags {
/* do not split, but return the config that would be split off */
int dryrun : 1;
/* after splitting, import the pool */
int import : 1;
} splitflags_t;
/*
* Functions to manipulate pool and vdev state
*/
extern int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t);
extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *);
extern int zpool_reguid(zpool_handle_t *);
extern int zpool_reopen(zpool_handle_t *);
extern int zpool_vdev_online(zpool_handle_t *, const char *, int,
vdev_state_t *);
extern int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t);
extern int zpool_vdev_attach(zpool_handle_t *, const char *,
const char *, nvlist_t *, int);
extern int zpool_vdev_detach(zpool_handle_t *, const char *);
extern int zpool_vdev_remove(zpool_handle_t *, const char *);
+extern int zpool_vdev_remove_cancel(zpool_handle_t *);
+extern int zpool_vdev_indirect_size(zpool_handle_t *, const char *, uint64_t *);
extern int zpool_vdev_split(zpool_handle_t *, char *, nvlist_t **, nvlist_t *,
splitflags_t);
extern int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t);
extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t);
extern int zpool_vdev_clear(zpool_handle_t *, uint64_t);
extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *,
boolean_t *, boolean_t *);
extern nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *,
boolean_t *, boolean_t *, boolean_t *);
extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, const char *,
zpool_boot_label_t, uint64_t, int *);
/*
* Functions to manage pool properties
*/
extern int zpool_set_prop(zpool_handle_t *, const char *, const char *);
extern int zpool_get_prop(zpool_handle_t *, zpool_prop_t, char *,
size_t proplen, zprop_source_t *, boolean_t);
extern uint64_t zpool_get_prop_int(zpool_handle_t *, zpool_prop_t,
zprop_source_t *);
extern const char *zpool_prop_to_name(zpool_prop_t);
extern const char *zpool_prop_values(zpool_prop_t);
/*
* Pool health statistics.
*/
typedef enum {
/*
* The following correspond to faults as defined in the (fault.fs.zfs.*)
* event namespace. Each is associated with a corresponding message ID.
*/
ZPOOL_STATUS_CORRUPT_CACHE, /* corrupt /kernel/drv/zpool.cache */
ZPOOL_STATUS_MISSING_DEV_R, /* missing device with replicas */
ZPOOL_STATUS_MISSING_DEV_NR, /* missing device with no replicas */
ZPOOL_STATUS_CORRUPT_LABEL_R, /* bad device label with replicas */
ZPOOL_STATUS_CORRUPT_LABEL_NR, /* bad device label with no replicas */
ZPOOL_STATUS_BAD_GUID_SUM, /* sum of device guids didn't match */
ZPOOL_STATUS_CORRUPT_POOL, /* pool metadata is corrupted */
ZPOOL_STATUS_CORRUPT_DATA, /* data errors in user (meta)data */
ZPOOL_STATUS_FAILING_DEV, /* device experiencing errors */
ZPOOL_STATUS_VERSION_NEWER, /* newer on-disk version */
ZPOOL_STATUS_HOSTID_MISMATCH, /* last accessed by another system */
ZPOOL_STATUS_IO_FAILURE_WAIT, /* failed I/O, failmode 'wait' */
ZPOOL_STATUS_IO_FAILURE_CONTINUE, /* failed I/O, failmode 'continue' */
ZPOOL_STATUS_BAD_LOG, /* cannot read log chain(s) */
/*
* If the pool has unsupported features but can still be opened in
* read-only mode, its status is ZPOOL_STATUS_UNSUP_FEAT_WRITE. If the
* pool has unsupported features but cannot be opened at all, its
* status is ZPOOL_STATUS_UNSUP_FEAT_READ.
*/
ZPOOL_STATUS_UNSUP_FEAT_READ, /* unsupported features for read */
ZPOOL_STATUS_UNSUP_FEAT_WRITE, /* unsupported features for write */
/*
* These faults have no corresponding message ID. At the time we are
* checking the status, the original reason for the FMA fault (I/O or
* checksum errors) has been lost.
*/
ZPOOL_STATUS_FAULTED_DEV_R, /* faulted device with replicas */
ZPOOL_STATUS_FAULTED_DEV_NR, /* faulted device with no replicas */
/*
* The following are not faults per se, but still an error possibly
* requiring administrative attention. There is no corresponding
* message ID.
*/
ZPOOL_STATUS_VERSION_OLDER, /* older legacy on-disk version */
ZPOOL_STATUS_FEAT_DISABLED, /* supported features are disabled */
ZPOOL_STATUS_RESILVERING, /* device being resilvered */
ZPOOL_STATUS_OFFLINE_DEV, /* device offline */
ZPOOL_STATUS_REMOVED_DEV, /* removed device */
ZPOOL_STATUS_NON_NATIVE_ASHIFT, /* (e.g. 512e dev with ashift of 9) */
/*
* Finally, the following indicates a healthy pool.
*/
ZPOOL_STATUS_OK
} zpool_status_t;
extern zpool_status_t zpool_get_status(zpool_handle_t *, char **);
extern zpool_status_t zpool_import_status(nvlist_t *, char **);
extern void zpool_dump_ddt(const ddt_stat_t *dds, const ddt_histogram_t *ddh);
/*
* Statistics and configuration functions.
*/
extern nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **);
extern nvlist_t *zpool_get_features(zpool_handle_t *);
extern int zpool_refresh_stats(zpool_handle_t *, boolean_t *);
extern int zpool_get_errlog(zpool_handle_t *, nvlist_t **);
extern boolean_t zpool_is_bootable(zpool_handle_t *);
/*
* Import and export functions
*/
extern int zpool_export(zpool_handle_t *, boolean_t, const char *);
extern int zpool_export_force(zpool_handle_t *, const char *);
extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *,
char *altroot);
extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *,
nvlist_t *, int);
extern void zpool_print_unsup_feat(nvlist_t *config);
/*
* Search for pools to import
*/
typedef struct importargs {
char **path; /* a list of paths to search */
int paths; /* number of paths to search */
char *poolname; /* name of a pool to find */
uint64_t guid; /* guid of a pool to find */
char *cachefile; /* cachefile to use for import */
int can_be_active : 1; /* can the pool be active? */
int unique : 1; /* does 'poolname' already exist? */
int exists : 1; /* set on return if pool already exists */
} importargs_t;
extern nvlist_t *zpool_search_import(libzfs_handle_t *, importargs_t *);
/* legacy pool search routines */
extern nvlist_t *zpool_find_import(libzfs_handle_t *, int, char **);
extern nvlist_t *zpool_find_import_cached(libzfs_handle_t *, const char *,
char *, uint64_t);
/*
* Miscellaneous pool functions
*/
struct zfs_cmd;
extern const char *zfs_history_event_names[];
extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *,
boolean_t verbose);
extern int zpool_upgrade(zpool_handle_t *, uint64_t);
extern int zpool_get_history(zpool_handle_t *, nvlist_t **);
extern int zpool_history_unpack(char *, uint64_t, uint64_t *,
nvlist_t ***, uint_t *);
extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *,
size_t len);
extern int zfs_ioctl(libzfs_handle_t *, int request, struct zfs_cmd *);
extern int zpool_get_physpath(zpool_handle_t *, char *, size_t);
extern void zpool_explain_recover(libzfs_handle_t *, const char *, int,
nvlist_t *);
/*
* Basic handle manipulations. These functions do not create or destroy the
* underlying datasets, only the references to them.
*/
extern zfs_handle_t *zfs_open(libzfs_handle_t *, const char *, int);
extern zfs_handle_t *zfs_handle_dup(zfs_handle_t *);
extern void zfs_close(zfs_handle_t *);
extern zfs_type_t zfs_get_type(const zfs_handle_t *);
extern const char *zfs_get_name(const zfs_handle_t *);
extern zpool_handle_t *zfs_get_pool_handle(const zfs_handle_t *);
extern const char *zfs_get_pool_name(const zfs_handle_t *);
/*
* Property management functions. Some functions are shared with the kernel,
* and are found in sys/fs/zfs.h.
*/
/*
* zfs dataset property management
*/
extern const char *zfs_prop_default_string(zfs_prop_t);
extern uint64_t zfs_prop_default_numeric(zfs_prop_t);
extern const char *zfs_prop_column_name(zfs_prop_t);
extern boolean_t zfs_prop_align_right(zfs_prop_t);
extern nvlist_t *zfs_valid_proplist(libzfs_handle_t *, zfs_type_t,
nvlist_t *, uint64_t, zfs_handle_t *, zpool_handle_t *, const char *);
extern const char *zfs_prop_to_name(zfs_prop_t);
extern int zfs_prop_set(zfs_handle_t *, const char *, const char *);
extern int zfs_prop_set_list(zfs_handle_t *, nvlist_t *);
extern int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t,
zprop_source_t *, char *, size_t, boolean_t);
extern int zfs_prop_get_recvd(zfs_handle_t *, const char *, char *, size_t,
boolean_t);
extern int zfs_prop_get_numeric(zfs_handle_t *, zfs_prop_t, uint64_t *,
zprop_source_t *, char *, size_t);
extern int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname,
uint64_t *propvalue);
extern int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname,
char *propbuf, int proplen, boolean_t literal);
extern int zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname,
uint64_t *propvalue);
extern int zfs_prop_get_written(zfs_handle_t *zhp, const char *propname,
char *propbuf, int proplen, boolean_t literal);
extern int zfs_prop_get_feature(zfs_handle_t *zhp, const char *propname,
char *buf, size_t len);
extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t);
extern int zfs_prop_inherit(zfs_handle_t *, const char *, boolean_t);
extern const char *zfs_prop_values(zfs_prop_t);
extern int zfs_prop_is_string(zfs_prop_t prop);
extern nvlist_t *zfs_get_user_props(zfs_handle_t *);
extern nvlist_t *zfs_get_recvd_props(zfs_handle_t *);
extern nvlist_t *zfs_get_clones_nvl(zfs_handle_t *);
typedef struct zprop_list {
int pl_prop;
char *pl_user_prop;
struct zprop_list *pl_next;
boolean_t pl_all;
size_t pl_width;
size_t pl_recvd_width;
boolean_t pl_fixed;
} zprop_list_t;
extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t,
boolean_t);
extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *);
#define ZFS_MOUNTPOINT_NONE "none"
#define ZFS_MOUNTPOINT_LEGACY "legacy"
#define ZFS_FEATURE_DISABLED "disabled"
#define ZFS_FEATURE_ENABLED "enabled"
#define ZFS_FEATURE_ACTIVE "active"
#define ZFS_UNSUPPORTED_INACTIVE "inactive"
#define ZFS_UNSUPPORTED_READONLY "readonly"
/*
* zpool property management
*/
extern int zpool_expand_proplist(zpool_handle_t *, zprop_list_t **);
extern int zpool_prop_get_feature(zpool_handle_t *, const char *, char *,
size_t);
extern const char *zpool_prop_default_string(zpool_prop_t);
extern uint64_t zpool_prop_default_numeric(zpool_prop_t);
extern const char *zpool_prop_column_name(zpool_prop_t);
extern boolean_t zpool_prop_align_right(zpool_prop_t);
/*
* Functions shared by zfs and zpool property management.
*/
extern int zprop_iter(zprop_func func, void *cb, boolean_t show_all,
boolean_t ordered, zfs_type_t type);
extern int zprop_get_list(libzfs_handle_t *, char *, zprop_list_t **,
zfs_type_t);
extern void zprop_free_list(zprop_list_t *);
#define ZFS_GET_NCOLS 5
typedef enum {
GET_COL_NONE,
GET_COL_NAME,
GET_COL_PROPERTY,
GET_COL_VALUE,
GET_COL_RECVD,
GET_COL_SOURCE
} zfs_get_column_t;
/*
* Functions for printing zfs or zpool properties
*/
typedef struct zprop_get_cbdata {
int cb_sources;
zfs_get_column_t cb_columns[ZFS_GET_NCOLS];
int cb_colwidths[ZFS_GET_NCOLS + 1];
boolean_t cb_scripted;
boolean_t cb_literal;
boolean_t cb_first;
zprop_list_t *cb_proplist;
zfs_type_t cb_type;
} zprop_get_cbdata_t;
void zprop_print_one_property(const char *, zprop_get_cbdata_t *,
const char *, const char *, zprop_source_t, const char *,
const char *);
/*
* Iterator functions.
*/
typedef int (*zfs_iter_f)(zfs_handle_t *, void *);
extern int zfs_iter_root(libzfs_handle_t *, zfs_iter_f, void *);
extern int zfs_iter_children(zfs_handle_t *, zfs_iter_f, void *);
extern int zfs_iter_dependents(zfs_handle_t *, boolean_t, zfs_iter_f, void *);
extern int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *);
extern int zfs_iter_snapshots(zfs_handle_t *, boolean_t, zfs_iter_f, void *);
extern int zfs_iter_snapshots_sorted(zfs_handle_t *, zfs_iter_f, void *);
extern int zfs_iter_snapspec(zfs_handle_t *, const char *, zfs_iter_f, void *);
extern int zfs_iter_bookmarks(zfs_handle_t *, zfs_iter_f, void *);
typedef struct get_all_cb {
zfs_handle_t **cb_handles;
size_t cb_alloc;
size_t cb_used;
boolean_t cb_verbose;
int (*cb_getone)(zfs_handle_t *, void *);
} get_all_cb_t;
void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *);
int libzfs_dataset_cmp(const void *, const void *);
/*
* Functions to create and destroy datasets.
*/
extern int zfs_create(libzfs_handle_t *, const char *, zfs_type_t,
nvlist_t *);
extern int zfs_create_ancestors(libzfs_handle_t *, const char *);
extern int zfs_destroy(zfs_handle_t *, boolean_t);
extern int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t);
extern int zfs_destroy_snaps_nvl(libzfs_handle_t *, nvlist_t *, boolean_t);
extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *);
extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *);
extern int zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps,
nvlist_t *props);
extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t);
typedef struct renameflags {
/* recursive rename */
int recurse : 1;
/* don't unmount file systems */
int nounmount : 1;
/* force unmount file systems */
int forceunmount : 1;
} renameflags_t;
extern int zfs_rename(zfs_handle_t *, const char *, const char *,
renameflags_t flags);
typedef struct sendflags {
/* print informational messages (ie, -v was specified) */
boolean_t verbose;
/* recursive send (ie, -R) */
boolean_t replicate;
/* for incrementals, do all intermediate snapshots */
boolean_t doall;
/* if dataset is a clone, do incremental from its origin */
boolean_t fromorigin;
/* do deduplication */
boolean_t dedup;
/* send properties (ie, -p) */
boolean_t props;
/* do not send (no-op, ie. -n) */
boolean_t dryrun;
/* parsable verbose output (ie. -P) */
boolean_t parsable;
/* show progress (ie. -v) */
boolean_t progress;
/* large blocks (>128K) are permitted */
boolean_t largeblock;
/* WRITE_EMBEDDED records of type DATA are permitted */
boolean_t embed_data;
/* compressed WRITE records are permitted */
boolean_t compress;
} sendflags_t;
typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *);
extern int zfs_send(zfs_handle_t *, const char *, const char *,
sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **);
extern int zfs_send_one(zfs_handle_t *, const char *, int, enum lzc_send_flags);
extern int zfs_send_resume(libzfs_handle_t *, sendflags_t *, int outfd,
const char *);
extern nvlist_t *zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl,
const char *token);
extern int zfs_promote(zfs_handle_t *);
extern int zfs_hold(zfs_handle_t *, const char *, const char *,
boolean_t, int);
extern int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *);
extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t);
extern int zfs_get_holds(zfs_handle_t *, nvlist_t **);
extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *);
typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain,
uid_t rid, uint64_t space);
extern int zfs_userspace(zfs_handle_t *, zfs_userquota_prop_t,
zfs_userspace_cb_t, void *);
extern int zfs_get_fsacl(zfs_handle_t *, nvlist_t **);
extern int zfs_set_fsacl(zfs_handle_t *, boolean_t, nvlist_t *);
typedef struct recvflags {
/* print informational messages (ie, -v was specified) */
boolean_t verbose;
/* the destination is a prefix, not the exact fs (ie, -d) */
boolean_t isprefix;
/*
* Only the tail of the sent snapshot path is appended to the
* destination to determine the received snapshot name (ie, -e).
*/
boolean_t istail;
/* do not actually do the recv, just check if it would work (ie, -n) */
boolean_t dryrun;
/* rollback/destroy filesystems as necessary (eg, -F) */
boolean_t force;
/* set "canmount=off" on all modified filesystems */
boolean_t canmountoff;
/*
* Mark the file systems as "resumable" and do not destroy them if the
* receive is interrupted
*/
boolean_t resumable;
/* byteswap flag is used internally; callers need not specify */
boolean_t byteswap;
/* do not mount file systems as they are extracted (private) */
boolean_t nomount;
} recvflags_t;
extern int zfs_receive(libzfs_handle_t *, const char *, nvlist_t *,
recvflags_t *, int, avl_tree_t *);
typedef enum diff_flags {
ZFS_DIFF_PARSEABLE = 0x1,
ZFS_DIFF_TIMESTAMP = 0x2,
ZFS_DIFF_CLASSIFY = 0x4
} diff_flags_t;
extern int zfs_show_diffs(zfs_handle_t *, int, const char *, const char *,
int);
/*
* Miscellaneous functions.
*/
extern const char *zfs_type_to_name(zfs_type_t);
extern void zfs_refresh_properties(zfs_handle_t *);
extern int zfs_name_valid(const char *, zfs_type_t);
extern zfs_handle_t *zfs_path_to_zhandle(libzfs_handle_t *, char *, zfs_type_t);
extern boolean_t zfs_dataset_exists(libzfs_handle_t *, const char *,
zfs_type_t);
extern int zfs_spa_version(zfs_handle_t *, int *);
extern boolean_t zfs_bookmark_exists(const char *path);
/*
* Mount support functions.
*/
extern boolean_t is_mounted(libzfs_handle_t *, const char *special, char **);
extern boolean_t zfs_is_mounted(zfs_handle_t *, char **);
extern int zfs_mount(zfs_handle_t *, const char *, int);
extern int zfs_unmount(zfs_handle_t *, const char *, int);
extern int zfs_unmountall(zfs_handle_t *, int);
/*
* Share support functions.
*/
extern boolean_t zfs_is_shared(zfs_handle_t *);
extern int zfs_share(zfs_handle_t *);
extern int zfs_unshare(zfs_handle_t *);
/*
* Protocol-specific share support functions.
*/
extern boolean_t zfs_is_shared_nfs(zfs_handle_t *, char **);
extern boolean_t zfs_is_shared_smb(zfs_handle_t *, char **);
extern int zfs_share_nfs(zfs_handle_t *);
extern int zfs_share_smb(zfs_handle_t *);
extern int zfs_shareall(zfs_handle_t *);
extern int zfs_unshare_nfs(zfs_handle_t *, const char *);
extern int zfs_unshare_smb(zfs_handle_t *, const char *);
extern int zfs_unshareall_nfs(zfs_handle_t *);
extern int zfs_unshareall_smb(zfs_handle_t *);
extern int zfs_unshareall_bypath(zfs_handle_t *, const char *);
extern int zfs_unshareall(zfs_handle_t *);
extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *,
void *, void *, int, zfs_share_op_t);
/*
* FreeBSD-specific jail support function.
*/
extern int zfs_jail(zfs_handle_t *, int, int);
/*
* When dealing with nvlists, verify() is extremely useful
*/
#ifndef verify
#ifdef NDEBUG
#define verify(EX) ((void)(EX))
#else
#define verify(EX) assert(EX)
#endif
#endif
/*
* Utility function to convert a number to a human-readable form.
*/
extern void zfs_nicenum(uint64_t, char *, size_t);
extern int zfs_nicestrtonum(libzfs_handle_t *, const char *, uint64_t *);
/*
* Given a device or file, determine if it is part of a pool.
*/
extern int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **,
boolean_t *);
/*
* Label manipulation.
*/
extern int zpool_read_label(int, nvlist_t **);
extern int zpool_read_all_labels(int, nvlist_t **);
extern int zpool_clear_label(int);
/* is this zvol valid for use as a dump device? */
extern int zvol_check_dump_config(char *);
/*
* Management interfaces for SMB ACL files
*/
int zfs_smb_acl_add(libzfs_handle_t *, char *, char *, char *);
int zfs_smb_acl_remove(libzfs_handle_t *, char *, char *, char *);
int zfs_smb_acl_purge(libzfs_handle_t *, char *, char *);
int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *);
/*
* Enable and disable datasets within a pool by mounting/unmounting and
* sharing/unsharing them.
*/
extern int zpool_enable_datasets(zpool_handle_t *, const char *, int);
extern int zpool_disable_datasets(zpool_handle_t *, boolean_t);
/*
* Mappings between vdev and FRU.
*/
extern void libzfs_fru_refresh(libzfs_handle_t *);
extern const char *libzfs_fru_lookup(libzfs_handle_t *, const char *);
extern const char *libzfs_fru_devpath(libzfs_handle_t *, const char *);
extern boolean_t libzfs_fru_compare(libzfs_handle_t *, const char *,
const char *);
extern boolean_t libzfs_fru_notself(libzfs_handle_t *, const char *);
extern int zpool_fru_set(zpool_handle_t *, uint64_t, const char *);
#ifndef illumos
extern int zmount(const char *, const char *, int, char *, char *, int, char *,
int);
#endif
+extern int zfs_remap_indirects(libzfs_handle_t *hdl, const char *);
#ifdef __cplusplus
}
#endif
#endif /* _LIBZFS_H */
Index: stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c
===================================================================
--- stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c (revision 332524)
+++ stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c (revision 332525)
@@ -1,5095 +1,5113 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
* Copyright (c) 2012 DEY Storage Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
* Copyright (c) 2013 Martin Matuska. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
* Copyright 2017 Nexenta Systems, Inc.
* Copyright 2017 RackTop Systems.
*/
#include <ctype.h>
#include <errno.h>
#include <libintl.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <unistd.h>
#include <stddef.h>
#include <zone.h>
#include <fcntl.h>
#include <sys/mntent.h>
#include <sys/mount.h>
#include <priv.h>
#include <pwd.h>
#include <grp.h>
#include <stddef.h>
#include <idmap.h>
#include <sys/dnode.h>
#include <sys/spa.h>
#include <sys/zap.h>
#include <sys/misc.h>
#include <libzfs.h>
#include "zfs_namecheck.h"
#include "zfs_prop.h"
#include "libzfs_impl.h"
#include "zfs_deleg.h"
static int userquota_propname_decode(const char *propname, boolean_t zoned,
zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp);
/*
* Given a single type (not a mask of types), return the type in a human
* readable form.
*/
const char *
zfs_type_to_name(zfs_type_t type)
{
switch (type) {
case ZFS_TYPE_FILESYSTEM:
return (dgettext(TEXT_DOMAIN, "filesystem"));
case ZFS_TYPE_SNAPSHOT:
return (dgettext(TEXT_DOMAIN, "snapshot"));
case ZFS_TYPE_VOLUME:
return (dgettext(TEXT_DOMAIN, "volume"));
case ZFS_TYPE_POOL:
return (dgettext(TEXT_DOMAIN, "pool"));
case ZFS_TYPE_BOOKMARK:
return (dgettext(TEXT_DOMAIN, "bookmark"));
default:
assert(!"unhandled zfs_type_t");
}
return (NULL);
}
/*
* Validate a ZFS path. This is used even before trying to open the dataset, to
* provide a more meaningful error message. We call zfs_error_aux() to
* explain exactly why the name was not valid.
*/
int
zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type,
boolean_t modifying)
{
namecheck_err_t why;
char what;
if (entity_namecheck(path, &why, &what) != 0) {
if (hdl != NULL) {
switch (why) {
case NAME_ERR_TOOLONG:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"name is too long"));
break;
case NAME_ERR_LEADING_SLASH:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"leading slash in name"));
break;
case NAME_ERR_EMPTY_COMPONENT:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"empty component in name"));
break;
case NAME_ERR_TRAILING_SLASH:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"trailing slash in name"));
break;
case NAME_ERR_INVALCHAR:
zfs_error_aux(hdl,
dgettext(TEXT_DOMAIN, "invalid character "
"'%c' in name"), what);
break;
case NAME_ERR_MULTIPLE_DELIMITERS:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"multiple '@' and/or '#' delimiters in "
"name"));
break;
case NAME_ERR_NOLETTER:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"pool doesn't begin with a letter"));
break;
case NAME_ERR_RESERVED:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"name is reserved"));
break;
case NAME_ERR_DISKLIKE:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"reserved disk name"));
break;
default:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"(%d) not defined"), why);
break;
}
}
return (0);
}
if (!(type & ZFS_TYPE_SNAPSHOT) && strchr(path, '@') != NULL) {
if (hdl != NULL)
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"snapshot delimiter '@' is not expected here"));
return (0);
}
if (type == ZFS_TYPE_SNAPSHOT && strchr(path, '@') == NULL) {
if (hdl != NULL)
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"missing '@' delimiter in snapshot name"));
return (0);
}
if (!(type & ZFS_TYPE_BOOKMARK) && strchr(path, '#') != NULL) {
if (hdl != NULL)
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"bookmark delimiter '#' is not expected here"));
return (0);
}
if (type == ZFS_TYPE_BOOKMARK && strchr(path, '#') == NULL) {
if (hdl != NULL)
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"missing '#' delimiter in bookmark name"));
return (0);
}
if (modifying && strchr(path, '%') != NULL) {
if (hdl != NULL)
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"invalid character %c in name"), '%');
return (0);
}
return (-1);
}
int
zfs_name_valid(const char *name, zfs_type_t type)
{
if (type == ZFS_TYPE_POOL)
return (zpool_name_valid(NULL, B_FALSE, name));
return (zfs_validate_name(NULL, name, type, B_FALSE));
}
/*
* This function takes the raw DSL properties, and filters out the user-defined
* properties into a separate nvlist.
*/
static nvlist_t *
process_user_props(zfs_handle_t *zhp, nvlist_t *props)
{
libzfs_handle_t *hdl = zhp->zfs_hdl;
nvpair_t *elem;
nvlist_t *propval;
nvlist_t *nvl;
if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) {
(void) no_memory(hdl);
return (NULL);
}
elem = NULL;
while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
if (!zfs_prop_user(nvpair_name(elem)))
continue;
verify(nvpair_value_nvlist(elem, &propval) == 0);
if (nvlist_add_nvlist(nvl, nvpair_name(elem), propval) != 0) {
nvlist_free(nvl);
(void) no_memory(hdl);
return (NULL);
}
}
return (nvl);
}
static zpool_handle_t *
zpool_add_handle(zfs_handle_t *zhp, const char *pool_name)
{
libzfs_handle_t *hdl = zhp->zfs_hdl;
zpool_handle_t *zph;
if ((zph = zpool_open_canfail(hdl, pool_name)) != NULL) {
if (hdl->libzfs_pool_handles != NULL)
zph->zpool_next = hdl->libzfs_pool_handles;
hdl->libzfs_pool_handles = zph;
}
return (zph);
}
static zpool_handle_t *
zpool_find_handle(zfs_handle_t *zhp, const char *pool_name, int len)
{
libzfs_handle_t *hdl = zhp->zfs_hdl;
zpool_handle_t *zph = hdl->libzfs_pool_handles;
while ((zph != NULL) &&
(strncmp(pool_name, zpool_get_name(zph), len) != 0))
zph = zph->zpool_next;
return (zph);
}
/*
* Returns a handle to the pool that contains the provided dataset.
* If a handle to that pool already exists then that handle is returned.
* Otherwise, a new handle is created and added to the list of handles.
*/
static zpool_handle_t *
zpool_handle(zfs_handle_t *zhp)
{
char *pool_name;
int len;
zpool_handle_t *zph;
len = strcspn(zhp->zfs_name, "/@#") + 1;
pool_name = zfs_alloc(zhp->zfs_hdl, len);
(void) strlcpy(pool_name, zhp->zfs_name, len);
zph = zpool_find_handle(zhp, pool_name, len);
if (zph == NULL)
zph = zpool_add_handle(zhp, pool_name);
free(pool_name);
return (zph);
}
void
zpool_free_handles(libzfs_handle_t *hdl)
{
zpool_handle_t *next, *zph = hdl->libzfs_pool_handles;
while (zph != NULL) {
next = zph->zpool_next;
zpool_close(zph);
zph = next;
}
hdl->libzfs_pool_handles = NULL;
}
/*
* Utility function to gather stats (objset and zpl) for the given object.
*/
static int
get_stats_ioctl(zfs_handle_t *zhp, zfs_cmd_t *zc)
{
libzfs_handle_t *hdl = zhp->zfs_hdl;
(void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name));
while (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, zc) != 0) {
if (errno == ENOMEM) {
if (zcmd_expand_dst_nvlist(hdl, zc) != 0) {
return (-1);
}
} else {
return (-1);
}
}
return (0);
}
/*
* Utility function to get the received properties of the given object.
*/
static int
get_recvd_props_ioctl(zfs_handle_t *zhp)
{
libzfs_handle_t *hdl = zhp->zfs_hdl;
nvlist_t *recvdprops;
zfs_cmd_t zc = { 0 };
int err;
if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0)
return (-1);
(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
while (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_RECVD_PROPS, &zc) != 0) {
if (errno == ENOMEM) {
if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
return (-1);
}
} else {
zcmd_free_nvlists(&zc);
return (-1);
}
}
err = zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &recvdprops);
zcmd_free_nvlists(&zc);
if (err != 0)
return (-1);
nvlist_free(zhp->zfs_recvd_props);
zhp->zfs_recvd_props = recvdprops;
return (0);
}
static int
put_stats_zhdl(zfs_handle_t *zhp, zfs_cmd_t *zc)
{
nvlist_t *allprops, *userprops;
zhp->zfs_dmustats = zc->zc_objset_stats; /* structure assignment */
if (zcmd_read_dst_nvlist(zhp->zfs_hdl, zc, &allprops) != 0) {
return (-1);
}
/*
* XXX Why do we store the user props separately, in addition to
* storing them in zfs_props?
*/
if ((userprops = process_user_props(zhp, allprops)) == NULL) {
nvlist_free(allprops);
return (-1);
}
nvlist_free(zhp->zfs_props);
nvlist_free(zhp->zfs_user_props);
zhp->zfs_props = allprops;
zhp->zfs_user_props = userprops;
return (0);
}
static int
get_stats(zfs_handle_t *zhp)
{
int rc = 0;
zfs_cmd_t zc = { 0 };
if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0)
return (-1);
if (get_stats_ioctl(zhp, &zc) != 0)
rc = -1;
else if (put_stats_zhdl(zhp, &zc) != 0)
rc = -1;
zcmd_free_nvlists(&zc);
return (rc);
}
/*
* Refresh the properties currently stored in the handle.
*/
void
zfs_refresh_properties(zfs_handle_t *zhp)
{
(void) get_stats(zhp);
}
/*
* Makes a handle from the given dataset name. Used by zfs_open() and
* zfs_iter_* to create child handles on the fly.
*/
static int
make_dataset_handle_common(zfs_handle_t *zhp, zfs_cmd_t *zc)
{
if (put_stats_zhdl(zhp, zc) != 0)
return (-1);
/*
* We've managed to open the dataset and gather statistics. Determine
* the high-level type.
*/
if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL)
zhp->zfs_head_type = ZFS_TYPE_VOLUME;
else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS)
zhp->zfs_head_type = ZFS_TYPE_FILESYSTEM;
else
abort();
if (zhp->zfs_dmustats.dds_is_snapshot)
zhp->zfs_type = ZFS_TYPE_SNAPSHOT;
else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL)
zhp->zfs_type = ZFS_TYPE_VOLUME;
else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS)
zhp->zfs_type = ZFS_TYPE_FILESYSTEM;
else
abort(); /* we should never see any other types */
if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL)
return (-1);
return (0);
}
zfs_handle_t *
make_dataset_handle(libzfs_handle_t *hdl, const char *path)
{
zfs_cmd_t zc = { 0 };
zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
if (zhp == NULL)
return (NULL);
zhp->zfs_hdl = hdl;
(void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name));
if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) {
free(zhp);
return (NULL);
}
if (get_stats_ioctl(zhp, &zc) == -1) {
zcmd_free_nvlists(&zc);
free(zhp);
return (NULL);
}
if (make_dataset_handle_common(zhp, &zc) == -1) {
free(zhp);
zhp = NULL;
}
zcmd_free_nvlists(&zc);
return (zhp);
}
zfs_handle_t *
make_dataset_handle_zc(libzfs_handle_t *hdl, zfs_cmd_t *zc)
{
zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
if (zhp == NULL)
return (NULL);
zhp->zfs_hdl = hdl;
(void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name));
if (make_dataset_handle_common(zhp, zc) == -1) {
free(zhp);
return (NULL);
}
return (zhp);
}
zfs_handle_t *
make_dataset_simple_handle_zc(zfs_handle_t *pzhp, zfs_cmd_t *zc)
{
zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
if (zhp == NULL)
return (NULL);
zhp->zfs_hdl = pzhp->zfs_hdl;
(void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name));
zhp->zfs_head_type = pzhp->zfs_type;
zhp->zfs_type = ZFS_TYPE_SNAPSHOT;
zhp->zpool_hdl = zpool_handle(zhp);
return (zhp);
}
zfs_handle_t *
zfs_handle_dup(zfs_handle_t *zhp_orig)
{
zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
if (zhp == NULL)
return (NULL);
zhp->zfs_hdl = zhp_orig->zfs_hdl;
zhp->zpool_hdl = zhp_orig->zpool_hdl;
(void) strlcpy(zhp->zfs_name, zhp_orig->zfs_name,
sizeof (zhp->zfs_name));
zhp->zfs_type = zhp_orig->zfs_type;
zhp->zfs_head_type = zhp_orig->zfs_head_type;
zhp->zfs_dmustats = zhp_orig->zfs_dmustats;
if (zhp_orig->zfs_props != NULL) {
if (nvlist_dup(zhp_orig->zfs_props, &zhp->zfs_props, 0) != 0) {
(void) no_memory(zhp->zfs_hdl);
zfs_close(zhp);
return (NULL);
}
}
if (zhp_orig->zfs_user_props != NULL) {
if (nvlist_dup(zhp_orig->zfs_user_props,
&zhp->zfs_user_props, 0) != 0) {
(void) no_memory(zhp->zfs_hdl);
zfs_close(zhp);
return (NULL);
}
}
if (zhp_orig->zfs_recvd_props != NULL) {
if (nvlist_dup(zhp_orig->zfs_recvd_props,
&zhp->zfs_recvd_props, 0)) {
(void) no_memory(zhp->zfs_hdl);
zfs_close(zhp);
return (NULL);
}
}
zhp->zfs_mntcheck = zhp_orig->zfs_mntcheck;
if (zhp_orig->zfs_mntopts != NULL) {
zhp->zfs_mntopts = zfs_strdup(zhp_orig->zfs_hdl,
zhp_orig->zfs_mntopts);
}
zhp->zfs_props_table = zhp_orig->zfs_props_table;
return (zhp);
}
boolean_t
zfs_bookmark_exists(const char *path)
{
nvlist_t *bmarks;
nvlist_t *props;
char fsname[ZFS_MAX_DATASET_NAME_LEN];
char *bmark_name;
char *pound;
int err;
boolean_t rv;
(void) strlcpy(fsname, path, sizeof (fsname));
pound = strchr(fsname, '#');
if (pound == NULL)
return (B_FALSE);
*pound = '\0';
bmark_name = pound + 1;
props = fnvlist_alloc();
err = lzc_get_bookmarks(fsname, props, &bmarks);
nvlist_free(props);
if (err != 0) {
nvlist_free(bmarks);
return (B_FALSE);
}
rv = nvlist_exists(bmarks, bmark_name);
nvlist_free(bmarks);
return (rv);
}
zfs_handle_t *
make_bookmark_handle(zfs_handle_t *parent, const char *path,
nvlist_t *bmark_props)
{
zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
if (zhp == NULL)
return (NULL);
/* Fill in the name. */
zhp->zfs_hdl = parent->zfs_hdl;
(void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name));
/* Set the property lists. */
if (nvlist_dup(bmark_props, &zhp->zfs_props, 0) != 0) {
free(zhp);
return (NULL);
}
/* Set the types. */
zhp->zfs_head_type = parent->zfs_head_type;
zhp->zfs_type = ZFS_TYPE_BOOKMARK;
if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL) {
nvlist_free(zhp->zfs_props);
free(zhp);
return (NULL);
}
return (zhp);
}
struct zfs_open_bookmarks_cb_data {
const char *path;
zfs_handle_t *zhp;
};
static int
zfs_open_bookmarks_cb(zfs_handle_t *zhp, void *data)
{
struct zfs_open_bookmarks_cb_data *dp = data;
/*
* Is it the one we are looking for?
*/
if (strcmp(dp->path, zfs_get_name(zhp)) == 0) {
/*
* We found it. Save it and let the caller know we are done.
*/
dp->zhp = zhp;
return (EEXIST);
}
/*
* Not found. Close the handle and ask for another one.
*/
zfs_close(zhp);
return (0);
}
/*
* Opens the given snapshot, bookmark, filesystem, or volume. The 'types'
* argument is a mask of acceptable types. The function will print an
* appropriate error message and return NULL if it can't be opened.
*/
zfs_handle_t *
zfs_open(libzfs_handle_t *hdl, const char *path, int types)
{
zfs_handle_t *zhp;
char errbuf[1024];
char *bookp;
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN, "cannot open '%s'"), path);
/*
* Validate the name before we even try to open it.
*/
if (!zfs_validate_name(hdl, path, types, B_FALSE)) {
(void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
return (NULL);
}
/*
* Bookmarks needs to be handled separately.
*/
bookp = strchr(path, '#');
if (bookp == NULL) {
/*
* Try to get stats for the dataset, which will tell us if it
* exists.
*/
errno = 0;
if ((zhp = make_dataset_handle(hdl, path)) == NULL) {
(void) zfs_standard_error(hdl, errno, errbuf);
return (NULL);
}
} else {
char dsname[ZFS_MAX_DATASET_NAME_LEN];
zfs_handle_t *pzhp;
struct zfs_open_bookmarks_cb_data cb_data = {path, NULL};
/*
* We need to cut out '#' and everything after '#'
* to get the parent dataset name only.
*/
assert(bookp - path < sizeof (dsname));
(void) strncpy(dsname, path, bookp - path);
dsname[bookp - path] = '\0';
/*
* Create handle for the parent dataset.
*/
errno = 0;
if ((pzhp = make_dataset_handle(hdl, dsname)) == NULL) {
(void) zfs_standard_error(hdl, errno, errbuf);
return (NULL);
}
/*
* Iterate bookmarks to find the right one.
*/
errno = 0;
if ((zfs_iter_bookmarks(pzhp, zfs_open_bookmarks_cb,
&cb_data) == 0) && (cb_data.zhp == NULL)) {
(void) zfs_error(hdl, EZFS_NOENT, errbuf);
zfs_close(pzhp);
return (NULL);
}
if (cb_data.zhp == NULL) {
(void) zfs_standard_error(hdl, errno, errbuf);
zfs_close(pzhp);
return (NULL);
}
zhp = cb_data.zhp;
/*
* Cleanup.
*/
zfs_close(pzhp);
}
if (zhp == NULL) {
char *at = strchr(path, '@');
if (at != NULL)
*at = '\0';
errno = 0;
if ((zhp = make_dataset_handle(hdl, path)) == NULL) {
(void) zfs_standard_error(hdl, errno, errbuf);
return (NULL);
}
if (at != NULL)
*at = '@';
(void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name));
zhp->zfs_type = ZFS_TYPE_SNAPSHOT;
}
if (!(types & zhp->zfs_type)) {
(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
zfs_close(zhp);
return (NULL);
}
return (zhp);
}
/*
* Release a ZFS handle. Nothing to do but free the associated memory.
*/
void
zfs_close(zfs_handle_t *zhp)
{
if (zhp->zfs_mntopts)
free(zhp->zfs_mntopts);
nvlist_free(zhp->zfs_props);
nvlist_free(zhp->zfs_user_props);
nvlist_free(zhp->zfs_recvd_props);
free(zhp);
}
typedef struct mnttab_node {
struct mnttab mtn_mt;
avl_node_t mtn_node;
} mnttab_node_t;
static int
libzfs_mnttab_cache_compare(const void *arg1, const void *arg2)
{
const mnttab_node_t *mtn1 = arg1;
const mnttab_node_t *mtn2 = arg2;
int rv;
rv = strcmp(mtn1->mtn_mt.mnt_special, mtn2->mtn_mt.mnt_special);
if (rv == 0)
return (0);
return (rv > 0 ? 1 : -1);
}
void
libzfs_mnttab_init(libzfs_handle_t *hdl)
{
assert(avl_numnodes(&hdl->libzfs_mnttab_cache) == 0);
avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare,
sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node));
}
void
libzfs_mnttab_update(libzfs_handle_t *hdl)
{
struct mnttab entry;
rewind(hdl->libzfs_mnttab);
while (getmntent(hdl->libzfs_mnttab, &entry) == 0) {
mnttab_node_t *mtn;
if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0)
continue;
mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));
mtn->mtn_mt.mnt_special = zfs_strdup(hdl, entry.mnt_special);
mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, entry.mnt_mountp);
mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, entry.mnt_fstype);
mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, entry.mnt_mntopts);
avl_add(&hdl->libzfs_mnttab_cache, mtn);
}
}
void
libzfs_mnttab_fini(libzfs_handle_t *hdl)
{
void *cookie = NULL;
mnttab_node_t *mtn;
while ((mtn = avl_destroy_nodes(&hdl->libzfs_mnttab_cache, &cookie))
!= NULL) {
free(mtn->mtn_mt.mnt_special);
free(mtn->mtn_mt.mnt_mountp);
free(mtn->mtn_mt.mnt_fstype);
free(mtn->mtn_mt.mnt_mntopts);
free(mtn);
}
avl_destroy(&hdl->libzfs_mnttab_cache);
}
void
libzfs_mnttab_cache(libzfs_handle_t *hdl, boolean_t enable)
{
hdl->libzfs_mnttab_enable = enable;
}
int
libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname,
struct mnttab *entry)
{
mnttab_node_t find;
mnttab_node_t *mtn;
if (!hdl->libzfs_mnttab_enable) {
struct mnttab srch = { 0 };
if (avl_numnodes(&hdl->libzfs_mnttab_cache))
libzfs_mnttab_fini(hdl);
rewind(hdl->libzfs_mnttab);
srch.mnt_special = (char *)fsname;
srch.mnt_fstype = MNTTYPE_ZFS;
if (getmntany(hdl->libzfs_mnttab, entry, &srch) == 0)
return (0);
else
return (ENOENT);
}
if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0)
libzfs_mnttab_update(hdl);
find.mtn_mt.mnt_special = (char *)fsname;
mtn = avl_find(&hdl->libzfs_mnttab_cache, &find, NULL);
if (mtn) {
*entry = mtn->mtn_mt;
return (0);
}
return (ENOENT);
}
void
libzfs_mnttab_add(libzfs_handle_t *hdl, const char *special,
const char *mountp, const char *mntopts)
{
mnttab_node_t *mtn;
if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0)
return;
mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));
mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special);
mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp);
mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS);
mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts);
avl_add(&hdl->libzfs_mnttab_cache, mtn);
}
void
libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname)
{
mnttab_node_t find;
mnttab_node_t *ret;
find.mtn_mt.mnt_special = (char *)fsname;
if ((ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL))
!= NULL) {
avl_remove(&hdl->libzfs_mnttab_cache, ret);
free(ret->mtn_mt.mnt_special);
free(ret->mtn_mt.mnt_mountp);
free(ret->mtn_mt.mnt_fstype);
free(ret->mtn_mt.mnt_mntopts);
free(ret);
}
}
int
zfs_spa_version(zfs_handle_t *zhp, int *spa_version)
{
zpool_handle_t *zpool_handle = zhp->zpool_hdl;
if (zpool_handle == NULL)
return (-1);
*spa_version = zpool_get_prop_int(zpool_handle,
ZPOOL_PROP_VERSION, NULL);
return (0);
}
/*
* The choice of reservation property depends on the SPA version.
*/
static int
zfs_which_resv_prop(zfs_handle_t *zhp, zfs_prop_t *resv_prop)
{
int spa_version;
if (zfs_spa_version(zhp, &spa_version) < 0)
return (-1);
if (spa_version >= SPA_VERSION_REFRESERVATION)
*resv_prop = ZFS_PROP_REFRESERVATION;
else
*resv_prop = ZFS_PROP_RESERVATION;
return (0);
}
/*
* Given an nvlist of properties to set, validates that they are correct, and
* parses any numeric properties (index, boolean, etc) if they are specified as
* strings.
*/
nvlist_t *
zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,
uint64_t zoned, zfs_handle_t *zhp, zpool_handle_t *zpool_hdl,
const char *errbuf)
{
nvpair_t *elem;
uint64_t intval;
char *strval;
zfs_prop_t prop;
nvlist_t *ret;
int chosen_normal = -1;
int chosen_utf = -1;
if (nvlist_alloc(&ret, NV_UNIQUE_NAME, 0) != 0) {
(void) no_memory(hdl);
return (NULL);
}
/*
* Make sure this property is valid and applies to this type.
*/
elem = NULL;
while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
const char *propname = nvpair_name(elem);
prop = zfs_name_to_prop(propname);
if (prop == ZPROP_INVAL && zfs_prop_user(propname)) {
/*
* This is a user property: make sure it's a
* string, and that it's less than ZAP_MAXNAMELEN.
*/
if (nvpair_type(elem) != DATA_TYPE_STRING) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"'%s' must be a string"), propname);
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}
if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"property name '%s' is too long"),
propname);
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}
(void) nvpair_value_string(elem, &strval);
if (nvlist_add_string(ret, propname, strval) != 0) {
(void) no_memory(hdl);
goto error;
}
continue;
}
/*
* Currently, only user properties can be modified on
* snapshots.
*/
if (type == ZFS_TYPE_SNAPSHOT) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"this property can not be modified for snapshots"));
(void) zfs_error(hdl, EZFS_PROPTYPE, errbuf);
goto error;
}
if (prop == ZPROP_INVAL && zfs_prop_userquota(propname)) {
zfs_userquota_prop_t uqtype;
char newpropname[128];
char domain[128];
uint64_t rid;
uint64_t valary[3];
if (userquota_propname_decode(propname, zoned,
&uqtype, domain, sizeof (domain), &rid) != 0) {
zfs_error_aux(hdl,
dgettext(TEXT_DOMAIN,
"'%s' has an invalid user/group name"),
propname);
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}
if (uqtype != ZFS_PROP_USERQUOTA &&
uqtype != ZFS_PROP_GROUPQUOTA) {
zfs_error_aux(hdl,
dgettext(TEXT_DOMAIN, "'%s' is readonly"),
propname);
(void) zfs_error(hdl, EZFS_PROPREADONLY,
errbuf);
goto error;
}
if (nvpair_type(elem) == DATA_TYPE_STRING) {
(void) nvpair_value_string(elem, &strval);
if (strcmp(strval, "none") == 0) {
intval = 0;
} else if (zfs_nicestrtonum(hdl,
strval, &intval) != 0) {
(void) zfs_error(hdl,
EZFS_BADPROP, errbuf);
goto error;
}
} else if (nvpair_type(elem) ==
DATA_TYPE_UINT64) {
(void) nvpair_value_uint64(elem, &intval);
if (intval == 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"use 'none' to disable "
"userquota/groupquota"));
goto error;
}
} else {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"'%s' must be a number"), propname);
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}
/*
* Encode the prop name as
* userquota@<hex-rid>-domain, to make it easy
* for the kernel to decode.
*/
(void) snprintf(newpropname, sizeof (newpropname),
"%s%llx-%s", zfs_userquota_prop_prefixes[uqtype],
(longlong_t)rid, domain);
valary[0] = uqtype;
valary[1] = rid;
valary[2] = intval;
if (nvlist_add_uint64_array(ret, newpropname,
valary, 3) != 0) {
(void) no_memory(hdl);
goto error;
}
continue;
} else if (prop == ZPROP_INVAL && zfs_prop_written(propname)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"'%s' is readonly"),
propname);
(void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf);
goto error;
}
if (prop == ZPROP_INVAL) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"invalid property '%s'"), propname);
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}
if (!zfs_prop_valid_for_type(prop, type)) {
zfs_error_aux(hdl,
dgettext(TEXT_DOMAIN, "'%s' does not "
"apply to datasets of this type"), propname);
(void) zfs_error(hdl, EZFS_PROPTYPE, errbuf);
goto error;
}
if (zfs_prop_readonly(prop) &&
(!zfs_prop_setonce(prop) || zhp != NULL)) {
zfs_error_aux(hdl,
dgettext(TEXT_DOMAIN, "'%s' is readonly"),
propname);
(void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf);
goto error;
}
if (zprop_parse_value(hdl, elem, prop, type, ret,
&strval, &intval, errbuf) != 0)
goto error;
/*
* Perform some additional checks for specific properties.
*/
switch (prop) {
case ZFS_PROP_VERSION:
{
int version;
if (zhp == NULL)
break;
version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
if (intval < version) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"Can not downgrade; already at version %u"),
version);
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}
break;
}
case ZFS_PROP_VOLBLOCKSIZE:
case ZFS_PROP_RECORDSIZE:
{
int maxbs = SPA_MAXBLOCKSIZE;
if (zpool_hdl != NULL) {
maxbs = zpool_get_prop_int(zpool_hdl,
ZPOOL_PROP_MAXBLOCKSIZE, NULL);
}
/*
* Volumes are limited to a volblocksize of 128KB,
* because they typically service workloads with
* small random writes, which incur a large performance
* penalty with large blocks.
*/
if (prop == ZFS_PROP_VOLBLOCKSIZE)
maxbs = SPA_OLD_MAXBLOCKSIZE;
/*
* The value must be a power of two between
* SPA_MINBLOCKSIZE and maxbs.
*/
if (intval < SPA_MINBLOCKSIZE ||
intval > maxbs || !ISP2(intval)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"'%s' must be power of 2 from 512B "
"to %uKB"), propname, maxbs >> 10);
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}
break;
}
case ZFS_PROP_MLSLABEL:
{
#ifdef illumos
/*
* Verify the mlslabel string and convert to
* internal hex label string.
*/
m_label_t *new_sl;
char *hex = NULL; /* internal label string */
/* Default value is already OK. */
if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0)
break;
/* Verify the label can be converted to binary form */
if (((new_sl = m_label_alloc(MAC_LABEL)) == NULL) ||
(str_to_label(strval, &new_sl, MAC_LABEL,
L_NO_CORRECTION, NULL) == -1)) {
goto badlabel;
}
/* Now translate to hex internal label string */
if (label_to_str(new_sl, &hex, M_INTERNAL,
DEF_NAMES) != 0) {
if (hex)
free(hex);
goto badlabel;
}
m_label_free(new_sl);
/* If string is already in internal form, we're done. */
if (strcmp(strval, hex) == 0) {
free(hex);
break;
}
/* Replace the label string with the internal form. */
(void) nvlist_remove(ret, zfs_prop_to_name(prop),
DATA_TYPE_STRING);
verify(nvlist_add_string(ret, zfs_prop_to_name(prop),
hex) == 0);
free(hex);
break;
badlabel:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"invalid mlslabel '%s'"), strval);
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
m_label_free(new_sl); /* OK if null */
#else /* !illumos */
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"mlslabel is not supported on FreeBSD"));
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
#endif /* illumos */
goto error;
}
case ZFS_PROP_MOUNTPOINT:
{
namecheck_err_t why;
if (strcmp(strval, ZFS_MOUNTPOINT_NONE) == 0 ||
strcmp(strval, ZFS_MOUNTPOINT_LEGACY) == 0)
break;
if (mountpoint_namecheck(strval, &why)) {
switch (why) {
case NAME_ERR_LEADING_SLASH:
zfs_error_aux(hdl,
dgettext(TEXT_DOMAIN,
"'%s' must be an absolute path, "
"'none', or 'legacy'"), propname);
break;
case NAME_ERR_TOOLONG:
zfs_error_aux(hdl,
dgettext(TEXT_DOMAIN,
"component of '%s' is too long"),
propname);
break;
default:
zfs_error_aux(hdl,
dgettext(TEXT_DOMAIN,
"(%d) not defined"),
why);
break;
}
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}
}
/*FALLTHRU*/
case ZFS_PROP_SHARESMB:
case ZFS_PROP_SHARENFS:
/*
* For the mountpoint and sharenfs or sharesmb
* properties, check if it can be set in a
* global/non-global zone based on
* the zoned property value:
*
* global zone non-global zone
* --------------------------------------------------
* zoned=on mountpoint (no) mountpoint (yes)
* sharenfs (no) sharenfs (no)
* sharesmb (no) sharesmb (no)
*
* zoned=off mountpoint (yes) N/A
* sharenfs (yes)
* sharesmb (yes)
*/
if (zoned) {
if (getzoneid() == GLOBAL_ZONEID) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"'%s' cannot be set on "
"dataset in a non-global zone"),
propname);
(void) zfs_error(hdl, EZFS_ZONED,
errbuf);
goto error;
} else if (prop == ZFS_PROP_SHARENFS ||
prop == ZFS_PROP_SHARESMB) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"'%s' cannot be set in "
"a non-global zone"), propname);
(void) zfs_error(hdl, EZFS_ZONED,
errbuf);
goto error;
}
} else if (getzoneid() != GLOBAL_ZONEID) {
/*
* If zoned property is 'off', this must be in
* a global zone. If not, something is wrong.
*/
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"'%s' cannot be set while dataset "
"'zoned' property is set"), propname);
(void) zfs_error(hdl, EZFS_ZONED, errbuf);
goto error;
}
/*
* At this point, it is legitimate to set the
* property. Now we want to make sure that the
* property value is valid if it is sharenfs.
*/
if ((prop == ZFS_PROP_SHARENFS ||
prop == ZFS_PROP_SHARESMB) &&
strcmp(strval, "on") != 0 &&
strcmp(strval, "off") != 0) {
zfs_share_proto_t proto;
if (prop == ZFS_PROP_SHARESMB)
proto = PROTO_SMB;
else
proto = PROTO_NFS;
/*
* Must be an valid sharing protocol
* option string so init the libshare
* in order to enable the parser and
* then parse the options. We use the
* control API since we don't care about
* the current configuration and don't
* want the overhead of loading it
* until we actually do something.
*/
if (zfs_init_libshare(hdl,
SA_INIT_CONTROL_API) != SA_OK) {
/*
* An error occurred so we can't do
* anything
*/
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"'%s' cannot be set: problem "
"in share initialization"),
propname);
(void) zfs_error(hdl, EZFS_BADPROP,
errbuf);
goto error;
}
if (zfs_parse_options(strval, proto) != SA_OK) {
/*
* There was an error in parsing so
* deal with it by issuing an error
* message and leaving after
* uninitializing the the libshare
* interface.
*/
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"'%s' cannot be set to invalid "
"options"), propname);
(void) zfs_error(hdl, EZFS_BADPROP,
errbuf);
zfs_uninit_libshare(hdl);
goto error;
}
zfs_uninit_libshare(hdl);
}
break;
case ZFS_PROP_UTF8ONLY:
chosen_utf = (int)intval;
break;
case ZFS_PROP_NORMALIZE:
chosen_normal = (int)intval;
break;
default:
break;
}
/*
* For changes to existing volumes, we have some additional
* checks to enforce.
*/
if (type == ZFS_TYPE_VOLUME && zhp != NULL) {
uint64_t volsize = zfs_prop_get_int(zhp,
ZFS_PROP_VOLSIZE);
uint64_t blocksize = zfs_prop_get_int(zhp,
ZFS_PROP_VOLBLOCKSIZE);
char buf[64];
switch (prop) {
case ZFS_PROP_RESERVATION:
case ZFS_PROP_REFRESERVATION:
if (intval > volsize) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"'%s' is greater than current "
"volume size"), propname);
(void) zfs_error(hdl, EZFS_BADPROP,
errbuf);
goto error;
}
break;
case ZFS_PROP_VOLSIZE:
if (intval % blocksize != 0) {
zfs_nicenum(blocksize, buf,
sizeof (buf));
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"'%s' must be a multiple of "
"volume block size (%s)"),
propname, buf);
(void) zfs_error(hdl, EZFS_BADPROP,
errbuf);
goto error;
}
if (intval == 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"'%s' cannot be zero"),
propname);
(void) zfs_error(hdl, EZFS_BADPROP,
errbuf);
goto error;
}
break;
default:
break;
}
}
}
/*
* If normalization was chosen, but no UTF8 choice was made,
* enforce rejection of non-UTF8 names.
*
* If normalization was chosen, but rejecting non-UTF8 names
* was explicitly not chosen, it is an error.
*/
if (chosen_normal > 0 && chosen_utf < 0) {
if (nvlist_add_uint64(ret,
zfs_prop_to_name(ZFS_PROP_UTF8ONLY), 1) != 0) {
(void) no_memory(hdl);
goto error;
}
} else if (chosen_normal > 0 && chosen_utf == 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"'%s' must be set 'on' if normalization chosen"),
zfs_prop_to_name(ZFS_PROP_UTF8ONLY));
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}
return (ret);
error:
nvlist_free(ret);
return (NULL);
}
int
zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl)
{
uint64_t old_volsize;
uint64_t new_volsize;
uint64_t old_reservation;
uint64_t new_reservation;
zfs_prop_t resv_prop;
nvlist_t *props;
/*
* If this is an existing volume, and someone is setting the volsize,
* make sure that it matches the reservation, or add it if necessary.
*/
old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
if (zfs_which_resv_prop(zhp, &resv_prop) < 0)
return (-1);
old_reservation = zfs_prop_get_int(zhp, resv_prop);
props = fnvlist_alloc();
fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE));
if ((zvol_volsize_to_reservation(old_volsize, props) !=
old_reservation) || nvlist_exists(nvl,
zfs_prop_to_name(resv_prop))) {
fnvlist_free(props);
return (0);
}
if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE),
&new_volsize) != 0) {
fnvlist_free(props);
return (-1);
}
new_reservation = zvol_volsize_to_reservation(new_volsize, props);
fnvlist_free(props);
if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop),
new_reservation) != 0) {
(void) no_memory(zhp->zfs_hdl);
return (-1);
}
return (1);
}
void
zfs_setprop_error(libzfs_handle_t *hdl, zfs_prop_t prop, int err,
char *errbuf)
{
switch (err) {
case ENOSPC:
/*
* For quotas and reservations, ENOSPC indicates
* something different; setting a quota or reservation
* doesn't use any disk space.
*/
switch (prop) {
case ZFS_PROP_QUOTA:
case ZFS_PROP_REFQUOTA:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"size is less than current used or "
"reserved space"));
(void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
break;
case ZFS_PROP_RESERVATION:
case ZFS_PROP_REFRESERVATION:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"size is greater than available space"));
(void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
break;
default:
(void) zfs_standard_error(hdl, err, errbuf);
break;
}
break;
case EBUSY:
(void) zfs_standard_error(hdl, EBUSY, errbuf);
break;
case EROFS:
(void) zfs_error(hdl, EZFS_DSREADONLY, errbuf);
break;
case E2BIG:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"property value too long"));
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
break;
case ENOTSUP:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"pool and or dataset must be upgraded to set this "
"property or value"));
(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
break;
case ERANGE:
case EDOM:
if (prop == ZFS_PROP_COMPRESSION ||
prop == ZFS_PROP_RECORDSIZE) {
(void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"property setting is not allowed on "
"bootable datasets"));
(void) zfs_error(hdl, EZFS_NOTSUP, errbuf);
} else if (prop == ZFS_PROP_CHECKSUM ||
prop == ZFS_PROP_DEDUP) {
(void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"property setting is not allowed on "
"root pools"));
(void) zfs_error(hdl, EZFS_NOTSUP, errbuf);
} else {
(void) zfs_standard_error(hdl, err, errbuf);
}
break;
case EINVAL:
if (prop == ZPROP_INVAL) {
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
} else {
(void) zfs_standard_error(hdl, err, errbuf);
}
break;
case EOVERFLOW:
/*
* This platform can't address a volume this big.
*/
#ifdef _ILP32
if (prop == ZFS_PROP_VOLSIZE) {
(void) zfs_error(hdl, EZFS_VOLTOOBIG, errbuf);
break;
}
#endif
/* FALLTHROUGH */
default:
(void) zfs_standard_error(hdl, err, errbuf);
}
}
/*
* Given a property name and value, set the property for the given dataset.
*/
int
zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval)
{
int ret = -1;
char errbuf[1024];
libzfs_handle_t *hdl = zhp->zfs_hdl;
nvlist_t *nvl = NULL;
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
zhp->zfs_name);
if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0 ||
nvlist_add_string(nvl, propname, propval) != 0) {
(void) no_memory(hdl);
goto error;
}
ret = zfs_prop_set_list(zhp, nvl);
error:
nvlist_free(nvl);
return (ret);
}
/*
* Given an nvlist of property names and values, set the properties for the
* given dataset.
*/
int
zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props)
{
zfs_cmd_t zc = { 0 };
int ret = -1;
prop_changelist_t **cls = NULL;
int cl_idx;
char errbuf[1024];
libzfs_handle_t *hdl = zhp->zfs_hdl;
nvlist_t *nvl;
int nvl_len;
int added_resv = 0;
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
zhp->zfs_name);
if ((nvl = zfs_valid_proplist(hdl, zhp->zfs_type, props,
zfs_prop_get_int(zhp, ZFS_PROP_ZONED), zhp, zhp->zpool_hdl,
errbuf)) == NULL)
goto error;
/*
* We have to check for any extra properties which need to be added
* before computing the length of the nvlist.
*/
for (nvpair_t *elem = nvlist_next_nvpair(nvl, NULL);
elem != NULL;
elem = nvlist_next_nvpair(nvl, elem)) {
if (zfs_name_to_prop(nvpair_name(elem)) == ZFS_PROP_VOLSIZE &&
(added_resv = zfs_add_synthetic_resv(zhp, nvl)) == -1) {
goto error;
}
}
/*
* Check how many properties we're setting and allocate an array to
* store changelist pointers for postfix().
*/
nvl_len = 0;
for (nvpair_t *elem = nvlist_next_nvpair(nvl, NULL);
elem != NULL;
elem = nvlist_next_nvpair(nvl, elem))
nvl_len++;
if ((cls = calloc(nvl_len, sizeof (prop_changelist_t *))) == NULL)
goto error;
cl_idx = 0;
for (nvpair_t *elem = nvlist_next_nvpair(nvl, NULL);
elem != NULL;
elem = nvlist_next_nvpair(nvl, elem)) {
zfs_prop_t prop = zfs_name_to_prop(nvpair_name(elem));
assert(cl_idx < nvl_len);
/*
* We don't want to unmount & remount the dataset when changing
* its canmount property to 'on' or 'noauto'. We only use
* the changelist logic to unmount when setting canmount=off.
*/
if (prop != ZFS_PROP_CANMOUNT ||
(fnvpair_value_uint64(elem) == ZFS_CANMOUNT_OFF &&
zfs_is_mounted(zhp, NULL))) {
cls[cl_idx] = changelist_gather(zhp, prop, 0, 0);
if (cls[cl_idx] == NULL)
goto error;
}
if (prop == ZFS_PROP_MOUNTPOINT &&
changelist_haszonedchild(cls[cl_idx])) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"child dataset with inherited mountpoint is used "
"in a non-global zone"));
ret = zfs_error(hdl, EZFS_ZONED, errbuf);
goto error;
}
/* We don't support those properties on FreeBSD. */
switch (prop) {
case ZFS_PROP_DEVICES:
case ZFS_PROP_ISCSIOPTIONS:
case ZFS_PROP_XATTR:
case ZFS_PROP_VSCAN:
case ZFS_PROP_NBMAND:
case ZFS_PROP_MLSLABEL:
(void) snprintf(errbuf, sizeof (errbuf),
"property '%s' not supported on FreeBSD",
nvpair_name(elem));
ret = zfs_error(hdl, EZFS_PERM, errbuf);
goto error;
}
if (cls[cl_idx] != NULL &&
(ret = changelist_prefix(cls[cl_idx])) != 0)
goto error;
cl_idx++;
}
assert(cl_idx == nvl_len);
/*
* Execute the corresponding ioctl() to set this list of properties.
*/
(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
if ((ret = zcmd_write_src_nvlist(hdl, &zc, nvl)) != 0 ||
(ret = zcmd_alloc_dst_nvlist(hdl, &zc, 0)) != 0)
goto error;
ret = zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc);
if (ret != 0) {
/* Get the list of unset properties back and report them. */
nvlist_t *errorprops = NULL;
if (zcmd_read_dst_nvlist(hdl, &zc, &errorprops) != 0)
goto error;
for (nvpair_t *elem = nvlist_next_nvpair(nvl, NULL);
elem != NULL;
elem = nvlist_next_nvpair(nvl, elem)) {
zfs_prop_t prop = zfs_name_to_prop(nvpair_name(elem));
zfs_setprop_error(hdl, prop, errno, errbuf);
}
nvlist_free(errorprops);
if (added_resv && errno == ENOSPC) {
/* clean up the volsize property we tried to set */
uint64_t old_volsize = zfs_prop_get_int(zhp,
ZFS_PROP_VOLSIZE);
nvlist_free(nvl);
nvl = NULL;
zcmd_free_nvlists(&zc);
if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
goto error;
if (nvlist_add_uint64(nvl,
zfs_prop_to_name(ZFS_PROP_VOLSIZE),
old_volsize) != 0)
goto error;
if (zcmd_write_src_nvlist(hdl, &zc, nvl) != 0)
goto error;
(void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc);
}
} else {
for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) {
if (cls[cl_idx] != NULL) {
int clp_err = changelist_postfix(cls[cl_idx]);
if (clp_err != 0)
ret = clp_err;
}
}
/*
* Refresh the statistics so the new property value
* is reflected.
*/
if (ret == 0)
(void) get_stats(zhp);
}
error:
nvlist_free(nvl);
zcmd_free_nvlists(&zc);
if (cls != NULL) {
for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) {
if (cls[cl_idx] != NULL)
changelist_free(cls[cl_idx]);
}
free(cls);
}
return (ret);
}
/*
* Given a property, inherit the value from the parent dataset, or if received
* is TRUE, revert to the received value, if any.
*/
int
zfs_prop_inherit(zfs_handle_t *zhp, const char *propname, boolean_t received)
{
zfs_cmd_t zc = { 0 };
int ret;
prop_changelist_t *cl;
libzfs_handle_t *hdl = zhp->zfs_hdl;
char errbuf[1024];
zfs_prop_t prop;
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"cannot inherit %s for '%s'"), propname, zhp->zfs_name);
zc.zc_cookie = received;
if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL) {
/*
* For user properties, the amount of work we have to do is very
* small, so just do it here.
*/
if (!zfs_prop_user(propname)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"invalid property"));
return (zfs_error(hdl, EZFS_BADPROP, errbuf));
}
(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
(void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value));
if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc) != 0)
return (zfs_standard_error(hdl, errno, errbuf));
return (0);
}
/*
* Verify that this property is inheritable.
*/
if (zfs_prop_readonly(prop))
return (zfs_error(hdl, EZFS_PROPREADONLY, errbuf));
if (!zfs_prop_inheritable(prop) && !received)
return (zfs_error(hdl, EZFS_PROPNONINHERIT, errbuf));
/*
* Check to see if the value applies to this type
*/
if (!zfs_prop_valid_for_type(prop, zhp->zfs_type))
return (zfs_error(hdl, EZFS_PROPTYPE, errbuf));
/*
* Normalize the name, to get rid of shorthand abbreviations.
*/
propname = zfs_prop_to_name(prop);
(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
(void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value));
if (prop == ZFS_PROP_MOUNTPOINT && getzoneid() == GLOBAL_ZONEID &&
zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"dataset is used in a non-global zone"));
return (zfs_error(hdl, EZFS_ZONED, errbuf));
}
/*
* Determine datasets which will be affected by this change, if any.
*/
if ((cl = changelist_gather(zhp, prop, 0, 0)) == NULL)
return (-1);
if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cl)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"child dataset with inherited mountpoint is used "
"in a non-global zone"));
ret = zfs_error(hdl, EZFS_ZONED, errbuf);
goto error;
}
if ((ret = changelist_prefix(cl)) != 0)
goto error;
if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc)) != 0) {
return (zfs_standard_error(hdl, errno, errbuf));
} else {
if ((ret = changelist_postfix(cl)) != 0)
goto error;
/*
* Refresh the statistics so the new property is reflected.
*/
(void) get_stats(zhp);
}
error:
changelist_free(cl);
return (ret);
}
/*
* True DSL properties are stored in an nvlist. The following two functions
* extract them appropriately.
*/
static uint64_t
getprop_uint64(zfs_handle_t *zhp, zfs_prop_t prop, char **source)
{
nvlist_t *nv;
uint64_t value;
*source = NULL;
if (nvlist_lookup_nvlist(zhp->zfs_props,
zfs_prop_to_name(prop), &nv) == 0) {
verify(nvlist_lookup_uint64(nv, ZPROP_VALUE, &value) == 0);
(void) nvlist_lookup_string(nv, ZPROP_SOURCE, source);
} else {
verify(!zhp->zfs_props_table ||
zhp->zfs_props_table[prop] == B_TRUE);
value = zfs_prop_default_numeric(prop);
*source = "";
}
return (value);
}
static const char *
getprop_string(zfs_handle_t *zhp, zfs_prop_t prop, char **source)
{
nvlist_t *nv;
const char *value;
*source = NULL;
if (nvlist_lookup_nvlist(zhp->zfs_props,
zfs_prop_to_name(prop), &nv) == 0) {
value = fnvlist_lookup_string(nv, ZPROP_VALUE);
(void) nvlist_lookup_string(nv, ZPROP_SOURCE, source);
} else {
verify(!zhp->zfs_props_table ||
zhp->zfs_props_table[prop] == B_TRUE);
value = zfs_prop_default_string(prop);
*source = "";
}
return (value);
}
static boolean_t
zfs_is_recvd_props_mode(zfs_handle_t *zhp)
{
return (zhp->zfs_props == zhp->zfs_recvd_props);
}
static void
zfs_set_recvd_props_mode(zfs_handle_t *zhp, uint64_t *cookie)
{
*cookie = (uint64_t)(uintptr_t)zhp->zfs_props;
zhp->zfs_props = zhp->zfs_recvd_props;
}
static void
zfs_unset_recvd_props_mode(zfs_handle_t *zhp, uint64_t *cookie)
{
zhp->zfs_props = (nvlist_t *)(uintptr_t)*cookie;
*cookie = 0;
}
/*
* Internal function for getting a numeric property. Both zfs_prop_get() and
* zfs_prop_get_int() are built using this interface.
*
* Certain properties can be overridden using 'mount -o'. In this case, scan
* the contents of the /etc/mnttab entry, searching for the appropriate options.
* If they differ from the on-disk values, report the current values and mark
* the source "temporary".
*/
static int
get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src,
char **source, uint64_t *val)
{
zfs_cmd_t zc = { 0 };
nvlist_t *zplprops = NULL;
struct mnttab mnt;
char *mntopt_on = NULL;
char *mntopt_off = NULL;
boolean_t received = zfs_is_recvd_props_mode(zhp);
*source = NULL;
switch (prop) {
case ZFS_PROP_ATIME:
mntopt_on = MNTOPT_ATIME;
mntopt_off = MNTOPT_NOATIME;
break;
case ZFS_PROP_DEVICES:
mntopt_on = MNTOPT_DEVICES;
mntopt_off = MNTOPT_NODEVICES;
break;
case ZFS_PROP_EXEC:
mntopt_on = MNTOPT_EXEC;
mntopt_off = MNTOPT_NOEXEC;
break;
case ZFS_PROP_READONLY:
mntopt_on = MNTOPT_RO;
mntopt_off = MNTOPT_RW;
break;
case ZFS_PROP_SETUID:
mntopt_on = MNTOPT_SETUID;
mntopt_off = MNTOPT_NOSETUID;
break;
case ZFS_PROP_XATTR:
mntopt_on = MNTOPT_XATTR;
mntopt_off = MNTOPT_NOXATTR;
break;
case ZFS_PROP_NBMAND:
mntopt_on = MNTOPT_NBMAND;
mntopt_off = MNTOPT_NONBMAND;
break;
default:
break;
}
/*
* Because looking up the mount options is potentially expensive
* (iterating over all of /etc/mnttab), we defer its calculation until
* we're looking up a property which requires its presence.
*/
if (!zhp->zfs_mntcheck &&
(mntopt_on != NULL || prop == ZFS_PROP_MOUNTED)) {
libzfs_handle_t *hdl = zhp->zfs_hdl;
struct mnttab entry;
if (libzfs_mnttab_find(hdl, zhp->zfs_name, &entry) == 0) {
zhp->zfs_mntopts = zfs_strdup(hdl,
entry.mnt_mntopts);
if (zhp->zfs_mntopts == NULL)
return (-1);
}
zhp->zfs_mntcheck = B_TRUE;
}
if (zhp->zfs_mntopts == NULL)
mnt.mnt_mntopts = "";
else
mnt.mnt_mntopts = zhp->zfs_mntopts;
switch (prop) {
case ZFS_PROP_ATIME:
case ZFS_PROP_DEVICES:
case ZFS_PROP_EXEC:
case ZFS_PROP_READONLY:
case ZFS_PROP_SETUID:
case ZFS_PROP_XATTR:
case ZFS_PROP_NBMAND:
*val = getprop_uint64(zhp, prop, source);
if (received)
break;
if (hasmntopt(&mnt, mntopt_on) && !*val) {
*val = B_TRUE;
if (src)
*src = ZPROP_SRC_TEMPORARY;
} else if (hasmntopt(&mnt, mntopt_off) && *val) {
*val = B_FALSE;
if (src)
*src = ZPROP_SRC_TEMPORARY;
}
break;
case ZFS_PROP_CANMOUNT:
case ZFS_PROP_VOLSIZE:
case ZFS_PROP_QUOTA:
case ZFS_PROP_REFQUOTA:
case ZFS_PROP_RESERVATION:
case ZFS_PROP_REFRESERVATION:
case ZFS_PROP_FILESYSTEM_LIMIT:
case ZFS_PROP_SNAPSHOT_LIMIT:
case ZFS_PROP_FILESYSTEM_COUNT:
case ZFS_PROP_SNAPSHOT_COUNT:
*val = getprop_uint64(zhp, prop, source);
if (*source == NULL) {
/* not default, must be local */
*source = zhp->zfs_name;
}
break;
case ZFS_PROP_MOUNTED:
*val = (zhp->zfs_mntopts != NULL);
break;
case ZFS_PROP_NUMCLONES:
*val = zhp->zfs_dmustats.dds_num_clones;
break;
case ZFS_PROP_VERSION:
case ZFS_PROP_NORMALIZE:
case ZFS_PROP_UTF8ONLY:
case ZFS_PROP_CASE:
if (!zfs_prop_valid_for_type(prop, zhp->zfs_head_type) ||
zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0)
return (-1);
(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_ZPLPROPS, &zc)) {
zcmd_free_nvlists(&zc);
return (-1);
}
if (zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &zplprops) != 0 ||
nvlist_lookup_uint64(zplprops, zfs_prop_to_name(prop),
val) != 0) {
zcmd_free_nvlists(&zc);
return (-1);
}
nvlist_free(zplprops);
zcmd_free_nvlists(&zc);
break;
case ZFS_PROP_INCONSISTENT:
*val = zhp->zfs_dmustats.dds_inconsistent;
break;
default:
switch (zfs_prop_get_type(prop)) {
case PROP_TYPE_NUMBER:
case PROP_TYPE_INDEX:
*val = getprop_uint64(zhp, prop, source);
/*
* If we tried to use a default value for a
* readonly property, it means that it was not
* present. Note this only applies to "truly"
* readonly properties, not set-once properties
* like volblocksize.
*/
if (zfs_prop_readonly(prop) &&
!zfs_prop_setonce(prop) &&
*source != NULL && (*source)[0] == '\0') {
*source = NULL;
return (-1);
}
break;
case PROP_TYPE_STRING:
default:
zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
"cannot get non-numeric property"));
return (zfs_error(zhp->zfs_hdl, EZFS_BADPROP,
dgettext(TEXT_DOMAIN, "internal error")));
}
}
return (0);
}
/*
* Calculate the source type, given the raw source string.
*/
static void
get_source(zfs_handle_t *zhp, zprop_source_t *srctype, char *source,
char *statbuf, size_t statlen)
{
if (statbuf == NULL || *srctype == ZPROP_SRC_TEMPORARY)
return;
if (source == NULL) {
*srctype = ZPROP_SRC_NONE;
} else if (source[0] == '\0') {
*srctype = ZPROP_SRC_DEFAULT;
} else if (strstr(source, ZPROP_SOURCE_VAL_RECVD) != NULL) {
*srctype = ZPROP_SRC_RECEIVED;
} else {
if (strcmp(source, zhp->zfs_name) == 0) {
*srctype = ZPROP_SRC_LOCAL;
} else {
(void) strlcpy(statbuf, source, statlen);
*srctype = ZPROP_SRC_INHERITED;
}
}
}
int
zfs_prop_get_recvd(zfs_handle_t *zhp, const char *propname, char *propbuf,
size_t proplen, boolean_t literal)
{
zfs_prop_t prop;
int err = 0;
if (zhp->zfs_recvd_props == NULL)
if (get_recvd_props_ioctl(zhp) != 0)
return (-1);
prop = zfs_name_to_prop(propname);
if (prop != ZPROP_INVAL) {
uint64_t cookie;
if (!nvlist_exists(zhp->zfs_recvd_props, propname))
return (-1);
zfs_set_recvd_props_mode(zhp, &cookie);
err = zfs_prop_get(zhp, prop, propbuf, proplen,
NULL, NULL, 0, literal);
zfs_unset_recvd_props_mode(zhp, &cookie);
} else {
nvlist_t *propval;
char *recvdval;
if (nvlist_lookup_nvlist(zhp->zfs_recvd_props,
propname, &propval) != 0)
return (-1);
verify(nvlist_lookup_string(propval, ZPROP_VALUE,
&recvdval) == 0);
(void) strlcpy(propbuf, recvdval, proplen);
}
return (err == 0 ? 0 : -1);
}
static int
get_clones_string(zfs_handle_t *zhp, char *propbuf, size_t proplen)
{
nvlist_t *value;
nvpair_t *pair;
value = zfs_get_clones_nvl(zhp);
if (value == NULL)
return (-1);
propbuf[0] = '\0';
for (pair = nvlist_next_nvpair(value, NULL); pair != NULL;
pair = nvlist_next_nvpair(value, pair)) {
if (propbuf[0] != '\0')
(void) strlcat(propbuf, ",", proplen);
(void) strlcat(propbuf, nvpair_name(pair), proplen);
}
return (0);
}
struct get_clones_arg {
uint64_t numclones;
nvlist_t *value;
const char *origin;
char buf[ZFS_MAX_DATASET_NAME_LEN];
};
int
get_clones_cb(zfs_handle_t *zhp, void *arg)
{
struct get_clones_arg *gca = arg;
if (gca->numclones == 0) {
zfs_close(zhp);
return (0);
}
if (zfs_prop_get(zhp, ZFS_PROP_ORIGIN, gca->buf, sizeof (gca->buf),
NULL, NULL, 0, B_TRUE) != 0)
goto out;
if (strcmp(gca->buf, gca->origin) == 0) {
fnvlist_add_boolean(gca->value, zfs_get_name(zhp));
gca->numclones--;
}
out:
(void) zfs_iter_children(zhp, get_clones_cb, gca);
zfs_close(zhp);
return (0);
}
nvlist_t *
zfs_get_clones_nvl(zfs_handle_t *zhp)
{
nvlist_t *nv, *value;
if (nvlist_lookup_nvlist(zhp->zfs_props,
zfs_prop_to_name(ZFS_PROP_CLONES), &nv) != 0) {
struct get_clones_arg gca;
/*
* if this is a snapshot, then the kernel wasn't able
* to get the clones. Do it by slowly iterating.
*/
if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT)
return (NULL);
if (nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) != 0)
return (NULL);
if (nvlist_alloc(&value, NV_UNIQUE_NAME, 0) != 0) {
nvlist_free(nv);
return (NULL);
}
gca.numclones = zfs_prop_get_int(zhp, ZFS_PROP_NUMCLONES);
gca.value = value;
gca.origin = zhp->zfs_name;
if (gca.numclones != 0) {
zfs_handle_t *root;
char pool[ZFS_MAX_DATASET_NAME_LEN];
char *cp = pool;
/* get the pool name */
(void) strlcpy(pool, zhp->zfs_name, sizeof (pool));
(void) strsep(&cp, "/@");
root = zfs_open(zhp->zfs_hdl, pool,
ZFS_TYPE_FILESYSTEM);
(void) get_clones_cb(root, &gca);
}
if (gca.numclones != 0 ||
nvlist_add_nvlist(nv, ZPROP_VALUE, value) != 0 ||
nvlist_add_nvlist(zhp->zfs_props,
zfs_prop_to_name(ZFS_PROP_CLONES), nv) != 0) {
nvlist_free(nv);
nvlist_free(value);
return (NULL);
}
nvlist_free(nv);
nvlist_free(value);
verify(0 == nvlist_lookup_nvlist(zhp->zfs_props,
zfs_prop_to_name(ZFS_PROP_CLONES), &nv));
}
verify(nvlist_lookup_nvlist(nv, ZPROP_VALUE, &value) == 0);
return (value);
}
/*
* Accepts a property and value and checks that the value
* matches the one found by the channel program. If they are
* not equal, print both of them.
*/
void
zcp_check(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t intval,
const char *strval)
{
if (!zhp->zfs_hdl->libzfs_prop_debug)
return;
int error;
char *poolname = zhp->zpool_hdl->zpool_name;
const char *program =
"args = ...\n"
"ds = args['dataset']\n"
"prop = args['property']\n"
"value, setpoint = zfs.get_prop(ds, prop)\n"
"return {value=value, setpoint=setpoint}\n";
nvlist_t *outnvl;
nvlist_t *retnvl;
nvlist_t *argnvl = fnvlist_alloc();
fnvlist_add_string(argnvl, "dataset", zhp->zfs_name);
fnvlist_add_string(argnvl, "property", zfs_prop_to_name(prop));
error = lzc_channel_program_nosync(poolname, program,
10 * 1000 * 1000, 10 * 1024 * 1024, argnvl, &outnvl);
if (error == 0) {
retnvl = fnvlist_lookup_nvlist(outnvl, "return");
if (zfs_prop_get_type(prop) == PROP_TYPE_NUMBER) {
int64_t ans;
error = nvlist_lookup_int64(retnvl, "value", &ans);
if (error != 0) {
(void) fprintf(stderr, "zcp check error: %u\n",
error);
return;
}
if (ans != intval) {
(void) fprintf(stderr,
"%s: zfs found %lld, but zcp found %lld\n",
zfs_prop_to_name(prop),
(longlong_t)intval, (longlong_t)ans);
}
} else {
char *str_ans;
error = nvlist_lookup_string(retnvl, "value", &str_ans);
if (error != 0) {
(void) fprintf(stderr, "zcp check error: %u\n",
error);
return;
}
if (strcmp(strval, str_ans) != 0) {
(void) fprintf(stderr,
"%s: zfs found %s, but zcp found %s\n",
zfs_prop_to_name(prop),
strval, str_ans);
}
}
} else {
(void) fprintf(stderr,
"zcp check failed, channel program error: %u\n", error);
}
nvlist_free(argnvl);
nvlist_free(outnvl);
}
/*
* Retrieve a property from the given object. If 'literal' is specified, then
* numbers are left as exact values. Otherwise, numbers are converted to a
* human-readable form.
*
* Returns 0 on success, or -1 on error.
*/
int
zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
zprop_source_t *src, char *statbuf, size_t statlen, boolean_t literal)
{
char *source = NULL;
uint64_t val;
const char *str;
const char *strval;
boolean_t received = zfs_is_recvd_props_mode(zhp);
/*
* Check to see if this property applies to our object
*/
if (!zfs_prop_valid_for_type(prop, zhp->zfs_type))
return (-1);
if (received && zfs_prop_readonly(prop))
return (-1);
if (src)
*src = ZPROP_SRC_NONE;
switch (prop) {
case ZFS_PROP_CREATION:
/*
* 'creation' is a time_t stored in the statistics. We convert
* this into a string unless 'literal' is specified.
*/
{
val = getprop_uint64(zhp, prop, &source);
time_t time = (time_t)val;
struct tm t;
if (literal ||
localtime_r(&time, &t) == NULL ||
strftime(propbuf, proplen, "%a %b %e %k:%M %Y",
&t) == 0)
(void) snprintf(propbuf, proplen, "%llu", val);
}
zcp_check(zhp, prop, val, NULL);
break;
case ZFS_PROP_MOUNTPOINT:
/*
* Getting the precise mountpoint can be tricky.
*
* - for 'none' or 'legacy', return those values.
* - for inherited mountpoints, we want to take everything
* after our ancestor and append it to the inherited value.
*
* If the pool has an alternate root, we want to prepend that
* root to any values we return.
*/
str = getprop_string(zhp, prop, &source);
if (str[0] == '/') {
char buf[MAXPATHLEN];
char *root = buf;
const char *relpath;
/*
* If we inherit the mountpoint, even from a dataset
* with a received value, the source will be the path of
* the dataset we inherit from. If source is
* ZPROP_SOURCE_VAL_RECVD, the received value is not
* inherited.
*/
if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) {
relpath = "";
} else {
relpath = zhp->zfs_name + strlen(source);
if (relpath[0] == '/')
relpath++;
}
if ((zpool_get_prop(zhp->zpool_hdl,
ZPOOL_PROP_ALTROOT, buf, MAXPATHLEN, NULL,
B_FALSE)) || (strcmp(root, "-") == 0))
root[0] = '\0';
/*
* Special case an alternate root of '/'. This will
* avoid having multiple leading slashes in the
* mountpoint path.
*/
if (strcmp(root, "/") == 0)
root++;
/*
* If the mountpoint is '/' then skip over this
* if we are obtaining either an alternate root or
* an inherited mountpoint.
*/
if (str[1] == '\0' && (root[0] != '\0' ||
relpath[0] != '\0'))
str++;
if (relpath[0] == '\0')
(void) snprintf(propbuf, proplen, "%s%s",
root, str);
else
(void) snprintf(propbuf, proplen, "%s%s%s%s",
root, str, relpath[0] == '@' ? "" : "/",
relpath);
} else {
/* 'legacy' or 'none' */
(void) strlcpy(propbuf, str, proplen);
}
zcp_check(zhp, prop, NULL, propbuf);
break;
case ZFS_PROP_ORIGIN:
str = getprop_string(zhp, prop, &source);
if (str == NULL)
return (-1);
(void) strlcpy(propbuf, str, proplen);
zcp_check(zhp, prop, NULL, str);
break;
case ZFS_PROP_CLONES:
if (get_clones_string(zhp, propbuf, proplen) != 0)
return (-1);
break;
case ZFS_PROP_QUOTA:
case ZFS_PROP_REFQUOTA:
case ZFS_PROP_RESERVATION:
case ZFS_PROP_REFRESERVATION:
if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
return (-1);
/*
* If quota or reservation is 0, we translate this into 'none'
* (unless literal is set), and indicate that it's the default
* value. Otherwise, we print the number nicely and indicate
* that its set locally.
*/
if (val == 0) {
if (literal)
(void) strlcpy(propbuf, "0", proplen);
else
(void) strlcpy(propbuf, "none", proplen);
} else {
if (literal)
(void) snprintf(propbuf, proplen, "%llu",
(u_longlong_t)val);
else
zfs_nicenum(val, propbuf, proplen);
}
zcp_check(zhp, prop, val, NULL);
break;
case ZFS_PROP_FILESYSTEM_LIMIT:
case ZFS_PROP_SNAPSHOT_LIMIT:
case ZFS_PROP_FILESYSTEM_COUNT:
case ZFS_PROP_SNAPSHOT_COUNT:
if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
return (-1);
/*
* If limit is UINT64_MAX, we translate this into 'none' (unless
* literal is set), and indicate that it's the default value.
* Otherwise, we print the number nicely and indicate that it's
* set locally.
*/
if (literal) {
(void) snprintf(propbuf, proplen, "%llu",
(u_longlong_t)val);
} else if (val == UINT64_MAX) {
(void) strlcpy(propbuf, "none", proplen);
} else {
zfs_nicenum(val, propbuf, proplen);
}
zcp_check(zhp, prop, val, NULL);
break;
case ZFS_PROP_REFRATIO:
case ZFS_PROP_COMPRESSRATIO:
if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
return (-1);
(void) snprintf(propbuf, proplen, "%llu.%02llux",
(u_longlong_t)(val / 100),
(u_longlong_t)(val % 100));
zcp_check(zhp, prop, val, NULL);
break;
case ZFS_PROP_TYPE:
switch (zhp->zfs_type) {
case ZFS_TYPE_FILESYSTEM:
str = "filesystem";
break;
case ZFS_TYPE_VOLUME:
str = "volume";
break;
case ZFS_TYPE_SNAPSHOT:
str = "snapshot";
break;
case ZFS_TYPE_BOOKMARK:
str = "bookmark";
break;
default:
abort();
}
(void) snprintf(propbuf, proplen, "%s", str);
zcp_check(zhp, prop, NULL, propbuf);
break;
case ZFS_PROP_MOUNTED:
/*
* The 'mounted' property is a pseudo-property that described
* whether the filesystem is currently mounted. Even though
* it's a boolean value, the typical values of "on" and "off"
* don't make sense, so we translate to "yes" and "no".
*/
if (get_numeric_property(zhp, ZFS_PROP_MOUNTED,
src, &source, &val) != 0)
return (-1);
if (val)
(void) strlcpy(propbuf, "yes", proplen);
else
(void) strlcpy(propbuf, "no", proplen);
break;
case ZFS_PROP_NAME:
/*
* The 'name' property is a pseudo-property derived from the
* dataset name. It is presented as a real property to simplify
* consumers.
*/
(void) strlcpy(propbuf, zhp->zfs_name, proplen);
zcp_check(zhp, prop, NULL, propbuf);
break;
case ZFS_PROP_MLSLABEL:
{
#ifdef illumos
m_label_t *new_sl = NULL;
char *ascii = NULL; /* human readable label */
(void) strlcpy(propbuf,
getprop_string(zhp, prop, &source), proplen);
if (literal || (strcasecmp(propbuf,
ZFS_MLSLABEL_DEFAULT) == 0))
break;
/*
* Try to translate the internal hex string to
* human-readable output. If there are any
* problems just use the hex string.
*/
if (str_to_label(propbuf, &new_sl, MAC_LABEL,
L_NO_CORRECTION, NULL) == -1) {
m_label_free(new_sl);
break;
}
if (label_to_str(new_sl, &ascii, M_LABEL,
DEF_NAMES) != 0) {
if (ascii)
free(ascii);
m_label_free(new_sl);
break;
}
m_label_free(new_sl);
(void) strlcpy(propbuf, ascii, proplen);
free(ascii);
#else /* !illumos */
propbuf[0] = '\0';
#endif /* illumos */
}
break;
case ZFS_PROP_GUID:
/*
* GUIDs are stored as numbers, but they are identifiers.
* We don't want them to be pretty printed, because pretty
* printing mangles the ID into a truncated and useless value.
*/
if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
return (-1);
(void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val);
zcp_check(zhp, prop, val, NULL);
break;
default:
switch (zfs_prop_get_type(prop)) {
case PROP_TYPE_NUMBER:
if (get_numeric_property(zhp, prop, src,
&source, &val) != 0) {
return (-1);
}
if (literal) {
(void) snprintf(propbuf, proplen, "%llu",
(u_longlong_t)val);
} else {
zfs_nicenum(val, propbuf, proplen);
}
zcp_check(zhp, prop, val, NULL);
break;
case PROP_TYPE_STRING:
str = getprop_string(zhp, prop, &source);
if (str == NULL)
return (-1);
(void) strlcpy(propbuf, str, proplen);
zcp_check(zhp, prop, NULL, str);
break;
case PROP_TYPE_INDEX:
if (get_numeric_property(zhp, prop, src,
&source, &val) != 0)
return (-1);
if (zfs_prop_index_to_string(prop, val, &strval) != 0)
return (-1);
(void) strlcpy(propbuf, strval, proplen);
zcp_check(zhp, prop, NULL, strval);
break;
default:
abort();
}
}
get_source(zhp, src, source, statbuf, statlen);
return (0);
}
/*
* Utility function to get the given numeric property. Does no validation that
* the given property is the appropriate type; should only be used with
* hard-coded property types.
*/
uint64_t
zfs_prop_get_int(zfs_handle_t *zhp, zfs_prop_t prop)
{
char *source;
uint64_t val;
(void) get_numeric_property(zhp, prop, NULL, &source, &val);
return (val);
}
int
zfs_prop_set_int(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t val)
{
char buf[64];
(void) snprintf(buf, sizeof (buf), "%llu", (longlong_t)val);
return (zfs_prop_set(zhp, zfs_prop_to_name(prop), buf));
}
/*
* Similar to zfs_prop_get(), but returns the value as an integer.
*/
int
zfs_prop_get_numeric(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t *value,
zprop_source_t *src, char *statbuf, size_t statlen)
{
char *source;
/*
* Check to see if this property applies to our object
*/
if (!zfs_prop_valid_for_type(prop, zhp->zfs_type)) {
return (zfs_error_fmt(zhp->zfs_hdl, EZFS_PROPTYPE,
dgettext(TEXT_DOMAIN, "cannot get property '%s'"),
zfs_prop_to_name(prop)));
}
if (src)
*src = ZPROP_SRC_NONE;
if (get_numeric_property(zhp, prop, src, &source, value) != 0)
return (-1);
get_source(zhp, src, source, statbuf, statlen);
return (0);
}
static int
idmap_id_to_numeric_domain_rid(uid_t id, boolean_t isuser,
char **domainp, idmap_rid_t *ridp)
{
#ifdef illumos
idmap_get_handle_t *get_hdl = NULL;
idmap_stat status;
int err = EINVAL;
if (idmap_get_create(&get_hdl) != IDMAP_SUCCESS)
goto out;
if (isuser) {
err = idmap_get_sidbyuid(get_hdl, id,
IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status);
} else {
err = idmap_get_sidbygid(get_hdl, id,
IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status);
}
if (err == IDMAP_SUCCESS &&
idmap_get_mappings(get_hdl) == IDMAP_SUCCESS &&
status == IDMAP_SUCCESS)
err = 0;
else
err = EINVAL;
out:
if (get_hdl)
idmap_get_destroy(get_hdl);
return (err);
#else /* !illumos */
assert(!"invalid code path");
return (EINVAL); // silence compiler warning
#endif /* illumos */
}
/*
* convert the propname into parameters needed by kernel
* Eg: userquota@ahrens -> ZFS_PROP_USERQUOTA, "", 126829
* Eg: userused@matt@domain -> ZFS_PROP_USERUSED, "S-1-123-456", 789
*/
static int
userquota_propname_decode(const char *propname, boolean_t zoned,
zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp)
{
zfs_userquota_prop_t type;
char *cp, *end;
char *numericsid = NULL;
boolean_t isuser;
domain[0] = '\0';
*ridp = 0;
/* Figure out the property type ({user|group}{quota|space}) */
for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) {
if (strncmp(propname, zfs_userquota_prop_prefixes[type],
strlen(zfs_userquota_prop_prefixes[type])) == 0)
break;
}
if (type == ZFS_NUM_USERQUOTA_PROPS)
return (EINVAL);
*typep = type;
isuser = (type == ZFS_PROP_USERQUOTA ||
type == ZFS_PROP_USERUSED);
cp = strchr(propname, '@') + 1;
if (strchr(cp, '@')) {
#ifdef illumos
/*
* It's a SID name (eg "user@domain") that needs to be
* turned into S-1-domainID-RID.
*/
int flag = 0;
idmap_stat stat, map_stat;
uid_t pid;
idmap_rid_t rid;
idmap_get_handle_t *gh = NULL;
stat = idmap_get_create(&gh);
if (stat != IDMAP_SUCCESS) {
idmap_get_destroy(gh);
return (ENOMEM);
}
if (zoned && getzoneid() == GLOBAL_ZONEID)
return (ENOENT);
if (isuser) {
stat = idmap_getuidbywinname(cp, NULL, flag, &pid);
if (stat < 0)
return (ENOENT);
stat = idmap_get_sidbyuid(gh, pid, flag, &numericsid,
&rid, &map_stat);
} else {
stat = idmap_getgidbywinname(cp, NULL, flag, &pid);
if (stat < 0)
return (ENOENT);
stat = idmap_get_sidbygid(gh, pid, flag, &numericsid,
&rid, &map_stat);
}
if (stat < 0) {
idmap_get_destroy(gh);
return (ENOENT);
}
stat = idmap_get_mappings(gh);
idmap_get_destroy(gh);
if (stat < 0) {
return (ENOENT);
}
if (numericsid == NULL)
return (ENOENT);
cp = numericsid;
*ridp = rid;
/* will be further decoded below */
#else /* !illumos */
return (ENOENT);
#endif /* illumos */
}
if (strncmp(cp, "S-1-", 4) == 0) {
/* It's a numeric SID (eg "S-1-234-567-89") */
(void) strlcpy(domain, cp, domainlen);
errno = 0;
if (*ridp == 0) {
cp = strrchr(domain, '-');
*cp = '\0';
cp++;
*ridp = strtoull(cp, &end, 10);
} else {
end = "";
}
if (numericsid) {
free(numericsid);
numericsid = NULL;
}
if (errno != 0 || *end != '\0')
return (EINVAL);
} else if (!isdigit(*cp)) {
/*
* It's a user/group name (eg "user") that needs to be
* turned into a uid/gid
*/
if (zoned && getzoneid() == GLOBAL_ZONEID)
return (ENOENT);
if (isuser) {
struct passwd *pw;
pw = getpwnam(cp);
if (pw == NULL)
return (ENOENT);
*ridp = pw->pw_uid;
} else {
struct group *gr;
gr = getgrnam(cp);
if (gr == NULL)
return (ENOENT);
*ridp = gr->gr_gid;
}
} else {
/* It's a user/group ID (eg "12345"). */
uid_t id = strtoul(cp, &end, 10);
idmap_rid_t rid;
char *mapdomain;
if (*end != '\0')
return (EINVAL);
if (id > MAXUID) {
/* It's an ephemeral ID. */
if (idmap_id_to_numeric_domain_rid(id, isuser,
&mapdomain, &rid) != 0)
return (ENOENT);
(void) strlcpy(domain, mapdomain, domainlen);
*ridp = rid;
} else {
*ridp = id;
}
}
ASSERT3P(numericsid, ==, NULL);
return (0);
}
static int
zfs_prop_get_userquota_common(zfs_handle_t *zhp, const char *propname,
uint64_t *propvalue, zfs_userquota_prop_t *typep)
{
int err;
zfs_cmd_t zc = { 0 };
(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
err = userquota_propname_decode(propname,
zfs_prop_get_int(zhp, ZFS_PROP_ZONED),
typep, zc.zc_value, sizeof (zc.zc_value), &zc.zc_guid);
zc.zc_objset_type = *typep;
if (err)
return (err);
err = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_USERSPACE_ONE, &zc);
if (err)
return (err);
*propvalue = zc.zc_cookie;
return (0);
}
int
zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname,
uint64_t *propvalue)
{
zfs_userquota_prop_t type;
return (zfs_prop_get_userquota_common(zhp, propname, propvalue,
&type));
}
int
zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname,
char *propbuf, int proplen, boolean_t literal)
{
int err;
uint64_t propvalue;
zfs_userquota_prop_t type;
err = zfs_prop_get_userquota_common(zhp, propname, &propvalue,
&type);
if (err)
return (err);
if (literal) {
(void) snprintf(propbuf, proplen, "%llu", propvalue);
} else if (propvalue == 0 &&
(type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_GROUPQUOTA)) {
(void) strlcpy(propbuf, "none", proplen);
} else {
zfs_nicenum(propvalue, propbuf, proplen);
}
return (0);
}
int
zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname,
uint64_t *propvalue)
{
int err;
zfs_cmd_t zc = { 0 };
const char *snapname;
(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
snapname = strchr(propname, '@') + 1;
if (strchr(snapname, '@')) {
(void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
} else {
/* snapname is the short name, append it to zhp's fsname */
char *cp;
(void) strlcpy(zc.zc_value, zhp->zfs_name,
sizeof (zc.zc_value));
cp = strchr(zc.zc_value, '@');
if (cp != NULL)
*cp = '\0';
(void) strlcat(zc.zc_value, "@", sizeof (zc.zc_value));
(void) strlcat(zc.zc_value, snapname, sizeof (zc.zc_value));
}
err = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SPACE_WRITTEN, &zc);
if (err)
return (err);
*propvalue = zc.zc_cookie;
return (0);
}
int
zfs_prop_get_written(zfs_handle_t *zhp, const char *propname,
char *propbuf, int proplen, boolean_t literal)
{
int err;
uint64_t propvalue;
err = zfs_prop_get_written_int(zhp, propname, &propvalue);
if (err)
return (err);
if (literal) {
(void) snprintf(propbuf, proplen, "%llu", propvalue);
} else {
zfs_nicenum(propvalue, propbuf, proplen);
}
return (0);
}
/*
* Returns the name of the given zfs handle.
*/
const char *
zfs_get_name(const zfs_handle_t *zhp)
{
return (zhp->zfs_name);
}
/*
* Returns the name of the parent pool for the given zfs handle.
*/
const char *
zfs_get_pool_name(const zfs_handle_t *zhp)
{
return (zhp->zpool_hdl->zpool_name);
}
/*
* Returns the type of the given zfs handle.
*/
zfs_type_t
zfs_get_type(const zfs_handle_t *zhp)
{
return (zhp->zfs_type);
}
/*
* Is one dataset name a child dataset of another?
*
* Needs to handle these cases:
* Dataset 1 "a/foo" "a/foo" "a/foo" "a/foo"
* Dataset 2 "a/fo" "a/foobar" "a/bar/baz" "a/foo/bar"
* Descendant? No. No. No. Yes.
*/
static boolean_t
is_descendant(const char *ds1, const char *ds2)
{
size_t d1len = strlen(ds1);
/* ds2 can't be a descendant if it's smaller */
if (strlen(ds2) < d1len)
return (B_FALSE);
/* otherwise, compare strings and verify that there's a '/' char */
return (ds2[d1len] == '/' && (strncmp(ds1, ds2, d1len) == 0));
}
/*
* Given a complete name, return just the portion that refers to the parent.
* Will return -1 if there is no parent (path is just the name of the
* pool).
*/
static int
parent_name(const char *path, char *buf, size_t buflen)
{
char *slashp;
(void) strlcpy(buf, path, buflen);
if ((slashp = strrchr(buf, '/')) == NULL)
return (-1);
*slashp = '\0';
return (0);
}
/*
* If accept_ancestor is false, then check to make sure that the given path has
* a parent, and that it exists. If accept_ancestor is true, then find the
* closest existing ancestor for the given path. In prefixlen return the
* length of already existing prefix of the given path. We also fetch the
* 'zoned' property, which is used to validate property settings when creating
* new datasets.
*/
static int
check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned,
boolean_t accept_ancestor, int *prefixlen)
{
zfs_cmd_t zc = { 0 };
char parent[ZFS_MAX_DATASET_NAME_LEN];
char *slash;
zfs_handle_t *zhp;
char errbuf[1024];
uint64_t is_zoned;
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN, "cannot create '%s'"), path);
/* get parent, and check to see if this is just a pool */
if (parent_name(path, parent, sizeof (parent)) != 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"missing dataset name"));
return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
}
/* check to see if the pool exists */
if ((slash = strchr(parent, '/')) == NULL)
slash = parent + strlen(parent);
(void) strncpy(zc.zc_name, parent, slash - parent);
zc.zc_name[slash - parent] = '\0';
if (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0 &&
errno == ENOENT) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"no such pool '%s'"), zc.zc_name);
return (zfs_error(hdl, EZFS_NOENT, errbuf));
}
/* check to see if the parent dataset exists */
while ((zhp = make_dataset_handle(hdl, parent)) == NULL) {
if (errno == ENOENT && accept_ancestor) {
/*
* Go deeper to find an ancestor, give up on top level.
*/
if (parent_name(parent, parent, sizeof (parent)) != 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"no such pool '%s'"), zc.zc_name);
return (zfs_error(hdl, EZFS_NOENT, errbuf));
}
} else if (errno == ENOENT) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"parent does not exist"));
return (zfs_error(hdl, EZFS_NOENT, errbuf));
} else
return (zfs_standard_error(hdl, errno, errbuf));
}
is_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
if (zoned != NULL)
*zoned = is_zoned;
/* we are in a non-global zone, but parent is in the global zone */
if (getzoneid() != GLOBAL_ZONEID && !is_zoned) {
(void) zfs_standard_error(hdl, EPERM, errbuf);
zfs_close(zhp);
return (-1);
}
/* make sure parent is a filesystem */
if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"parent is not a filesystem"));
(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
zfs_close(zhp);
return (-1);
}
zfs_close(zhp);
if (prefixlen != NULL)
*prefixlen = strlen(parent);
return (0);
}
/*
* Finds whether the dataset of the given type(s) exists.
*/
boolean_t
zfs_dataset_exists(libzfs_handle_t *hdl, const char *path, zfs_type_t types)
{
zfs_handle_t *zhp;
if (!zfs_validate_name(hdl, path, types, B_FALSE))
return (B_FALSE);
/*
* Try to get stats for the dataset, which will tell us if it exists.
*/
if ((zhp = make_dataset_handle(hdl, path)) != NULL) {
int ds_type = zhp->zfs_type;
zfs_close(zhp);
if (types & ds_type)
return (B_TRUE);
}
return (B_FALSE);
}
/*
* Given a path to 'target', create all the ancestors between
* the prefixlen portion of the path, and the target itself.
* Fail if the initial prefixlen-ancestor does not already exist.
*/
int
create_parents(libzfs_handle_t *hdl, char *target, int prefixlen)
{
zfs_handle_t *h;
char *cp;
const char *opname;
/* make sure prefix exists */
cp = target + prefixlen;
if (*cp != '/') {
assert(strchr(cp, '/') == NULL);
h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM);
} else {
*cp = '\0';
h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM);
*cp = '/';
}
if (h == NULL)
return (-1);
zfs_close(h);
/*
* Attempt to create, mount, and share any ancestor filesystems,
* up to the prefixlen-long one.
*/
for (cp = target + prefixlen + 1;
(cp = strchr(cp, '/')) != NULL; *cp = '/', cp++) {
*cp = '\0';
h = make_dataset_handle(hdl, target);
if (h) {
/* it already exists, nothing to do here */
zfs_close(h);
continue;
}
if (zfs_create(hdl, target, ZFS_TYPE_FILESYSTEM,
NULL) != 0) {
opname = dgettext(TEXT_DOMAIN, "create");
goto ancestorerr;
}
h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM);
if (h == NULL) {
opname = dgettext(TEXT_DOMAIN, "open");
goto ancestorerr;
}
if (zfs_mount(h, NULL, 0) != 0) {
opname = dgettext(TEXT_DOMAIN, "mount");
goto ancestorerr;
}
if (zfs_share(h) != 0) {
opname = dgettext(TEXT_DOMAIN, "share");
goto ancestorerr;
}
zfs_close(h);
}
return (0);
ancestorerr:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"failed to %s ancestor '%s'"), opname, target);
return (-1);
}
/*
* Creates non-existing ancestors of the given path.
*/
int
zfs_create_ancestors(libzfs_handle_t *hdl, const char *path)
{
int prefix;
char *path_copy;
int rc = 0;
if (check_parents(hdl, path, NULL, B_TRUE, &prefix) != 0)
return (-1);
if ((path_copy = strdup(path)) != NULL) {
rc = create_parents(hdl, path_copy, prefix);
free(path_copy);
}
if (path_copy == NULL || rc != 0)
return (-1);
return (0);
}
/*
* Create a new filesystem or volume.
*/
int
zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type,
nvlist_t *props)
{
int ret;
uint64_t size = 0;
uint64_t blocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
char errbuf[1024];
uint64_t zoned;
enum lzc_dataset_type ost;
zpool_handle_t *zpool_handle;
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"cannot create '%s'"), path);
/* validate the path, taking care to note the extended error message */
if (!zfs_validate_name(hdl, path, type, B_TRUE))
return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
/* validate parents exist */
if (check_parents(hdl, path, &zoned, B_FALSE, NULL) != 0)
return (-1);
/*
* The failure modes when creating a dataset of a different type over
* one that already exists is a little strange. In particular, if you
* try to create a dataset on top of an existing dataset, the ioctl()
* will return ENOENT, not EEXIST. To prevent this from happening, we
* first try to see if the dataset exists.
*/
if (zfs_dataset_exists(hdl, path, ZFS_TYPE_DATASET)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"dataset already exists"));
return (zfs_error(hdl, EZFS_EXISTS, errbuf));
}
if (type == ZFS_TYPE_VOLUME)
ost = LZC_DATSET_TYPE_ZVOL;
else
ost = LZC_DATSET_TYPE_ZFS;
/* open zpool handle for prop validation */
char pool_path[ZFS_MAX_DATASET_NAME_LEN];
(void) strlcpy(pool_path, path, sizeof (pool_path));
/* truncate pool_path at first slash */
char *p = strchr(pool_path, '/');
if (p != NULL)
*p = '\0';
if ((zpool_handle = zpool_open(hdl, pool_path)) == NULL)
return (-1);
if (props && (props = zfs_valid_proplist(hdl, type, props,
zoned, NULL, zpool_handle, errbuf)) == 0) {
zpool_close(zpool_handle);
return (-1);
}
zpool_close(zpool_handle);
if (type == ZFS_TYPE_VOLUME) {
/*
* If we are creating a volume, the size and block size must
* satisfy a few restraints. First, the blocksize must be a
* valid block size between SPA_{MIN,MAX}BLOCKSIZE. Second, the
* volsize must be a multiple of the block size, and cannot be
* zero.
*/
if (props == NULL || nvlist_lookup_uint64(props,
zfs_prop_to_name(ZFS_PROP_VOLSIZE), &size) != 0) {
nvlist_free(props);
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"missing volume size"));
return (zfs_error(hdl, EZFS_BADPROP, errbuf));
}
if ((ret = nvlist_lookup_uint64(props,
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
&blocksize)) != 0) {
if (ret == ENOENT) {
blocksize = zfs_prop_default_numeric(
ZFS_PROP_VOLBLOCKSIZE);
} else {
nvlist_free(props);
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"missing volume block size"));
return (zfs_error(hdl, EZFS_BADPROP, errbuf));
}
}
if (size == 0) {
nvlist_free(props);
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"volume size cannot be zero"));
return (zfs_error(hdl, EZFS_BADPROP, errbuf));
}
if (size % blocksize != 0) {
nvlist_free(props);
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"volume size must be a multiple of volume block "
"size"));
return (zfs_error(hdl, EZFS_BADPROP, errbuf));
}
}
/* create the dataset */
ret = lzc_create(path, ost, props);
nvlist_free(props);
/* check for failure */
if (ret != 0) {
char parent[ZFS_MAX_DATASET_NAME_LEN];
(void) parent_name(path, parent, sizeof (parent));
switch (errno) {
case ENOENT:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"no such parent '%s'"), parent);
return (zfs_error(hdl, EZFS_NOENT, errbuf));
case EINVAL:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"parent '%s' is not a filesystem"), parent);
return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
case ENOTSUP:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"pool must be upgraded to set this "
"property or value"));
return (zfs_error(hdl, EZFS_BADVERSION, errbuf));
case ERANGE:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"invalid property value(s) specified"));
return (zfs_error(hdl, EZFS_BADPROP, errbuf));
#ifdef _ILP32
case EOVERFLOW:
/*
* This platform can't address a volume this big.
*/
if (type == ZFS_TYPE_VOLUME)
return (zfs_error(hdl, EZFS_VOLTOOBIG,
errbuf));
#endif
/* FALLTHROUGH */
default:
return (zfs_standard_error(hdl, errno, errbuf));
}
}
return (0);
}
/*
* Destroys the given dataset. The caller must make sure that the filesystem
* isn't mounted, and that there are no active dependents. If the file system
* does not exist this function does nothing.
*/
int
zfs_destroy(zfs_handle_t *zhp, boolean_t defer)
{
zfs_cmd_t zc = { 0 };
if (zhp->zfs_type == ZFS_TYPE_BOOKMARK) {
nvlist_t *nv = fnvlist_alloc();
fnvlist_add_boolean(nv, zhp->zfs_name);
int error = lzc_destroy_bookmarks(nv, NULL);
fnvlist_free(nv);
if (error != 0) {
return (zfs_standard_error_fmt(zhp->zfs_hdl, errno,
dgettext(TEXT_DOMAIN, "cannot destroy '%s'"),
zhp->zfs_name));
}
return (0);
}
(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
if (ZFS_IS_VOLUME(zhp)) {
zc.zc_objset_type = DMU_OST_ZVOL;
} else {
zc.zc_objset_type = DMU_OST_ZFS;
}
zc.zc_defer_destroy = defer;
if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY, &zc) != 0 &&
errno != ENOENT) {
return (zfs_standard_error_fmt(zhp->zfs_hdl, errno,
dgettext(TEXT_DOMAIN, "cannot destroy '%s'"),
zhp->zfs_name));
}
remove_mountpoint(zhp);
return (0);
}
struct destroydata {
nvlist_t *nvl;
const char *snapname;
};
static int
zfs_check_snap_cb(zfs_handle_t *zhp, void *arg)
{
struct destroydata *dd = arg;
char name[ZFS_MAX_DATASET_NAME_LEN];
int rv = 0;
(void) snprintf(name, sizeof (name),
"%s@%s", zhp->zfs_name, dd->snapname);
if (lzc_exists(name))
verify(nvlist_add_boolean(dd->nvl, name) == 0);
rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, dd);
zfs_close(zhp);
return (rv);
}
/*
* Destroys all snapshots with the given name in zhp & descendants.
*/
int
zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer)
{
int ret;
struct destroydata dd = { 0 };
dd.snapname = snapname;
verify(nvlist_alloc(&dd.nvl, NV_UNIQUE_NAME, 0) == 0);
(void) zfs_check_snap_cb(zfs_handle_dup(zhp), &dd);
if (nvlist_empty(dd.nvl)) {
ret = zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT,
dgettext(TEXT_DOMAIN, "cannot destroy '%s@%s'"),
zhp->zfs_name, snapname);
} else {
ret = zfs_destroy_snaps_nvl(zhp->zfs_hdl, dd.nvl, defer);
}
nvlist_free(dd.nvl);
return (ret);
}
/*
* Destroys all the snapshots named in the nvlist.
*/
int
zfs_destroy_snaps_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, boolean_t defer)
{
int ret;
nvlist_t *errlist = NULL;
ret = lzc_destroy_snaps(snaps, defer, &errlist);
if (ret == 0) {
nvlist_free(errlist);
return (0);
}
if (nvlist_empty(errlist)) {
char errbuf[1024];
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN, "cannot destroy snapshots"));
ret = zfs_standard_error(hdl, ret, errbuf);
}
for (nvpair_t *pair = nvlist_next_nvpair(errlist, NULL);
pair != NULL; pair = nvlist_next_nvpair(errlist, pair)) {
char errbuf[1024];
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN, "cannot destroy snapshot %s"),
nvpair_name(pair));
switch (fnvpair_value_int32(pair)) {
case EEXIST:
zfs_error_aux(hdl,
dgettext(TEXT_DOMAIN, "snapshot is cloned"));
ret = zfs_error(hdl, EZFS_EXISTS, errbuf);
break;
default:
ret = zfs_standard_error(hdl, errno, errbuf);
break;
}
}
nvlist_free(errlist);
return (ret);
}
/*
* Clones the given dataset. The target must be of the same type as the source.
*/
int
zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props)
{
char parent[ZFS_MAX_DATASET_NAME_LEN];
int ret;
char errbuf[1024];
libzfs_handle_t *hdl = zhp->zfs_hdl;
uint64_t zoned;
assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"cannot create '%s'"), target);
/* validate the target/clone name */
if (!zfs_validate_name(hdl, target, ZFS_TYPE_FILESYSTEM, B_TRUE))
return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
/* validate parents exist */
if (check_parents(hdl, target, &zoned, B_FALSE, NULL) != 0)
return (-1);
(void) parent_name(target, parent, sizeof (parent));
/* do the clone */
if (props) {
zfs_type_t type;
if (ZFS_IS_VOLUME(zhp)) {
type = ZFS_TYPE_VOLUME;
} else {
type = ZFS_TYPE_FILESYSTEM;
}
if ((props = zfs_valid_proplist(hdl, type, props, zoned,
zhp, zhp->zpool_hdl, errbuf)) == NULL)
return (-1);
}
ret = lzc_clone(target, zhp->zfs_name, props);
nvlist_free(props);
if (ret != 0) {
switch (errno) {
case ENOENT:
/*
* The parent doesn't exist. We should have caught this
* above, but there may a race condition that has since
* destroyed the parent.
*
* At this point, we don't know whether it's the source
* that doesn't exist anymore, or whether the target
* dataset doesn't exist.
*/
zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
"no such parent '%s'"), parent);
return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf));
case EXDEV:
zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
"source and target pools differ"));
return (zfs_error(zhp->zfs_hdl, EZFS_CROSSTARGET,
errbuf));
default:
return (zfs_standard_error(zhp->zfs_hdl, errno,
errbuf));
}
}
return (ret);
}
/*
* Promotes the given clone fs to be the clone parent.
*/
int
zfs_promote(zfs_handle_t *zhp)
{
libzfs_handle_t *hdl = zhp->zfs_hdl;
char snapname[ZFS_MAX_DATASET_NAME_LEN];
int ret;
char errbuf[1024];
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"cannot promote '%s'"), zhp->zfs_name);
if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"snapshots can not be promoted"));
return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
}
if (zhp->zfs_dmustats.dds_origin[0] == '\0') {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"not a cloned filesystem"));
return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
}
ret = lzc_promote(zhp->zfs_name, snapname, sizeof (snapname));
if (ret != 0) {
switch (ret) {
case EEXIST:
/* There is a conflicting snapshot name. */
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"conflicting snapshot '%s' from parent '%s'"),
snapname, zhp->zfs_dmustats.dds_origin);
return (zfs_error(hdl, EZFS_EXISTS, errbuf));
default:
return (zfs_standard_error(hdl, ret, errbuf));
}
}
return (ret);
}
typedef struct snapdata {
nvlist_t *sd_nvl;
const char *sd_snapname;
} snapdata_t;
static int
zfs_snapshot_cb(zfs_handle_t *zhp, void *arg)
{
snapdata_t *sd = arg;
char name[ZFS_MAX_DATASET_NAME_LEN];
int rv = 0;
if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) == 0) {
(void) snprintf(name, sizeof (name),
"%s@%s", zfs_get_name(zhp), sd->sd_snapname);
fnvlist_add_boolean(sd->sd_nvl, name);
rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd);
}
zfs_close(zhp);
return (rv);
}
+int
+zfs_remap_indirects(libzfs_handle_t *hdl, const char *fs)
+{
+ int err;
+ char errbuf[1024];
+
+ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+ "cannot remap filesystem '%s' "), fs);
+
+ err = lzc_remap(fs);
+
+ if (err != 0) {
+ (void) zfs_standard_error(hdl, err, errbuf);
+ }
+
+ return (err);
+}
+
/*
* Creates snapshots. The keys in the snaps nvlist are the snapshots to be
* created.
*/
int
zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, nvlist_t *props)
{
int ret;
char errbuf[1024];
nvpair_t *elem;
nvlist_t *errors;
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"cannot create snapshots "));
elem = NULL;
while ((elem = nvlist_next_nvpair(snaps, elem)) != NULL) {
const char *snapname = nvpair_name(elem);
/* validate the target name */
if (!zfs_validate_name(hdl, snapname, ZFS_TYPE_SNAPSHOT,
B_TRUE)) {
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN,
"cannot create snapshot '%s'"), snapname);
return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
}
}
/*
* get pool handle for prop validation. assumes all snaps are in the
* same pool, as does lzc_snapshot (below).
*/
char pool[ZFS_MAX_DATASET_NAME_LEN];
elem = nvlist_next_nvpair(snaps, NULL);
(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
pool[strcspn(pool, "/@")] = '\0';
zpool_handle_t *zpool_hdl = zpool_open(hdl, pool);
if (props != NULL &&
(props = zfs_valid_proplist(hdl, ZFS_TYPE_SNAPSHOT,
props, B_FALSE, NULL, zpool_hdl, errbuf)) == NULL) {
zpool_close(zpool_hdl);
return (-1);
}
zpool_close(zpool_hdl);
ret = lzc_snapshot(snaps, props, &errors);
if (ret != 0) {
boolean_t printed = B_FALSE;
for (elem = nvlist_next_nvpair(errors, NULL);
elem != NULL;
elem = nvlist_next_nvpair(errors, elem)) {
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN,
"cannot create snapshot '%s'"), nvpair_name(elem));
(void) zfs_standard_error(hdl,
fnvpair_value_int32(elem), errbuf);
printed = B_TRUE;
}
if (!printed) {
switch (ret) {
case EXDEV:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"multiple snapshots of same "
"fs not allowed"));
(void) zfs_error(hdl, EZFS_EXISTS, errbuf);
break;
default:
(void) zfs_standard_error(hdl, ret, errbuf);
}
}
}
nvlist_free(props);
nvlist_free(errors);
return (ret);
}
int
zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive,
nvlist_t *props)
{
int ret;
snapdata_t sd = { 0 };
char fsname[ZFS_MAX_DATASET_NAME_LEN];
char *cp;
zfs_handle_t *zhp;
char errbuf[1024];
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"cannot snapshot %s"), path);
if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT, B_TRUE))
return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
(void) strlcpy(fsname, path, sizeof (fsname));
cp = strchr(fsname, '@');
*cp = '\0';
sd.sd_snapname = cp + 1;
if ((zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM |
ZFS_TYPE_VOLUME)) == NULL) {
return (-1);
}
verify(nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) == 0);
if (recursive) {
(void) zfs_snapshot_cb(zfs_handle_dup(zhp), &sd);
} else {
fnvlist_add_boolean(sd.sd_nvl, path);
}
ret = zfs_snapshot_nvl(hdl, sd.sd_nvl, props);
nvlist_free(sd.sd_nvl);
zfs_close(zhp);
return (ret);
}
/*
* Destroy any more recent snapshots. We invoke this callback on any dependents
* of the snapshot first. If the 'cb_dependent' member is non-zero, then this
* is a dependent and we should just destroy it without checking the transaction
* group.
*/
typedef struct rollback_data {
const char *cb_target; /* the snapshot */
uint64_t cb_create; /* creation time reference */
boolean_t cb_error;
boolean_t cb_force;
} rollback_data_t;
static int
rollback_destroy_dependent(zfs_handle_t *zhp, void *data)
{
rollback_data_t *cbp = data;
prop_changelist_t *clp;
/* We must destroy this clone; first unmount it */
clp = changelist_gather(zhp, ZFS_PROP_NAME, 0,
cbp->cb_force ? MS_FORCE: 0);
if (clp == NULL || changelist_prefix(clp) != 0) {
cbp->cb_error = B_TRUE;
zfs_close(zhp);
return (0);
}
if (zfs_destroy(zhp, B_FALSE) != 0)
cbp->cb_error = B_TRUE;
else
changelist_remove(clp, zhp->zfs_name);
(void) changelist_postfix(clp);
changelist_free(clp);
zfs_close(zhp);
return (0);
}
static int
rollback_destroy(zfs_handle_t *zhp, void *data)
{
rollback_data_t *cbp = data;
if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) {
cbp->cb_error |= zfs_iter_dependents(zhp, B_FALSE,
rollback_destroy_dependent, cbp);
cbp->cb_error |= zfs_destroy(zhp, B_FALSE);
}
zfs_close(zhp);
return (0);
}
/*
* Given a dataset, rollback to a specific snapshot, discarding any
* data changes since then and making it the active dataset.
*
* Any snapshots and bookmarks more recent than the target are
* destroyed, along with their dependents (i.e. clones).
*/
int
zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force)
{
rollback_data_t cb = { 0 };
int err;
boolean_t restore_resv = 0;
uint64_t old_volsize = 0, new_volsize;
zfs_prop_t resv_prop;
assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM ||
zhp->zfs_type == ZFS_TYPE_VOLUME);
/*
* Destroy all recent snapshots and their dependents.
*/
cb.cb_force = force;
cb.cb_target = snap->zfs_name;
cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG);
(void) zfs_iter_snapshots(zhp, B_FALSE, rollback_destroy, &cb);
(void) zfs_iter_bookmarks(zhp, rollback_destroy, &cb);
if (cb.cb_error)
return (-1);
/*
* Now that we have verified that the snapshot is the latest,
* rollback to the given snapshot.
*/
if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
if (zfs_which_resv_prop(zhp, &resv_prop) < 0)
return (-1);
old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
restore_resv =
(old_volsize == zfs_prop_get_int(zhp, resv_prop));
}
/*
* Pass both the filesystem and the wanted snapshot names,
* we would get an error back if the snapshot is destroyed or
* a new snapshot is created before this request is processed.
*/
err = lzc_rollback_to(zhp->zfs_name, snap->zfs_name);
if (err != 0) {
char errbuf[1024];
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN, "cannot rollback '%s'"),
zhp->zfs_name);
switch (err) {
case EEXIST:
zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
"there is a snapshot or bookmark more recent "
"than '%s'"), snap->zfs_name);
(void) zfs_error(zhp->zfs_hdl, EZFS_EXISTS, errbuf);
break;
case ESRCH:
zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
"'%s' is not found among snapshots of '%s'"),
snap->zfs_name, zhp->zfs_name);
(void) zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf);
break;
case EINVAL:
(void) zfs_error(zhp->zfs_hdl, EZFS_BADTYPE, errbuf);
break;
default:
(void) zfs_standard_error(zhp->zfs_hdl, err, errbuf);
}
return (err);
}
/*
* For volumes, if the pre-rollback volsize matched the pre-
* rollback reservation and the volsize has changed then set
* the reservation property to the post-rollback volsize.
* Make a new handle since the rollback closed the dataset.
*/
if ((zhp->zfs_type == ZFS_TYPE_VOLUME) &&
(zhp = make_dataset_handle(zhp->zfs_hdl, zhp->zfs_name))) {
if (restore_resv) {
new_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
if (old_volsize != new_volsize)
err = zfs_prop_set_int(zhp, resv_prop,
new_volsize);
}
zfs_close(zhp);
}
return (err);
}
/*
* Renames the given dataset.
*/
int
zfs_rename(zfs_handle_t *zhp, const char *source, const char *target,
renameflags_t flags)
{
int ret = 0;
zfs_cmd_t zc = { 0 };
char *delim;
prop_changelist_t *cl = NULL;
zfs_handle_t *zhrp = NULL;
char *parentname = NULL;
char parent[ZFS_MAX_DATASET_NAME_LEN];
char property[ZFS_MAXPROPLEN];
libzfs_handle_t *hdl = zhp->zfs_hdl;
char errbuf[1024];
/* if we have the same exact name, just return success */
if (strcmp(zhp->zfs_name, target) == 0)
return (0);
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"cannot rename to '%s'"), target);
if (source != NULL) {
/*
* This is recursive snapshots rename, put snapshot name
* (that might not exist) into zfs_name.
*/
assert(flags.recurse);
(void) strlcat(zhp->zfs_name, "@", sizeof(zhp->zfs_name));
(void) strlcat(zhp->zfs_name, source, sizeof(zhp->zfs_name));
zhp->zfs_type = ZFS_TYPE_SNAPSHOT;
}
/*
* Make sure the target name is valid
*/
if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) {
if ((strchr(target, '@') == NULL) ||
*target == '@') {
/*
* Snapshot target name is abbreviated,
* reconstruct full dataset name
*/
(void) strlcpy(parent, zhp->zfs_name,
sizeof (parent));
delim = strchr(parent, '@');
if (strchr(target, '@') == NULL)
*(++delim) = '\0';
else
*delim = '\0';
(void) strlcat(parent, target, sizeof (parent));
target = parent;
} else {
/*
* Make sure we're renaming within the same dataset.
*/
delim = strchr(target, '@');
if (strncmp(zhp->zfs_name, target, delim - target)
!= 0 || zhp->zfs_name[delim - target] != '@') {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"snapshots must be part of same "
"dataset"));
return (zfs_error(hdl, EZFS_CROSSTARGET,
errbuf));
}
}
if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE))
return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
} else {
if (flags.recurse) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"recursive rename must be a snapshot"));
return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
}
if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE))
return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
/* validate parents */
if (check_parents(hdl, target, NULL, B_FALSE, NULL) != 0)
return (-1);
/* make sure we're in the same pool */
verify((delim = strchr(target, '/')) != NULL);
if (strncmp(zhp->zfs_name, target, delim - target) != 0 ||
zhp->zfs_name[delim - target] != '/') {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"datasets must be within same pool"));
return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
}
/* new name cannot be a child of the current dataset name */
if (is_descendant(zhp->zfs_name, target)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"New dataset name cannot be a descendant of "
"current dataset name"));
return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
}
}
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN, "cannot rename '%s'"), zhp->zfs_name);
if (getzoneid() == GLOBAL_ZONEID &&
zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"dataset is used in a non-global zone"));
return (zfs_error(hdl, EZFS_ZONED, errbuf));
}
/*
* Avoid unmounting file systems with mountpoint property set to
* 'legacy' or 'none' even if -u option is not given.
*/
if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM &&
!flags.recurse && !flags.nounmount &&
zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, property,
sizeof (property), NULL, NULL, 0, B_FALSE) == 0 &&
(strcmp(property, "legacy") == 0 ||
strcmp(property, "none") == 0)) {
flags.nounmount = B_TRUE;
}
if (flags.recurse) {
parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name);
if (parentname == NULL) {
ret = -1;
goto error;
}
delim = strchr(parentname, '@');
*delim = '\0';
zhrp = zfs_open(zhp->zfs_hdl, parentname, ZFS_TYPE_DATASET);
if (zhrp == NULL) {
ret = -1;
goto error;
}
} else if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT) {
if ((cl = changelist_gather(zhp, ZFS_PROP_NAME,
flags.nounmount ? CL_GATHER_DONT_UNMOUNT : 0,
flags.forceunmount ? MS_FORCE : 0)) == NULL) {
return (-1);
}
if (changelist_haszonedchild(cl)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"child dataset with inherited mountpoint is used "
"in a non-global zone"));
(void) zfs_error(hdl, EZFS_ZONED, errbuf);
ret = -1;
goto error;
}
if ((ret = changelist_prefix(cl)) != 0)
goto error;
}
if (ZFS_IS_VOLUME(zhp))
zc.zc_objset_type = DMU_OST_ZVOL;
else
zc.zc_objset_type = DMU_OST_ZFS;
(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
(void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value));
zc.zc_cookie = flags.recurse ? 1 : 0;
if (flags.nounmount)
zc.zc_cookie |= 2;
if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_RENAME, &zc)) != 0) {
/*
* if it was recursive, the one that actually failed will
* be in zc.zc_name
*/
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"cannot rename '%s'"), zc.zc_name);
if (flags.recurse && errno == EEXIST) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"a child dataset already has a snapshot "
"with the new name"));
(void) zfs_error(hdl, EZFS_EXISTS, errbuf);
} else {
(void) zfs_standard_error(zhp->zfs_hdl, errno, errbuf);
}
/*
* On failure, we still want to remount any filesystems that
* were previously mounted, so we don't alter the system state.
*/
if (cl != NULL)
(void) changelist_postfix(cl);
} else {
if (cl != NULL) {
changelist_rename(cl, zfs_get_name(zhp), target);
ret = changelist_postfix(cl);
}
}
error:
if (parentname != NULL) {
free(parentname);
}
if (zhrp != NULL) {
zfs_close(zhrp);
}
if (cl != NULL) {
changelist_free(cl);
}
return (ret);
}
nvlist_t *
zfs_get_user_props(zfs_handle_t *zhp)
{
return (zhp->zfs_user_props);
}
nvlist_t *
zfs_get_recvd_props(zfs_handle_t *zhp)
{
if (zhp->zfs_recvd_props == NULL)
if (get_recvd_props_ioctl(zhp) != 0)
return (NULL);
return (zhp->zfs_recvd_props);
}
/*
* This function is used by 'zfs list' to determine the exact set of columns to
* display, and their maximum widths. This does two main things:
*
* - If this is a list of all properties, then expand the list to include
* all native properties, and set a flag so that for each dataset we look
* for new unique user properties and add them to the list.
*
* - For non fixed-width properties, keep track of the maximum width seen
* so that we can size the column appropriately. If the user has
* requested received property values, we also need to compute the width
* of the RECEIVED column.
*/
int
zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received,
boolean_t literal)
{
libzfs_handle_t *hdl = zhp->zfs_hdl;
zprop_list_t *entry;
zprop_list_t **last, **start;
nvlist_t *userprops, *propval;
nvpair_t *elem;
char *strval;
char buf[ZFS_MAXPROPLEN];
if (zprop_expand_list(hdl, plp, ZFS_TYPE_DATASET) != 0)
return (-1);
userprops = zfs_get_user_props(zhp);
entry = *plp;
if (entry->pl_all && nvlist_next_nvpair(userprops, NULL) != NULL) {
/*
* Go through and add any user properties as necessary. We
* start by incrementing our list pointer to the first
* non-native property.
*/
start = plp;
while (*start != NULL) {
if ((*start)->pl_prop == ZPROP_INVAL)
break;
start = &(*start)->pl_next;
}
elem = NULL;
while ((elem = nvlist_next_nvpair(userprops, elem)) != NULL) {
/*
* See if we've already found this property in our list.
*/
for (last = start; *last != NULL;
last = &(*last)->pl_next) {
if (strcmp((*last)->pl_user_prop,
nvpair_name(elem)) == 0)
break;
}
if (*last == NULL) {
if ((entry = zfs_alloc(hdl,
sizeof (zprop_list_t))) == NULL ||
((entry->pl_user_prop = zfs_strdup(hdl,
nvpair_name(elem)))) == NULL) {
free(entry);
return (-1);
}
entry->pl_prop = ZPROP_INVAL;
entry->pl_width = strlen(nvpair_name(elem));
entry->pl_all = B_TRUE;
*last = entry;
}
}
}
/*
* Now go through and check the width of any non-fixed columns
*/
for (entry = *plp; entry != NULL; entry = entry->pl_next) {
if (entry->pl_fixed && !literal)
continue;
if (entry->pl_prop != ZPROP_INVAL) {
if (zfs_prop_get(zhp, entry->pl_prop,
buf, sizeof (buf), NULL, NULL, 0, literal) == 0) {
if (strlen(buf) > entry->pl_width)
entry->pl_width = strlen(buf);
}
if (received && zfs_prop_get_recvd(zhp,
zfs_prop_to_name(entry->pl_prop),
buf, sizeof (buf), literal) == 0)
if (strlen(buf) > entry->pl_recvd_width)
entry->pl_recvd_width = strlen(buf);
} else {
if (nvlist_lookup_nvlist(userprops, entry->pl_user_prop,
&propval) == 0) {
verify(nvlist_lookup_string(propval,
ZPROP_VALUE, &strval) == 0);
if (strlen(strval) > entry->pl_width)
entry->pl_width = strlen(strval);
}
if (received && zfs_prop_get_recvd(zhp,
entry->pl_user_prop,
buf, sizeof (buf), literal) == 0)
if (strlen(buf) > entry->pl_recvd_width)
entry->pl_recvd_width = strlen(buf);
}
}
return (0);
}
int
zfs_deleg_share_nfs(libzfs_handle_t *hdl, char *dataset, char *path,
char *resource, void *export, void *sharetab,
int sharemax, zfs_share_op_t operation)
{
zfs_cmd_t zc = { 0 };
int error;
(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
(void) strlcpy(zc.zc_value, path, sizeof (zc.zc_value));
if (resource)
(void) strlcpy(zc.zc_string, resource, sizeof (zc.zc_string));
zc.zc_share.z_sharedata = (uint64_t)(uintptr_t)sharetab;
zc.zc_share.z_exportdata = (uint64_t)(uintptr_t)export;
zc.zc_share.z_sharetype = operation;
zc.zc_share.z_sharemax = sharemax;
error = ioctl(hdl->libzfs_fd, ZFS_IOC_SHARE, &zc);
return (error);
}
void
zfs_prune_proplist(zfs_handle_t *zhp, uint8_t *props)
{
nvpair_t *curr;
/*
* Keep a reference to the props-table against which we prune the
* properties.
*/
zhp->zfs_props_table = props;
curr = nvlist_next_nvpair(zhp->zfs_props, NULL);
while (curr) {
zfs_prop_t zfs_prop = zfs_name_to_prop(nvpair_name(curr));
nvpair_t *next = nvlist_next_nvpair(zhp->zfs_props, curr);
/*
* User properties will result in ZPROP_INVAL, and since we
* only know how to prune standard ZFS properties, we always
* leave these in the list. This can also happen if we
* encounter an unknown DSL property (when running older
* software, for example).
*/
if (zfs_prop != ZPROP_INVAL && props[zfs_prop] == B_FALSE)
(void) nvlist_remove(zhp->zfs_props,
nvpair_name(curr), nvpair_type(curr));
curr = next;
}
}
#ifdef illumos
static int
zfs_smb_acl_mgmt(libzfs_handle_t *hdl, char *dataset, char *path,
zfs_smb_acl_op_t cmd, char *resource1, char *resource2)
{
zfs_cmd_t zc = { 0 };
nvlist_t *nvlist = NULL;
int error;
(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
(void) strlcpy(zc.zc_value, path, sizeof (zc.zc_value));
zc.zc_cookie = (uint64_t)cmd;
if (cmd == ZFS_SMB_ACL_RENAME) {
if (nvlist_alloc(&nvlist, NV_UNIQUE_NAME, 0) != 0) {
(void) no_memory(hdl);
return (0);
}
}
switch (cmd) {
case ZFS_SMB_ACL_ADD:
case ZFS_SMB_ACL_REMOVE:
(void) strlcpy(zc.zc_string, resource1, sizeof (zc.zc_string));
break;
case ZFS_SMB_ACL_RENAME:
if (nvlist_add_string(nvlist, ZFS_SMB_ACL_SRC,
resource1) != 0) {
(void) no_memory(hdl);
return (-1);
}
if (nvlist_add_string(nvlist, ZFS_SMB_ACL_TARGET,
resource2) != 0) {
(void) no_memory(hdl);
return (-1);
}
if (zcmd_write_src_nvlist(hdl, &zc, nvlist) != 0) {
nvlist_free(nvlist);
return (-1);
}
break;
case ZFS_SMB_ACL_PURGE:
break;
default:
return (-1);
}
error = ioctl(hdl->libzfs_fd, ZFS_IOC_SMB_ACL, &zc);
nvlist_free(nvlist);
return (error);
}
int
zfs_smb_acl_add(libzfs_handle_t *hdl, char *dataset,
char *path, char *resource)
{
return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_ADD,
resource, NULL));
}
int
zfs_smb_acl_remove(libzfs_handle_t *hdl, char *dataset,
char *path, char *resource)
{
return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_REMOVE,
resource, NULL));
}
int
zfs_smb_acl_purge(libzfs_handle_t *hdl, char *dataset, char *path)
{
return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_PURGE,
NULL, NULL));
}
int
zfs_smb_acl_rename(libzfs_handle_t *hdl, char *dataset, char *path,
char *oldname, char *newname)
{
return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_RENAME,
oldname, newname));
}
#endif /* illumos */
int
zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type,
zfs_userspace_cb_t func, void *arg)
{
zfs_cmd_t zc = { 0 };
zfs_useracct_t buf[100];
libzfs_handle_t *hdl = zhp->zfs_hdl;
int ret;
(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
zc.zc_objset_type = type;
zc.zc_nvlist_dst = (uintptr_t)buf;
for (;;) {
zfs_useracct_t *zua = buf;
zc.zc_nvlist_dst_size = sizeof (buf);
if (zfs_ioctl(hdl, ZFS_IOC_USERSPACE_MANY, &zc) != 0) {
char errbuf[1024];
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN,
"cannot get used/quota for %s"), zc.zc_name);
return (zfs_standard_error_fmt(hdl, errno, errbuf));
}
if (zc.zc_nvlist_dst_size == 0)
break;
while (zc.zc_nvlist_dst_size > 0) {
if ((ret = func(arg, zua->zu_domain, zua->zu_rid,
zua->zu_space)) != 0)
return (ret);
zua++;
zc.zc_nvlist_dst_size -= sizeof (zfs_useracct_t);
}
}
return (0);
}
struct holdarg {
nvlist_t *nvl;
const char *snapname;
const char *tag;
boolean_t recursive;
int error;
};
static int
zfs_hold_one(zfs_handle_t *zhp, void *arg)
{
struct holdarg *ha = arg;
char name[ZFS_MAX_DATASET_NAME_LEN];
int rv = 0;
(void) snprintf(name, sizeof (name),
"%s@%s", zhp->zfs_name, ha->snapname);
if (lzc_exists(name))
fnvlist_add_string(ha->nvl, name, ha->tag);
if (ha->recursive)
rv = zfs_iter_filesystems(zhp, zfs_hold_one, ha);
zfs_close(zhp);
return (rv);
}
int
zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag,
boolean_t recursive, int cleanup_fd)
{
int ret;
struct holdarg ha;
ha.nvl = fnvlist_alloc();
ha.snapname = snapname;
ha.tag = tag;
ha.recursive = recursive;
(void) zfs_hold_one(zfs_handle_dup(zhp), &ha);
if (nvlist_empty(ha.nvl)) {
char errbuf[1024];
fnvlist_free(ha.nvl);
ret = ENOENT;
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN,
"cannot hold snapshot '%s@%s'"),
zhp->zfs_name, snapname);
(void) zfs_standard_error(zhp->zfs_hdl, ret, errbuf);
return (ret);
}
ret = zfs_hold_nvl(zhp, cleanup_fd, ha.nvl);
fnvlist_free(ha.nvl);
return (ret);
}
int
zfs_hold_nvl(zfs_handle_t *zhp, int cleanup_fd, nvlist_t *holds)
{
int ret;
nvlist_t *errors;
libzfs_handle_t *hdl = zhp->zfs_hdl;
char errbuf[1024];
nvpair_t *elem;
errors = NULL;
ret = lzc_hold(holds, cleanup_fd, &errors);
if (ret == 0) {
/* There may be errors even in the success case. */
fnvlist_free(errors);
return (0);
}
if (nvlist_empty(errors)) {
/* no hold-specific errors */
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN, "cannot hold"));
switch (ret) {
case ENOTSUP:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"pool must be upgraded"));
(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
break;
case EINVAL:
(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
break;
default:
(void) zfs_standard_error(hdl, ret, errbuf);
}
}
for (elem = nvlist_next_nvpair(errors, NULL);
elem != NULL;
elem = nvlist_next_nvpair(errors, elem)) {
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN,
"cannot hold snapshot '%s'"), nvpair_name(elem));
switch (fnvpair_value_int32(elem)) {
case E2BIG:
/*
* Temporary tags wind up having the ds object id
* prepended. So even if we passed the length check
* above, it's still possible for the tag to wind
* up being slightly too long.
*/
(void) zfs_error(hdl, EZFS_TAGTOOLONG, errbuf);
break;
case EINVAL:
(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
break;
case EEXIST:
(void) zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf);
break;
default:
(void) zfs_standard_error(hdl,
fnvpair_value_int32(elem), errbuf);
}
}
fnvlist_free(errors);
return (ret);
}
static int
zfs_release_one(zfs_handle_t *zhp, void *arg)
{
struct holdarg *ha = arg;
char name[ZFS_MAX_DATASET_NAME_LEN];
int rv = 0;
nvlist_t *existing_holds;
(void) snprintf(name, sizeof (name),
"%s@%s", zhp->zfs_name, ha->snapname);
if (lzc_get_holds(name, &existing_holds) != 0) {
ha->error = ENOENT;
} else if (!nvlist_exists(existing_holds, ha->tag)) {
ha->error = ESRCH;
} else {
nvlist_t *torelease = fnvlist_alloc();
fnvlist_add_boolean(torelease, ha->tag);
fnvlist_add_nvlist(ha->nvl, name, torelease);
fnvlist_free(torelease);
}
if (ha->recursive)
rv = zfs_iter_filesystems(zhp, zfs_release_one, ha);
zfs_close(zhp);
return (rv);
}
int
zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag,
boolean_t recursive)
{
int ret;
struct holdarg ha;
nvlist_t *errors = NULL;
nvpair_t *elem;
libzfs_handle_t *hdl = zhp->zfs_hdl;
char errbuf[1024];
ha.nvl = fnvlist_alloc();
ha.snapname = snapname;
ha.tag = tag;
ha.recursive = recursive;
ha.error = 0;
(void) zfs_release_one(zfs_handle_dup(zhp), &ha);
if (nvlist_empty(ha.nvl)) {
fnvlist_free(ha.nvl);
ret = ha.error;
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN,
"cannot release hold from snapshot '%s@%s'"),
zhp->zfs_name, snapname);
if (ret == ESRCH) {
(void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf);
} else {
(void) zfs_standard_error(hdl, ret, errbuf);
}
return (ret);
}
ret = lzc_release(ha.nvl, &errors);
fnvlist_free(ha.nvl);
if (ret == 0) {
/* There may be errors even in the success case. */
fnvlist_free(errors);
return (0);
}
if (nvlist_empty(errors)) {
/* no hold-specific errors */
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"cannot release"));
switch (errno) {
case ENOTSUP:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"pool must be upgraded"));
(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
break;
default:
(void) zfs_standard_error_fmt(hdl, errno, errbuf);
}
}
for (elem = nvlist_next_nvpair(errors, NULL);
elem != NULL;
elem = nvlist_next_nvpair(errors, elem)) {
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN,
"cannot release hold from snapshot '%s'"),
nvpair_name(elem));
switch (fnvpair_value_int32(elem)) {
case ESRCH:
(void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf);
break;
case EINVAL:
(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
break;
default:
(void) zfs_standard_error_fmt(hdl,
fnvpair_value_int32(elem), errbuf);
}
}
fnvlist_free(errors);
return (ret);
}
int
zfs_get_fsacl(zfs_handle_t *zhp, nvlist_t **nvl)
{
zfs_cmd_t zc = { 0 };
libzfs_handle_t *hdl = zhp->zfs_hdl;
int nvsz = 2048;
void *nvbuf;
int err = 0;
char errbuf[1024];
assert(zhp->zfs_type == ZFS_TYPE_VOLUME ||
zhp->zfs_type == ZFS_TYPE_FILESYSTEM);
tryagain:
nvbuf = malloc(nvsz);
if (nvbuf == NULL) {
err = (zfs_error(hdl, EZFS_NOMEM, strerror(errno)));
goto out;
}
zc.zc_nvlist_dst_size = nvsz;
zc.zc_nvlist_dst = (uintptr_t)nvbuf;
(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
if (ioctl(hdl->libzfs_fd, ZFS_IOC_GET_FSACL, &zc) != 0) {
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN, "cannot get permissions on '%s'"),
zc.zc_name);
switch (errno) {
case ENOMEM:
free(nvbuf);
nvsz = zc.zc_nvlist_dst_size;
goto tryagain;
case ENOTSUP:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"pool must be upgraded"));
err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
break;
case EINVAL:
err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
break;
case ENOENT:
err = zfs_error(hdl, EZFS_NOENT, errbuf);
break;
default:
err = zfs_standard_error_fmt(hdl, errno, errbuf);
break;
}
} else {
/* success */
int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0);
if (rc) {
(void) snprintf(errbuf, sizeof (errbuf), dgettext(
TEXT_DOMAIN, "cannot get permissions on '%s'"),
zc.zc_name);
err = zfs_standard_error_fmt(hdl, rc, errbuf);
}
}
free(nvbuf);
out:
return (err);
}
int
zfs_set_fsacl(zfs_handle_t *zhp, boolean_t un, nvlist_t *nvl)
{
zfs_cmd_t zc = { 0 };
libzfs_handle_t *hdl = zhp->zfs_hdl;
char *nvbuf;
char errbuf[1024];
size_t nvsz;
int err;
assert(zhp->zfs_type == ZFS_TYPE_VOLUME ||
zhp->zfs_type == ZFS_TYPE_FILESYSTEM);
err = nvlist_size(nvl, &nvsz, NV_ENCODE_NATIVE);
assert(err == 0);
nvbuf = malloc(nvsz);
err = nvlist_pack(nvl, &nvbuf, &nvsz, NV_ENCODE_NATIVE, 0);
assert(err == 0);
zc.zc_nvlist_src_size = nvsz;
zc.zc_nvlist_src = (uintptr_t)nvbuf;
zc.zc_perm_action = un;
(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
if (zfs_ioctl(hdl, ZFS_IOC_SET_FSACL, &zc) != 0) {
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN, "cannot set permissions on '%s'"),
zc.zc_name);
switch (errno) {
case ENOTSUP:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"pool must be upgraded"));
err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
break;
case EINVAL:
err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
break;
case ENOENT:
err = zfs_error(hdl, EZFS_NOENT, errbuf);
break;
default:
err = zfs_standard_error_fmt(hdl, errno, errbuf);
break;
}
}
free(nvbuf);
return (err);
}
int
zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
{
int err;
char errbuf[1024];
err = lzc_get_holds(zhp->zfs_name, nvl);
if (err != 0) {
libzfs_handle_t *hdl = zhp->zfs_hdl;
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"),
zhp->zfs_name);
switch (err) {
case ENOTSUP:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"pool must be upgraded"));
err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
break;
case EINVAL:
err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
break;
case ENOENT:
err = zfs_error(hdl, EZFS_NOENT, errbuf);
break;
default:
err = zfs_standard_error_fmt(hdl, errno, errbuf);
break;
}
}
return (err);
}
/*
* Convert the zvol's volume size to an appropriate reservation.
* Note: If this routine is updated, it is necessary to update the ZFS test
* suite's shell version in reservation.kshlib.
*/
uint64_t
zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props)
{
uint64_t numdb;
uint64_t nblocks, volblocksize;
int ncopies;
char *strval;
if (nvlist_lookup_string(props,
zfs_prop_to_name(ZFS_PROP_COPIES), &strval) == 0)
ncopies = atoi(strval);
else
ncopies = 1;
if (nvlist_lookup_uint64(props,
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
&volblocksize) != 0)
volblocksize = ZVOL_DEFAULT_BLOCKSIZE;
nblocks = volsize/volblocksize;
/* start with metadnode L0-L6 */
numdb = 7;
/* calculate number of indirects */
while (nblocks > 1) {
nblocks += DNODES_PER_LEVEL - 1;
nblocks /= DNODES_PER_LEVEL;
numdb += nblocks;
}
numdb *= MIN(SPA_DVAS_PER_BP, ncopies + 1);
volsize *= ncopies;
/*
* this is exactly DN_MAX_INDBLKSHIFT when metadata isn't
* compressed, but in practice they compress down to about
* 1100 bytes
*/
numdb *= 1ULL << DN_MAX_INDBLKSHIFT;
volsize += numdb;
return (volsize);
}
/*
* Attach/detach the given filesystem to/from the given jail.
*/
int
zfs_jail(zfs_handle_t *zhp, int jailid, int attach)
{
libzfs_handle_t *hdl = zhp->zfs_hdl;
zfs_cmd_t zc = { 0 };
char errbuf[1024];
unsigned long cmd;
int ret;
if (attach) {
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN, "cannot jail '%s'"), zhp->zfs_name);
} else {
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN, "cannot unjail '%s'"), zhp->zfs_name);
}
switch (zhp->zfs_type) {
case ZFS_TYPE_VOLUME:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"volumes can not be jailed"));
return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
case ZFS_TYPE_SNAPSHOT:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"snapshots can not be jailed"));
return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
}
assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM);
(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
zc.zc_objset_type = DMU_OST_ZFS;
zc.zc_jailid = jailid;
cmd = attach ? ZFS_IOC_JAIL : ZFS_IOC_UNJAIL;
if ((ret = ioctl(hdl->libzfs_fd, cmd, &zc)) != 0)
zfs_standard_error(hdl, errno, errbuf);
return (ret);
}
Index: stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c
===================================================================
--- stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c (revision 332524)
+++ stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c (revision 332525)
@@ -1,4284 +1,4356 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright 2016 Nexenta Systems, Inc.
* Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
* Copyright (c) 2017 Datto Inc.
*/
#include <sys/types.h>
#include <sys/stat.h>
#include <ctype.h>
#include <errno.h>
#include <devid.h>
#include <fcntl.h>
#include <libintl.h>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <unistd.h>
#include <libgen.h>
#include <sys/zfs_ioctl.h>
#include <dlfcn.h>
#include "zfs_namecheck.h"
#include "zfs_prop.h"
#include "libzfs_impl.h"
#include "zfs_comutil.h"
#include "zfeature_common.h"
static int read_efi_label(nvlist_t *, diskaddr_t *, boolean_t *);
static boolean_t zpool_vdev_is_interior(const char *name);
#define BACKUP_SLICE "s2"
typedef struct prop_flags {
int create:1; /* Validate property on creation */
int import:1; /* Validate property on import */
} prop_flags_t;
/*
* ====================================================================
* zpool property functions
* ====================================================================
*/
static int
zpool_get_all_props(zpool_handle_t *zhp)
{
zfs_cmd_t zc = { 0 };
libzfs_handle_t *hdl = zhp->zpool_hdl;
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0)
return (-1);
while (ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_GET_PROPS, &zc) != 0) {
if (errno == ENOMEM) {
if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
zcmd_free_nvlists(&zc);
return (-1);
}
} else {
zcmd_free_nvlists(&zc);
return (-1);
}
}
if (zcmd_read_dst_nvlist(hdl, &zc, &zhp->zpool_props) != 0) {
zcmd_free_nvlists(&zc);
return (-1);
}
zcmd_free_nvlists(&zc);
return (0);
}
static int
zpool_props_refresh(zpool_handle_t *zhp)
{
nvlist_t *old_props;
old_props = zhp->zpool_props;
if (zpool_get_all_props(zhp) != 0)
return (-1);
nvlist_free(old_props);
return (0);
}
static char *
zpool_get_prop_string(zpool_handle_t *zhp, zpool_prop_t prop,
zprop_source_t *src)
{
nvlist_t *nv, *nvl;
uint64_t ival;
char *value;
zprop_source_t source;
nvl = zhp->zpool_props;
if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) {
verify(nvlist_lookup_uint64(nv, ZPROP_SOURCE, &ival) == 0);
source = ival;
verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0);
} else {
source = ZPROP_SRC_DEFAULT;
if ((value = (char *)zpool_prop_default_string(prop)) == NULL)
value = "-";
}
if (src)
*src = source;
return (value);
}
uint64_t
zpool_get_prop_int(zpool_handle_t *zhp, zpool_prop_t prop, zprop_source_t *src)
{
nvlist_t *nv, *nvl;
uint64_t value;
zprop_source_t source;
if (zhp->zpool_props == NULL && zpool_get_all_props(zhp)) {
/*
* zpool_get_all_props() has most likely failed because
* the pool is faulted, but if all we need is the top level
* vdev's guid then get it from the zhp config nvlist.
*/
if ((prop == ZPOOL_PROP_GUID) &&
(nvlist_lookup_nvlist(zhp->zpool_config,
ZPOOL_CONFIG_VDEV_TREE, &nv) == 0) &&
(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value)
== 0)) {
return (value);
}
return (zpool_prop_default_numeric(prop));
}
nvl = zhp->zpool_props;
if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) {
verify(nvlist_lookup_uint64(nv, ZPROP_SOURCE, &value) == 0);
source = value;
verify(nvlist_lookup_uint64(nv, ZPROP_VALUE, &value) == 0);
} else {
source = ZPROP_SRC_DEFAULT;
value = zpool_prop_default_numeric(prop);
}
if (src)
*src = source;
return (value);
}
/*
* Map VDEV STATE to printed strings.
*/
const char *
zpool_state_to_name(vdev_state_t state, vdev_aux_t aux)
{
switch (state) {
case VDEV_STATE_CLOSED:
case VDEV_STATE_OFFLINE:
return (gettext("OFFLINE"));
case VDEV_STATE_REMOVED:
return (gettext("REMOVED"));
case VDEV_STATE_CANT_OPEN:
if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG)
return (gettext("FAULTED"));
else if (aux == VDEV_AUX_SPLIT_POOL)
return (gettext("SPLIT"));
else
return (gettext("UNAVAIL"));
case VDEV_STATE_FAULTED:
return (gettext("FAULTED"));
case VDEV_STATE_DEGRADED:
return (gettext("DEGRADED"));
case VDEV_STATE_HEALTHY:
return (gettext("ONLINE"));
default:
break;
}
return (gettext("UNKNOWN"));
}
/*
* Map POOL STATE to printed strings.
*/
const char *
zpool_pool_state_to_name(pool_state_t state)
{
switch (state) {
case POOL_STATE_ACTIVE:
return (gettext("ACTIVE"));
case POOL_STATE_EXPORTED:
return (gettext("EXPORTED"));
case POOL_STATE_DESTROYED:
return (gettext("DESTROYED"));
case POOL_STATE_SPARE:
return (gettext("SPARE"));
case POOL_STATE_L2CACHE:
return (gettext("L2CACHE"));
case POOL_STATE_UNINITIALIZED:
return (gettext("UNINITIALIZED"));
case POOL_STATE_UNAVAIL:
return (gettext("UNAVAIL"));
case POOL_STATE_POTENTIALLY_ACTIVE:
return (gettext("POTENTIALLY_ACTIVE"));
}
return (gettext("UNKNOWN"));
}
/*
* Get a zpool property value for 'prop' and return the value in
* a pre-allocated buffer.
*/
int
zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len,
zprop_source_t *srctype, boolean_t literal)
{
uint64_t intval;
const char *strval;
zprop_source_t src = ZPROP_SRC_NONE;
nvlist_t *nvroot;
vdev_stat_t *vs;
uint_t vsc;
if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
switch (prop) {
case ZPOOL_PROP_NAME:
(void) strlcpy(buf, zpool_get_name(zhp), len);
break;
case ZPOOL_PROP_HEALTH:
(void) strlcpy(buf,
zpool_pool_state_to_name(POOL_STATE_UNAVAIL), len);
break;
case ZPOOL_PROP_GUID:
intval = zpool_get_prop_int(zhp, prop, &src);
(void) snprintf(buf, len, "%llu", intval);
break;
case ZPOOL_PROP_ALTROOT:
case ZPOOL_PROP_CACHEFILE:
case ZPOOL_PROP_COMMENT:
if (zhp->zpool_props != NULL ||
zpool_get_all_props(zhp) == 0) {
(void) strlcpy(buf,
zpool_get_prop_string(zhp, prop, &src),
len);
break;
}
/* FALLTHROUGH */
default:
(void) strlcpy(buf, "-", len);
break;
}
if (srctype != NULL)
*srctype = src;
return (0);
}
if (zhp->zpool_props == NULL && zpool_get_all_props(zhp) &&
prop != ZPOOL_PROP_NAME)
return (-1);
switch (zpool_prop_get_type(prop)) {
case PROP_TYPE_STRING:
(void) strlcpy(buf, zpool_get_prop_string(zhp, prop, &src),
len);
break;
case PROP_TYPE_NUMBER:
intval = zpool_get_prop_int(zhp, prop, &src);
switch (prop) {
case ZPOOL_PROP_SIZE:
case ZPOOL_PROP_ALLOCATED:
case ZPOOL_PROP_FREE:
case ZPOOL_PROP_FREEING:
case ZPOOL_PROP_LEAKED:
if (literal) {
(void) snprintf(buf, len, "%llu",
(u_longlong_t)intval);
} else {
(void) zfs_nicenum(intval, buf, len);
}
break;
case ZPOOL_PROP_BOOTSIZE:
case ZPOOL_PROP_EXPANDSZ:
if (intval == 0) {
(void) strlcpy(buf, "-", len);
} else if (literal) {
(void) snprintf(buf, len, "%llu",
(u_longlong_t)intval);
} else {
(void) zfs_nicenum(intval, buf, len);
}
break;
case ZPOOL_PROP_CAPACITY:
if (literal) {
(void) snprintf(buf, len, "%llu",
(u_longlong_t)intval);
} else {
(void) snprintf(buf, len, "%llu%%",
(u_longlong_t)intval);
}
break;
case ZPOOL_PROP_FRAGMENTATION:
if (intval == UINT64_MAX) {
(void) strlcpy(buf, "-", len);
} else {
(void) snprintf(buf, len, "%llu%%",
(u_longlong_t)intval);
}
break;
case ZPOOL_PROP_DEDUPRATIO:
(void) snprintf(buf, len, "%llu.%02llux",
(u_longlong_t)(intval / 100),
(u_longlong_t)(intval % 100));
break;
case ZPOOL_PROP_HEALTH:
verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
verify(nvlist_lookup_uint64_array(nvroot,
ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
== 0);
(void) strlcpy(buf, zpool_state_to_name(intval,
vs->vs_aux), len);
break;
case ZPOOL_PROP_VERSION:
if (intval >= SPA_VERSION_FEATURES) {
(void) snprintf(buf, len, "-");
break;
}
/* FALLTHROUGH */
default:
(void) snprintf(buf, len, "%llu", intval);
}
break;
case PROP_TYPE_INDEX:
intval = zpool_get_prop_int(zhp, prop, &src);
if (zpool_prop_index_to_string(prop, intval, &strval)
!= 0)
return (-1);
(void) strlcpy(buf, strval, len);
break;
default:
abort();
}
if (srctype)
*srctype = src;
return (0);
}
/*
* Check if the bootfs name has the same pool name as it is set to.
* Assuming bootfs is a valid dataset name.
*/
static boolean_t
bootfs_name_valid(const char *pool, char *bootfs)
{
int len = strlen(pool);
if (!zfs_name_valid(bootfs, ZFS_TYPE_FILESYSTEM|ZFS_TYPE_SNAPSHOT))
return (B_FALSE);
if (strncmp(pool, bootfs, len) == 0 &&
(bootfs[len] == '/' || bootfs[len] == '\0'))
return (B_TRUE);
return (B_FALSE);
}
boolean_t
zpool_is_bootable(zpool_handle_t *zhp)
{
char bootfs[ZFS_MAX_DATASET_NAME_LEN];
return (zpool_get_prop(zhp, ZPOOL_PROP_BOOTFS, bootfs,
sizeof (bootfs), NULL, B_FALSE) == 0 && strncmp(bootfs, "-",
sizeof (bootfs)) != 0);
}
/*
* Given an nvlist of zpool properties to be set, validate that they are
* correct, and parse any numeric properties (index, boolean, etc) if they are
* specified as strings.
*/
static nvlist_t *
zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname,
nvlist_t *props, uint64_t version, prop_flags_t flags, char *errbuf)
{
nvpair_t *elem;
nvlist_t *retprops;
zpool_prop_t prop;
char *strval;
uint64_t intval;
char *slash, *check;
struct stat64 statbuf;
zpool_handle_t *zhp;
if (nvlist_alloc(&retprops, NV_UNIQUE_NAME, 0) != 0) {
(void) no_memory(hdl);
return (NULL);
}
elem = NULL;
while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
const char *propname = nvpair_name(elem);
prop = zpool_name_to_prop(propname);
if (prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname)) {
int err;
char *fname = strchr(propname, '@') + 1;
err = zfeature_lookup_name(fname, NULL);
if (err != 0) {
ASSERT3U(err, ==, ENOENT);
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"invalid feature '%s'"), fname);
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}
if (nvpair_type(elem) != DATA_TYPE_STRING) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"'%s' must be a string"), propname);
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}
(void) nvpair_value_string(elem, &strval);
if (strcmp(strval, ZFS_FEATURE_ENABLED) != 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"property '%s' can only be set to "
"'enabled'"), propname);
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}
if (nvlist_add_uint64(retprops, propname, 0) != 0) {
(void) no_memory(hdl);
goto error;
}
continue;
}
/*
* Make sure this property is valid and applies to this type.
*/
if (prop == ZPOOL_PROP_INVAL) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"invalid property '%s'"), propname);
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}
if (zpool_prop_readonly(prop)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' "
"is readonly"), propname);
(void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf);
goto error;
}
if (zprop_parse_value(hdl, elem, prop, ZFS_TYPE_POOL, retprops,
&strval, &intval, errbuf) != 0)
goto error;
/*
* Perform additional checking for specific properties.
*/
switch (prop) {
case ZPOOL_PROP_VERSION:
if (intval < version ||
!SPA_VERSION_IS_SUPPORTED(intval)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"property '%s' number %d is invalid."),
propname, intval);
(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
goto error;
}
break;
case ZPOOL_PROP_BOOTSIZE:
if (!flags.create) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"property '%s' can only be set during pool "
"creation"), propname);
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}
break;
case ZPOOL_PROP_BOOTFS:
if (flags.create || flags.import) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"property '%s' cannot be set at creation "
"or import time"), propname);
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}
if (version < SPA_VERSION_BOOTFS) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"pool must be upgraded to support "
"'%s' property"), propname);
(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
goto error;
}
/*
* bootfs property value has to be a dataset name and
* the dataset has to be in the same pool as it sets to.
*/
if (strval[0] != '\0' && !bootfs_name_valid(poolname,
strval)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' "
"is an invalid name"), strval);
(void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
goto error;
}
if ((zhp = zpool_open_canfail(hdl, poolname)) == NULL) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"could not open pool '%s'"), poolname);
(void) zfs_error(hdl, EZFS_OPENFAILED, errbuf);
goto error;
}
zpool_close(zhp);
break;
case ZPOOL_PROP_ALTROOT:
if (!flags.create && !flags.import) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"property '%s' can only be set during pool "
"creation or import"), propname);
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}
if (strval[0] != '/') {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"bad alternate root '%s'"), strval);
(void) zfs_error(hdl, EZFS_BADPATH, errbuf);
goto error;
}
break;
case ZPOOL_PROP_CACHEFILE:
if (strval[0] == '\0')
break;
if (strcmp(strval, "none") == 0)
break;
if (strval[0] != '/') {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"property '%s' must be empty, an "
"absolute path, or 'none'"), propname);
(void) zfs_error(hdl, EZFS_BADPATH, errbuf);
goto error;
}
slash = strrchr(strval, '/');
if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
strcmp(slash, "/..") == 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"'%s' is not a valid file"), strval);
(void) zfs_error(hdl, EZFS_BADPATH, errbuf);
goto error;
}
*slash = '\0';
if (strval[0] != '\0' &&
(stat64(strval, &statbuf) != 0 ||
!S_ISDIR(statbuf.st_mode))) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"'%s' is not a valid directory"),
strval);
(void) zfs_error(hdl, EZFS_BADPATH, errbuf);
goto error;
}
*slash = '/';
break;
case ZPOOL_PROP_COMMENT:
for (check = strval; *check != '\0'; check++) {
if (!isprint(*check)) {
zfs_error_aux(hdl,
dgettext(TEXT_DOMAIN,
"comment may only have printable "
"characters"));
(void) zfs_error(hdl, EZFS_BADPROP,
errbuf);
goto error;
}
}
if (strlen(strval) > ZPROP_MAX_COMMENT) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"comment must not exceed %d characters"),
ZPROP_MAX_COMMENT);
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}
break;
case ZPOOL_PROP_READONLY:
if (!flags.import) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"property '%s' can only be set at "
"import time"), propname);
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}
break;
default:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"property '%s'(%d) not defined"), propname, prop);
break;
}
}
return (retprops);
error:
nvlist_free(retprops);
return (NULL);
}
/*
* Set zpool property : propname=propval.
*/
int
zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval)
{
zfs_cmd_t zc = { 0 };
int ret = -1;
char errbuf[1024];
nvlist_t *nvl = NULL;
nvlist_t *realprops;
uint64_t version;
prop_flags_t flags = { 0 };
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
zhp->zpool_name);
if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
return (no_memory(zhp->zpool_hdl));
if (nvlist_add_string(nvl, propname, propval) != 0) {
nvlist_free(nvl);
return (no_memory(zhp->zpool_hdl));
}
version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
if ((realprops = zpool_valid_proplist(zhp->zpool_hdl,
zhp->zpool_name, nvl, version, flags, errbuf)) == NULL) {
nvlist_free(nvl);
return (-1);
}
nvlist_free(nvl);
nvl = realprops;
/*
* Execute the corresponding ioctl() to set this property.
*/
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if (zcmd_write_src_nvlist(zhp->zpool_hdl, &zc, nvl) != 0) {
nvlist_free(nvl);
return (-1);
}
ret = zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_SET_PROPS, &zc);
zcmd_free_nvlists(&zc);
nvlist_free(nvl);
if (ret)
(void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf);
else
(void) zpool_props_refresh(zhp);
return (ret);
}
int
zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp)
{
libzfs_handle_t *hdl = zhp->zpool_hdl;
zprop_list_t *entry;
char buf[ZFS_MAXPROPLEN];
nvlist_t *features = NULL;
zprop_list_t **last;
boolean_t firstexpand = (NULL == *plp);
if (zprop_expand_list(hdl, plp, ZFS_TYPE_POOL) != 0)
return (-1);
last = plp;
while (*last != NULL)
last = &(*last)->pl_next;
if ((*plp)->pl_all)
features = zpool_get_features(zhp);
if ((*plp)->pl_all && firstexpand) {
for (int i = 0; i < SPA_FEATURES; i++) {
zprop_list_t *entry = zfs_alloc(hdl,
sizeof (zprop_list_t));
entry->pl_prop = ZPROP_INVAL;
entry->pl_user_prop = zfs_asprintf(hdl, "feature@%s",
spa_feature_table[i].fi_uname);
entry->pl_width = strlen(entry->pl_user_prop);
entry->pl_all = B_TRUE;
*last = entry;
last = &entry->pl_next;
}
}
/* add any unsupported features */
for (nvpair_t *nvp = nvlist_next_nvpair(features, NULL);
nvp != NULL; nvp = nvlist_next_nvpair(features, nvp)) {
char *propname;
boolean_t found;
zprop_list_t *entry;
if (zfeature_is_supported(nvpair_name(nvp)))
continue;
propname = zfs_asprintf(hdl, "unsupported@%s",
nvpair_name(nvp));
/*
* Before adding the property to the list make sure that no
* other pool already added the same property.
*/
found = B_FALSE;
entry = *plp;
while (entry != NULL) {
if (entry->pl_user_prop != NULL &&
strcmp(propname, entry->pl_user_prop) == 0) {
found = B_TRUE;
break;
}
entry = entry->pl_next;
}
if (found) {
free(propname);
continue;
}
entry = zfs_alloc(hdl, sizeof (zprop_list_t));
entry->pl_prop = ZPROP_INVAL;
entry->pl_user_prop = propname;
entry->pl_width = strlen(entry->pl_user_prop);
entry->pl_all = B_TRUE;
*last = entry;
last = &entry->pl_next;
}
for (entry = *plp; entry != NULL; entry = entry->pl_next) {
if (entry->pl_fixed)
continue;
if (entry->pl_prop != ZPROP_INVAL &&
zpool_get_prop(zhp, entry->pl_prop, buf, sizeof (buf),
NULL, B_FALSE) == 0) {
if (strlen(buf) > entry->pl_width)
entry->pl_width = strlen(buf);
}
}
return (0);
}
/*
* Get the state for the given feature on the given ZFS pool.
*/
int
zpool_prop_get_feature(zpool_handle_t *zhp, const char *propname, char *buf,
size_t len)
{
uint64_t refcount;
boolean_t found = B_FALSE;
nvlist_t *features = zpool_get_features(zhp);
boolean_t supported;
const char *feature = strchr(propname, '@') + 1;
supported = zpool_prop_feature(propname);
ASSERT(supported || zpool_prop_unsupported(propname));
/*
* Convert from feature name to feature guid. This conversion is
* unecessary for unsupported@... properties because they already
* use guids.
*/
if (supported) {
int ret;
spa_feature_t fid;
ret = zfeature_lookup_name(feature, &fid);
if (ret != 0) {
(void) strlcpy(buf, "-", len);
return (ENOTSUP);
}
feature = spa_feature_table[fid].fi_guid;
}
if (nvlist_lookup_uint64(features, feature, &refcount) == 0)
found = B_TRUE;
if (supported) {
if (!found) {
(void) strlcpy(buf, ZFS_FEATURE_DISABLED, len);
} else {
if (refcount == 0)
(void) strlcpy(buf, ZFS_FEATURE_ENABLED, len);
else
(void) strlcpy(buf, ZFS_FEATURE_ACTIVE, len);
}
} else {
if (found) {
if (refcount == 0) {
(void) strcpy(buf, ZFS_UNSUPPORTED_INACTIVE);
} else {
(void) strcpy(buf, ZFS_UNSUPPORTED_READONLY);
}
} else {
(void) strlcpy(buf, "-", len);
return (ENOTSUP);
}
}
return (0);
}
/*
* Don't start the slice at the default block of 34; many storage
* devices will use a stripe width of 128k, so start there instead.
*/
#define NEW_START_BLOCK 256
/*
* Validate the given pool name, optionally putting an extended error message in
* 'buf'.
*/
boolean_t
zpool_name_valid(libzfs_handle_t *hdl, boolean_t isopen, const char *pool)
{
namecheck_err_t why;
char what;
int ret;
ret = pool_namecheck(pool, &why, &what);
/*
* The rules for reserved pool names were extended at a later point.
* But we need to support users with existing pools that may now be
* invalid. So we only check for this expanded set of names during a
* create (or import), and only in userland.
*/
if (ret == 0 && !isopen &&
(strncmp(pool, "mirror", 6) == 0 ||
strncmp(pool, "raidz", 5) == 0 ||
strncmp(pool, "spare", 5) == 0 ||
strcmp(pool, "log") == 0)) {
if (hdl != NULL)
zfs_error_aux(hdl,
dgettext(TEXT_DOMAIN, "name is reserved"));
return (B_FALSE);
}
if (ret != 0) {
if (hdl != NULL) {
switch (why) {
case NAME_ERR_TOOLONG:
zfs_error_aux(hdl,
dgettext(TEXT_DOMAIN, "name is too long"));
break;
case NAME_ERR_INVALCHAR:
zfs_error_aux(hdl,
dgettext(TEXT_DOMAIN, "invalid character "
"'%c' in pool name"), what);
break;
case NAME_ERR_NOLETTER:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"name must begin with a letter"));
break;
case NAME_ERR_RESERVED:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"name is reserved"));
break;
case NAME_ERR_DISKLIKE:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"pool name is reserved"));
break;
case NAME_ERR_LEADING_SLASH:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"leading slash in name"));
break;
case NAME_ERR_EMPTY_COMPONENT:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"empty component in name"));
break;
case NAME_ERR_TRAILING_SLASH:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"trailing slash in name"));
break;
case NAME_ERR_MULTIPLE_DELIMITERS:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"multiple '@' and/or '#' delimiters in "
"name"));
break;
default:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"(%d) not defined"), why);
break;
}
}
return (B_FALSE);
}
return (B_TRUE);
}
/*
* Open a handle to the given pool, even if the pool is currently in the FAULTED
* state.
*/
zpool_handle_t *
zpool_open_canfail(libzfs_handle_t *hdl, const char *pool)
{
zpool_handle_t *zhp;
boolean_t missing;
/*
* Make sure the pool name is valid.
*/
if (!zpool_name_valid(hdl, B_TRUE, pool)) {
(void) zfs_error_fmt(hdl, EZFS_INVALIDNAME,
dgettext(TEXT_DOMAIN, "cannot open '%s'"),
pool);
return (NULL);
}
if ((zhp = zfs_alloc(hdl, sizeof (zpool_handle_t))) == NULL)
return (NULL);
zhp->zpool_hdl = hdl;
(void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name));
if (zpool_refresh_stats(zhp, &missing) != 0) {
zpool_close(zhp);
return (NULL);
}
if (missing) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool"));
(void) zfs_error_fmt(hdl, EZFS_NOENT,
dgettext(TEXT_DOMAIN, "cannot open '%s'"), pool);
zpool_close(zhp);
return (NULL);
}
return (zhp);
}
/*
* Like the above, but silent on error. Used when iterating over pools (because
* the configuration cache may be out of date).
*/
int
zpool_open_silent(libzfs_handle_t *hdl, const char *pool, zpool_handle_t **ret)
{
zpool_handle_t *zhp;
boolean_t missing;
if ((zhp = zfs_alloc(hdl, sizeof (zpool_handle_t))) == NULL)
return (-1);
zhp->zpool_hdl = hdl;
(void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name));
if (zpool_refresh_stats(zhp, &missing) != 0) {
zpool_close(zhp);
return (-1);
}
if (missing) {
zpool_close(zhp);
*ret = NULL;
return (0);
}
*ret = zhp;
return (0);
}
/*
* Similar to zpool_open_canfail(), but refuses to open pools in the faulted
* state.
*/
zpool_handle_t *
zpool_open(libzfs_handle_t *hdl, const char *pool)
{
zpool_handle_t *zhp;
if ((zhp = zpool_open_canfail(hdl, pool)) == NULL)
return (NULL);
if (zhp->zpool_state == POOL_STATE_UNAVAIL) {
(void) zfs_error_fmt(hdl, EZFS_POOLUNAVAIL,
dgettext(TEXT_DOMAIN, "cannot open '%s'"), zhp->zpool_name);
zpool_close(zhp);
return (NULL);
}
return (zhp);
}
/*
* Close the handle. Simply frees the memory associated with the handle.
*/
void
zpool_close(zpool_handle_t *zhp)
{
nvlist_free(zhp->zpool_config);
nvlist_free(zhp->zpool_old_config);
nvlist_free(zhp->zpool_props);
free(zhp);
}
/*
* Return the name of the pool.
*/
const char *
zpool_get_name(zpool_handle_t *zhp)
{
return (zhp->zpool_name);
}
/*
* Return the state of the pool (ACTIVE or UNAVAILABLE)
*/
int
zpool_get_state(zpool_handle_t *zhp)
{
return (zhp->zpool_state);
}
/*
* Create the named pool, using the provided vdev list. It is assumed
* that the consumer has already validated the contents of the nvlist, so we
* don't have to worry about error semantics.
*/
int
zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot,
nvlist_t *props, nvlist_t *fsprops)
{
zfs_cmd_t zc = { 0 };
nvlist_t *zc_fsprops = NULL;
nvlist_t *zc_props = NULL;
char msg[1024];
int ret = -1;
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
"cannot create '%s'"), pool);
if (!zpool_name_valid(hdl, B_FALSE, pool))
return (zfs_error(hdl, EZFS_INVALIDNAME, msg));
if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0)
return (-1);
if (props) {
prop_flags_t flags = { .create = B_TRUE, .import = B_FALSE };
if ((zc_props = zpool_valid_proplist(hdl, pool, props,
SPA_VERSION_1, flags, msg)) == NULL) {
goto create_failed;
}
}
if (fsprops) {
uint64_t zoned;
char *zonestr;
zoned = ((nvlist_lookup_string(fsprops,
zfs_prop_to_name(ZFS_PROP_ZONED), &zonestr) == 0) &&
strcmp(zonestr, "on") == 0);
if ((zc_fsprops = zfs_valid_proplist(hdl, ZFS_TYPE_FILESYSTEM,
fsprops, zoned, NULL, NULL, msg)) == NULL) {
goto create_failed;
}
if (!zc_props &&
(nvlist_alloc(&zc_props, NV_UNIQUE_NAME, 0) != 0)) {
goto create_failed;
}
if (nvlist_add_nvlist(zc_props,
ZPOOL_ROOTFS_PROPS, zc_fsprops) != 0) {
goto create_failed;
}
}
if (zc_props && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0)
goto create_failed;
(void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));
if ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_CREATE, &zc)) != 0) {
zcmd_free_nvlists(&zc);
nvlist_free(zc_props);
nvlist_free(zc_fsprops);
switch (errno) {
case EBUSY:
/*
* This can happen if the user has specified the same
* device multiple times. We can't reliably detect this
* until we try to add it and see we already have a
* label.
*/
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"one or more vdevs refer to the same device"));
return (zfs_error(hdl, EZFS_BADDEV, msg));
case ERANGE:
/*
* This happens if the record size is smaller or larger
* than the allowed size range, or not a power of 2.
*
* NOTE: although zfs_valid_proplist is called earlier,
* this case may have slipped through since the
* pool does not exist yet and it is therefore
* impossible to read properties e.g. max blocksize
* from the pool.
*/
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"record size invalid"));
return (zfs_error(hdl, EZFS_BADPROP, msg));
case EOVERFLOW:
/*
* This occurs when one of the devices is below
* SPA_MINDEVSIZE. Unfortunately, we can't detect which
* device was the problem device since there's no
* reliable way to determine device size from userland.
*/
{
char buf[64];
zfs_nicenum(SPA_MINDEVSIZE, buf, sizeof (buf));
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"one or more devices is less than the "
"minimum size (%s)"), buf);
}
return (zfs_error(hdl, EZFS_BADDEV, msg));
case ENOSPC:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"one or more devices is out of space"));
return (zfs_error(hdl, EZFS_BADDEV, msg));
case ENOTBLK:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"cache device must be a disk or disk slice"));
return (zfs_error(hdl, EZFS_BADDEV, msg));
default:
return (zpool_standard_error(hdl, errno, msg));
}
}
create_failed:
zcmd_free_nvlists(&zc);
nvlist_free(zc_props);
nvlist_free(zc_fsprops);
return (ret);
}
/*
* Destroy the given pool. It is up to the caller to ensure that there are no
* datasets left in the pool.
*/
int
zpool_destroy(zpool_handle_t *zhp, const char *log_str)
{
zfs_cmd_t zc = { 0 };
zfs_handle_t *zfp = NULL;
libzfs_handle_t *hdl = zhp->zpool_hdl;
char msg[1024];
if (zhp->zpool_state == POOL_STATE_ACTIVE &&
(zfp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_FILESYSTEM)) == NULL)
return (-1);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
zc.zc_history = (uint64_t)(uintptr_t)log_str;
if (zfs_ioctl(hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) {
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
"cannot destroy '%s'"), zhp->zpool_name);
if (errno == EROFS) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"one or more devices is read only"));
(void) zfs_error(hdl, EZFS_BADDEV, msg);
} else {
(void) zpool_standard_error(hdl, errno, msg);
}
if (zfp)
zfs_close(zfp);
return (-1);
}
if (zfp) {
remove_mountpoint(zfp);
zfs_close(zfp);
}
return (0);
}
/*
* Add the given vdevs to the pool. The caller must have already performed the
* necessary verification to ensure that the vdev specification is well-formed.
*/
int
zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
{
zfs_cmd_t zc = { 0 };
int ret;
libzfs_handle_t *hdl = zhp->zpool_hdl;
char msg[1024];
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
"cannot add to '%s'"), zhp->zpool_name);
if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) <
SPA_VERSION_SPARES &&
nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
&spares, &nspares) == 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be "
"upgraded to add hot spares"));
return (zfs_error(hdl, EZFS_BADVERSION, msg));
}
if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) <
SPA_VERSION_L2CACHE &&
nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
&l2cache, &nl2cache) == 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be "
"upgraded to add cache devices"));
return (zfs_error(hdl, EZFS_BADVERSION, msg));
}
if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0)
return (-1);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if (zfs_ioctl(hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) {
switch (errno) {
case EBUSY:
/*
* This can happen if the user has specified the same
* device multiple times. We can't reliably detect this
* until we try to add it and see we already have a
* label.
*/
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"one or more vdevs refer to the same device"));
(void) zfs_error(hdl, EZFS_BADDEV, msg);
break;
+ case EINVAL:
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "invalid config; a pool with removing/removed "
+ "vdevs does not support adding raidz vdevs"));
+ (void) zfs_error(hdl, EZFS_BADDEV, msg);
+ break;
+
case EOVERFLOW:
/*
* This occurrs when one of the devices is below
* SPA_MINDEVSIZE. Unfortunately, we can't detect which
* device was the problem device since there's no
* reliable way to determine device size from userland.
*/
{
char buf[64];
zfs_nicenum(SPA_MINDEVSIZE, buf, sizeof (buf));
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"device is less than the minimum "
"size (%s)"), buf);
}
(void) zfs_error(hdl, EZFS_BADDEV, msg);
break;
case ENOTSUP:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"pool must be upgraded to add these vdevs"));
(void) zfs_error(hdl, EZFS_BADVERSION, msg);
break;
case EDOM:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"root pool can not have multiple vdevs"
" or separate logs"));
(void) zfs_error(hdl, EZFS_POOL_NOTSUP, msg);
break;
case ENOTBLK:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"cache device must be a disk or disk slice"));
(void) zfs_error(hdl, EZFS_BADDEV, msg);
break;
default:
(void) zpool_standard_error(hdl, errno, msg);
}
ret = -1;
} else {
ret = 0;
}
zcmd_free_nvlists(&zc);
return (ret);
}
/*
* Exports the pool from the system. The caller must ensure that there are no
* mounted datasets in the pool.
*/
static int
zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce,
const char *log_str)
{
zfs_cmd_t zc = { 0 };
char msg[1024];
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
"cannot export '%s'"), zhp->zpool_name);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
zc.zc_cookie = force;
zc.zc_guid = hardforce;
zc.zc_history = (uint64_t)(uintptr_t)log_str;
if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_EXPORT, &zc) != 0) {
switch (errno) {
case EXDEV:
zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN,
"use '-f' to override the following errors:\n"
"'%s' has an active shared spare which could be"
" used by other pools once '%s' is exported."),
zhp->zpool_name, zhp->zpool_name);
return (zfs_error(zhp->zpool_hdl, EZFS_ACTIVE_SPARE,
msg));
default:
return (zpool_standard_error_fmt(zhp->zpool_hdl, errno,
msg));
}
}
return (0);
}
int
zpool_export(zpool_handle_t *zhp, boolean_t force, const char *log_str)
{
return (zpool_export_common(zhp, force, B_FALSE, log_str));
}
int
zpool_export_force(zpool_handle_t *zhp, const char *log_str)
{
return (zpool_export_common(zhp, B_TRUE, B_TRUE, log_str));
}
static void
zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun,
nvlist_t *config)
{
nvlist_t *nv = NULL;
uint64_t rewindto;
int64_t loss = -1;
struct tm t;
char timestr[128];
if (!hdl->libzfs_printerr || config == NULL)
return;
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 ||
nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0) {
return;
}
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
return;
(void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss);
if (localtime_r((time_t *)&rewindto, &t) != NULL &&
strftime(timestr, 128, 0, &t) != 0) {
if (dryrun) {
(void) printf(dgettext(TEXT_DOMAIN,
"Would be able to return %s "
"to its state as of %s.\n"),
name, timestr);
} else {
(void) printf(dgettext(TEXT_DOMAIN,
"Pool %s returned to its state as of %s.\n"),
name, timestr);
}
if (loss > 120) {
(void) printf(dgettext(TEXT_DOMAIN,
"%s approximately %lld "),
dryrun ? "Would discard" : "Discarded",
(loss + 30) / 60);
(void) printf(dgettext(TEXT_DOMAIN,
"minutes of transactions.\n"));
} else if (loss > 0) {
(void) printf(dgettext(TEXT_DOMAIN,
"%s approximately %lld "),
dryrun ? "Would discard" : "Discarded", loss);
(void) printf(dgettext(TEXT_DOMAIN,
"seconds of transactions.\n"));
}
}
}
void
zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason,
nvlist_t *config)
{
nvlist_t *nv = NULL;
int64_t loss = -1;
uint64_t edata = UINT64_MAX;
uint64_t rewindto;
struct tm t;
char timestr[128];
if (!hdl->libzfs_printerr)
return;
if (reason >= 0)
(void) printf(dgettext(TEXT_DOMAIN, "action: "));
else
(void) printf(dgettext(TEXT_DOMAIN, "\t"));
/* All attempted rewinds failed if ZPOOL_CONFIG_LOAD_TIME missing */
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 ||
nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0 ||
nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
goto no_info;
(void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_DATA_ERRORS,
&edata);
(void) printf(dgettext(TEXT_DOMAIN,
"Recovery is possible, but will result in some data loss.\n"));
if (localtime_r((time_t *)&rewindto, &t) != NULL &&
strftime(timestr, 128, 0, &t) != 0) {
(void) printf(dgettext(TEXT_DOMAIN,
"\tReturning the pool to its state as of %s\n"
"\tshould correct the problem. "),
timestr);
} else {
(void) printf(dgettext(TEXT_DOMAIN,
"\tReverting the pool to an earlier state "
"should correct the problem.\n\t"));
}
if (loss > 120) {
(void) printf(dgettext(TEXT_DOMAIN,
"Approximately %lld minutes of data\n"
"\tmust be discarded, irreversibly. "), (loss + 30) / 60);
} else if (loss > 0) {
(void) printf(dgettext(TEXT_DOMAIN,
"Approximately %lld seconds of data\n"
"\tmust be discarded, irreversibly. "), loss);
}
if (edata != 0 && edata != UINT64_MAX) {
if (edata == 1) {
(void) printf(dgettext(TEXT_DOMAIN,
"After rewind, at least\n"
"\tone persistent user-data error will remain. "));
} else {
(void) printf(dgettext(TEXT_DOMAIN,
"After rewind, several\n"
"\tpersistent user-data errors will remain. "));
}
}
(void) printf(dgettext(TEXT_DOMAIN,
"Recovery can be attempted\n\tby executing 'zpool %s -F %s'. "),
reason >= 0 ? "clear" : "import", name);
(void) printf(dgettext(TEXT_DOMAIN,
"A scrub of the pool\n"
"\tis strongly recommended after recovery.\n"));
return;
no_info:
(void) printf(dgettext(TEXT_DOMAIN,
"Destroy and re-create the pool from\n\ta backup source.\n"));
}
/*
* zpool_import() is a contracted interface. Should be kept the same
* if possible.
*
* Applications should use zpool_import_props() to import a pool with
* new properties value to be set.
*/
int
zpool_import(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
char *altroot)
{
nvlist_t *props = NULL;
int ret;
if (altroot != NULL) {
if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) {
return (zfs_error_fmt(hdl, EZFS_NOMEM,
dgettext(TEXT_DOMAIN, "cannot import '%s'"),
newname));
}
if (nvlist_add_string(props,
zpool_prop_to_name(ZPOOL_PROP_ALTROOT), altroot) != 0 ||
nvlist_add_string(props,
zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), "none") != 0) {
nvlist_free(props);
return (zfs_error_fmt(hdl, EZFS_NOMEM,
dgettext(TEXT_DOMAIN, "cannot import '%s'"),
newname));
}
}
ret = zpool_import_props(hdl, config, newname, props,
ZFS_IMPORT_NORMAL);
nvlist_free(props);
return (ret);
}
static void
print_vdev_tree(libzfs_handle_t *hdl, const char *name, nvlist_t *nv,
int indent)
{
nvlist_t **child;
uint_t c, children;
char *vname;
uint64_t is_log = 0;
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG,
&is_log);
if (name != NULL)
(void) printf("\t%*s%s%s\n", indent, "", name,
is_log ? " [log]" : "");
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0)
return;
for (c = 0; c < children; c++) {
vname = zpool_vdev_name(hdl, NULL, child[c], B_TRUE);
print_vdev_tree(hdl, vname, child[c], indent + 2);
free(vname);
}
}
void
zpool_print_unsup_feat(nvlist_t *config)
{
nvlist_t *nvinfo, *unsup_feat;
verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) ==
0);
verify(nvlist_lookup_nvlist(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT,
&unsup_feat) == 0);
for (nvpair_t *nvp = nvlist_next_nvpair(unsup_feat, NULL); nvp != NULL;
nvp = nvlist_next_nvpair(unsup_feat, nvp)) {
char *desc;
verify(nvpair_type(nvp) == DATA_TYPE_STRING);
verify(nvpair_value_string(nvp, &desc) == 0);
if (strlen(desc) > 0)
(void) printf("\t%s (%s)\n", nvpair_name(nvp), desc);
else
(void) printf("\t%s\n", nvpair_name(nvp));
}
}
/*
* Import the given pool using the known configuration and a list of
* properties to be set. The configuration should have come from
* zpool_find_import(). The 'newname' parameters control whether the pool
* is imported with a different name.
*/
int
zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
nvlist_t *props, int flags)
{
zfs_cmd_t zc = { 0 };
zpool_rewind_policy_t policy;
nvlist_t *nv = NULL;
nvlist_t *nvinfo = NULL;
nvlist_t *missing = NULL;
char *thename;
char *origname;
int ret;
int error = 0;
char errbuf[1024];
verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
&origname) == 0);
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"cannot import pool '%s'"), origname);
if (newname != NULL) {
if (!zpool_name_valid(hdl, B_FALSE, newname))
return (zfs_error_fmt(hdl, EZFS_INVALIDNAME,
dgettext(TEXT_DOMAIN, "cannot import '%s'"),
newname));
thename = (char *)newname;
} else {
thename = origname;
}
if (props != NULL) {
uint64_t version;
prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE };
verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
&version) == 0);
if ((props = zpool_valid_proplist(hdl, origname,
props, version, flags, errbuf)) == NULL)
return (-1);
if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) {
nvlist_free(props);
return (-1);
}
nvlist_free(props);
}
(void) strlcpy(zc.zc_name, thename, sizeof (zc.zc_name));
verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
&zc.zc_guid) == 0);
if (zcmd_write_conf_nvlist(hdl, &zc, config) != 0) {
zcmd_free_nvlists(&zc);
return (-1);
}
if (zcmd_alloc_dst_nvlist(hdl, &zc, zc.zc_nvlist_conf_size * 2) != 0) {
zcmd_free_nvlists(&zc);
return (-1);
}
zc.zc_cookie = flags;
while ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc)) != 0 &&
errno == ENOMEM) {
if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
zcmd_free_nvlists(&zc);
return (-1);
}
}
if (ret != 0)
error = errno;
(void) zcmd_read_dst_nvlist(hdl, &zc, &nv);
zcmd_free_nvlists(&zc);
zpool_get_rewind_policy(config, &policy);
if (error) {
char desc[1024];
/*
* Dry-run failed, but we print out what success
* looks like if we found a best txg
*/
if (policy.zrp_request & ZPOOL_TRY_REWIND) {
zpool_rewind_exclaim(hdl, newname ? origname : thename,
B_TRUE, nv);
nvlist_free(nv);
return (-1);
}
if (newname == NULL)
(void) snprintf(desc, sizeof (desc),
dgettext(TEXT_DOMAIN, "cannot import '%s'"),
thename);
else
(void) snprintf(desc, sizeof (desc),
dgettext(TEXT_DOMAIN, "cannot import '%s' as '%s'"),
origname, thename);
switch (error) {
case ENOTSUP:
if (nv != NULL && nvlist_lookup_nvlist(nv,
ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 &&
nvlist_exists(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT)) {
(void) printf(dgettext(TEXT_DOMAIN, "This "
"pool uses the following feature(s) not "
"supported by this system:\n"));
zpool_print_unsup_feat(nv);
if (nvlist_exists(nvinfo,
ZPOOL_CONFIG_CAN_RDONLY)) {
(void) printf(dgettext(TEXT_DOMAIN,
"All unsupported features are only "
"required for writing to the pool."
"\nThe pool can be imported using "
"'-o readonly=on'.\n"));
}
}
/*
* Unsupported version.
*/
(void) zfs_error(hdl, EZFS_BADVERSION, desc);
break;
case EINVAL:
(void) zfs_error(hdl, EZFS_INVALCONFIG, desc);
break;
case EROFS:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"one or more devices is read only"));
(void) zfs_error(hdl, EZFS_BADDEV, desc);
break;
case ENXIO:
if (nv && nvlist_lookup_nvlist(nv,
ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 &&
nvlist_lookup_nvlist(nvinfo,
ZPOOL_CONFIG_MISSING_DEVICES, &missing) == 0) {
(void) printf(dgettext(TEXT_DOMAIN,
"The devices below are missing, use "
"'-m' to import the pool anyway:\n"));
print_vdev_tree(hdl, NULL, missing, 2);
(void) printf("\n");
}
(void) zpool_standard_error(hdl, error, desc);
break;
case EEXIST:
(void) zpool_standard_error(hdl, error, desc);
break;
case ENAMETOOLONG:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"new name of at least one dataset is longer than "
"the maximum allowable length"));
(void) zfs_error(hdl, EZFS_NAMETOOLONG, desc);
break;
default:
(void) zpool_standard_error(hdl, error, desc);
zpool_explain_recover(hdl,
newname ? origname : thename, -error, nv);
break;
}
nvlist_free(nv);
ret = -1;
} else {
zpool_handle_t *zhp;
/*
* This should never fail, but play it safe anyway.
*/
if (zpool_open_silent(hdl, thename, &zhp) != 0)
ret = -1;
else if (zhp != NULL)
zpool_close(zhp);
if (policy.zrp_request &
(ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) {
zpool_rewind_exclaim(hdl, newname ? origname : thename,
((policy.zrp_request & ZPOOL_TRY_REWIND) != 0), nv);
}
nvlist_free(nv);
return (0);
}
return (ret);
}
/*
* Scan the pool.
*/
int
zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd)
{
zfs_cmd_t zc = { 0 };
char msg[1024];
int err;
libzfs_handle_t *hdl = zhp->zpool_hdl;
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
zc.zc_cookie = func;
zc.zc_flags = cmd;
if (zfs_ioctl(hdl, ZFS_IOC_POOL_SCAN, &zc) == 0)
return (0);
err = errno;
/* ECANCELED on a scrub means we resumed a paused scrub */
if (err == ECANCELED && func == POOL_SCAN_SCRUB &&
cmd == POOL_SCRUB_NORMAL)
return (0);
if (err == ENOENT && func != POOL_SCAN_NONE && cmd == POOL_SCRUB_NORMAL)
return (0);
if (func == POOL_SCAN_SCRUB) {
if (cmd == POOL_SCRUB_PAUSE) {
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
"cannot pause scrubbing %s"), zc.zc_name);
} else {
assert(cmd == POOL_SCRUB_NORMAL);
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
"cannot scrub %s"), zc.zc_name);
}
} else if (func == POOL_SCAN_NONE) {
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot cancel scrubbing %s"),
zc.zc_name);
} else {
assert(!"unexpected result");
}
if (err == EBUSY) {
nvlist_t *nvroot;
pool_scan_stat_t *ps = NULL;
uint_t psc;
verify(nvlist_lookup_nvlist(zhp->zpool_config,
ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
(void) nvlist_lookup_uint64_array(nvroot,
ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc);
if (ps && ps->pss_func == POOL_SCAN_SCRUB) {
if (cmd == POOL_SCRUB_PAUSE)
return (zfs_error(hdl, EZFS_SCRUB_PAUSED, msg));
else
return (zfs_error(hdl, EZFS_SCRUBBING, msg));
} else {
return (zfs_error(hdl, EZFS_RESILVERING, msg));
}
} else if (err == ENOENT) {
return (zfs_error(hdl, EZFS_NO_SCRUB, msg));
} else {
return (zpool_standard_error(hdl, err, msg));
}
}
#ifdef illumos
/*
* This provides a very minimal check whether a given string is likely a
* c#t#d# style string. Users of this are expected to do their own
* verification of the s# part.
*/
#define CTD_CHECK(str) (str && str[0] == 'c' && isdigit(str[1]))
/*
* More elaborate version for ones which may start with "/dev/dsk/"
* and the like.
*/
static int
ctd_check_path(char *str)
{
/*
* If it starts with a slash, check the last component.
*/
if (str && str[0] == '/') {
char *tmp = strrchr(str, '/');
/*
* If it ends in "/old", check the second-to-last
* component of the string instead.
*/
if (tmp != str && strcmp(tmp, "/old") == 0) {
for (tmp--; *tmp != '/'; tmp--)
;
}
str = tmp + 1;
}
return (CTD_CHECK(str));
}
#endif
/*
* Find a vdev that matches the search criteria specified. We use the
* the nvpair name to determine how we should look for the device.
* 'avail_spare' is set to TRUE if the provided guid refers to an AVAIL
* spare; but FALSE if its an INUSE spare.
*/
static nvlist_t *
vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare,
boolean_t *l2cache, boolean_t *log)
{
uint_t c, children;
nvlist_t **child;
nvlist_t *ret;
uint64_t is_log;
char *srchkey;
nvpair_t *pair = nvlist_next_nvpair(search, NULL);
/* Nothing to look for */
if (search == NULL || pair == NULL)
return (NULL);
/* Obtain the key we will use to search */
srchkey = nvpair_name(pair);
switch (nvpair_type(pair)) {
case DATA_TYPE_UINT64:
if (strcmp(srchkey, ZPOOL_CONFIG_GUID) == 0) {
uint64_t srchval, theguid;
verify(nvpair_value_uint64(pair, &srchval) == 0);
verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
&theguid) == 0);
if (theguid == srchval)
return (nv);
}
break;
case DATA_TYPE_STRING: {
char *srchval, *val;
verify(nvpair_value_string(pair, &srchval) == 0);
if (nvlist_lookup_string(nv, srchkey, &val) != 0)
break;
/*
* Search for the requested value. Special cases:
*
* - ZPOOL_CONFIG_PATH for whole disk entries. To support
* UEFI boot, these end in "s0" or "s0/old" or "s1" or
* "s1/old". The "s0" or "s1" part is hidden from the user,
* but included in the string, so this matches around it.
* - looking for a top-level vdev name (i.e. ZPOOL_CONFIG_TYPE).
*
* Otherwise, all other searches are simple string compares.
*/
#ifdef illumos
if (strcmp(srchkey, ZPOOL_CONFIG_PATH) == 0 &&
ctd_check_path(val)) {
uint64_t wholedisk = 0;
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
&wholedisk);
if (wholedisk) {
int slen = strlen(srchval);
int vlen = strlen(val);
if (slen != vlen - 2)
break;
/*
* make_leaf_vdev() should only set
* wholedisk for ZPOOL_CONFIG_PATHs which
* will include "/dev/dsk/", giving plenty of
* room for the indices used next.
*/
ASSERT(vlen >= 6);
/*
* strings identical except trailing "s0"
*/
if ((strcmp(&val[vlen - 2], "s0") == 0 ||
strcmp(&val[vlen - 2], "s1") == 0) &&
strncmp(srchval, val, slen) == 0)
return (nv);
/*
* strings identical except trailing "s0/old"
*/
if ((strcmp(&val[vlen - 6], "s0/old") == 0 ||
strcmp(&val[vlen - 6], "s1/old") == 0) &&
strcmp(&srchval[slen - 4], "/old") == 0 &&
strncmp(srchval, val, slen - 4) == 0)
return (nv);
break;
}
} else if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0 && val) {
#else
if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0 && val) {
#endif
char *type, *idx, *end, *p;
uint64_t id, vdev_id;
/*
* Determine our vdev type, keeping in mind
* that the srchval is composed of a type and
* vdev id pair (i.e. mirror-4).
*/
if ((type = strdup(srchval)) == NULL)
return (NULL);
if ((p = strrchr(type, '-')) == NULL) {
free(type);
break;
}
idx = p + 1;
*p = '\0';
/*
* If the types don't match then keep looking.
*/
if (strncmp(val, type, strlen(val)) != 0) {
free(type);
break;
}
verify(zpool_vdev_is_interior(type));
verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
&id) == 0);
errno = 0;
vdev_id = strtoull(idx, &end, 10);
free(type);
if (errno != 0)
return (NULL);
/*
* Now verify that we have the correct vdev id.
*/
if (vdev_id == id)
return (nv);
}
/*
* Common case
*/
if (strcmp(srchval, val) == 0)
return (nv);
break;
}
default:
break;
}
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0)
return (NULL);
for (c = 0; c < children; c++) {
if ((ret = vdev_to_nvlist_iter(child[c], search,
avail_spare, l2cache, NULL)) != NULL) {
/*
* The 'is_log' value is only set for the toplevel
* vdev, not the leaf vdevs. So we always lookup the
* log device from the root of the vdev tree (where
* 'log' is non-NULL).
*/
if (log != NULL &&
nvlist_lookup_uint64(child[c],
ZPOOL_CONFIG_IS_LOG, &is_log) == 0 &&
is_log) {
*log = B_TRUE;
}
return (ret);
}
}
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
&child, &children) == 0) {
for (c = 0; c < children; c++) {
if ((ret = vdev_to_nvlist_iter(child[c], search,
avail_spare, l2cache, NULL)) != NULL) {
*avail_spare = B_TRUE;
return (ret);
}
}
}
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
&child, &children) == 0) {
for (c = 0; c < children; c++) {
if ((ret = vdev_to_nvlist_iter(child[c], search,
avail_spare, l2cache, NULL)) != NULL) {
*l2cache = B_TRUE;
return (ret);
}
}
}
return (NULL);
}
/*
* Given a physical path (minus the "/devices" prefix), find the
* associated vdev.
*/
nvlist_t *
zpool_find_vdev_by_physpath(zpool_handle_t *zhp, const char *ppath,
boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log)
{
nvlist_t *search, *nvroot, *ret;
verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0);
verify(nvlist_add_string(search, ZPOOL_CONFIG_PHYS_PATH, ppath) == 0);
verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
*avail_spare = B_FALSE;
*l2cache = B_FALSE;
if (log != NULL)
*log = B_FALSE;
ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log);
nvlist_free(search);
return (ret);
}
/*
* Determine if we have an "interior" top-level vdev (i.e mirror/raidz).
*/
static boolean_t
zpool_vdev_is_interior(const char *name)
{
if (strncmp(name, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 ||
strncmp(name, VDEV_TYPE_SPARE, strlen(VDEV_TYPE_SPARE)) == 0 ||
strncmp(name,
VDEV_TYPE_REPLACING, strlen(VDEV_TYPE_REPLACING)) == 0 ||
strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0)
return (B_TRUE);
return (B_FALSE);
}
nvlist_t *
zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare,
boolean_t *l2cache, boolean_t *log)
{
char buf[MAXPATHLEN];
char *end;
nvlist_t *nvroot, *search, *ret;
uint64_t guid;
verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0);
guid = strtoull(path, &end, 10);
if (guid != 0 && *end == '\0') {
verify(nvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid) == 0);
} else if (zpool_vdev_is_interior(path)) {
verify(nvlist_add_string(search, ZPOOL_CONFIG_TYPE, path) == 0);
} else if (path[0] != '/') {
(void) snprintf(buf, sizeof (buf), "%s%s", _PATH_DEV, path);
verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, buf) == 0);
} else {
verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, path) == 0);
}
verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
*avail_spare = B_FALSE;
*l2cache = B_FALSE;
if (log != NULL)
*log = B_FALSE;
ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log);
nvlist_free(search);
return (ret);
}
static int
vdev_online(nvlist_t *nv)
{
uint64_t ival;
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 ||
nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 ||
nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0)
return (0);
return (1);
}
/*
* Helper function for zpool_get_physpaths().
*/
static int
vdev_get_one_physpath(nvlist_t *config, char *physpath, size_t physpath_size,
size_t *bytes_written)
{
size_t bytes_left, pos, rsz;
char *tmppath;
const char *format;
if (nvlist_lookup_string(config, ZPOOL_CONFIG_PHYS_PATH,
&tmppath) != 0)
return (EZFS_NODEVICE);
pos = *bytes_written;
bytes_left = physpath_size - pos;
format = (pos == 0) ? "%s" : " %s";
rsz = snprintf(physpath + pos, bytes_left, format, tmppath);
*bytes_written += rsz;
if (rsz >= bytes_left) {
/* if physpath was not copied properly, clear it */
if (bytes_left != 0) {
physpath[pos] = 0;
}
return (EZFS_NOSPC);
}
return (0);
}
static int
vdev_get_physpaths(nvlist_t *nv, char *physpath, size_t phypath_size,
size_t *rsz, boolean_t is_spare)
{
char *type;
int ret;
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
return (EZFS_INVALCONFIG);
if (strcmp(type, VDEV_TYPE_DISK) == 0) {
/*
* An active spare device has ZPOOL_CONFIG_IS_SPARE set.
* For a spare vdev, we only want to boot from the active
* spare device.
*/
if (is_spare) {
uint64_t spare = 0;
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
&spare);
if (!spare)
return (EZFS_INVALCONFIG);
}
if (vdev_online(nv)) {
if ((ret = vdev_get_one_physpath(nv, physpath,
phypath_size, rsz)) != 0)
return (ret);
}
} else if (strcmp(type, VDEV_TYPE_MIRROR) == 0 ||
strcmp(type, VDEV_TYPE_RAIDZ) == 0 ||
strcmp(type, VDEV_TYPE_REPLACING) == 0 ||
(is_spare = (strcmp(type, VDEV_TYPE_SPARE) == 0))) {
nvlist_t **child;
uint_t count;
int i, ret;
if (nvlist_lookup_nvlist_array(nv,
ZPOOL_CONFIG_CHILDREN, &child, &count) != 0)
return (EZFS_INVALCONFIG);
for (i = 0; i < count; i++) {
ret = vdev_get_physpaths(child[i], physpath,
phypath_size, rsz, is_spare);
if (ret == EZFS_NOSPC)
return (ret);
}
}
return (EZFS_POOL_INVALARG);
}
/*
* Get phys_path for a root pool config.
* Return 0 on success; non-zero on failure.
*/
static int
zpool_get_config_physpath(nvlist_t *config, char *physpath, size_t phypath_size)
{
size_t rsz;
nvlist_t *vdev_root;
nvlist_t **child;
uint_t count;
char *type;
rsz = 0;
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&vdev_root) != 0)
return (EZFS_INVALCONFIG);
if (nvlist_lookup_string(vdev_root, ZPOOL_CONFIG_TYPE, &type) != 0 ||
nvlist_lookup_nvlist_array(vdev_root, ZPOOL_CONFIG_CHILDREN,
&child, &count) != 0)
return (EZFS_INVALCONFIG);
/*
* root pool can only have a single top-level vdev.
*/
if (strcmp(type, VDEV_TYPE_ROOT) != 0 || count != 1)
return (EZFS_POOL_INVALARG);
(void) vdev_get_physpaths(child[0], physpath, phypath_size, &rsz,
B_FALSE);
/* No online devices */
if (rsz == 0)
return (EZFS_NODEVICE);
return (0);
}
/*
* Get phys_path for a root pool
* Return 0 on success; non-zero on failure.
*/
int
zpool_get_physpath(zpool_handle_t *zhp, char *physpath, size_t phypath_size)
{
return (zpool_get_config_physpath(zhp->zpool_config, physpath,
phypath_size));
}
/*
* If the device has being dynamically expanded then we need to relabel
* the disk to use the new unallocated space.
*/
static int
zpool_relabel_disk(libzfs_handle_t *hdl, const char *name)
{
#ifdef illumos
char path[MAXPATHLEN];
char errbuf[1024];
int fd, error;
int (*_efi_use_whole_disk)(int);
if ((_efi_use_whole_disk = (int (*)(int))dlsym(RTLD_DEFAULT,
"efi_use_whole_disk")) == NULL)
return (-1);
(void) snprintf(path, sizeof (path), "%s/%s", ZFS_RDISK_ROOT, name);
if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
"relabel '%s': unable to open device"), name);
return (zfs_error(hdl, EZFS_OPENFAILED, errbuf));
}
/*
* It's possible that we might encounter an error if the device
* does not have any unallocated space left. If so, we simply
* ignore that error and continue on.
*/
error = _efi_use_whole_disk(fd);
(void) close(fd);
if (error && error != VT_ENOSPC) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
"relabel '%s': unable to read disk capacity"), name);
return (zfs_error(hdl, EZFS_NOCAP, errbuf));
}
#endif /* illumos */
return (0);
}
/*
* Bring the specified vdev online. The 'flags' parameter is a set of the
* ZFS_ONLINE_* flags.
*/
int
zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags,
vdev_state_t *newstate)
{
zfs_cmd_t zc = { 0 };
char msg[1024];
char *pathname;
nvlist_t *tgt;
boolean_t avail_spare, l2cache, islog;
libzfs_handle_t *hdl = zhp->zpool_hdl;
if (flags & ZFS_ONLINE_EXPAND) {
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot expand %s"), path);
} else {
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot online %s"), path);
}
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
&islog)) == NULL)
return (zfs_error(hdl, EZFS_NODEVICE, msg));
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
if (avail_spare)
return (zfs_error(hdl, EZFS_ISSPARE, msg));
if ((flags & ZFS_ONLINE_EXPAND ||
zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) &&
nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &pathname) == 0) {
uint64_t wholedisk = 0;
(void) nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK,
&wholedisk);
/*
* XXX - L2ARC 1.0 devices can't support expansion.
*/
if (l2cache) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"cannot expand cache devices"));
return (zfs_error(hdl, EZFS_VDEVNOTSUP, msg));
}
if (wholedisk) {
pathname += strlen(ZFS_DISK_ROOT) + 1;
(void) zpool_relabel_disk(hdl, pathname);
}
}
zc.zc_cookie = VDEV_STATE_ONLINE;
zc.zc_obj = flags;
if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) {
if (errno == EINVAL) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "was split "
"from this pool into a new one. Use '%s' "
"instead"), "zpool detach");
return (zfs_error(hdl, EZFS_POSTSPLIT_ONLINE, msg));
}
return (zpool_standard_error(hdl, errno, msg));
}
*newstate = zc.zc_cookie;
return (0);
}
/*
* Take the specified vdev offline
*/
int
zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp)
{
zfs_cmd_t zc = { 0 };
char msg[1024];
nvlist_t *tgt;
boolean_t avail_spare, l2cache;
libzfs_handle_t *hdl = zhp->zpool_hdl;
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot offline %s"), path);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
NULL)) == NULL)
return (zfs_error(hdl, EZFS_NODEVICE, msg));
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
if (avail_spare)
return (zfs_error(hdl, EZFS_ISSPARE, msg));
zc.zc_cookie = VDEV_STATE_OFFLINE;
zc.zc_obj = istmp ? ZFS_OFFLINE_TEMPORARY : 0;
if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
return (0);
switch (errno) {
case EBUSY:
/*
* There are no other replicas of this device.
*/
return (zfs_error(hdl, EZFS_NOREPLICAS, msg));
case EEXIST:
/*
* The log device has unplayed logs
*/
return (zfs_error(hdl, EZFS_UNPLAYED_LOGS, msg));
default:
return (zpool_standard_error(hdl, errno, msg));
}
}
/*
* Mark the given vdev faulted.
*/
int
zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux)
{
zfs_cmd_t zc = { 0 };
char msg[1024];
libzfs_handle_t *hdl = zhp->zpool_hdl;
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot fault %llu"), guid);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
zc.zc_guid = guid;
zc.zc_cookie = VDEV_STATE_FAULTED;
zc.zc_obj = aux;
if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
return (0);
switch (errno) {
case EBUSY:
/*
* There are no other replicas of this device.
*/
return (zfs_error(hdl, EZFS_NOREPLICAS, msg));
default:
return (zpool_standard_error(hdl, errno, msg));
}
}
/*
* Mark the given vdev degraded.
*/
int
zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux)
{
zfs_cmd_t zc = { 0 };
char msg[1024];
libzfs_handle_t *hdl = zhp->zpool_hdl;
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot degrade %llu"), guid);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
zc.zc_guid = guid;
zc.zc_cookie = VDEV_STATE_DEGRADED;
zc.zc_obj = aux;
if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
return (0);
return (zpool_standard_error(hdl, errno, msg));
}
/*
* Returns TRUE if the given nvlist is a vdev that was originally swapped in as
* a hot spare.
*/
static boolean_t
is_replacing_spare(nvlist_t *search, nvlist_t *tgt, int which)
{
nvlist_t **child;
uint_t c, children;
char *type;
if (nvlist_lookup_nvlist_array(search, ZPOOL_CONFIG_CHILDREN, &child,
&children) == 0) {
verify(nvlist_lookup_string(search, ZPOOL_CONFIG_TYPE,
&type) == 0);
if (strcmp(type, VDEV_TYPE_SPARE) == 0 &&
children == 2 && child[which] == tgt)
return (B_TRUE);
for (c = 0; c < children; c++)
if (is_replacing_spare(child[c], tgt, which))
return (B_TRUE);
}
return (B_FALSE);
}
/*
* Attach new_disk (fully described by nvroot) to old_disk.
* If 'replacing' is specified, the new disk will replace the old one.
*/
int
zpool_vdev_attach(zpool_handle_t *zhp,
const char *old_disk, const char *new_disk, nvlist_t *nvroot, int replacing)
{
zfs_cmd_t zc = { 0 };
char msg[1024];
int ret;
nvlist_t *tgt;
boolean_t avail_spare, l2cache, islog;
uint64_t val;
char *newname;
nvlist_t **child;
uint_t children;
nvlist_t *config_root;
libzfs_handle_t *hdl = zhp->zpool_hdl;
boolean_t rootpool = zpool_is_bootable(zhp);
if (replacing)
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
"cannot replace %s with %s"), old_disk, new_disk);
else
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
"cannot attach %s to %s"), new_disk, old_disk);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if ((tgt = zpool_find_vdev(zhp, old_disk, &avail_spare, &l2cache,
- &islog)) == 0)
+ &islog)) == NULL)
return (zfs_error(hdl, EZFS_NODEVICE, msg));
if (avail_spare)
return (zfs_error(hdl, EZFS_ISSPARE, msg));
if (l2cache)
return (zfs_error(hdl, EZFS_ISL2CACHE, msg));
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
zc.zc_cookie = replacing;
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0 || children != 1) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"new device must be a single disk"));
return (zfs_error(hdl, EZFS_INVALCONFIG, msg));
}
verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
ZPOOL_CONFIG_VDEV_TREE, &config_root) == 0);
if ((newname = zpool_vdev_name(NULL, NULL, child[0], B_FALSE)) == NULL)
return (-1);
/*
* If the target is a hot spare that has been swapped in, we can only
* replace it with another hot spare.
*/
if (replacing &&
nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_IS_SPARE, &val) == 0 &&
(zpool_find_vdev(zhp, newname, &avail_spare, &l2cache,
NULL) == NULL || !avail_spare) &&
is_replacing_spare(config_root, tgt, 1)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"can only be replaced by another hot spare"));
free(newname);
return (zfs_error(hdl, EZFS_BADTARGET, msg));
}
free(newname);
if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0)
return (-1);
ret = zfs_ioctl(hdl, ZFS_IOC_VDEV_ATTACH, &zc);
zcmd_free_nvlists(&zc);
if (ret == 0) {
if (rootpool) {
/*
* XXX need a better way to prevent user from
* booting up a half-baked vdev.
*/
(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Make "
"sure to wait until resilver is done "
"before rebooting.\n"));
(void) fprintf(stderr, "\n");
(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "If "
"you boot from pool '%s', you may need to update\n"
"boot code on newly attached disk '%s'.\n\n"
"Assuming you use GPT partitioning and 'da0' is "
"your new boot disk\n"
"you may use the following command:\n\n"
"\tgpart bootcode -b /boot/pmbr -p "
"/boot/gptzfsboot -i 1 da0\n\n"),
zhp->zpool_name, new_disk);
}
return (0);
}
switch (errno) {
case ENOTSUP:
/*
* Can't attach to or replace this type of vdev.
*/
if (replacing) {
uint64_t version = zpool_get_prop_int(zhp,
ZPOOL_PROP_VERSION, NULL);
if (islog)
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"cannot replace a log with a spare"));
else if (version >= SPA_VERSION_MULTI_REPLACE)
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"already in replacing/spare config; wait "
"for completion or use 'zpool detach'"));
else
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"cannot replace a replacing device"));
} else {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"can only attach to mirrors and top-level "
"disks"));
}
(void) zfs_error(hdl, EZFS_BADTARGET, msg);
break;
case EINVAL:
/*
* The new device must be a single disk.
*/
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"new device must be a single disk"));
(void) zfs_error(hdl, EZFS_INVALCONFIG, msg);
break;
case EBUSY:
- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy"),
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy, "
+ "or pool has removing/removed vdevs"),
new_disk);
(void) zfs_error(hdl, EZFS_BADDEV, msg);
break;
case EOVERFLOW:
/*
* The new device is too small.
*/
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"device is too small"));
(void) zfs_error(hdl, EZFS_BADDEV, msg);
break;
case EDOM:
/*
* The new device has a different alignment requirement.
*/
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"devices have different sector alignment"));
(void) zfs_error(hdl, EZFS_BADDEV, msg);
break;
case ENAMETOOLONG:
/*
* The resulting top-level vdev spec won't fit in the label.
*/
(void) zfs_error(hdl, EZFS_DEVOVERFLOW, msg);
break;
default:
(void) zpool_standard_error(hdl, errno, msg);
}
return (-1);
}
/*
* Detach the specified device.
*/
int
zpool_vdev_detach(zpool_handle_t *zhp, const char *path)
{
zfs_cmd_t zc = { 0 };
char msg[1024];
nvlist_t *tgt;
boolean_t avail_spare, l2cache;
libzfs_handle_t *hdl = zhp->zpool_hdl;
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot detach %s"), path);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
- NULL)) == 0)
+ NULL)) == NULL)
return (zfs_error(hdl, EZFS_NODEVICE, msg));
if (avail_spare)
return (zfs_error(hdl, EZFS_ISSPARE, msg));
if (l2cache)
return (zfs_error(hdl, EZFS_ISL2CACHE, msg));
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
if (zfs_ioctl(hdl, ZFS_IOC_VDEV_DETACH, &zc) == 0)
return (0);
switch (errno) {
case ENOTSUP:
/*
* Can't detach from this type of vdev.
*/
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "only "
"applicable to mirror and replacing vdevs"));
(void) zfs_error(hdl, EZFS_BADTARGET, msg);
break;
case EBUSY:
/*
* There are no other replicas of this device.
*/
(void) zfs_error(hdl, EZFS_NOREPLICAS, msg);
break;
default:
(void) zpool_standard_error(hdl, errno, msg);
}
return (-1);
}
/*
* Find a mirror vdev in the source nvlist.
*
* The mchild array contains a list of disks in one of the top-level mirrors
* of the source pool. The schild array contains a list of disks that the
* user specified on the command line. We loop over the mchild array to
* see if any entry in the schild array matches.
*
* If a disk in the mchild array is found in the schild array, we return
* the index of that entry. Otherwise we return -1.
*/
static int
find_vdev_entry(zpool_handle_t *zhp, nvlist_t **mchild, uint_t mchildren,
nvlist_t **schild, uint_t schildren)
{
uint_t mc;
for (mc = 0; mc < mchildren; mc++) {
uint_t sc;
char *mpath = zpool_vdev_name(zhp->zpool_hdl, zhp,
mchild[mc], B_FALSE);
for (sc = 0; sc < schildren; sc++) {
char *spath = zpool_vdev_name(zhp->zpool_hdl, zhp,
schild[sc], B_FALSE);
boolean_t result = (strcmp(mpath, spath) == 0);
free(spath);
if (result) {
free(mpath);
return (mc);
}
}
free(mpath);
}
return (-1);
}
/*
* Split a mirror pool. If newroot points to null, then a new nvlist
* is generated and it is the responsibility of the caller to free it.
*/
int
zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot,
nvlist_t *props, splitflags_t flags)
{
zfs_cmd_t zc = { 0 };
char msg[1024];
nvlist_t *tree, *config, **child, **newchild, *newconfig = NULL;
nvlist_t **varray = NULL, *zc_props = NULL;
uint_t c, children, newchildren, lastlog = 0, vcount, found = 0;
libzfs_handle_t *hdl = zhp->zpool_hdl;
uint64_t vers;
boolean_t freelist = B_FALSE, memory_err = B_TRUE;
int retval = 0;
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "Unable to split %s"), zhp->zpool_name);
if (!zpool_name_valid(hdl, B_FALSE, newname))
return (zfs_error(hdl, EZFS_INVALIDNAME, msg));
if ((config = zpool_get_config(zhp, NULL)) == NULL) {
(void) fprintf(stderr, gettext("Internal error: unable to "
"retrieve pool configuration\n"));
return (-1);
}
verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree)
== 0);
verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &vers) == 0);
if (props) {
prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE };
if ((zc_props = zpool_valid_proplist(hdl, zhp->zpool_name,
props, vers, flags, msg)) == NULL)
return (-1);
}
if (nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child,
&children) != 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"Source pool is missing vdev tree"));
nvlist_free(zc_props);
return (-1);
}
varray = zfs_alloc(hdl, children * sizeof (nvlist_t *));
vcount = 0;
if (*newroot == NULL ||
nvlist_lookup_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN,
&newchild, &newchildren) != 0)
newchildren = 0;
for (c = 0; c < children; c++) {
uint64_t is_log = B_FALSE, is_hole = B_FALSE;
char *type;
nvlist_t **mchild, *vdev;
uint_t mchildren;
int entry;
/*
* Unlike cache & spares, slogs are stored in the
* ZPOOL_CONFIG_CHILDREN array. We filter them out here.
*/
(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
&is_log);
(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
&is_hole);
if (is_log || is_hole) {
/*
* Create a hole vdev and put it in the config.
*/
if (nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) != 0)
goto out;
if (nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE,
VDEV_TYPE_HOLE) != 0)
goto out;
if (nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_HOLE,
1) != 0)
goto out;
if (lastlog == 0)
lastlog = vcount;
varray[vcount++] = vdev;
continue;
}
lastlog = 0;
verify(nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type)
== 0);
if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"Source pool must be composed only of mirrors\n"));
retval = zfs_error(hdl, EZFS_INVALCONFIG, msg);
goto out;
}
verify(nvlist_lookup_nvlist_array(child[c],
ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0);
/* find or add an entry for this top-level vdev */
if (newchildren > 0 &&
(entry = find_vdev_entry(zhp, mchild, mchildren,
newchild, newchildren)) >= 0) {
/* We found a disk that the user specified. */
vdev = mchild[entry];
++found;
} else {
/* User didn't specify a disk for this vdev. */
vdev = mchild[mchildren - 1];
}
if (nvlist_dup(vdev, &varray[vcount++], 0) != 0)
goto out;
}
/* did we find every disk the user specified? */
if (found != newchildren) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Device list must "
"include at most one disk from each mirror"));
retval = zfs_error(hdl, EZFS_INVALCONFIG, msg);
goto out;
}
/* Prepare the nvlist for populating. */
if (*newroot == NULL) {
if (nvlist_alloc(newroot, NV_UNIQUE_NAME, 0) != 0)
goto out;
freelist = B_TRUE;
if (nvlist_add_string(*newroot, ZPOOL_CONFIG_TYPE,
VDEV_TYPE_ROOT) != 0)
goto out;
} else {
verify(nvlist_remove_all(*newroot, ZPOOL_CONFIG_CHILDREN) == 0);
}
/* Add all the children we found */
if (nvlist_add_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, varray,
lastlog == 0 ? vcount : lastlog) != 0)
goto out;
/*
* If we're just doing a dry run, exit now with success.
*/
if (flags.dryrun) {
memory_err = B_FALSE;
freelist = B_FALSE;
goto out;
}
/* now build up the config list & call the ioctl */
if (nvlist_alloc(&newconfig, NV_UNIQUE_NAME, 0) != 0)
goto out;
if (nvlist_add_nvlist(newconfig,
ZPOOL_CONFIG_VDEV_TREE, *newroot) != 0 ||
nvlist_add_string(newconfig,
ZPOOL_CONFIG_POOL_NAME, newname) != 0 ||
nvlist_add_uint64(newconfig, ZPOOL_CONFIG_VERSION, vers) != 0)
goto out;
/*
* The new pool is automatically part of the namespace unless we
* explicitly export it.
*/
if (!flags.import)
zc.zc_cookie = ZPOOL_EXPORT_AFTER_SPLIT;
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
(void) strlcpy(zc.zc_string, newname, sizeof (zc.zc_string));
if (zcmd_write_conf_nvlist(hdl, &zc, newconfig) != 0)
goto out;
if (zc_props != NULL && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0)
goto out;
if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SPLIT, &zc) != 0) {
retval = zpool_standard_error(hdl, errno, msg);
goto out;
}
freelist = B_FALSE;
memory_err = B_FALSE;
out:
if (varray != NULL) {
int v;
for (v = 0; v < vcount; v++)
nvlist_free(varray[v]);
free(varray);
}
zcmd_free_nvlists(&zc);
nvlist_free(zc_props);
nvlist_free(newconfig);
if (freelist) {
nvlist_free(*newroot);
*newroot = NULL;
}
if (retval != 0)
return (retval);
if (memory_err)
return (no_memory(hdl));
return (0);
}
/*
- * Remove the given device. Currently, this is supported only for hot spares
- * and level 2 cache devices.
+ * Remove the given device.
*/
int
zpool_vdev_remove(zpool_handle_t *zhp, const char *path)
{
zfs_cmd_t zc = { 0 };
char msg[1024];
nvlist_t *tgt;
boolean_t avail_spare, l2cache, islog;
libzfs_handle_t *hdl = zhp->zpool_hdl;
uint64_t version;
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot remove %s"), path);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
- &islog)) == 0)
+ &islog)) == NULL)
return (zfs_error(hdl, EZFS_NODEVICE, msg));
- /*
- * XXX - this should just go away.
- */
- if (!avail_spare && !l2cache && !islog) {
- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
- "only inactive hot spares, cache, top-level, "
- "or log devices can be removed"));
- return (zfs_error(hdl, EZFS_NODEVICE, msg));
- }
version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
if (islog && version < SPA_VERSION_HOLES) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
- "pool must be upgrade to support log removal"));
+ "pool must be upgraded to support log removal"));
return (zfs_error(hdl, EZFS_BADVERSION, msg));
}
- verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
+ if (!islog && !avail_spare && !l2cache && zpool_is_bootable(zhp)) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "root pool can not have removed devices, "
+ "because GRUB does not understand them"));
+ return (zfs_error(hdl, EINVAL, msg));
+ }
+ zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID);
+
if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0)
return (0);
+ switch (errno) {
+
+ case EINVAL:
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "invalid config; all top-level vdevs must "
+ "have the same sector size and not be raidz."));
+ (void) zfs_error(hdl, EZFS_INVALCONFIG, msg);
+ break;
+
+ case EBUSY:
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "Pool busy; removal may already be in progress"));
+ (void) zfs_error(hdl, EZFS_BUSY, msg);
+ break;
+
+ default:
+ (void) zpool_standard_error(hdl, errno, msg);
+ }
+ return (-1);
+}
+
+int
+zpool_vdev_remove_cancel(zpool_handle_t *zhp)
+{
+ zfs_cmd_t zc = { 0 };
+ char msg[1024];
+ libzfs_handle_t *hdl = zhp->zpool_hdl;
+
+ (void) snprintf(msg, sizeof (msg),
+ dgettext(TEXT_DOMAIN, "cannot cancel removal"));
+
+ (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+ zc.zc_cookie = 1;
+
+ if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0)
+ return (0);
+
return (zpool_standard_error(hdl, errno, msg));
}
+int
+zpool_vdev_indirect_size(zpool_handle_t *zhp, const char *path,
+ uint64_t *sizep)
+{
+ char msg[1024];
+ nvlist_t *tgt;
+ boolean_t avail_spare, l2cache, islog;
+ libzfs_handle_t *hdl = zhp->zpool_hdl;
+
+ (void) snprintf(msg, sizeof (msg),
+ dgettext(TEXT_DOMAIN, "cannot determine indirect size of %s"),
+ path);
+
+ if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
+ &islog)) == NULL)
+ return (zfs_error(hdl, EZFS_NODEVICE, msg));
+
+ if (avail_spare || l2cache || islog) {
+ *sizep = 0;
+ return (0);
+ }
+
+ if (nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_INDIRECT_SIZE, sizep) != 0) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "indirect size not available"));
+ return (zfs_error(hdl, EINVAL, msg));
+ }
+ return (0);
+}
+
/*
* Clear the errors for the pool, or the particular device if specified.
*/
int
zpool_clear(zpool_handle_t *zhp, const char *path, nvlist_t *rewindnvl)
{
zfs_cmd_t zc = { 0 };
char msg[1024];
nvlist_t *tgt;
zpool_rewind_policy_t policy;
boolean_t avail_spare, l2cache;
libzfs_handle_t *hdl = zhp->zpool_hdl;
nvlist_t *nvi = NULL;
int error;
if (path)
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot clear errors for %s"),
path);
else
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot clear errors for %s"),
zhp->zpool_name);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if (path) {
if ((tgt = zpool_find_vdev(zhp, path, &avail_spare,
- &l2cache, NULL)) == 0)
+ &l2cache, NULL)) == NULL)
return (zfs_error(hdl, EZFS_NODEVICE, msg));
/*
* Don't allow error clearing for hot spares. Do allow
* error clearing for l2cache devices.
*/
if (avail_spare)
return (zfs_error(hdl, EZFS_ISSPARE, msg));
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID,
&zc.zc_guid) == 0);
}
zpool_get_rewind_policy(rewindnvl, &policy);
zc.zc_cookie = policy.zrp_request;
if (zcmd_alloc_dst_nvlist(hdl, &zc, zhp->zpool_config_size * 2) != 0)
return (-1);
if (zcmd_write_src_nvlist(hdl, &zc, rewindnvl) != 0)
return (-1);
while ((error = zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc)) != 0 &&
errno == ENOMEM) {
if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
zcmd_free_nvlists(&zc);
return (-1);
}
}
if (!error || ((policy.zrp_request & ZPOOL_TRY_REWIND) &&
errno != EPERM && errno != EACCES)) {
if (policy.zrp_request &
(ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) {
(void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
zpool_rewind_exclaim(hdl, zc.zc_name,
((policy.zrp_request & ZPOOL_TRY_REWIND) != 0),
nvi);
nvlist_free(nvi);
}
zcmd_free_nvlists(&zc);
return (0);
}
zcmd_free_nvlists(&zc);
return (zpool_standard_error(hdl, errno, msg));
}
/*
* Similar to zpool_clear(), but takes a GUID (used by fmd).
*/
int
zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid)
{
zfs_cmd_t zc = { 0 };
char msg[1024];
libzfs_handle_t *hdl = zhp->zpool_hdl;
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot clear errors for %llx"),
guid);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
zc.zc_guid = guid;
zc.zc_cookie = ZPOOL_NO_REWIND;
if (ioctl(hdl->libzfs_fd, ZFS_IOC_CLEAR, &zc) == 0)
return (0);
return (zpool_standard_error(hdl, errno, msg));
}
/*
* Change the GUID for a pool.
*/
int
zpool_reguid(zpool_handle_t *zhp)
{
char msg[1024];
libzfs_handle_t *hdl = zhp->zpool_hdl;
zfs_cmd_t zc = { 0 };
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot reguid '%s'"), zhp->zpool_name);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if (zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc) == 0)
return (0);
return (zpool_standard_error(hdl, errno, msg));
}
/*
* Reopen the pool.
*/
int
zpool_reopen(zpool_handle_t *zhp)
{
zfs_cmd_t zc = { 0 };
char msg[1024];
libzfs_handle_t *hdl = zhp->zpool_hdl;
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot reopen '%s'"),
zhp->zpool_name);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if (zfs_ioctl(hdl, ZFS_IOC_POOL_REOPEN, &zc) == 0)
return (0);
return (zpool_standard_error(hdl, errno, msg));
}
/*
* Convert from a devid string to a path.
*/
static char *
devid_to_path(char *devid_str)
{
ddi_devid_t devid;
char *minor;
char *path;
devid_nmlist_t *list = NULL;
int ret;
if (devid_str_decode(devid_str, &devid, &minor) != 0)
return (NULL);
ret = devid_deviceid_to_nmlist("/dev", devid, minor, &list);
devid_str_free(minor);
devid_free(devid);
if (ret != 0)
return (NULL);
/*
* In a case the strdup() fails, we will just return NULL below.
*/
path = strdup(list[0].devname);
devid_free_nmlist(list);
return (path);
}
/*
* Convert from a path to a devid string.
*/
static char *
path_to_devid(const char *path)
{
#ifdef have_devid
int fd;
ddi_devid_t devid;
char *minor, *ret;
if ((fd = open(path, O_RDONLY)) < 0)
return (NULL);
minor = NULL;
ret = NULL;
if (devid_get(fd, &devid) == 0) {
if (devid_get_minor_name(fd, &minor) == 0)
ret = devid_str_encode(devid, minor);
if (minor != NULL)
devid_str_free(minor);
devid_free(devid);
}
(void) close(fd);
return (ret);
#else
return (NULL);
#endif
}
/*
* Issue the necessary ioctl() to update the stored path value for the vdev. We
* ignore any failure here, since a common case is for an unprivileged user to
* type 'zpool status', and we'll display the correct information anyway.
*/
static void
set_path(zpool_handle_t *zhp, nvlist_t *nv, const char *path)
{
zfs_cmd_t zc = { 0 };
(void) strncpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
(void) strncpy(zc.zc_value, path, sizeof (zc.zc_value));
verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
&zc.zc_guid) == 0);
(void) ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SETPATH, &zc);
}
/*
* Given a vdev, return the name to display in iostat. If the vdev has a path,
* we use that, stripping off any leading "/dev/dsk/"; if not, we use the type.
* We also check if this is a whole disk, in which case we strip off the
* trailing 's0' slice name.
*
* This routine is also responsible for identifying when disks have been
* reconfigured in a new location. The kernel will have opened the device by
* devid, but the path will still refer to the old location. To catch this, we
* first do a path -> devid translation (which is fast for the common case). If
* the devid matches, we're done. If not, we do a reverse devid -> path
* translation and issue the appropriate ioctl() to update the path of the vdev.
* If 'zhp' is NULL, then this is an exported pool, and we don't need to do any
* of these checks.
*/
char *
zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
boolean_t verbose)
{
char *path, *devid;
uint64_t value;
char buf[64];
vdev_stat_t *vs;
uint_t vsc;
int have_stats;
int have_path;
have_stats = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &vsc) == 0;
have_path = nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0;
/*
* If the device is not currently present, assume it will not
* come back at the same device path. Display the device by GUID.
*/
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &value) == 0 ||
have_path && have_stats && vs->vs_state <= VDEV_STATE_CANT_OPEN) {
verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
&value) == 0);
(void) snprintf(buf, sizeof (buf), "%llu",
(u_longlong_t)value);
path = buf;
} else if (have_path) {
/*
* If the device is dead (faulted, offline, etc) then don't
* bother opening it. Otherwise we may be forcing the user to
* open a misbehaving device, which can have undesirable
* effects.
*/
if ((have_stats == 0 ||
vs->vs_state >= VDEV_STATE_DEGRADED) &&
zhp != NULL &&
nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &devid) == 0) {
/*
* Determine if the current path is correct.
*/
char *newdevid = path_to_devid(path);
if (newdevid == NULL ||
strcmp(devid, newdevid) != 0) {
char *newpath;
if ((newpath = devid_to_path(devid)) != NULL) {
/*
* Update the path appropriately.
*/
set_path(zhp, nv, newpath);
if (nvlist_add_string(nv,
ZPOOL_CONFIG_PATH, newpath) == 0)
verify(nvlist_lookup_string(nv,
ZPOOL_CONFIG_PATH,
&path) == 0);
free(newpath);
}
}
if (newdevid)
devid_str_free(newdevid);
}
#ifdef illumos
if (strncmp(path, ZFS_DISK_ROOTD, strlen(ZFS_DISK_ROOTD)) == 0)
path += strlen(ZFS_DISK_ROOTD);
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
&value) == 0 && value) {
int pathlen = strlen(path);
char *tmp = zfs_strdup(hdl, path);
/*
* If it starts with c#, and ends with "s0" or "s1",
* chop the slice off, or if it ends with "s0/old" or
* "s1/old", remove the slice from the middle.
*/
if (CTD_CHECK(tmp)) {
if (strcmp(&tmp[pathlen - 2], "s0") == 0 ||
strcmp(&tmp[pathlen - 2], "s1") == 0) {
tmp[pathlen - 2] = '\0';
} else if (pathlen > 6 &&
(strcmp(&tmp[pathlen - 6], "s0/old") == 0 ||
strcmp(&tmp[pathlen - 6], "s1/old") == 0)) {
(void) strcpy(&tmp[pathlen - 6],
"/old");
}
}
return (tmp);
}
#else /* !illumos */
if (strncmp(path, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0)
path += sizeof(_PATH_DEV) - 1;
#endif /* illumos */
} else {
verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &path) == 0);
/*
* If it's a raidz device, we need to stick in the parity level.
*/
if (strcmp(path, VDEV_TYPE_RAIDZ) == 0) {
verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
&value) == 0);
(void) snprintf(buf, sizeof (buf), "%s%llu", path,
(u_longlong_t)value);
path = buf;
}
/*
* We identify each top-level vdev by using a <type-id>
* naming convention.
*/
if (verbose) {
uint64_t id;
verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
&id) == 0);
(void) snprintf(buf, sizeof (buf), "%s-%llu", path,
(u_longlong_t)id);
path = buf;
}
}
return (zfs_strdup(hdl, path));
}
static int
zbookmark_mem_compare(const void *a, const void *b)
{
return (memcmp(a, b, sizeof (zbookmark_phys_t)));
}
/*
* Retrieve the persistent error log, uniquify the members, and return to the
* caller.
*/
int
zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
{
zfs_cmd_t zc = { 0 };
uint64_t count;
zbookmark_phys_t *zb = NULL;
int i;
/*
* Retrieve the raw error list from the kernel. If the number of errors
* has increased, allocate more space and continue until we get the
* entire list.
*/
verify(nvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_ERRCOUNT,
&count) == 0);
if (count == 0)
return (0);
if ((zc.zc_nvlist_dst = (uintptr_t)zfs_alloc(zhp->zpool_hdl,
count * sizeof (zbookmark_phys_t))) == (uintptr_t)NULL)
return (-1);
zc.zc_nvlist_dst_size = count;
(void) strcpy(zc.zc_name, zhp->zpool_name);
for (;;) {
if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_ERROR_LOG,
&zc) != 0) {
free((void *)(uintptr_t)zc.zc_nvlist_dst);
if (errno == ENOMEM) {
void *dst;
count = zc.zc_nvlist_dst_size;
dst = zfs_alloc(zhp->zpool_hdl, count *
sizeof (zbookmark_phys_t));
if (dst == NULL)
return (-1);
zc.zc_nvlist_dst = (uintptr_t)dst;
} else {
return (-1);
}
} else {
break;
}
}
/*
* Sort the resulting bookmarks. This is a little confusing due to the
* implementation of ZFS_IOC_ERROR_LOG. The bookmarks are copied last
* to first, and 'zc_nvlist_dst_size' indicates the number of boomarks
* _not_ copied as part of the process. So we point the start of our
* array appropriate and decrement the total number of elements.
*/
zb = ((zbookmark_phys_t *)(uintptr_t)zc.zc_nvlist_dst) +
zc.zc_nvlist_dst_size;
count -= zc.zc_nvlist_dst_size;
qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_mem_compare);
verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0);
/*
* Fill in the nverrlistp with nvlist's of dataset and object numbers.
*/
for (i = 0; i < count; i++) {
nvlist_t *nv;
/* ignoring zb_blkid and zb_level for now */
if (i > 0 && zb[i-1].zb_objset == zb[i].zb_objset &&
zb[i-1].zb_object == zb[i].zb_object)
continue;
if (nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) != 0)
goto nomem;
if (nvlist_add_uint64(nv, ZPOOL_ERR_DATASET,
zb[i].zb_objset) != 0) {
nvlist_free(nv);
goto nomem;
}
if (nvlist_add_uint64(nv, ZPOOL_ERR_OBJECT,
zb[i].zb_object) != 0) {
nvlist_free(nv);
goto nomem;
}
if (nvlist_add_nvlist(*nverrlistp, "ejk", nv) != 0) {
nvlist_free(nv);
goto nomem;
}
nvlist_free(nv);
}
free((void *)(uintptr_t)zc.zc_nvlist_dst);
return (0);
nomem:
free((void *)(uintptr_t)zc.zc_nvlist_dst);
return (no_memory(zhp->zpool_hdl));
}
/*
* Upgrade a ZFS pool to the latest on-disk version.
*/
int
zpool_upgrade(zpool_handle_t *zhp, uint64_t new_version)
{
zfs_cmd_t zc = { 0 };
libzfs_handle_t *hdl = zhp->zpool_hdl;
(void) strcpy(zc.zc_name, zhp->zpool_name);
zc.zc_cookie = new_version;
if (zfs_ioctl(hdl, ZFS_IOC_POOL_UPGRADE, &zc) != 0)
return (zpool_standard_error_fmt(hdl, errno,
dgettext(TEXT_DOMAIN, "cannot upgrade '%s'"),
zhp->zpool_name));
return (0);
}
void
zfs_save_arguments(int argc, char **argv, char *string, int len)
{
(void) strlcpy(string, basename(argv[0]), len);
for (int i = 1; i < argc; i++) {
(void) strlcat(string, " ", len);
(void) strlcat(string, argv[i], len);
}
}
int
zpool_log_history(libzfs_handle_t *hdl, const char *message)
{
zfs_cmd_t zc = { 0 };
nvlist_t *args;
int err;
args = fnvlist_alloc();
fnvlist_add_string(args, "message", message);
err = zcmd_write_src_nvlist(hdl, &zc, args);
if (err == 0)
err = ioctl(hdl->libzfs_fd, ZFS_IOC_LOG_HISTORY, &zc);
nvlist_free(args);
zcmd_free_nvlists(&zc);
return (err);
}
/*
* Perform ioctl to get some command history of a pool.
*
* 'buf' is the buffer to fill up to 'len' bytes. 'off' is the
* logical offset of the history buffer to start reading from.
*
* Upon return, 'off' is the next logical offset to read from and
* 'len' is the actual amount of bytes read into 'buf'.
*/
static int
get_history(zpool_handle_t *zhp, char *buf, uint64_t *off, uint64_t *len)
{
zfs_cmd_t zc = { 0 };
libzfs_handle_t *hdl = zhp->zpool_hdl;
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
zc.zc_history = (uint64_t)(uintptr_t)buf;
zc.zc_history_len = *len;
zc.zc_history_offset = *off;
if (ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_GET_HISTORY, &zc) != 0) {
switch (errno) {
case EPERM:
return (zfs_error_fmt(hdl, EZFS_PERM,
dgettext(TEXT_DOMAIN,
"cannot show history for pool '%s'"),
zhp->zpool_name));
case ENOENT:
return (zfs_error_fmt(hdl, EZFS_NOHISTORY,
dgettext(TEXT_DOMAIN, "cannot get history for pool "
"'%s'"), zhp->zpool_name));
case ENOTSUP:
return (zfs_error_fmt(hdl, EZFS_BADVERSION,
dgettext(TEXT_DOMAIN, "cannot get history for pool "
"'%s', pool must be upgraded"), zhp->zpool_name));
default:
return (zpool_standard_error_fmt(hdl, errno,
dgettext(TEXT_DOMAIN,
"cannot get history for '%s'"), zhp->zpool_name));
}
}
*len = zc.zc_history_len;
*off = zc.zc_history_offset;
return (0);
}
/*
* Process the buffer of nvlists, unpacking and storing each nvlist record
* into 'records'. 'leftover' is set to the number of bytes that weren't
* processed as there wasn't a complete record.
*/
int
zpool_history_unpack(char *buf, uint64_t bytes_read, uint64_t *leftover,
nvlist_t ***records, uint_t *numrecords)
{
uint64_t reclen;
nvlist_t *nv;
int i;
while (bytes_read > sizeof (reclen)) {
/* get length of packed record (stored as little endian) */
for (i = 0, reclen = 0; i < sizeof (reclen); i++)
reclen += (uint64_t)(((uchar_t *)buf)[i]) << (8*i);
if (bytes_read < sizeof (reclen) + reclen)
break;
/* unpack record */
if (nvlist_unpack(buf + sizeof (reclen), reclen, &nv, 0) != 0)
return (ENOMEM);
bytes_read -= sizeof (reclen) + reclen;
buf += sizeof (reclen) + reclen;
/* add record to nvlist array */
(*numrecords)++;
if (ISP2(*numrecords + 1)) {
*records = realloc(*records,
*numrecords * 2 * sizeof (nvlist_t *));
}
(*records)[*numrecords - 1] = nv;
}
*leftover = bytes_read;
return (0);
}
/* from spa_history.c: spa_history_create_obj() */
#define HIS_BUF_LEN_DEF (128 << 10)
#define HIS_BUF_LEN_MAX (1 << 30)
/*
* Retrieve the command history of a pool.
*/
int
zpool_get_history(zpool_handle_t *zhp, nvlist_t **nvhisp)
{
char *buf;
uint64_t buflen = HIS_BUF_LEN_DEF;
uint64_t off = 0;
nvlist_t **records = NULL;
uint_t numrecords = 0;
int err, i;
buf = malloc(buflen);
if (buf == NULL)
return (ENOMEM);
do {
uint64_t bytes_read = buflen;
uint64_t leftover;
if ((err = get_history(zhp, buf, &off, &bytes_read)) != 0)
break;
/* if nothing else was read in, we're at EOF, just return */
if (bytes_read == 0)
break;
if ((err = zpool_history_unpack(buf, bytes_read,
&leftover, &records, &numrecords)) != 0)
break;
off -= leftover;
if (leftover == bytes_read) {
/*
* no progress made, because buffer is not big enough
* to hold this record; resize and retry.
*/
buflen *= 2;
free(buf);
buf = NULL;
if ((buflen >= HIS_BUF_LEN_MAX) ||
((buf = malloc(buflen)) == NULL)) {
err = ENOMEM;
break;
}
}
/* CONSTCOND */
} while (1);
free(buf);
if (!err) {
verify(nvlist_alloc(nvhisp, NV_UNIQUE_NAME, 0) == 0);
verify(nvlist_add_nvlist_array(*nvhisp, ZPOOL_HIST_RECORD,
records, numrecords) == 0);
}
for (i = 0; i < numrecords; i++)
nvlist_free(records[i]);
free(records);
return (err);
}
void
zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj,
char *pathname, size_t len)
{
zfs_cmd_t zc = { 0 };
boolean_t mounted = B_FALSE;
char *mntpnt = NULL;
char dsname[ZFS_MAX_DATASET_NAME_LEN];
if (dsobj == 0) {
/* special case for the MOS */
(void) snprintf(pathname, len, "<metadata>:<0x%llx>", obj);
return;
}
/* get the dataset's name */
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
zc.zc_obj = dsobj;
if (ioctl(zhp->zpool_hdl->libzfs_fd,
ZFS_IOC_DSOBJ_TO_DSNAME, &zc) != 0) {
/* just write out a path of two object numbers */
(void) snprintf(pathname, len, "<0x%llx>:<0x%llx>",
dsobj, obj);
return;
}
(void) strlcpy(dsname, zc.zc_value, sizeof (dsname));
/* find out if the dataset is mounted */
mounted = is_mounted(zhp->zpool_hdl, dsname, &mntpnt);
/* get the corrupted object's path */
(void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name));
zc.zc_obj = obj;
if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_OBJ_TO_PATH,
&zc) == 0) {
if (mounted) {
(void) snprintf(pathname, len, "%s%s", mntpnt,
zc.zc_value);
} else {
(void) snprintf(pathname, len, "%s:%s",
dsname, zc.zc_value);
}
} else {
(void) snprintf(pathname, len, "%s:<0x%llx>", dsname, obj);
}
free(mntpnt);
}
#ifdef illumos
/*
* Read the EFI label from the config, if a label does not exist then
* pass back the error to the caller. If the caller has passed a non-NULL
* diskaddr argument then we set it to the starting address of the EFI
* partition. If the caller has passed a non-NULL boolean argument, then
* we set it to indicate if the disk does have efi system partition.
*/
static int
read_efi_label(nvlist_t *config, diskaddr_t *sb, boolean_t *system)
{
char *path;
int fd;
char diskname[MAXPATHLEN];
boolean_t boot = B_FALSE;
int err = -1;
int slice;
if (nvlist_lookup_string(config, ZPOOL_CONFIG_PATH, &path) != 0)
return (err);
(void) snprintf(diskname, sizeof (diskname), "%s%s", ZFS_RDISK_ROOT,
strrchr(path, '/'));
if ((fd = open(diskname, O_RDONLY|O_NDELAY)) >= 0) {
struct dk_gpt *vtoc;
if ((err = efi_alloc_and_read(fd, &vtoc)) >= 0) {
for (slice = 0; slice < vtoc->efi_nparts; slice++) {
if (vtoc->efi_parts[slice].p_tag == V_SYSTEM)
boot = B_TRUE;
if (vtoc->efi_parts[slice].p_tag == V_USR)
break;
}
if (sb != NULL && vtoc->efi_parts[slice].p_tag == V_USR)
*sb = vtoc->efi_parts[slice].p_start;
if (system != NULL)
*system = boot;
efi_free(vtoc);
}
(void) close(fd);
}
return (err);
}
/*
* determine where a partition starts on a disk in the current
* configuration
*/
static diskaddr_t
find_start_block(nvlist_t *config)
{
nvlist_t **child;
uint_t c, children;
diskaddr_t sb = MAXOFFSET_T;
uint64_t wholedisk;
if (nvlist_lookup_nvlist_array(config,
ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) {
if (nvlist_lookup_uint64(config,
ZPOOL_CONFIG_WHOLE_DISK,
&wholedisk) != 0 || !wholedisk) {
return (MAXOFFSET_T);
}
if (read_efi_label(config, &sb, NULL) < 0)
sb = MAXOFFSET_T;
return (sb);
}
for (c = 0; c < children; c++) {
sb = find_start_block(child[c]);
if (sb != MAXOFFSET_T) {
return (sb);
}
}
return (MAXOFFSET_T);
}
#endif /* illumos */
/*
* Label an individual disk. The name provided is the short name,
* stripped of any leading /dev path.
*/
int
zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, const char *name,
zpool_boot_label_t boot_type, uint64_t boot_size, int *slice)
{
#ifdef illumos
char path[MAXPATHLEN];
struct dk_gpt *vtoc;
int fd;
size_t resv = EFI_MIN_RESV_SIZE;
uint64_t slice_size;
diskaddr_t start_block;
char errbuf[1024];
/* prepare an error message just in case */
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN, "cannot label '%s'"), name);
if (zhp) {
nvlist_t *nvroot;
verify(nvlist_lookup_nvlist(zhp->zpool_config,
ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
if (zhp->zpool_start_block == 0)
start_block = find_start_block(nvroot);
else
start_block = zhp->zpool_start_block;
zhp->zpool_start_block = start_block;
} else {
/* new pool */
start_block = NEW_START_BLOCK;
}
(void) snprintf(path, sizeof (path), "%s/%s%s", ZFS_RDISK_ROOT, name,
BACKUP_SLICE);
if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) {
/*
* This shouldn't happen. We've long since verified that this
* is a valid device.
*/
zfs_error_aux(hdl,
dgettext(TEXT_DOMAIN, "unable to open device"));
return (zfs_error(hdl, EZFS_OPENFAILED, errbuf));
}
if (efi_alloc_and_init(fd, EFI_NUMPAR, &vtoc) != 0) {
/*
* The only way this can fail is if we run out of memory, or we
* were unable to read the disk's capacity
*/
if (errno == ENOMEM)
(void) no_memory(hdl);
(void) close(fd);
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"unable to read disk capacity"), name);
return (zfs_error(hdl, EZFS_NOCAP, errbuf));
}
/*
* Why we use V_USR: V_BACKUP confuses users, and is considered
* disposable by some EFI utilities (since EFI doesn't have a backup
* slice). V_UNASSIGNED is supposed to be used only for zero size
* partitions, and efi_write() will fail if we use it. V_ROOT, V_BOOT,
* etc. were all pretty specific. V_USR is as close to reality as we
* can get, in the absence of V_OTHER.
*/
/* first fix the partition start block */
if (start_block == MAXOFFSET_T)
start_block = NEW_START_BLOCK;
/*
* EFI System partition is using slice 0.
* ZFS is on slice 1 and slice 8 is reserved.
* We assume the GPT partition table without system
* partition has zfs p_start == NEW_START_BLOCK.
* If start_block != NEW_START_BLOCK, it means we have
* system partition. Correct solution would be to query/cache vtoc
* from existing vdev member.
*/
if (boot_type == ZPOOL_CREATE_BOOT_LABEL) {
if (boot_size % vtoc->efi_lbasize != 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"boot partition size must be a multiple of %d"),
vtoc->efi_lbasize);
(void) close(fd);
efi_free(vtoc);
return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
}
/*
* System partition size checks.
* Note the 1MB is quite arbitrary value, since we
* are creating dedicated pool, it should be enough
* to hold fat + efi bootloader. May need to be
* adjusted if the bootloader size will grow.
*/
if (boot_size < 1024 * 1024) {
char buf[64];
zfs_nicenum(boot_size, buf, sizeof (buf));
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"Specified size %s for EFI System partition is too "
"small, the minimum size is 1MB."), buf);
(void) close(fd);
efi_free(vtoc);
return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
}
/* 33MB is tested with mkfs -F pcfs */
if (hdl->libzfs_printerr &&
((vtoc->efi_lbasize == 512 &&
boot_size < 33 * 1024 * 1024) ||
(vtoc->efi_lbasize == 4096 &&
boot_size < 256 * 1024 * 1024))) {
char buf[64];
zfs_nicenum(boot_size, buf, sizeof (buf));
(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
"Warning: EFI System partition size %s is "
"not allowing to create FAT32 file\nsystem, which "
"may result in unbootable system.\n"), buf);
}
/* Adjust zfs partition start by size of system partition. */
start_block += boot_size / vtoc->efi_lbasize;
}
if (start_block == NEW_START_BLOCK) {
/*
* Use default layout.
* ZFS is on slice 0 and slice 8 is reserved.
*/
slice_size = vtoc->efi_last_u_lba + 1;
slice_size -= EFI_MIN_RESV_SIZE;
slice_size -= start_block;
if (slice != NULL)
*slice = 0;
vtoc->efi_parts[0].p_start = start_block;
vtoc->efi_parts[0].p_size = slice_size;
vtoc->efi_parts[0].p_tag = V_USR;
(void) strcpy(vtoc->efi_parts[0].p_name, "zfs");
vtoc->efi_parts[8].p_start = slice_size + start_block;
vtoc->efi_parts[8].p_size = resv;
vtoc->efi_parts[8].p_tag = V_RESERVED;
} else {
slice_size = start_block - NEW_START_BLOCK;
vtoc->efi_parts[0].p_start = NEW_START_BLOCK;
vtoc->efi_parts[0].p_size = slice_size;
vtoc->efi_parts[0].p_tag = V_SYSTEM;
(void) strcpy(vtoc->efi_parts[0].p_name, "loader");
if (slice != NULL)
*slice = 1;
/* prepare slice 1 */
slice_size = vtoc->efi_last_u_lba + 1 - slice_size;
slice_size -= resv;
slice_size -= NEW_START_BLOCK;
vtoc->efi_parts[1].p_start = start_block;
vtoc->efi_parts[1].p_size = slice_size;
vtoc->efi_parts[1].p_tag = V_USR;
(void) strcpy(vtoc->efi_parts[1].p_name, "zfs");
vtoc->efi_parts[8].p_start = slice_size + start_block;
vtoc->efi_parts[8].p_size = resv;
vtoc->efi_parts[8].p_tag = V_RESERVED;
}
if (efi_write(fd, vtoc) != 0) {
/*
* Some block drivers (like pcata) may not support EFI
* GPT labels. Print out a helpful error message dir-
* ecting the user to manually label the disk and give
* a specific slice.
*/
(void) close(fd);
efi_free(vtoc);
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"try using fdisk(1M) and then provide a specific slice"));
return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
}
(void) close(fd);
efi_free(vtoc);
#endif /* illumos */
return (0);
}
static boolean_t
supported_dump_vdev_type(libzfs_handle_t *hdl, nvlist_t *config, char *errbuf)
{
char *type;
nvlist_t **child;
uint_t children, c;
verify(nvlist_lookup_string(config, ZPOOL_CONFIG_TYPE, &type) == 0);
if (strcmp(type, VDEV_TYPE_FILE) == 0 ||
strcmp(type, VDEV_TYPE_HOLE) == 0 ||
strcmp(type, VDEV_TYPE_MISSING) == 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"vdev type '%s' is not supported"), type);
(void) zfs_error(hdl, EZFS_VDEVNOTSUP, errbuf);
return (B_FALSE);
}
if (nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN,
&child, &children) == 0) {
for (c = 0; c < children; c++) {
if (!supported_dump_vdev_type(hdl, child[c], errbuf))
return (B_FALSE);
}
}
return (B_TRUE);
}
/*
* Check if this zvol is allowable for use as a dump device; zero if
* it is, > 0 if it isn't, < 0 if it isn't a zvol.
*
* Allowable storage configurations include mirrors, all raidz variants, and
* pools with log, cache, and spare devices. Pools which are backed by files or
* have missing/hole vdevs are not suitable.
*/
int
zvol_check_dump_config(char *arg)
{
zpool_handle_t *zhp = NULL;
nvlist_t *config, *nvroot;
char *p, *volname;
nvlist_t **top;
uint_t toplevels;
libzfs_handle_t *hdl;
char errbuf[1024];
char poolname[ZFS_MAX_DATASET_NAME_LEN];
int pathlen = strlen(ZVOL_FULL_DEV_DIR);
int ret = 1;
if (strncmp(arg, ZVOL_FULL_DEV_DIR, pathlen)) {
return (-1);
}
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"dump is not supported on device '%s'"), arg);
if ((hdl = libzfs_init()) == NULL)
return (1);
libzfs_print_on_error(hdl, B_TRUE);
volname = arg + pathlen;
/* check the configuration of the pool */
if ((p = strchr(volname, '/')) == NULL) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"malformed dataset name"));
(void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
return (1);
} else if (p - volname >= ZFS_MAX_DATASET_NAME_LEN) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"dataset name is too long"));
(void) zfs_error(hdl, EZFS_NAMETOOLONG, errbuf);
return (1);
} else {
(void) strncpy(poolname, volname, p - volname);
poolname[p - volname] = '\0';
}
if ((zhp = zpool_open(hdl, poolname)) == NULL) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"could not open pool '%s'"), poolname);
(void) zfs_error(hdl, EZFS_OPENFAILED, errbuf);
goto out;
}
config = zpool_get_config(zhp, NULL);
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) != 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"could not obtain vdev configuration for '%s'"), poolname);
(void) zfs_error(hdl, EZFS_INVALCONFIG, errbuf);
goto out;
}
verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
&top, &toplevels) == 0);
if (!supported_dump_vdev_type(hdl, top[0], errbuf)) {
goto out;
}
ret = 0;
out:
if (zhp)
zpool_close(zhp);
libzfs_fini(hdl);
return (ret);
}
int
zpool_nextboot(libzfs_handle_t *hdl, uint64_t pool_guid, uint64_t dev_guid,
const char *command)
{
zfs_cmd_t zc = { 0 };
nvlist_t *args;
char *packed;
size_t size;
int error;
args = fnvlist_alloc();
fnvlist_add_uint64(args, ZPOOL_CONFIG_POOL_GUID, pool_guid);
fnvlist_add_uint64(args, ZPOOL_CONFIG_GUID, dev_guid);
fnvlist_add_string(args, "command", command);
error = zcmd_write_src_nvlist(hdl, &zc, args);
if (error == 0)
error = ioctl(hdl->libzfs_fd, ZFS_IOC_NEXTBOOT, &zc);
zcmd_free_nvlists(&zc);
nvlist_free(args);
return (error);
}
Index: stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c
===================================================================
--- stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c (revision 332524)
+++ stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c (revision 332525)
@@ -1,1528 +1,1535 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
* Copyright (c) 2017 Datto Inc.
*/
/*
* Internal utility routines for the ZFS library.
*/
#include <sys/param.h>
#include <sys/linker.h>
#include <sys/module.h>
#include <sys/stat.h>
#include <errno.h>
#include <fcntl.h>
#include <libintl.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <unistd.h>
#include <ctype.h>
#include <math.h>
#include <sys/mnttab.h>
#include <sys/mntent.h>
#include <sys/types.h>
#include <libcmdutils.h>
#include <libzfs.h>
#include <libzfs_core.h>
#include "libzfs_impl.h"
#include "zfs_prop.h"
#include "zfeature_common.h"
int
libzfs_errno(libzfs_handle_t *hdl)
{
return (hdl->libzfs_error);
}
const char *
libzfs_error_action(libzfs_handle_t *hdl)
{
return (hdl->libzfs_action);
}
const char *
libzfs_error_description(libzfs_handle_t *hdl)
{
if (hdl->libzfs_desc[0] != '\0')
return (hdl->libzfs_desc);
switch (hdl->libzfs_error) {
case EZFS_NOMEM:
return (dgettext(TEXT_DOMAIN, "out of memory"));
case EZFS_BADPROP:
return (dgettext(TEXT_DOMAIN, "invalid property value"));
case EZFS_PROPREADONLY:
return (dgettext(TEXT_DOMAIN, "read-only property"));
case EZFS_PROPTYPE:
return (dgettext(TEXT_DOMAIN, "property doesn't apply to "
"datasets of this type"));
case EZFS_PROPNONINHERIT:
return (dgettext(TEXT_DOMAIN, "property cannot be inherited"));
case EZFS_PROPSPACE:
return (dgettext(TEXT_DOMAIN, "invalid quota or reservation"));
case EZFS_BADTYPE:
return (dgettext(TEXT_DOMAIN, "operation not applicable to "
"datasets of this type"));
case EZFS_BUSY:
return (dgettext(TEXT_DOMAIN, "pool or dataset is busy"));
case EZFS_EXISTS:
return (dgettext(TEXT_DOMAIN, "pool or dataset exists"));
case EZFS_NOENT:
return (dgettext(TEXT_DOMAIN, "no such pool or dataset"));
case EZFS_BADSTREAM:
return (dgettext(TEXT_DOMAIN, "invalid backup stream"));
case EZFS_DSREADONLY:
return (dgettext(TEXT_DOMAIN, "dataset is read-only"));
case EZFS_VOLTOOBIG:
return (dgettext(TEXT_DOMAIN, "volume size exceeds limit for "
"this system"));
case EZFS_INVALIDNAME:
return (dgettext(TEXT_DOMAIN, "invalid name"));
case EZFS_BADRESTORE:
return (dgettext(TEXT_DOMAIN, "unable to restore to "
"destination"));
case EZFS_BADBACKUP:
return (dgettext(TEXT_DOMAIN, "backup failed"));
case EZFS_BADTARGET:
return (dgettext(TEXT_DOMAIN, "invalid target vdev"));
case EZFS_NODEVICE:
return (dgettext(TEXT_DOMAIN, "no such device in pool"));
case EZFS_BADDEV:
return (dgettext(TEXT_DOMAIN, "invalid device"));
case EZFS_NOREPLICAS:
return (dgettext(TEXT_DOMAIN, "no valid replicas"));
case EZFS_RESILVERING:
return (dgettext(TEXT_DOMAIN, "currently resilvering"));
case EZFS_BADVERSION:
return (dgettext(TEXT_DOMAIN, "unsupported version or "
"feature"));
case EZFS_POOLUNAVAIL:
return (dgettext(TEXT_DOMAIN, "pool is unavailable"));
case EZFS_DEVOVERFLOW:
return (dgettext(TEXT_DOMAIN, "too many devices in one vdev"));
case EZFS_BADPATH:
return (dgettext(TEXT_DOMAIN, "must be an absolute path"));
case EZFS_CROSSTARGET:
return (dgettext(TEXT_DOMAIN, "operation crosses datasets or "
"pools"));
case EZFS_ZONED:
return (dgettext(TEXT_DOMAIN, "dataset in use by local zone"));
case EZFS_MOUNTFAILED:
return (dgettext(TEXT_DOMAIN, "mount failed"));
case EZFS_UMOUNTFAILED:
return (dgettext(TEXT_DOMAIN, "umount failed"));
case EZFS_UNSHARENFSFAILED:
return (dgettext(TEXT_DOMAIN, "unshare(1M) failed"));
case EZFS_SHARENFSFAILED:
return (dgettext(TEXT_DOMAIN, "share(1M) failed"));
case EZFS_UNSHARESMBFAILED:
return (dgettext(TEXT_DOMAIN, "smb remove share failed"));
case EZFS_SHARESMBFAILED:
return (dgettext(TEXT_DOMAIN, "smb add share failed"));
case EZFS_PERM:
return (dgettext(TEXT_DOMAIN, "permission denied"));
case EZFS_NOSPC:
return (dgettext(TEXT_DOMAIN, "out of space"));
case EZFS_FAULT:
return (dgettext(TEXT_DOMAIN, "bad address"));
case EZFS_IO:
return (dgettext(TEXT_DOMAIN, "I/O error"));
case EZFS_INTR:
return (dgettext(TEXT_DOMAIN, "signal received"));
case EZFS_ISSPARE:
return (dgettext(TEXT_DOMAIN, "device is reserved as a hot "
"spare"));
case EZFS_INVALCONFIG:
return (dgettext(TEXT_DOMAIN, "invalid vdev configuration"));
case EZFS_RECURSIVE:
return (dgettext(TEXT_DOMAIN, "recursive dataset dependency"));
case EZFS_NOHISTORY:
return (dgettext(TEXT_DOMAIN, "no history available"));
case EZFS_POOLPROPS:
return (dgettext(TEXT_DOMAIN, "failed to retrieve "
"pool properties"));
case EZFS_POOL_NOTSUP:
return (dgettext(TEXT_DOMAIN, "operation not supported "
"on this type of pool"));
case EZFS_POOL_INVALARG:
return (dgettext(TEXT_DOMAIN, "invalid argument for "
"this pool operation"));
case EZFS_NAMETOOLONG:
return (dgettext(TEXT_DOMAIN, "dataset name is too long"));
case EZFS_OPENFAILED:
return (dgettext(TEXT_DOMAIN, "open failed"));
case EZFS_NOCAP:
return (dgettext(TEXT_DOMAIN,
"disk capacity information could not be retrieved"));
case EZFS_LABELFAILED:
return (dgettext(TEXT_DOMAIN, "write of label failed"));
case EZFS_BADWHO:
return (dgettext(TEXT_DOMAIN, "invalid user/group"));
case EZFS_BADPERM:
return (dgettext(TEXT_DOMAIN, "invalid permission"));
case EZFS_BADPERMSET:
return (dgettext(TEXT_DOMAIN, "invalid permission set name"));
case EZFS_NODELEGATION:
return (dgettext(TEXT_DOMAIN, "delegated administration is "
"disabled on pool"));
case EZFS_BADCACHE:
return (dgettext(TEXT_DOMAIN, "invalid or missing cache file"));
case EZFS_ISL2CACHE:
return (dgettext(TEXT_DOMAIN, "device is in use as a cache"));
case EZFS_VDEVNOTSUP:
return (dgettext(TEXT_DOMAIN, "vdev specification is not "
"supported"));
case EZFS_NOTSUP:
return (dgettext(TEXT_DOMAIN, "operation not supported "
"on this dataset"));
case EZFS_ACTIVE_SPARE:
return (dgettext(TEXT_DOMAIN, "pool has active shared spare "
"device"));
case EZFS_UNPLAYED_LOGS:
return (dgettext(TEXT_DOMAIN, "log device has unplayed intent "
"logs"));
case EZFS_REFTAG_RELE:
return (dgettext(TEXT_DOMAIN, "no such tag on this dataset"));
case EZFS_REFTAG_HOLD:
return (dgettext(TEXT_DOMAIN, "tag already exists on this "
"dataset"));
case EZFS_TAGTOOLONG:
return (dgettext(TEXT_DOMAIN, "tag too long"));
case EZFS_PIPEFAILED:
return (dgettext(TEXT_DOMAIN, "pipe create failed"));
case EZFS_THREADCREATEFAILED:
return (dgettext(TEXT_DOMAIN, "thread create failed"));
case EZFS_POSTSPLIT_ONLINE:
return (dgettext(TEXT_DOMAIN, "disk was split from this pool "
"into a new one"));
case EZFS_SCRUB_PAUSED:
return (dgettext(TEXT_DOMAIN, "scrub is paused; "
"use 'zpool scrub' to resume"));
case EZFS_SCRUBBING:
return (dgettext(TEXT_DOMAIN, "currently scrubbing; "
"use 'zpool scrub -s' to cancel current scrub"));
case EZFS_NO_SCRUB:
return (dgettext(TEXT_DOMAIN, "there is no active scrub"));
case EZFS_DIFF:
return (dgettext(TEXT_DOMAIN, "unable to generate diffs"));
case EZFS_DIFFDATA:
return (dgettext(TEXT_DOMAIN, "invalid diff data"));
case EZFS_POOLREADONLY:
return (dgettext(TEXT_DOMAIN, "pool is read-only"));
+ case EZFS_NO_PENDING:
+ return (dgettext(TEXT_DOMAIN, "operation is not "
+ "in progress"));
case EZFS_UNKNOWN:
return (dgettext(TEXT_DOMAIN, "unknown error"));
default:
assert(hdl->libzfs_error == 0);
return (dgettext(TEXT_DOMAIN, "no error"));
}
}
/*PRINTFLIKE2*/
void
zfs_error_aux(libzfs_handle_t *hdl, const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
(void) vsnprintf(hdl->libzfs_desc, sizeof (hdl->libzfs_desc),
fmt, ap);
hdl->libzfs_desc_active = 1;
va_end(ap);
}
static void
zfs_verror(libzfs_handle_t *hdl, int error, const char *fmt, va_list ap)
{
(void) vsnprintf(hdl->libzfs_action, sizeof (hdl->libzfs_action),
fmt, ap);
hdl->libzfs_error = error;
if (hdl->libzfs_desc_active)
hdl->libzfs_desc_active = 0;
else
hdl->libzfs_desc[0] = '\0';
if (hdl->libzfs_printerr) {
if (error == EZFS_UNKNOWN) {
(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "internal "
"error: %s\n"), libzfs_error_description(hdl));
abort();
}
(void) fprintf(stderr, "%s: %s\n", hdl->libzfs_action,
libzfs_error_description(hdl));
if (error == EZFS_NOMEM)
exit(1);
}
}
int
zfs_error(libzfs_handle_t *hdl, int error, const char *msg)
{
return (zfs_error_fmt(hdl, error, "%s", msg));
}
/*PRINTFLIKE3*/
int
zfs_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
zfs_verror(hdl, error, fmt, ap);
va_end(ap);
return (-1);
}
static int
zfs_common_error(libzfs_handle_t *hdl, int error, const char *fmt,
va_list ap)
{
switch (error) {
case EPERM:
case EACCES:
zfs_verror(hdl, EZFS_PERM, fmt, ap);
return (-1);
case ECANCELED:
zfs_verror(hdl, EZFS_NODELEGATION, fmt, ap);
return (-1);
case EIO:
zfs_verror(hdl, EZFS_IO, fmt, ap);
return (-1);
case EFAULT:
zfs_verror(hdl, EZFS_FAULT, fmt, ap);
return (-1);
case EINTR:
zfs_verror(hdl, EZFS_INTR, fmt, ap);
return (-1);
}
return (0);
}
int
zfs_standard_error(libzfs_handle_t *hdl, int error, const char *msg)
{
return (zfs_standard_error_fmt(hdl, error, "%s", msg));
}
/*PRINTFLIKE3*/
int
zfs_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
if (zfs_common_error(hdl, error, fmt, ap) != 0) {
va_end(ap);
return (-1);
}
switch (error) {
case ENXIO:
case ENODEV:
case EPIPE:
zfs_verror(hdl, EZFS_IO, fmt, ap);
break;
case ENOENT:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"dataset does not exist"));
zfs_verror(hdl, EZFS_NOENT, fmt, ap);
break;
case ENOSPC:
case EDQUOT:
zfs_verror(hdl, EZFS_NOSPC, fmt, ap);
va_end(ap);
return (-1);
case EEXIST:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"dataset already exists"));
zfs_verror(hdl, EZFS_EXISTS, fmt, ap);
break;
case EBUSY:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"dataset is busy"));
zfs_verror(hdl, EZFS_BUSY, fmt, ap);
break;
case EROFS:
zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap);
break;
case ENAMETOOLONG:
zfs_verror(hdl, EZFS_NAMETOOLONG, fmt, ap);
break;
case ENOTSUP:
zfs_verror(hdl, EZFS_BADVERSION, fmt, ap);
break;
case EAGAIN:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"pool I/O is currently suspended"));
zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap);
break;
default:
zfs_error_aux(hdl, strerror(error));
zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
break;
}
va_end(ap);
return (-1);
}
int
zpool_standard_error(libzfs_handle_t *hdl, int error, const char *msg)
{
return (zpool_standard_error_fmt(hdl, error, "%s", msg));
}
/*PRINTFLIKE3*/
int
zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
if (zfs_common_error(hdl, error, fmt, ap) != 0) {
va_end(ap);
return (-1);
}
switch (error) {
case ENODEV:
zfs_verror(hdl, EZFS_NODEVICE, fmt, ap);
break;
case ENOENT:
zfs_error_aux(hdl,
dgettext(TEXT_DOMAIN, "no such pool or dataset"));
zfs_verror(hdl, EZFS_NOENT, fmt, ap);
break;
case EEXIST:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"pool already exists"));
zfs_verror(hdl, EZFS_EXISTS, fmt, ap);
break;
case EBUSY:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool is busy"));
zfs_verror(hdl, EZFS_BUSY, fmt, ap);
break;
case ENXIO:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"one or more devices is currently unavailable"));
zfs_verror(hdl, EZFS_BADDEV, fmt, ap);
break;
case ENAMETOOLONG:
zfs_verror(hdl, EZFS_DEVOVERFLOW, fmt, ap);
break;
case ENOTSUP:
zfs_verror(hdl, EZFS_POOL_NOTSUP, fmt, ap);
break;
case EINVAL:
zfs_verror(hdl, EZFS_POOL_INVALARG, fmt, ap);
break;
case ENOSPC:
case EDQUOT:
zfs_verror(hdl, EZFS_NOSPC, fmt, ap);
va_end(ap);
return (-1);
case EAGAIN:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"pool I/O is currently suspended"));
zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap);
break;
case EROFS:
zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap);
+ break;
+ /* There is no pending operation to cancel */
+ case ESRCH:
+ zfs_verror(hdl, EZFS_NO_PENDING, fmt, ap);
break;
default:
zfs_error_aux(hdl, strerror(error));
zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
}
va_end(ap);
return (-1);
}
/*
* Display an out of memory error message and abort the current program.
*/
int
no_memory(libzfs_handle_t *hdl)
{
return (zfs_error(hdl, EZFS_NOMEM, "internal error"));
}
/*
* A safe form of malloc() which will die if the allocation fails.
*/
void *
zfs_alloc(libzfs_handle_t *hdl, size_t size)
{
void *data;
if ((data = calloc(1, size)) == NULL)
(void) no_memory(hdl);
return (data);
}
/*
* A safe form of asprintf() which will die if the allocation fails.
*/
/*PRINTFLIKE2*/
char *
zfs_asprintf(libzfs_handle_t *hdl, const char *fmt, ...)
{
va_list ap;
char *ret;
int err;
va_start(ap, fmt);
err = vasprintf(&ret, fmt, ap);
va_end(ap);
if (err < 0)
(void) no_memory(hdl);
return (ret);
}
/*
* A safe form of realloc(), which also zeroes newly allocated space.
*/
void *
zfs_realloc(libzfs_handle_t *hdl, void *ptr, size_t oldsize, size_t newsize)
{
void *ret;
if ((ret = realloc(ptr, newsize)) == NULL) {
(void) no_memory(hdl);
return (NULL);
}
bzero((char *)ret + oldsize, (newsize - oldsize));
return (ret);
}
/*
* A safe form of strdup() which will die if the allocation fails.
*/
char *
zfs_strdup(libzfs_handle_t *hdl, const char *str)
{
char *ret;
if ((ret = strdup(str)) == NULL)
(void) no_memory(hdl);
return (ret);
}
/*
* Convert a number to an appropriately human-readable output.
*/
void
zfs_nicenum(uint64_t num, char *buf, size_t buflen)
{
nicenum(num, buf, buflen);
}
void
libzfs_print_on_error(libzfs_handle_t *hdl, boolean_t printerr)
{
hdl->libzfs_printerr = printerr;
}
static int
libzfs_load(void)
{
int error;
if (modfind("zfs") < 0) {
/* Not present in kernel, try loading it. */
if (kldload("zfs") < 0 || modfind("zfs") < 0) {
if (errno != EEXIST)
return (-1);
}
}
return (0);
}
libzfs_handle_t *
libzfs_init(void)
{
libzfs_handle_t *hdl;
if ((hdl = calloc(1, sizeof (libzfs_handle_t))) == NULL) {
return (NULL);
}
if (libzfs_load() < 0) {
free(hdl);
return (NULL);
}
if ((hdl->libzfs_fd = open(ZFS_DEV, O_RDWR)) < 0) {
free(hdl);
return (NULL);
}
if ((hdl->libzfs_mnttab = fopen(MNTTAB, "r")) == NULL) {
(void) close(hdl->libzfs_fd);
free(hdl);
return (NULL);
}
hdl->libzfs_sharetab = fopen(ZFS_EXPORTS_PATH, "r");
if (libzfs_core_init() != 0) {
(void) close(hdl->libzfs_fd);
(void) fclose(hdl->libzfs_mnttab);
(void) fclose(hdl->libzfs_sharetab);
free(hdl);
return (NULL);
}
zfs_prop_init();
zpool_prop_init();
zpool_feature_init();
libzfs_mnttab_init(hdl);
if (getenv("ZFS_PROP_DEBUG") != NULL) {
hdl->libzfs_prop_debug = B_TRUE;
}
return (hdl);
}
void
libzfs_fini(libzfs_handle_t *hdl)
{
(void) close(hdl->libzfs_fd);
if (hdl->libzfs_mnttab)
(void) fclose(hdl->libzfs_mnttab);
if (hdl->libzfs_sharetab)
(void) fclose(hdl->libzfs_sharetab);
zfs_uninit_libshare(hdl);
zpool_free_handles(hdl);
#ifdef illumos
libzfs_fru_clear(hdl, B_TRUE);
#endif
namespace_clear(hdl);
libzfs_mnttab_fini(hdl);
libzfs_core_fini();
free(hdl);
}
libzfs_handle_t *
zpool_get_handle(zpool_handle_t *zhp)
{
return (zhp->zpool_hdl);
}
libzfs_handle_t *
zfs_get_handle(zfs_handle_t *zhp)
{
return (zhp->zfs_hdl);
}
zpool_handle_t *
zfs_get_pool_handle(const zfs_handle_t *zhp)
{
return (zhp->zpool_hdl);
}
/*
* Given a name, determine whether or not it's a valid path
* (starts with '/' or "./"). If so, walk the mnttab trying
* to match the device number. If not, treat the path as an
* fs/vol/snap/bkmark name.
*/
zfs_handle_t *
zfs_path_to_zhandle(libzfs_handle_t *hdl, char *path, zfs_type_t argtype)
{
struct stat64 statbuf;
struct extmnttab entry;
int ret;
if (path[0] != '/' && strncmp(path, "./", strlen("./")) != 0) {
/*
* It's not a valid path, assume it's a name of type 'argtype'.
*/
return (zfs_open(hdl, path, argtype));
}
if (stat64(path, &statbuf) != 0) {
(void) fprintf(stderr, "%s: %s\n", path, strerror(errno));
return (NULL);
}
#ifdef illumos
rewind(hdl->libzfs_mnttab);
while ((ret = getextmntent(hdl->libzfs_mnttab, &entry, 0)) == 0) {
if (makedevice(entry.mnt_major, entry.mnt_minor) ==
statbuf.st_dev) {
break;
}
}
#else
{
struct statfs sfs;
ret = statfs(path, &sfs);
if (ret == 0)
statfs2mnttab(&sfs, &entry);
else {
(void) fprintf(stderr, "%s: %s\n", path,
strerror(errno));
}
}
#endif /* illumos */
if (ret != 0) {
return (NULL);
}
if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) {
(void) fprintf(stderr, gettext("'%s': not a ZFS filesystem\n"),
path);
return (NULL);
}
return (zfs_open(hdl, entry.mnt_special, ZFS_TYPE_FILESYSTEM));
}
/*
* Initialize the zc_nvlist_dst member to prepare for receiving an nvlist from
* an ioctl().
*/
int
zcmd_alloc_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, size_t len)
{
if (len == 0)
len = 16 * 1024;
zc->zc_nvlist_dst_size = len;
zc->zc_nvlist_dst =
(uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size);
if (zc->zc_nvlist_dst == 0)
return (-1);
return (0);
}
/*
* Called when an ioctl() which returns an nvlist fails with ENOMEM. This will
* expand the nvlist to the size specified in 'zc_nvlist_dst_size', which was
* filled in by the kernel to indicate the actual required size.
*/
int
zcmd_expand_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc)
{
free((void *)(uintptr_t)zc->zc_nvlist_dst);
zc->zc_nvlist_dst =
(uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size);
if (zc->zc_nvlist_dst == 0)
return (-1);
return (0);
}
/*
* Called to free the src and dst nvlists stored in the command structure.
*/
void
zcmd_free_nvlists(zfs_cmd_t *zc)
{
free((void *)(uintptr_t)zc->zc_nvlist_conf);
free((void *)(uintptr_t)zc->zc_nvlist_src);
free((void *)(uintptr_t)zc->zc_nvlist_dst);
zc->zc_nvlist_conf = NULL;
zc->zc_nvlist_src = NULL;
zc->zc_nvlist_dst = NULL;
}
static int
zcmd_write_nvlist_com(libzfs_handle_t *hdl, uint64_t *outnv, uint64_t *outlen,
nvlist_t *nvl)
{
char *packed;
size_t len;
verify(nvlist_size(nvl, &len, NV_ENCODE_NATIVE) == 0);
if ((packed = zfs_alloc(hdl, len)) == NULL)
return (-1);
verify(nvlist_pack(nvl, &packed, &len, NV_ENCODE_NATIVE, 0) == 0);
*outnv = (uint64_t)(uintptr_t)packed;
*outlen = len;
return (0);
}
int
zcmd_write_conf_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t *nvl)
{
return (zcmd_write_nvlist_com(hdl, &zc->zc_nvlist_conf,
&zc->zc_nvlist_conf_size, nvl));
}
int
zcmd_write_src_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t *nvl)
{
return (zcmd_write_nvlist_com(hdl, &zc->zc_nvlist_src,
&zc->zc_nvlist_src_size, nvl));
}
/*
* Unpacks an nvlist from the ZFS ioctl command structure.
*/
int
zcmd_read_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t **nvlp)
{
if (nvlist_unpack((void *)(uintptr_t)zc->zc_nvlist_dst,
zc->zc_nvlist_dst_size, nvlp, 0) != 0)
return (no_memory(hdl));
return (0);
}
int
zfs_ioctl(libzfs_handle_t *hdl, int request, zfs_cmd_t *zc)
{
return (ioctl(hdl->libzfs_fd, request, zc));
}
/*
* ================================================================
* API shared by zfs and zpool property management
* ================================================================
*/
static void
zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
{
zprop_list_t *pl = cbp->cb_proplist;
int i;
char *title;
size_t len;
cbp->cb_first = B_FALSE;
if (cbp->cb_scripted)
return;
/*
* Start with the length of the column headers.
*/
cbp->cb_colwidths[GET_COL_NAME] = strlen(dgettext(TEXT_DOMAIN, "NAME"));
cbp->cb_colwidths[GET_COL_PROPERTY] = strlen(dgettext(TEXT_DOMAIN,
"PROPERTY"));
cbp->cb_colwidths[GET_COL_VALUE] = strlen(dgettext(TEXT_DOMAIN,
"VALUE"));
cbp->cb_colwidths[GET_COL_RECVD] = strlen(dgettext(TEXT_DOMAIN,
"RECEIVED"));
cbp->cb_colwidths[GET_COL_SOURCE] = strlen(dgettext(TEXT_DOMAIN,
"SOURCE"));
/* first property is always NAME */
assert(cbp->cb_proplist->pl_prop ==
((type == ZFS_TYPE_POOL) ? ZPOOL_PROP_NAME : ZFS_PROP_NAME));
/*
* Go through and calculate the widths for each column. For the
* 'source' column, we kludge it up by taking the worst-case scenario of
* inheriting from the longest name. This is acceptable because in the
* majority of cases 'SOURCE' is the last column displayed, and we don't
* use the width anyway. Note that the 'VALUE' column can be oversized,
* if the name of the property is much longer than any values we find.
*/
for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) {
/*
* 'PROPERTY' column
*/
if (pl->pl_prop != ZPROP_INVAL) {
const char *propname = (type == ZFS_TYPE_POOL) ?
zpool_prop_to_name(pl->pl_prop) :
zfs_prop_to_name(pl->pl_prop);
len = strlen(propname);
if (len > cbp->cb_colwidths[GET_COL_PROPERTY])
cbp->cb_colwidths[GET_COL_PROPERTY] = len;
} else {
len = strlen(pl->pl_user_prop);
if (len > cbp->cb_colwidths[GET_COL_PROPERTY])
cbp->cb_colwidths[GET_COL_PROPERTY] = len;
}
/*
* 'VALUE' column. The first property is always the 'name'
* property that was tacked on either by /sbin/zfs's
* zfs_do_get() or when calling zprop_expand_list(), so we
* ignore its width. If the user specified the name property
* to display, then it will be later in the list in any case.
*/
if (pl != cbp->cb_proplist &&
pl->pl_width > cbp->cb_colwidths[GET_COL_VALUE])
cbp->cb_colwidths[GET_COL_VALUE] = pl->pl_width;
/* 'RECEIVED' column. */
if (pl != cbp->cb_proplist &&
pl->pl_recvd_width > cbp->cb_colwidths[GET_COL_RECVD])
cbp->cb_colwidths[GET_COL_RECVD] = pl->pl_recvd_width;
/*
* 'NAME' and 'SOURCE' columns
*/
if (pl->pl_prop == (type == ZFS_TYPE_POOL ? ZPOOL_PROP_NAME :
ZFS_PROP_NAME) &&
pl->pl_width > cbp->cb_colwidths[GET_COL_NAME]) {
cbp->cb_colwidths[GET_COL_NAME] = pl->pl_width;
cbp->cb_colwidths[GET_COL_SOURCE] = pl->pl_width +
strlen(dgettext(TEXT_DOMAIN, "inherited from"));
}
}
/*
* Now go through and print the headers.
*/
for (i = 0; i < ZFS_GET_NCOLS; i++) {
switch (cbp->cb_columns[i]) {
case GET_COL_NAME:
title = dgettext(TEXT_DOMAIN, "NAME");
break;
case GET_COL_PROPERTY:
title = dgettext(TEXT_DOMAIN, "PROPERTY");
break;
case GET_COL_VALUE:
title = dgettext(TEXT_DOMAIN, "VALUE");
break;
case GET_COL_RECVD:
title = dgettext(TEXT_DOMAIN, "RECEIVED");
break;
case GET_COL_SOURCE:
title = dgettext(TEXT_DOMAIN, "SOURCE");
break;
default:
title = NULL;
}
if (title != NULL) {
if (i == (ZFS_GET_NCOLS - 1) ||
cbp->cb_columns[i + 1] == GET_COL_NONE)
(void) printf("%s", title);
else
(void) printf("%-*s ",
cbp->cb_colwidths[cbp->cb_columns[i]],
title);
}
}
(void) printf("\n");
}
/*
* Display a single line of output, according to the settings in the callback
* structure.
*/
void
zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp,
const char *propname, const char *value, zprop_source_t sourcetype,
const char *source, const char *recvd_value)
{
int i;
const char *str = NULL;
char buf[128];
/*
* Ignore those source types that the user has chosen to ignore.
*/
if ((sourcetype & cbp->cb_sources) == 0)
return;
if (cbp->cb_first)
zprop_print_headers(cbp, cbp->cb_type);
for (i = 0; i < ZFS_GET_NCOLS; i++) {
switch (cbp->cb_columns[i]) {
case GET_COL_NAME:
str = name;
break;
case GET_COL_PROPERTY:
str = propname;
break;
case GET_COL_VALUE:
str = value;
break;
case GET_COL_SOURCE:
switch (sourcetype) {
case ZPROP_SRC_NONE:
str = "-";
break;
case ZPROP_SRC_DEFAULT:
str = "default";
break;
case ZPROP_SRC_LOCAL:
str = "local";
break;
case ZPROP_SRC_TEMPORARY:
str = "temporary";
break;
case ZPROP_SRC_INHERITED:
(void) snprintf(buf, sizeof (buf),
"inherited from %s", source);
str = buf;
break;
case ZPROP_SRC_RECEIVED:
str = "received";
break;
default:
str = NULL;
assert(!"unhandled zprop_source_t");
}
break;
case GET_COL_RECVD:
str = (recvd_value == NULL ? "-" : recvd_value);
break;
default:
continue;
}
if (cbp->cb_columns[i + 1] == GET_COL_NONE)
(void) printf("%s", str);
else if (cbp->cb_scripted)
(void) printf("%s\t", str);
else
(void) printf("%-*s ",
cbp->cb_colwidths[cbp->cb_columns[i]],
str);
}
(void) printf("\n");
}
/*
* Given a numeric suffix, convert the value into a number of bits that the
* resulting value must be shifted.
*/
static int
str2shift(libzfs_handle_t *hdl, const char *buf)
{
const char *ends = "BKMGTPEZ";
int i;
if (buf[0] == '\0')
return (0);
for (i = 0; i < strlen(ends); i++) {
if (toupper(buf[0]) == ends[i])
break;
}
if (i == strlen(ends)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"invalid numeric suffix '%s'"), buf);
return (-1);
}
/*
* We want to allow trailing 'b' characters for 'GB' or 'Mb'. But don't
* allow 'BB' - that's just weird.
*/
if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0' &&
toupper(buf[0]) != 'B'))
return (10*i);
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"invalid numeric suffix '%s'"), buf);
return (-1);
}
/*
* Convert a string of the form '100G' into a real number. Used when setting
* properties or creating a volume. 'buf' is used to place an extended error
* message for the caller to use.
*/
int
zfs_nicestrtonum(libzfs_handle_t *hdl, const char *value, uint64_t *num)
{
char *end;
int shift;
*num = 0;
/* Check to see if this looks like a number. */
if ((value[0] < '0' || value[0] > '9') && value[0] != '.') {
if (hdl)
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"bad numeric value '%s'"), value);
return (-1);
}
/* Rely on strtoull() to process the numeric portion. */
errno = 0;
*num = strtoull(value, &end, 10);
/*
* Check for ERANGE, which indicates that the value is too large to fit
* in a 64-bit value.
*/
if (errno == ERANGE) {
if (hdl)
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"numeric value is too large"));
return (-1);
}
/*
* If we have a decimal value, then do the computation with floating
* point arithmetic. Otherwise, use standard arithmetic.
*/
if (*end == '.') {
double fval = strtod(value, &end);
if ((shift = str2shift(hdl, end)) == -1)
return (-1);
fval *= pow(2, shift);
if (fval > UINT64_MAX) {
if (hdl)
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"numeric value is too large"));
return (-1);
}
*num = (uint64_t)fval;
} else {
if ((shift = str2shift(hdl, end)) == -1)
return (-1);
/* Check for overflow */
if (shift >= 64 || (*num << shift) >> shift != *num) {
if (hdl)
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"numeric value is too large"));
return (-1);
}
*num <<= shift;
}
return (0);
}
/*
* Given a propname=value nvpair to set, parse any numeric properties
* (index, boolean, etc) if they are specified as strings and add the
* resulting nvpair to the returned nvlist.
*
* At the DSL layer, all properties are either 64-bit numbers or strings.
* We want the user to be able to ignore this fact and specify properties
* as native values (numbers, for example) or as strings (to simplify
* command line utilities). This also handles converting index types
* (compression, checksum, etc) from strings to their on-disk index.
*/
int
zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop,
zfs_type_t type, nvlist_t *ret, char **svalp, uint64_t *ivalp,
const char *errbuf)
{
data_type_t datatype = nvpair_type(elem);
zprop_type_t proptype;
const char *propname;
char *value;
boolean_t isnone = B_FALSE;
if (type == ZFS_TYPE_POOL) {
proptype = zpool_prop_get_type(prop);
propname = zpool_prop_to_name(prop);
} else {
proptype = zfs_prop_get_type(prop);
propname = zfs_prop_to_name(prop);
}
/*
* Convert any properties to the internal DSL value types.
*/
*svalp = NULL;
*ivalp = 0;
switch (proptype) {
case PROP_TYPE_STRING:
if (datatype != DATA_TYPE_STRING) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"'%s' must be a string"), nvpair_name(elem));
goto error;
}
(void) nvpair_value_string(elem, svalp);
if (strlen(*svalp) >= ZFS_MAXPROPLEN) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"'%s' is too long"), nvpair_name(elem));
goto error;
}
break;
case PROP_TYPE_NUMBER:
if (datatype == DATA_TYPE_STRING) {
(void) nvpair_value_string(elem, &value);
if (strcmp(value, "none") == 0) {
isnone = B_TRUE;
} else if (zfs_nicestrtonum(hdl, value, ivalp)
!= 0) {
goto error;
}
} else if (datatype == DATA_TYPE_UINT64) {
(void) nvpair_value_uint64(elem, ivalp);
} else {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"'%s' must be a number"), nvpair_name(elem));
goto error;
}
/*
* Quota special: force 'none' and don't allow 0.
*/
if ((type & ZFS_TYPE_DATASET) && *ivalp == 0 && !isnone &&
(prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_REFQUOTA)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"use 'none' to disable quota/refquota"));
goto error;
}
/*
* Special handling for "*_limit=none". In this case it's not
* 0 but UINT64_MAX.
*/
if ((type & ZFS_TYPE_DATASET) && isnone &&
(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
prop == ZFS_PROP_SNAPSHOT_LIMIT)) {
*ivalp = UINT64_MAX;
}
break;
case PROP_TYPE_INDEX:
if (datatype != DATA_TYPE_STRING) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"'%s' must be a string"), nvpair_name(elem));
goto error;
}
(void) nvpair_value_string(elem, &value);
if (zprop_string_to_index(prop, value, ivalp, type) != 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"'%s' must be one of '%s'"), propname,
zprop_values(prop, type));
goto error;
}
break;
default:
abort();
}
/*
* Add the result to our return set of properties.
*/
if (*svalp != NULL) {
if (nvlist_add_string(ret, propname, *svalp) != 0) {
(void) no_memory(hdl);
return (-1);
}
} else {
if (nvlist_add_uint64(ret, propname, *ivalp) != 0) {
(void) no_memory(hdl);
return (-1);
}
}
return (0);
error:
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
return (-1);
}
static int
addlist(libzfs_handle_t *hdl, char *propname, zprop_list_t **listp,
zfs_type_t type)
{
int prop;
zprop_list_t *entry;
prop = zprop_name_to_prop(propname, type);
if (prop != ZPROP_INVAL && !zprop_valid_for_type(prop, type))
prop = ZPROP_INVAL;
/*
* When no property table entry can be found, return failure if
* this is a pool property or if this isn't a user-defined
* dataset property,
*/
if (prop == ZPROP_INVAL && ((type == ZFS_TYPE_POOL &&
!zpool_prop_feature(propname) &&
!zpool_prop_unsupported(propname)) ||
(type == ZFS_TYPE_DATASET && !zfs_prop_user(propname) &&
!zfs_prop_userquota(propname) && !zfs_prop_written(propname)))) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"invalid property '%s'"), propname);
return (zfs_error(hdl, EZFS_BADPROP,
dgettext(TEXT_DOMAIN, "bad property list")));
}
if ((entry = zfs_alloc(hdl, sizeof (zprop_list_t))) == NULL)
return (-1);
entry->pl_prop = prop;
if (prop == ZPROP_INVAL) {
if ((entry->pl_user_prop = zfs_strdup(hdl, propname)) ==
NULL) {
free(entry);
return (-1);
}
entry->pl_width = strlen(propname);
} else {
entry->pl_width = zprop_width(prop, &entry->pl_fixed,
type);
}
*listp = entry;
return (0);
}
/*
* Given a comma-separated list of properties, construct a property list
* containing both user-defined and native properties. This function will
* return a NULL list if 'all' is specified, which can later be expanded
* by zprop_expand_list().
*/
int
zprop_get_list(libzfs_handle_t *hdl, char *props, zprop_list_t **listp,
zfs_type_t type)
{
*listp = NULL;
/*
* If 'all' is specified, return a NULL list.
*/
if (strcmp(props, "all") == 0)
return (0);
/*
* If no props were specified, return an error.
*/
if (props[0] == '\0') {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"no properties specified"));
return (zfs_error(hdl, EZFS_BADPROP, dgettext(TEXT_DOMAIN,
"bad property list")));
}
/*
* It would be nice to use getsubopt() here, but the inclusion of column
* aliases makes this more effort than it's worth.
*/
while (*props != '\0') {
size_t len;
char *p;
char c;
if ((p = strchr(props, ',')) == NULL) {
len = strlen(props);
p = props + len;
} else {
len = p - props;
}
/*
* Check for empty options.
*/
if (len == 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"empty property name"));
return (zfs_error(hdl, EZFS_BADPROP,
dgettext(TEXT_DOMAIN, "bad property list")));
}
/*
* Check all regular property names.
*/
c = props[len];
props[len] = '\0';
if (strcmp(props, "space") == 0) {
static char *spaceprops[] = {
"name", "avail", "used", "usedbysnapshots",
"usedbydataset", "usedbyrefreservation",
"usedbychildren", NULL
};
int i;
for (i = 0; spaceprops[i]; i++) {
if (addlist(hdl, spaceprops[i], listp, type))
return (-1);
listp = &(*listp)->pl_next;
}
} else {
if (addlist(hdl, props, listp, type))
return (-1);
listp = &(*listp)->pl_next;
}
props = p;
if (c == ',')
props++;
}
return (0);
}
void
zprop_free_list(zprop_list_t *pl)
{
zprop_list_t *next;
while (pl != NULL) {
next = pl->pl_next;
free(pl->pl_user_prop);
free(pl);
pl = next;
}
}
typedef struct expand_data {
zprop_list_t **last;
libzfs_handle_t *hdl;
zfs_type_t type;
} expand_data_t;
int
zprop_expand_list_cb(int prop, void *cb)
{
zprop_list_t *entry;
expand_data_t *edp = cb;
if ((entry = zfs_alloc(edp->hdl, sizeof (zprop_list_t))) == NULL)
return (ZPROP_INVAL);
entry->pl_prop = prop;
entry->pl_width = zprop_width(prop, &entry->pl_fixed, edp->type);
entry->pl_all = B_TRUE;
*(edp->last) = entry;
edp->last = &entry->pl_next;
return (ZPROP_CONT);
}
int
zprop_expand_list(libzfs_handle_t *hdl, zprop_list_t **plp, zfs_type_t type)
{
zprop_list_t *entry;
zprop_list_t **last;
expand_data_t exp;
if (*plp == NULL) {
/*
* If this is the very first time we've been called for an 'all'
* specification, expand the list to include all native
* properties.
*/
last = plp;
exp.last = last;
exp.hdl = hdl;
exp.type = type;
if (zprop_iter_common(zprop_expand_list_cb, &exp, B_FALSE,
B_FALSE, type) == ZPROP_INVAL)
return (-1);
/*
* Add 'name' to the beginning of the list, which is handled
* specially.
*/
if ((entry = zfs_alloc(hdl, sizeof (zprop_list_t))) == NULL)
return (-1);
entry->pl_prop = (type == ZFS_TYPE_POOL) ? ZPOOL_PROP_NAME :
ZFS_PROP_NAME;
entry->pl_width = zprop_width(entry->pl_prop,
&entry->pl_fixed, type);
entry->pl_all = B_TRUE;
entry->pl_next = *plp;
*plp = entry;
}
return (0);
}
int
zprop_iter(zprop_func func, void *cb, boolean_t show_all, boolean_t ordered,
zfs_type_t type)
{
return (zprop_iter_common(func, cb, show_all, ordered, type));
}
Index: stable/11/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c
===================================================================
--- stable/11/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c (revision 332524)
+++ stable/11/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c (revision 332525)
@@ -1,1009 +1,1019 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2017 RackTop Systems.
*/
/*
* LibZFS_Core (lzc) is intended to replace most functionality in libzfs.
* It has the following characteristics:
*
* - Thread Safe. libzfs_core is accessible concurrently from multiple
* threads. This is accomplished primarily by avoiding global data
* (e.g. caching). Since it's thread-safe, there is no reason for a
* process to have multiple libzfs "instances". Therefore, we store
* our few pieces of data (e.g. the file descriptor) in global
* variables. The fd is reference-counted so that the libzfs_core
* library can be "initialized" multiple times (e.g. by different
* consumers within the same process).
*
* - Committed Interface. The libzfs_core interface will be committed,
* therefore consumers can compile against it and be confident that
* their code will continue to work on future releases of this code.
* Currently, the interface is Evolving (not Committed), but we intend
* to commit to it once it is more complete and we determine that it
* meets the needs of all consumers.
*
* - Programatic Error Handling. libzfs_core communicates errors with
* defined error numbers, and doesn't print anything to stdout/stderr.
*
* - Thin Layer. libzfs_core is a thin layer, marshaling arguments
* to/from the kernel ioctls. There is generally a 1:1 correspondence
* between libzfs_core functions and ioctls to /dev/zfs.
*
* - Clear Atomicity. Because libzfs_core functions are generally 1:1
* with kernel ioctls, and kernel ioctls are general atomic, each
* libzfs_core function is atomic. For example, creating multiple
* snapshots with a single call to lzc_snapshot() is atomic -- it
* can't fail with only some of the requested snapshots created, even
* in the event of power loss or system crash.
*
* - Continued libzfs Support. Some higher-level operations (e.g.
* support for "zfs send -R") are too complicated to fit the scope of
* libzfs_core. This functionality will continue to live in libzfs.
* Where appropriate, libzfs will use the underlying atomic operations
* of libzfs_core. For example, libzfs may implement "zfs send -R |
* zfs receive" by using individual "send one snapshot", rename,
* destroy, and "receive one snapshot" operations in libzfs_core.
* /sbin/zfs and /zbin/zpool will link with both libzfs and
* libzfs_core. Other consumers should aim to use only libzfs_core,
* since that will be the supported, stable interface going forwards.
*/
#define _IN_LIBZFS_CORE_
#include <libzfs_core.h>
#include <ctype.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <sys/nvpair.h>
#include <sys/param.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/zfs_ioctl.h>
#include "libzfs_core_compat.h"
#include "libzfs_compat.h"
#ifdef __FreeBSD__
extern int zfs_ioctl_version;
#endif
static int g_fd = -1;
static pthread_mutex_t g_lock = PTHREAD_MUTEX_INITIALIZER;
static int g_refcount;
int
libzfs_core_init(void)
{
(void) pthread_mutex_lock(&g_lock);
if (g_refcount == 0) {
g_fd = open("/dev/zfs", O_RDWR);
if (g_fd < 0) {
(void) pthread_mutex_unlock(&g_lock);
return (errno);
}
}
g_refcount++;
(void) pthread_mutex_unlock(&g_lock);
return (0);
}
void
libzfs_core_fini(void)
{
(void) pthread_mutex_lock(&g_lock);
ASSERT3S(g_refcount, >, 0);
if (g_refcount > 0)
g_refcount--;
if (g_refcount == 0 && g_fd != -1) {
(void) close(g_fd);
g_fd = -1;
}
(void) pthread_mutex_unlock(&g_lock);
}
static int
lzc_ioctl(zfs_ioc_t ioc, const char *name,
nvlist_t *source, nvlist_t **resultp)
{
zfs_cmd_t zc = { 0 };
int error = 0;
char *packed;
#ifdef __FreeBSD__
nvlist_t *oldsource;
#endif
size_t size;
ASSERT3S(g_refcount, >, 0);
VERIFY3S(g_fd, !=, -1);
(void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
#ifdef __FreeBSD__
if (zfs_ioctl_version == ZFS_IOCVER_UNDEF)
zfs_ioctl_version = get_zfs_ioctl_version();
if (zfs_ioctl_version < ZFS_IOCVER_LZC) {
oldsource = source;
error = lzc_compat_pre(&zc, &ioc, &source);
if (error)
return (error);
}
#endif
packed = fnvlist_pack(source, &size);
zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed;
zc.zc_nvlist_src_size = size;
if (resultp != NULL) {
*resultp = NULL;
if (ioc == ZFS_IOC_CHANNEL_PROGRAM) {
zc.zc_nvlist_dst_size = fnvlist_lookup_uint64(source,
ZCP_ARG_MEMLIMIT);
} else {
zc.zc_nvlist_dst_size = MAX(size * 2, 128 * 1024);
}
zc.zc_nvlist_dst = (uint64_t)(uintptr_t)
malloc(zc.zc_nvlist_dst_size);
#ifdef illumos
if (zc.zc_nvlist_dst == NULL) {
#else
if (zc.zc_nvlist_dst == 0) {
#endif
error = ENOMEM;
goto out;
}
}
while (ioctl(g_fd, ioc, &zc) != 0) {
/*
* If ioctl exited with ENOMEM, we retry the ioctl after
* increasing the size of the destination nvlist.
*
* Channel programs that exit with ENOMEM ran over the
* lua memory sandbox; they should not be retried.
*/
if (errno == ENOMEM && resultp != NULL &&
ioc != ZFS_IOC_CHANNEL_PROGRAM) {
free((void *)(uintptr_t)zc.zc_nvlist_dst);
zc.zc_nvlist_dst_size *= 2;
zc.zc_nvlist_dst = (uint64_t)(uintptr_t)
malloc(zc.zc_nvlist_dst_size);
#ifdef illumos
if (zc.zc_nvlist_dst == NULL) {
#else
if (zc.zc_nvlist_dst == 0) {
#endif
error = ENOMEM;
goto out;
}
} else {
error = errno;
break;
}
}
#ifdef __FreeBSD__
if (zfs_ioctl_version < ZFS_IOCVER_LZC)
lzc_compat_post(&zc, ioc);
#endif
if (zc.zc_nvlist_dst_filled) {
*resultp = fnvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst,
zc.zc_nvlist_dst_size);
}
#ifdef __FreeBSD__
if (zfs_ioctl_version < ZFS_IOCVER_LZC)
lzc_compat_outnvl(&zc, ioc, resultp);
#endif
out:
#ifdef __FreeBSD__
if (zfs_ioctl_version < ZFS_IOCVER_LZC) {
if (source != oldsource)
nvlist_free(source);
source = oldsource;
}
#endif
fnvlist_pack_free(packed, size);
free((void *)(uintptr_t)zc.zc_nvlist_dst);
return (error);
}
int
lzc_create(const char *fsname, enum lzc_dataset_type type, nvlist_t *props)
{
int error;
nvlist_t *args = fnvlist_alloc();
fnvlist_add_int32(args, "type", (dmu_objset_type_t)type);
if (props != NULL)
fnvlist_add_nvlist(args, "props", props);
error = lzc_ioctl(ZFS_IOC_CREATE, fsname, args, NULL);
nvlist_free(args);
return (error);
}
int
lzc_clone(const char *fsname, const char *origin,
nvlist_t *props)
{
int error;
nvlist_t *args = fnvlist_alloc();
fnvlist_add_string(args, "origin", origin);
if (props != NULL)
fnvlist_add_nvlist(args, "props", props);
error = lzc_ioctl(ZFS_IOC_CLONE, fsname, args, NULL);
nvlist_free(args);
return (error);
}
int
lzc_promote(const char *fsname, char *snapnamebuf, int snapnamelen)
{
/*
* The promote ioctl is still legacy, so we need to construct our
* own zfs_cmd_t rather than using lzc_ioctl().
*/
zfs_cmd_t zc = { 0 };
ASSERT3S(g_refcount, >, 0);
VERIFY3S(g_fd, !=, -1);
(void) strlcpy(zc.zc_name, fsname, sizeof (zc.zc_name));
if (ioctl(g_fd, ZFS_IOC_PROMOTE, &zc) != 0) {
int error = errno;
if (error == EEXIST && snapnamebuf != NULL)
(void) strlcpy(snapnamebuf, zc.zc_string, snapnamelen);
return (error);
}
return (0);
}
+int
+lzc_remap(const char *fsname)
+{
+ int error;
+ nvlist_t *args = fnvlist_alloc();
+ error = lzc_ioctl(ZFS_IOC_REMAP, fsname, args, NULL);
+ nvlist_free(args);
+ return (error);
+}
+
/*
* Creates snapshots.
*
* The keys in the snaps nvlist are the snapshots to be created.
* They must all be in the same pool.
*
* The props nvlist is properties to set. Currently only user properties
* are supported. { user:prop_name -> string value }
*
* The returned results nvlist will have an entry for each snapshot that failed.
* The value will be the (int32) error code.
*
* The return value will be 0 if all snapshots were created, otherwise it will
* be the errno of a (unspecified) snapshot that failed.
*/
int
lzc_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t **errlist)
{
nvpair_t *elem;
nvlist_t *args;
int error;
char pool[ZFS_MAX_DATASET_NAME_LEN];
*errlist = NULL;
/* determine the pool name */
elem = nvlist_next_nvpair(snaps, NULL);
if (elem == NULL)
return (0);
(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
pool[strcspn(pool, "/@")] = '\0';
args = fnvlist_alloc();
fnvlist_add_nvlist(args, "snaps", snaps);
if (props != NULL)
fnvlist_add_nvlist(args, "props", props);
error = lzc_ioctl(ZFS_IOC_SNAPSHOT, pool, args, errlist);
nvlist_free(args);
return (error);
}
/*
* Destroys snapshots.
*
* The keys in the snaps nvlist are the snapshots to be destroyed.
* They must all be in the same pool.
*
* Snapshots that do not exist will be silently ignored.
*
* If 'defer' is not set, and a snapshot has user holds or clones, the
* destroy operation will fail and none of the snapshots will be
* destroyed.
*
* If 'defer' is set, and a snapshot has user holds or clones, it will be
* marked for deferred destruction, and will be destroyed when the last hold
* or clone is removed/destroyed.
*
* The return value will be 0 if all snapshots were destroyed (or marked for
* later destruction if 'defer' is set) or didn't exist to begin with.
*
* Otherwise the return value will be the errno of a (unspecified) snapshot
* that failed, no snapshots will be destroyed, and the errlist will have an
* entry for each snapshot that failed. The value in the errlist will be
* the (int32) error code.
*/
int
lzc_destroy_snaps(nvlist_t *snaps, boolean_t defer, nvlist_t **errlist)
{
nvpair_t *elem;
nvlist_t *args;
int error;
char pool[ZFS_MAX_DATASET_NAME_LEN];
/* determine the pool name */
elem = nvlist_next_nvpair(snaps, NULL);
if (elem == NULL)
return (0);
(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
pool[strcspn(pool, "/@")] = '\0';
args = fnvlist_alloc();
fnvlist_add_nvlist(args, "snaps", snaps);
if (defer)
fnvlist_add_boolean(args, "defer");
error = lzc_ioctl(ZFS_IOC_DESTROY_SNAPS, pool, args, errlist);
nvlist_free(args);
return (error);
}
int
lzc_snaprange_space(const char *firstsnap, const char *lastsnap,
uint64_t *usedp)
{
nvlist_t *args;
nvlist_t *result;
int err;
char fs[ZFS_MAX_DATASET_NAME_LEN];
char *atp;
/* determine the fs name */
(void) strlcpy(fs, firstsnap, sizeof (fs));
atp = strchr(fs, '@');
if (atp == NULL)
return (EINVAL);
*atp = '\0';
args = fnvlist_alloc();
fnvlist_add_string(args, "firstsnap", firstsnap);
err = lzc_ioctl(ZFS_IOC_SPACE_SNAPS, lastsnap, args, &result);
nvlist_free(args);
if (err == 0)
*usedp = fnvlist_lookup_uint64(result, "used");
fnvlist_free(result);
return (err);
}
boolean_t
lzc_exists(const char *dataset)
{
/*
* The objset_stats ioctl is still legacy, so we need to construct our
* own zfs_cmd_t rather than using lzc_ioctl().
*/
zfs_cmd_t zc = { 0 };
ASSERT3S(g_refcount, >, 0);
VERIFY3S(g_fd, !=, -1);
(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
return (ioctl(g_fd, ZFS_IOC_OBJSET_STATS, &zc) == 0);
}
/*
* Create "user holds" on snapshots. If there is a hold on a snapshot,
* the snapshot can not be destroyed. (However, it can be marked for deletion
* by lzc_destroy_snaps(defer=B_TRUE).)
*
* The keys in the nvlist are snapshot names.
* The snapshots must all be in the same pool.
* The value is the name of the hold (string type).
*
* If cleanup_fd is not -1, it must be the result of open("/dev/zfs", O_EXCL).
* In this case, when the cleanup_fd is closed (including on process
* termination), the holds will be released. If the system is shut down
* uncleanly, the holds will be released when the pool is next opened
* or imported.
*
* Holds for snapshots which don't exist will be skipped and have an entry
* added to errlist, but will not cause an overall failure.
*
* The return value will be 0 if all holds, for snapshots that existed,
* were succesfully created.
*
* Otherwise the return value will be the errno of a (unspecified) hold that
* failed and no holds will be created.
*
* In all cases the errlist will have an entry for each hold that failed
* (name = snapshot), with its value being the error code (int32).
*/
int
lzc_hold(nvlist_t *holds, int cleanup_fd, nvlist_t **errlist)
{
char pool[ZFS_MAX_DATASET_NAME_LEN];
nvlist_t *args;
nvpair_t *elem;
int error;
/* determine the pool name */
elem = nvlist_next_nvpair(holds, NULL);
if (elem == NULL)
return (0);
(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
pool[strcspn(pool, "/@")] = '\0';
args = fnvlist_alloc();
fnvlist_add_nvlist(args, "holds", holds);
if (cleanup_fd != -1)
fnvlist_add_int32(args, "cleanup_fd", cleanup_fd);
error = lzc_ioctl(ZFS_IOC_HOLD, pool, args, errlist);
nvlist_free(args);
return (error);
}
/*
* Release "user holds" on snapshots. If the snapshot has been marked for
* deferred destroy (by lzc_destroy_snaps(defer=B_TRUE)), it does not have
* any clones, and all the user holds are removed, then the snapshot will be
* destroyed.
*
* The keys in the nvlist are snapshot names.
* The snapshots must all be in the same pool.
* The value is a nvlist whose keys are the holds to remove.
*
* Holds which failed to release because they didn't exist will have an entry
* added to errlist, but will not cause an overall failure.
*
* The return value will be 0 if the nvl holds was empty or all holds that
* existed, were successfully removed.
*
* Otherwise the return value will be the errno of a (unspecified) hold that
* failed to release and no holds will be released.
*
* In all cases the errlist will have an entry for each hold that failed to
* to release.
*/
int
lzc_release(nvlist_t *holds, nvlist_t **errlist)
{
char pool[ZFS_MAX_DATASET_NAME_LEN];
nvpair_t *elem;
/* determine the pool name */
elem = nvlist_next_nvpair(holds, NULL);
if (elem == NULL)
return (0);
(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
pool[strcspn(pool, "/@")] = '\0';
return (lzc_ioctl(ZFS_IOC_RELEASE, pool, holds, errlist));
}
/*
* Retrieve list of user holds on the specified snapshot.
*
* On success, *holdsp will be set to a nvlist which the caller must free.
* The keys are the names of the holds, and the value is the creation time
* of the hold (uint64) in seconds since the epoch.
*/
int
lzc_get_holds(const char *snapname, nvlist_t **holdsp)
{
int error;
nvlist_t *innvl = fnvlist_alloc();
error = lzc_ioctl(ZFS_IOC_GET_HOLDS, snapname, innvl, holdsp);
fnvlist_free(innvl);
return (error);
}
/*
* Generate a zfs send stream for the specified snapshot and write it to
* the specified file descriptor.
*
* "snapname" is the full name of the snapshot to send (e.g. "pool/fs@snap")
*
* If "from" is NULL, a full (non-incremental) stream will be sent.
* If "from" is non-NULL, it must be the full name of a snapshot or
* bookmark to send an incremental from (e.g. "pool/fs@earlier_snap" or
* "pool/fs#earlier_bmark"). If non-NULL, the specified snapshot or
* bookmark must represent an earlier point in the history of "snapname").
* It can be an earlier snapshot in the same filesystem or zvol as "snapname",
* or it can be the origin of "snapname"'s filesystem, or an earlier
* snapshot in the origin, etc.
*
* "fd" is the file descriptor to write the send stream to.
*
* If "flags" contains LZC_SEND_FLAG_LARGE_BLOCK, the stream is permitted
* to contain DRR_WRITE records with drr_length > 128K, and DRR_OBJECT
* records with drr_blksz > 128K.
*
* If "flags" contains LZC_SEND_FLAG_EMBED_DATA, the stream is permitted
* to contain DRR_WRITE_EMBEDDED records with drr_etype==BP_EMBEDDED_TYPE_DATA,
* which the receiving system must support (as indicated by support
* for the "embedded_data" feature).
*/
int
lzc_send(const char *snapname, const char *from, int fd,
enum lzc_send_flags flags)
{
return (lzc_send_resume(snapname, from, fd, flags, 0, 0));
}
int
lzc_send_resume(const char *snapname, const char *from, int fd,
enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff)
{
nvlist_t *args;
int err;
args = fnvlist_alloc();
fnvlist_add_int32(args, "fd", fd);
if (from != NULL)
fnvlist_add_string(args, "fromsnap", from);
if (flags & LZC_SEND_FLAG_LARGE_BLOCK)
fnvlist_add_boolean(args, "largeblockok");
if (flags & LZC_SEND_FLAG_EMBED_DATA)
fnvlist_add_boolean(args, "embedok");
if (flags & LZC_SEND_FLAG_COMPRESS)
fnvlist_add_boolean(args, "compressok");
if (resumeobj != 0 || resumeoff != 0) {
fnvlist_add_uint64(args, "resume_object", resumeobj);
fnvlist_add_uint64(args, "resume_offset", resumeoff);
}
err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL);
nvlist_free(args);
return (err);
}
/*
* "from" can be NULL, a snapshot, or a bookmark.
*
* If from is NULL, a full (non-incremental) stream will be estimated. This
* is calculated very efficiently.
*
* If from is a snapshot, lzc_send_space uses the deadlists attached to
* each snapshot to efficiently estimate the stream size.
*
* If from is a bookmark, the indirect blocks in the destination snapshot
* are traversed, looking for blocks with a birth time since the creation TXG of
* the snapshot this bookmark was created from. This will result in
* significantly more I/O and be less efficient than a send space estimation on
* an equivalent snapshot.
*/
int
lzc_send_space(const char *snapname, const char *from,
enum lzc_send_flags flags, uint64_t *spacep)
{
nvlist_t *args;
nvlist_t *result;
int err;
args = fnvlist_alloc();
if (from != NULL)
fnvlist_add_string(args, "from", from);
if (flags & LZC_SEND_FLAG_LARGE_BLOCK)
fnvlist_add_boolean(args, "largeblockok");
if (flags & LZC_SEND_FLAG_EMBED_DATA)
fnvlist_add_boolean(args, "embedok");
if (flags & LZC_SEND_FLAG_COMPRESS)
fnvlist_add_boolean(args, "compressok");
err = lzc_ioctl(ZFS_IOC_SEND_SPACE, snapname, args, &result);
nvlist_free(args);
if (err == 0)
*spacep = fnvlist_lookup_uint64(result, "space");
nvlist_free(result);
return (err);
}
static int
recv_read(int fd, void *buf, int ilen)
{
char *cp = buf;
int rv;
int len = ilen;
do {
rv = read(fd, cp, len);
cp += rv;
len -= rv;
} while (rv > 0);
if (rv < 0 || len != 0)
return (EIO);
return (0);
}
static int
recv_impl(const char *snapname, nvlist_t *props, const char *origin,
boolean_t force, boolean_t resumable, int fd,
const dmu_replay_record_t *begin_record)
{
/*
* The receive ioctl is still legacy, so we need to construct our own
* zfs_cmd_t rather than using zfsc_ioctl().
*/
zfs_cmd_t zc = { 0 };
char *atp;
char *packed = NULL;
size_t size;
int error;
ASSERT3S(g_refcount, >, 0);
VERIFY3S(g_fd, !=, -1);
/* zc_name is name of containing filesystem */
(void) strlcpy(zc.zc_name, snapname, sizeof (zc.zc_name));
atp = strchr(zc.zc_name, '@');
if (atp == NULL)
return (EINVAL);
*atp = '\0';
/* if the fs does not exist, try its parent. */
if (!lzc_exists(zc.zc_name)) {
char *slashp = strrchr(zc.zc_name, '/');
if (slashp == NULL)
return (ENOENT);
*slashp = '\0';
}
/* zc_value is full name of the snapshot to create */
(void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
if (props != NULL) {
/* zc_nvlist_src is props to set */
packed = fnvlist_pack(props, &size);
zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed;
zc.zc_nvlist_src_size = size;
}
/* zc_string is name of clone origin (if DRR_FLAG_CLONE) */
if (origin != NULL)
(void) strlcpy(zc.zc_string, origin, sizeof (zc.zc_string));
/* zc_begin_record is non-byteswapped BEGIN record */
if (begin_record == NULL) {
error = recv_read(fd, &zc.zc_begin_record,
sizeof (zc.zc_begin_record));
if (error != 0)
goto out;
} else {
zc.zc_begin_record = *begin_record;
}
/* zc_cookie is fd to read from */
zc.zc_cookie = fd;
/* zc guid is force flag */
zc.zc_guid = force;
zc.zc_resumable = resumable;
/* zc_cleanup_fd is unused */
zc.zc_cleanup_fd = -1;
error = ioctl(g_fd, ZFS_IOC_RECV, &zc);
if (error != 0)
error = errno;
out:
if (packed != NULL)
fnvlist_pack_free(packed, size);
free((void*)(uintptr_t)zc.zc_nvlist_dst);
return (error);
}
/*
* The simplest receive case: receive from the specified fd, creating the
* specified snapshot. Apply the specified properties as "received" properties
* (which can be overridden by locally-set properties). If the stream is a
* clone, its origin snapshot must be specified by 'origin'. The 'force'
* flag will cause the target filesystem to be rolled back or destroyed if
* necessary to receive.
*
* Return 0 on success or an errno on failure.
*
* Note: this interface does not work on dedup'd streams
* (those with DMU_BACKUP_FEATURE_DEDUP).
*/
int
lzc_receive(const char *snapname, nvlist_t *props, const char *origin,
boolean_t force, int fd)
{
return (recv_impl(snapname, props, origin, force, B_FALSE, fd, NULL));
}
/*
* Like lzc_receive, but if the receive fails due to premature stream
* termination, the intermediate state will be preserved on disk. In this
* case, ECKSUM will be returned. The receive may subsequently be resumed
* with a resuming send stream generated by lzc_send_resume().
*/
int
lzc_receive_resumable(const char *snapname, nvlist_t *props, const char *origin,
boolean_t force, int fd)
{
return (recv_impl(snapname, props, origin, force, B_TRUE, fd, NULL));
}
/*
* Like lzc_receive, but allows the caller to read the begin record and then to
* pass it in. That could be useful if the caller wants to derive, for example,
* the snapname or the origin parameters based on the information contained in
* the begin record.
* The begin record must be in its original form as read from the stream,
* in other words, it should not be byteswapped.
*
* The 'resumable' parameter allows to obtain the same behavior as with
* lzc_receive_resumable.
*/
int
lzc_receive_with_header(const char *snapname, nvlist_t *props,
const char *origin, boolean_t force, boolean_t resumable, int fd,
const dmu_replay_record_t *begin_record)
{
if (begin_record == NULL)
return (EINVAL);
return (recv_impl(snapname, props, origin, force, resumable, fd,
begin_record));
}
/*
* Roll back this filesystem or volume to its most recent snapshot.
* If snapnamebuf is not NULL, it will be filled in with the name
* of the most recent snapshot.
* Note that the latest snapshot may change if a new one is concurrently
* created or the current one is destroyed. lzc_rollback_to can be used
* to roll back to a specific latest snapshot.
*
* Return 0 on success or an errno on failure.
*/
int
lzc_rollback(const char *fsname, char *snapnamebuf, int snapnamelen)
{
nvlist_t *args;
nvlist_t *result;
int err;
args = fnvlist_alloc();
err = lzc_ioctl(ZFS_IOC_ROLLBACK, fsname, args, &result);
nvlist_free(args);
if (err == 0 && snapnamebuf != NULL) {
const char *snapname = fnvlist_lookup_string(result, "target");
(void) strlcpy(snapnamebuf, snapname, snapnamelen);
}
nvlist_free(result);
return (err);
}
/*
* Roll back this filesystem or volume to the specified snapshot,
* if possible.
*
* Return 0 on success or an errno on failure.
*/
int
lzc_rollback_to(const char *fsname, const char *snapname)
{
nvlist_t *args;
nvlist_t *result;
int err;
args = fnvlist_alloc();
fnvlist_add_string(args, "target", snapname);
err = lzc_ioctl(ZFS_IOC_ROLLBACK, fsname, args, &result);
nvlist_free(args);
nvlist_free(result);
return (err);
}
/*
* Creates bookmarks.
*
* The bookmarks nvlist maps from name of the bookmark (e.g. "pool/fs#bmark") to
* the name of the snapshot (e.g. "pool/fs@snap"). All the bookmarks and
* snapshots must be in the same pool.
*
* The returned results nvlist will have an entry for each bookmark that failed.
* The value will be the (int32) error code.
*
* The return value will be 0 if all bookmarks were created, otherwise it will
* be the errno of a (undetermined) bookmarks that failed.
*/
int
lzc_bookmark(nvlist_t *bookmarks, nvlist_t **errlist)
{
nvpair_t *elem;
int error;
char pool[ZFS_MAX_DATASET_NAME_LEN];
/* determine the pool name */
elem = nvlist_next_nvpair(bookmarks, NULL);
if (elem == NULL)
return (0);
(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
pool[strcspn(pool, "/#")] = '\0';
error = lzc_ioctl(ZFS_IOC_BOOKMARK, pool, bookmarks, errlist);
return (error);
}
/*
* Retrieve bookmarks.
*
* Retrieve the list of bookmarks for the given file system. The props
* parameter is an nvlist of property names (with no values) that will be
* returned for each bookmark.
*
* The following are valid properties on bookmarks, all of which are numbers
* (represented as uint64 in the nvlist)
*
* "guid" - globally unique identifier of the snapshot it refers to
* "createtxg" - txg when the snapshot it refers to was created
* "creation" - timestamp when the snapshot it refers to was created
*
* The format of the returned nvlist as follows:
* <short name of bookmark> -> {
* <name of property> -> {
* "value" -> uint64
* }
* }
*/
int
lzc_get_bookmarks(const char *fsname, nvlist_t *props, nvlist_t **bmarks)
{
return (lzc_ioctl(ZFS_IOC_GET_BOOKMARKS, fsname, props, bmarks));
}
/*
* Destroys bookmarks.
*
* The keys in the bmarks nvlist are the bookmarks to be destroyed.
* They must all be in the same pool. Bookmarks are specified as
* <fs>#<bmark>.
*
* Bookmarks that do not exist will be silently ignored.
*
* The return value will be 0 if all bookmarks that existed were destroyed.
*
* Otherwise the return value will be the errno of a (undetermined) bookmark
* that failed, no bookmarks will be destroyed, and the errlist will have an
* entry for each bookmarks that failed. The value in the errlist will be
* the (int32) error code.
*/
int
lzc_destroy_bookmarks(nvlist_t *bmarks, nvlist_t **errlist)
{
nvpair_t *elem;
int error;
char pool[ZFS_MAX_DATASET_NAME_LEN];
/* determine the pool name */
elem = nvlist_next_nvpair(bmarks, NULL);
if (elem == NULL)
return (0);
(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
pool[strcspn(pool, "/#")] = '\0';
error = lzc_ioctl(ZFS_IOC_DESTROY_BOOKMARKS, pool, bmarks, errlist);
return (error);
}
static int
lzc_channel_program_impl(const char *pool, const char *program, boolean_t sync,
uint64_t instrlimit, uint64_t memlimit, nvlist_t *argnvl, nvlist_t **outnvl)
{
int error;
nvlist_t *args;
args = fnvlist_alloc();
fnvlist_add_string(args, ZCP_ARG_PROGRAM, program);
fnvlist_add_nvlist(args, ZCP_ARG_ARGLIST, argnvl);
fnvlist_add_boolean_value(args, ZCP_ARG_SYNC, sync);
fnvlist_add_uint64(args, ZCP_ARG_INSTRLIMIT, instrlimit);
fnvlist_add_uint64(args, ZCP_ARG_MEMLIMIT, memlimit);
error = lzc_ioctl(ZFS_IOC_CHANNEL_PROGRAM, pool, args, outnvl);
fnvlist_free(args);
return (error);
}
/*
* Executes a channel program.
*
* If this function returns 0 the channel program was successfully loaded and
* ran without failing. Note that individual commands the channel program ran
* may have failed and the channel program is responsible for reporting such
* errors through outnvl if they are important.
*
* This method may also return:
*
* EINVAL The program contains syntax errors, or an invalid memory or time
* limit was given. No part of the channel program was executed.
* If caused by syntax errors, 'outnvl' contains information about the
* errors.
*
* EDOM The program was executed, but encountered a runtime error, such as
* calling a function with incorrect arguments, invoking the error()
* function directly, failing an assert() command, etc. Some portion
* of the channel program may have executed and committed changes.
* Information about the failure can be found in 'outnvl'.
*
* ENOMEM The program fully executed, but the output buffer was not large
* enough to store the returned value. No output is returned through
* 'outnvl'.
*
* ENOSPC The program was terminated because it exceeded its memory usage
* limit. Some portion of the channel program may have executed and
* committed changes to disk. No output is returned through 'outnvl'.
*
* ETIMEDOUT The program was terminated because it exceeded its Lua instruction
* limit. Some portion of the channel program may have executed and
* committed changes to disk. No output is returned through 'outnvl'.
*/
int
lzc_channel_program(const char *pool, const char *program, uint64_t instrlimit,
uint64_t memlimit, nvlist_t *argnvl, nvlist_t **outnvl)
{
return (lzc_channel_program_impl(pool, program, B_TRUE, instrlimit,
memlimit, argnvl, outnvl));
}
/*
* Executes a read-only channel program.
*
* A read-only channel program works programmatically the same way as a
* normal channel program executed with lzc_channel_program(). The only
* difference is it runs exclusively in open-context and therefore can
* return faster. The downside to that, is that the program cannot change
* on-disk state by calling functions from the zfs.sync submodule.
*
* The return values of this function (and their meaning) are exactly the
* same as the ones described in lzc_channel_program().
*/
int
lzc_channel_program_nosync(const char *pool, const char *program,
uint64_t timeout, uint64_t memlimit, nvlist_t *argnvl, nvlist_t **outnvl)
{
return (lzc_channel_program_impl(pool, program, B_FALSE, timeout,
memlimit, argnvl, outnvl));
}
Index: stable/11/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h
===================================================================
--- stable/11/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h (revision 332524)
+++ stable/11/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h (revision 332525)
@@ -1,98 +1,99 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
* Copyright (c) 2013 by Martin Matuska <mm@FreeBSD.org>. All rights reserved.
* Copyright 2017 RackTop Systems.
*/
#ifndef _LIBZFS_CORE_H
#define _LIBZFS_CORE_H
#include <libnvpair.h>
#include <sys/param.h>
#include <sys/types.h>
#ifdef __cplusplus
extern "C" {
#endif
int libzfs_core_init(void);
void libzfs_core_fini(void);
/*
* NB: this type should be kept binary compatible with dmu_objset_type_t.
*/
enum lzc_dataset_type {
LZC_DATSET_TYPE_ZFS = 2,
LZC_DATSET_TYPE_ZVOL
};
+int lzc_remap(const char *fsname);
int lzc_snapshot(nvlist_t *, nvlist_t *, nvlist_t **);
int lzc_create(const char *, enum lzc_dataset_type, nvlist_t *);
int lzc_clone(const char *, const char *, nvlist_t *);
int lzc_promote(const char *, char *, int);
int lzc_destroy_snaps(nvlist_t *, boolean_t, nvlist_t **);
int lzc_bookmark(nvlist_t *, nvlist_t **);
int lzc_get_bookmarks(const char *, nvlist_t *, nvlist_t **);
int lzc_destroy_bookmarks(nvlist_t *, nvlist_t **);
int lzc_snaprange_space(const char *, const char *, uint64_t *);
int lzc_hold(nvlist_t *, int, nvlist_t **);
int lzc_release(nvlist_t *, nvlist_t **);
int lzc_get_holds(const char *, nvlist_t **);
enum lzc_send_flags {
LZC_SEND_FLAG_EMBED_DATA = 1 << 0,
LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1,
LZC_SEND_FLAG_COMPRESS = 1 << 2
};
int lzc_send(const char *, const char *, int, enum lzc_send_flags);
int lzc_send_resume(const char *, const char *, int,
enum lzc_send_flags, uint64_t, uint64_t);
int lzc_send_space(const char *, const char *, enum lzc_send_flags, uint64_t *);
struct dmu_replay_record;
int lzc_receive(const char *, nvlist_t *, const char *, boolean_t, int);
int lzc_receive_resumable(const char *, nvlist_t *, const char *,
boolean_t, int);
int lzc_receive_with_header(const char *, nvlist_t *, const char *, boolean_t,
boolean_t, int, const struct dmu_replay_record *);
boolean_t lzc_exists(const char *);
int lzc_rollback(const char *, char *, int);
int lzc_rollback_to(const char *, const char *);
int lzc_channel_program(const char *, const char *, uint64_t,
uint64_t, nvlist_t *, nvlist_t **);
int lzc_channel_program_nosync(const char *, const char *, uint64_t,
uint64_t, nvlist_t *, nvlist_t **);
#ifdef __cplusplus
}
#endif
#endif /* _LIBZFS_CORE_H */
Index: stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c (revision 332525)
@@ -1,250 +1,266 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
#ifdef _KERNEL
#include <sys/systm.h>
#else
#include <errno.h>
#include <string.h>
#endif
#include <sys/debug.h>
#include <sys/fs/zfs.h>
#include <sys/types.h>
#include "zfeature_common.h"
/*
* Set to disable all feature checks while opening pools, allowing pools with
* unsupported features to be opened. Set for testing only.
*/
boolean_t zfeature_checks_disable = B_FALSE;
zfeature_info_t spa_feature_table[SPA_FEATURES];
/*
* Valid characters for feature guids. This list is mainly for aesthetic
* purposes and could be expanded in the future. There are different allowed
* characters in the guids reverse dns portion (before the colon) and its
* short name (after the colon).
*/
static int
valid_char(char c, boolean_t after_colon)
{
return ((c >= 'a' && c <= 'z') ||
(c >= '0' && c <= '9') ||
(after_colon && c == '_') ||
(!after_colon && (c == '.' || c == '-')));
}
/*
* Every feature guid must contain exactly one colon which separates a reverse
* dns organization name from the feature's "short" name (e.g.
* "com.company:feature_name").
*/
boolean_t
zfeature_is_valid_guid(const char *name)
{
int i;
boolean_t has_colon = B_FALSE;
i = 0;
while (name[i] != '\0') {
char c = name[i++];
if (c == ':') {
if (has_colon)
return (B_FALSE);
has_colon = B_TRUE;
continue;
}
if (!valid_char(c, has_colon))
return (B_FALSE);
}
return (has_colon);
}
boolean_t
zfeature_is_supported(const char *guid)
{
if (zfeature_checks_disable)
return (B_TRUE);
for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
zfeature_info_t *feature = &spa_feature_table[i];
if (strcmp(guid, feature->fi_guid) == 0)
return (B_TRUE);
}
return (B_FALSE);
}
int
zfeature_lookup_name(const char *name, spa_feature_t *res)
{
for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
zfeature_info_t *feature = &spa_feature_table[i];
if (strcmp(name, feature->fi_uname) == 0) {
if (res != NULL)
*res = i;
return (0);
}
}
return (ENOENT);
}
boolean_t
zfeature_depends_on(spa_feature_t fid, spa_feature_t check)
{
zfeature_info_t *feature = &spa_feature_table[fid];
for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++) {
if (feature->fi_depends[i] == check)
return (B_TRUE);
}
return (B_FALSE);
}
static void
zfeature_register(spa_feature_t fid, const char *guid, const char *name,
const char *desc, zfeature_flags_t flags, const spa_feature_t *deps)
{
zfeature_info_t *feature = &spa_feature_table[fid];
static spa_feature_t nodeps[] = { SPA_FEATURE_NONE };
ASSERT(name != NULL);
ASSERT(desc != NULL);
ASSERT((flags & ZFEATURE_FLAG_READONLY_COMPAT) == 0 ||
(flags & ZFEATURE_FLAG_MOS) == 0);
ASSERT3U(fid, <, SPA_FEATURES);
ASSERT(zfeature_is_valid_guid(guid));
if (deps == NULL)
deps = nodeps;
feature->fi_feature = fid;
feature->fi_guid = guid;
feature->fi_uname = name;
feature->fi_desc = desc;
feature->fi_flags = flags;
feature->fi_depends = deps;
}
void
zpool_feature_init(void)
{
zfeature_register(SPA_FEATURE_ASYNC_DESTROY,
"com.delphix:async_destroy", "async_destroy",
"Destroy filesystems asynchronously.",
ZFEATURE_FLAG_READONLY_COMPAT, NULL);
zfeature_register(SPA_FEATURE_EMPTY_BPOBJ,
"com.delphix:empty_bpobj", "empty_bpobj",
"Snapshots use less space.",
ZFEATURE_FLAG_READONLY_COMPAT, NULL);
zfeature_register(SPA_FEATURE_LZ4_COMPRESS,
"org.illumos:lz4_compress", "lz4_compress",
"LZ4 compression algorithm support.",
ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, NULL);
zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
"com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump",
"Crash dumps to multiple vdev pools.",
0, NULL);
zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM,
"com.delphix:spacemap_histogram", "spacemap_histogram",
"Spacemaps maintain space histograms.",
ZFEATURE_FLAG_READONLY_COMPAT, NULL);
zfeature_register(SPA_FEATURE_ENABLED_TXG,
"com.delphix:enabled_txg", "enabled_txg",
"Record txg at which a feature is enabled",
ZFEATURE_FLAG_READONLY_COMPAT, NULL);
static spa_feature_t hole_birth_deps[] = { SPA_FEATURE_ENABLED_TXG,
SPA_FEATURE_NONE };
zfeature_register(SPA_FEATURE_HOLE_BIRTH,
"com.delphix:hole_birth", "hole_birth",
"Retain hole birth txg for more precise zfs send",
ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
hole_birth_deps);
zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET,
"com.delphix:extensible_dataset", "extensible_dataset",
"Enhanced dataset functionality, used by other features.",
0, NULL);
static const spa_feature_t bookmarks_deps[] = {
SPA_FEATURE_EXTENSIBLE_DATASET,
SPA_FEATURE_NONE
};
zfeature_register(SPA_FEATURE_BOOKMARKS,
"com.delphix:bookmarks", "bookmarks",
"\"zfs bookmark\" command",
ZFEATURE_FLAG_READONLY_COMPAT, bookmarks_deps);
static const spa_feature_t filesystem_limits_deps[] = {
SPA_FEATURE_EXTENSIBLE_DATASET,
SPA_FEATURE_NONE
};
zfeature_register(SPA_FEATURE_FS_SS_LIMIT,
"com.joyent:filesystem_limits", "filesystem_limits",
"Filesystem and snapshot limits.",
ZFEATURE_FLAG_READONLY_COMPAT, filesystem_limits_deps);
zfeature_register(SPA_FEATURE_EMBEDDED_DATA,
"com.delphix:embedded_data", "embedded_data",
"Blocks which compress very well use even less space.",
ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
NULL);
static const spa_feature_t large_blocks_deps[] = {
SPA_FEATURE_EXTENSIBLE_DATASET,
SPA_FEATURE_NONE
};
zfeature_register(SPA_FEATURE_LARGE_BLOCKS,
"org.open-zfs:large_blocks", "large_blocks",
"Support for blocks larger than 128KB.",
ZFEATURE_FLAG_PER_DATASET, large_blocks_deps);
zfeature_register(SPA_FEATURE_SHA512,
"org.illumos:sha512", "sha512",
"SHA-512/256 hash algorithm.",
ZFEATURE_FLAG_PER_DATASET, NULL);
zfeature_register(SPA_FEATURE_SKEIN,
"org.illumos:skein", "skein",
"Skein hash algorithm.",
ZFEATURE_FLAG_PER_DATASET, NULL);
#ifdef illumos
zfeature_register(SPA_FEATURE_EDONR,
"org.illumos:edonr", "edonr",
"Edon-R hash algorithm.",
ZFEATURE_FLAG_PER_DATASET, NULL);
#endif
+
+ zfeature_register(SPA_FEATURE_DEVICE_REMOVAL,
+ "com.delphix:device_removal", "device_removal",
+ "Top-level vdevs can be removed, reducing logical pool size.",
+ ZFEATURE_FLAG_MOS, NULL);
+
+ static const spa_feature_t obsolete_counts_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_DEVICE_REMOVAL,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_OBSOLETE_COUNTS,
+ "com.delphix:obsolete_counts", "obsolete_counts",
+ "Reduce memory used by removed devices when their blocks are "
+ "freed or remapped.",
+ ZFEATURE_FLAG_READONLY_COMPAT, obsolete_counts_deps);
}
Index: stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h (revision 332525)
@@ -1,104 +1,106 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
#ifndef _ZFEATURE_COMMON_H
#define _ZFEATURE_COMMON_H
#include <sys/fs/zfs.h>
#include <sys/types.h>
#ifdef __cplusplus
extern "C" {
#endif
struct zfeature_info;
typedef enum spa_feature {
SPA_FEATURE_NONE = -1,
SPA_FEATURE_ASYNC_DESTROY,
SPA_FEATURE_EMPTY_BPOBJ,
SPA_FEATURE_LZ4_COMPRESS,
SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
SPA_FEATURE_SPACEMAP_HISTOGRAM,
SPA_FEATURE_ENABLED_TXG,
SPA_FEATURE_HOLE_BIRTH,
SPA_FEATURE_EXTENSIBLE_DATASET,
SPA_FEATURE_EMBEDDED_DATA,
SPA_FEATURE_BOOKMARKS,
SPA_FEATURE_FS_SS_LIMIT,
SPA_FEATURE_LARGE_BLOCKS,
SPA_FEATURE_SHA512,
SPA_FEATURE_SKEIN,
#ifdef illumos
SPA_FEATURE_EDONR,
#endif
+ SPA_FEATURE_DEVICE_REMOVAL,
+ SPA_FEATURE_OBSOLETE_COUNTS,
SPA_FEATURES
} spa_feature_t;
#define SPA_FEATURE_DISABLED (-1ULL)
typedef enum zfeature_flags {
/* Can open pool readonly even if this feature is not supported. */
ZFEATURE_FLAG_READONLY_COMPAT = (1 << 0),
/* Is this feature necessary to read the MOS? */
ZFEATURE_FLAG_MOS = (1 << 1),
/* Activate this feature at the same time it is enabled. */
ZFEATURE_FLAG_ACTIVATE_ON_ENABLE = (1 << 2),
/* Each dataset has a field set if it has ever used this feature. */
ZFEATURE_FLAG_PER_DATASET = (1 << 3)
} zfeature_flags_t;
typedef struct zfeature_info {
spa_feature_t fi_feature;
const char *fi_uname; /* User-facing feature name */
const char *fi_guid; /* On-disk feature identifier */
const char *fi_desc; /* Feature description */
zfeature_flags_t fi_flags;
/* array of dependencies, terminated by SPA_FEATURE_NONE */
const spa_feature_t *fi_depends;
} zfeature_info_t;
typedef int (zfeature_func_t)(zfeature_info_t *, void *);
#define ZFS_FEATURE_DEBUG
extern zfeature_info_t spa_feature_table[SPA_FEATURES];
extern boolean_t zfeature_is_valid_guid(const char *);
extern boolean_t zfeature_is_supported(const char *);
extern int zfeature_lookup_name(const char *, spa_feature_t *);
extern boolean_t zfeature_depends_on(spa_feature_t, spa_feature_t);
extern void zpool_feature_init(void);
#ifdef __cplusplus
}
#endif
#endif /* _ZFEATURE_COMMON_H */
Index: stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c (revision 332525)
@@ -1,234 +1,235 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2010 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
* Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
*/
#include <sys/zfs_context.h>
#if defined(_KERNEL)
#include <sys/systm.h>
#include <sys/sunddi.h>
#include <sys/ctype.h>
#else
#include <stdio.h>
#include <unistd.h>
#include <strings.h>
#include <libnvpair.h>
#include <ctype.h>
#endif
#include <sys/dsl_deleg.h>
#include "zfs_prop.h"
#include "zfs_deleg.h"
#include "zfs_namecheck.h"
zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = {
{ZFS_DELEG_PERM_ALLOW},
{ZFS_DELEG_PERM_BOOKMARK},
{ZFS_DELEG_PERM_CLONE},
{ZFS_DELEG_PERM_CREATE},
{ZFS_DELEG_PERM_DESTROY},
{ZFS_DELEG_PERM_DIFF},
{ZFS_DELEG_PERM_MOUNT},
{ZFS_DELEG_PERM_PROMOTE},
{ZFS_DELEG_PERM_RECEIVE},
+ {ZFS_DELEG_PERM_REMAP},
{ZFS_DELEG_PERM_RENAME},
{ZFS_DELEG_PERM_ROLLBACK},
{ZFS_DELEG_PERM_SNAPSHOT},
{ZFS_DELEG_PERM_SHARE},
{ZFS_DELEG_PERM_SEND},
{ZFS_DELEG_PERM_USERPROP},
{ZFS_DELEG_PERM_USERQUOTA},
{ZFS_DELEG_PERM_GROUPQUOTA},
{ZFS_DELEG_PERM_USERUSED},
{ZFS_DELEG_PERM_GROUPUSED},
{ZFS_DELEG_PERM_HOLD},
{ZFS_DELEG_PERM_RELEASE},
{NULL}
};
static int
zfs_valid_permission_name(const char *perm)
{
if (zfs_deleg_canonicalize_perm(perm))
return (0);
return (permset_namecheck(perm, NULL, NULL));
}
const char *
zfs_deleg_canonicalize_perm(const char *perm)
{
int i;
zfs_prop_t prop;
for (i = 0; zfs_deleg_perm_tab[i].z_perm != NULL; i++) {
if (strcmp(perm, zfs_deleg_perm_tab[i].z_perm) == 0)
return (perm);
}
prop = zfs_name_to_prop(perm);
if (prop != ZPROP_INVAL && zfs_prop_delegatable(prop))
return (zfs_prop_to_name(prop));
return (NULL);
}
static int
zfs_validate_who(char *who)
{
char *p;
if (who[2] != ZFS_DELEG_FIELD_SEP_CHR)
return (-1);
switch (who[0]) {
case ZFS_DELEG_USER:
case ZFS_DELEG_GROUP:
case ZFS_DELEG_USER_SETS:
case ZFS_DELEG_GROUP_SETS:
if (who[1] != ZFS_DELEG_LOCAL && who[1] != ZFS_DELEG_DESCENDENT)
return (-1);
for (p = &who[3]; *p; p++)
if (!isdigit(*p))
return (-1);
break;
case ZFS_DELEG_NAMED_SET:
case ZFS_DELEG_NAMED_SET_SETS:
if (who[1] != ZFS_DELEG_NA)
return (-1);
return (permset_namecheck(&who[3], NULL, NULL));
case ZFS_DELEG_CREATE:
case ZFS_DELEG_CREATE_SETS:
if (who[1] != ZFS_DELEG_NA)
return (-1);
if (who[3] != '\0')
return (-1);
break;
case ZFS_DELEG_EVERYONE:
case ZFS_DELEG_EVERYONE_SETS:
if (who[1] != ZFS_DELEG_LOCAL && who[1] != ZFS_DELEG_DESCENDENT)
return (-1);
if (who[3] != '\0')
return (-1);
break;
default:
return (-1);
}
return (0);
}
int
zfs_deleg_verify_nvlist(nvlist_t *nvp)
{
nvpair_t *who, *perm_name;
nvlist_t *perms;
int error;
if (nvp == NULL)
return (-1);
who = nvlist_next_nvpair(nvp, NULL);
if (who == NULL)
return (-1);
do {
if (zfs_validate_who(nvpair_name(who)))
return (-1);
error = nvlist_lookup_nvlist(nvp, nvpair_name(who), &perms);
if (error && error != ENOENT)
return (-1);
if (error == ENOENT)
continue;
perm_name = nvlist_next_nvpair(perms, NULL);
if (perm_name == NULL) {
return (-1);
}
do {
error = zfs_valid_permission_name(
nvpair_name(perm_name));
if (error)
return (-1);
} while ((perm_name = nvlist_next_nvpair(perms, perm_name))
!= NULL);
} while ((who = nvlist_next_nvpair(nvp, who)) != NULL);
return (0);
}
/*
* Construct the base attribute name. The base attribute names
* are the "key" to locate the jump objects which contain the actual
* permissions. The base attribute names are encoded based on
* type of entry and whether it is a local or descendent permission.
*
* Arguments:
* attr - attribute name return string, attribute is assumed to be
* ZFS_MAX_DELEG_NAME long.
* type - type of entry to construct
* inheritchr - inheritance type (local,descendent, or NA for create and
* permission set definitions
* data - is either a permission set name or a 64 bit uid/gid.
*/
void
zfs_deleg_whokey(char *attr, zfs_deleg_who_type_t type,
char inheritchr, void *data)
{
int len = ZFS_MAX_DELEG_NAME;
uint64_t *id = data;
switch (type) {
case ZFS_DELEG_USER:
case ZFS_DELEG_GROUP:
case ZFS_DELEG_USER_SETS:
case ZFS_DELEG_GROUP_SETS:
(void) snprintf(attr, len, "%c%c%c%lld", type, inheritchr,
ZFS_DELEG_FIELD_SEP_CHR, (longlong_t)*id);
break;
case ZFS_DELEG_NAMED_SET_SETS:
case ZFS_DELEG_NAMED_SET:
(void) snprintf(attr, len, "%c-%c%s", type,
ZFS_DELEG_FIELD_SEP_CHR, (char *)data);
break;
case ZFS_DELEG_CREATE:
case ZFS_DELEG_CREATE_SETS:
(void) snprintf(attr, len, "%c-%c", type,
ZFS_DELEG_FIELD_SEP_CHR);
break;
case ZFS_DELEG_EVERYONE:
case ZFS_DELEG_EVERYONE_SETS:
(void) snprintf(attr, len, "%c%c%c", type, inheritchr,
ZFS_DELEG_FIELD_SEP_CHR);
break;
default:
ASSERT(!"bad zfs_deleg_who_type_t");
}
}
Index: stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h (revision 332525)
@@ -1,89 +1,90 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2010 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
#ifndef _ZFS_DELEG_H
#define _ZFS_DELEG_H
#include <sys/fs/zfs.h>
#ifdef __cplusplus
extern "C" {
#endif
#define ZFS_DELEG_SET_NAME_CHR '@' /* set name lead char */
#define ZFS_DELEG_FIELD_SEP_CHR '$' /* field separator */
/*
* Max name length for a delegation attribute
*/
#define ZFS_MAX_DELEG_NAME 128
#define ZFS_DELEG_LOCAL 'l'
#define ZFS_DELEG_DESCENDENT 'd'
#define ZFS_DELEG_NA '-'
typedef enum {
ZFS_DELEG_NOTE_CREATE,
ZFS_DELEG_NOTE_DESTROY,
ZFS_DELEG_NOTE_SNAPSHOT,
ZFS_DELEG_NOTE_ROLLBACK,
ZFS_DELEG_NOTE_CLONE,
ZFS_DELEG_NOTE_PROMOTE,
ZFS_DELEG_NOTE_RENAME,
ZFS_DELEG_NOTE_SEND,
ZFS_DELEG_NOTE_RECEIVE,
ZFS_DELEG_NOTE_ALLOW,
ZFS_DELEG_NOTE_USERPROP,
ZFS_DELEG_NOTE_MOUNT,
ZFS_DELEG_NOTE_SHARE,
ZFS_DELEG_NOTE_USERQUOTA,
ZFS_DELEG_NOTE_GROUPQUOTA,
ZFS_DELEG_NOTE_USERUSED,
ZFS_DELEG_NOTE_GROUPUSED,
ZFS_DELEG_NOTE_HOLD,
ZFS_DELEG_NOTE_RELEASE,
ZFS_DELEG_NOTE_DIFF,
ZFS_DELEG_NOTE_BOOKMARK,
+ ZFS_DELEG_NOTE_REMAP,
ZFS_DELEG_NOTE_NONE
} zfs_deleg_note_t;
typedef struct zfs_deleg_perm_tab {
char *z_perm;
zfs_deleg_note_t z_note;
} zfs_deleg_perm_tab_t;
extern zfs_deleg_perm_tab_t zfs_deleg_perm_tab[];
int zfs_deleg_verify_nvlist(nvlist_t *nvlist);
void zfs_deleg_whokey(char *attr, zfs_deleg_who_type_t type,
char checkflag, void *data);
const char *zfs_deleg_canonicalize_perm(const char *perm);
#ifdef __cplusplus
}
#endif
#endif /* _ZFS_DELEG_H */
Index: stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c (revision 332525)
@@ -1,698 +1,700 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
/* Portions Copyright 2010 Robert Milkowski */
#include <sys/zio.h>
#include <sys/spa.h>
#include <sys/u8_textprep.h>
#include <sys/zfs_acl.h>
#include <sys/zfs_ioctl.h>
#include <sys/zfs_znode.h>
#include "zfs_prop.h"
#include "zfs_deleg.h"
#if defined(_KERNEL)
#include <sys/systm.h>
#else
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#endif
static zprop_desc_t zfs_prop_table[ZFS_NUM_PROPS];
/* Note this is indexed by zfs_userquota_prop_t, keep the order the same */
const char *zfs_userquota_prop_prefixes[] = {
"userused@",
"userquota@",
"groupused@",
"groupquota@"
};
zprop_desc_t *
zfs_prop_get_table(void)
{
return (zfs_prop_table);
}
void
zfs_prop_init(void)
{
static zprop_index_t checksum_table[] = {
{ "on", ZIO_CHECKSUM_ON },
{ "off", ZIO_CHECKSUM_OFF },
{ "fletcher2", ZIO_CHECKSUM_FLETCHER_2 },
{ "fletcher4", ZIO_CHECKSUM_FLETCHER_4 },
{ "sha256", ZIO_CHECKSUM_SHA256 },
{ "noparity", ZIO_CHECKSUM_NOPARITY },
{ "sha512", ZIO_CHECKSUM_SHA512 },
{ "skein", ZIO_CHECKSUM_SKEIN },
#ifdef illumos
{ "edonr", ZIO_CHECKSUM_EDONR },
#endif
{ NULL }
};
static zprop_index_t dedup_table[] = {
{ "on", ZIO_CHECKSUM_ON },
{ "off", ZIO_CHECKSUM_OFF },
{ "verify", ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY },
{ "sha256", ZIO_CHECKSUM_SHA256 },
{ "sha256,verify",
ZIO_CHECKSUM_SHA256 | ZIO_CHECKSUM_VERIFY },
{ "sha512", ZIO_CHECKSUM_SHA512 },
{ "sha512,verify",
ZIO_CHECKSUM_SHA512 | ZIO_CHECKSUM_VERIFY },
{ "skein", ZIO_CHECKSUM_SKEIN },
{ "skein,verify",
ZIO_CHECKSUM_SKEIN | ZIO_CHECKSUM_VERIFY },
#ifdef illumos
{ "edonr,verify",
ZIO_CHECKSUM_EDONR | ZIO_CHECKSUM_VERIFY },
#endif
{ NULL }
};
static zprop_index_t compress_table[] = {
{ "on", ZIO_COMPRESS_ON },
{ "off", ZIO_COMPRESS_OFF },
{ "lzjb", ZIO_COMPRESS_LZJB },
{ "gzip", ZIO_COMPRESS_GZIP_6 }, /* gzip default */
{ "gzip-1", ZIO_COMPRESS_GZIP_1 },
{ "gzip-2", ZIO_COMPRESS_GZIP_2 },
{ "gzip-3", ZIO_COMPRESS_GZIP_3 },
{ "gzip-4", ZIO_COMPRESS_GZIP_4 },
{ "gzip-5", ZIO_COMPRESS_GZIP_5 },
{ "gzip-6", ZIO_COMPRESS_GZIP_6 },
{ "gzip-7", ZIO_COMPRESS_GZIP_7 },
{ "gzip-8", ZIO_COMPRESS_GZIP_8 },
{ "gzip-9", ZIO_COMPRESS_GZIP_9 },
{ "zle", ZIO_COMPRESS_ZLE },
{ "lz4", ZIO_COMPRESS_LZ4 },
{ NULL }
};
static zprop_index_t snapdir_table[] = {
{ "hidden", ZFS_SNAPDIR_HIDDEN },
{ "visible", ZFS_SNAPDIR_VISIBLE },
{ NULL }
};
static zprop_index_t acl_mode_table[] = {
{ "discard", ZFS_ACL_DISCARD },
{ "groupmask", ZFS_ACL_GROUPMASK },
{ "passthrough", ZFS_ACL_PASSTHROUGH },
{ "restricted", ZFS_ACL_RESTRICTED },
{ NULL }
};
static zprop_index_t acl_inherit_table[] = {
{ "discard", ZFS_ACL_DISCARD },
{ "noallow", ZFS_ACL_NOALLOW },
{ "restricted", ZFS_ACL_RESTRICTED },
{ "passthrough", ZFS_ACL_PASSTHROUGH },
{ "secure", ZFS_ACL_RESTRICTED }, /* bkwrd compatability */
{ "passthrough-x", ZFS_ACL_PASSTHROUGH_X },
{ NULL }
};
static zprop_index_t case_table[] = {
{ "sensitive", ZFS_CASE_SENSITIVE },
{ "insensitive", ZFS_CASE_INSENSITIVE },
{ "mixed", ZFS_CASE_MIXED },
{ NULL }
};
static zprop_index_t copies_table[] = {
{ "1", 1 },
{ "2", 2 },
{ "3", 3 },
{ NULL }
};
/*
* Use the unique flags we have to send to u8_strcmp() and/or
* u8_textprep() to represent the various normalization property
* values.
*/
static zprop_index_t normalize_table[] = {
{ "none", 0 },
{ "formD", U8_TEXTPREP_NFD },
{ "formKC", U8_TEXTPREP_NFKC },
{ "formC", U8_TEXTPREP_NFC },
{ "formKD", U8_TEXTPREP_NFKD },
{ NULL }
};
static zprop_index_t version_table[] = {
{ "1", 1 },
{ "2", 2 },
{ "3", 3 },
{ "4", 4 },
{ "5", 5 },
{ "current", ZPL_VERSION },
{ NULL }
};
static zprop_index_t boolean_table[] = {
{ "off", 0 },
{ "on", 1 },
{ NULL }
};
static zprop_index_t logbias_table[] = {
{ "latency", ZFS_LOGBIAS_LATENCY },
{ "throughput", ZFS_LOGBIAS_THROUGHPUT },
{ NULL }
};
static zprop_index_t canmount_table[] = {
{ "off", ZFS_CANMOUNT_OFF },
{ "on", ZFS_CANMOUNT_ON },
{ "noauto", ZFS_CANMOUNT_NOAUTO },
{ NULL }
};
static zprop_index_t cache_table[] = {
{ "none", ZFS_CACHE_NONE },
{ "metadata", ZFS_CACHE_METADATA },
{ "all", ZFS_CACHE_ALL },
{ NULL }
};
static zprop_index_t sync_table[] = {
{ "standard", ZFS_SYNC_STANDARD },
{ "always", ZFS_SYNC_ALWAYS },
{ "disabled", ZFS_SYNC_DISABLED },
{ NULL }
};
static zprop_index_t volmode_table[] = {
{ "default", ZFS_VOLMODE_DEFAULT },
{ "geom", ZFS_VOLMODE_GEOM },
{ "dev", ZFS_VOLMODE_DEV },
{ "none", ZFS_VOLMODE_NONE },
{ NULL }
};
static zprop_index_t redundant_metadata_table[] = {
{ "all", ZFS_REDUNDANT_METADATA_ALL },
{ "most", ZFS_REDUNDANT_METADATA_MOST },
{ NULL }
};
/* inherit index properties */
zprop_register_index(ZFS_PROP_REDUNDANT_METADATA, "redundant_metadata",
ZFS_REDUNDANT_METADATA_ALL,
PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"all | most", "REDUND_MD",
redundant_metadata_table);
zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD,
PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"standard | always | disabled", "SYNC",
sync_table);
zprop_register_index(ZFS_PROP_CHECKSUM, "checksum",
ZIO_CHECKSUM_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM |
ZFS_TYPE_VOLUME,
"on | off | fletcher2 | fletcher4 | sha256 | sha512 | "
"skein", "CHECKSUM", checksum_table);
zprop_register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF,
PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"on | off | verify | sha256[,verify], sha512[,verify], "
"skein[,verify]", "DEDUP", dedup_table);
zprop_register_index(ZFS_PROP_COMPRESSION, "compression",
ZIO_COMPRESS_DEFAULT, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"on | off | lzjb | gzip | gzip-[1-9] | zle | lz4",
"COMPRESS", compress_table);
zprop_register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN,
PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
"hidden | visible", "SNAPDIR", snapdir_table);
zprop_register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_DISCARD,
PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
"discard | groupmask | passthrough | restricted", "ACLMODE",
acl_mode_table);
zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit",
ZFS_ACL_RESTRICTED, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
"discard | noallow | restricted | passthrough | passthrough-x",
"ACLINHERIT", acl_inherit_table);
zprop_register_index(ZFS_PROP_COPIES, "copies", 1, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"1 | 2 | 3", "COPIES", copies_table);
zprop_register_index(ZFS_PROP_PRIMARYCACHE, "primarycache",
ZFS_CACHE_ALL, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
"all | none | metadata", "PRIMARYCACHE", cache_table);
zprop_register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache",
ZFS_CACHE_ALL, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
"all | none | metadata", "SECONDARYCACHE", cache_table);
zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY,
PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"latency | throughput", "LOGBIAS", logbias_table);
zprop_register_index(ZFS_PROP_VOLMODE, "volmode",
ZFS_VOLMODE_DEFAULT, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
"default | geom | dev | none", "VOLMODE", volmode_table);
/* inherit index (boolean) properties */
zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table);
zprop_register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "DEVICES",
boolean_table);
zprop_register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "EXEC",
boolean_table);
zprop_register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID",
boolean_table);
zprop_register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off", "RDONLY",
boolean_table);
zprop_register_index(ZFS_PROP_ZONED, "jailed", 0, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM, "on | off", "JAILED", boolean_table);
zprop_register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "XATTR",
boolean_table);
zprop_register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN",
boolean_table);
zprop_register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND",
boolean_table);
/* default index properties */
zprop_register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
"1 | 2 | 3 | 4 | 5 | current", "VERSION", version_table);
zprop_register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON,
PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto",
"CANMOUNT", canmount_table);
/* readonly index (boolean) properties */
zprop_register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY,
ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table);
zprop_register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0,
PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes | no", "DEFER_DESTROY",
boolean_table);
/* set once index properties */
zprop_register_index(ZFS_PROP_NORMALIZE, "normalization", 0,
PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
"none | formC | formD | formKC | formKD", "NORMALIZATION",
normalize_table);
zprop_register_index(ZFS_PROP_CASE, "casesensitivity",
ZFS_CASE_SENSITIVE, PROP_ONETIME, ZFS_TYPE_FILESYSTEM |
ZFS_TYPE_SNAPSHOT,
"sensitive | insensitive | mixed", "CASE", case_table);
/* set once index (boolean) properties */
zprop_register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
"on | off", "UTF8ONLY", boolean_table);
/* string properties */
zprop_register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<snapshot>", "ORIGIN");
zprop_register_string(ZFS_PROP_CLONES, "clones", NULL, PROP_READONLY,
ZFS_TYPE_SNAPSHOT, "<dataset>[,...]", "CLONES");
zprop_register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/",
PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "<path> | legacy | none",
"MOUNTPOINT");
zprop_register_string(ZFS_PROP_SHARENFS, "sharenfs", "off",
PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options",
"SHARENFS");
zprop_register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY,
ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK,
"filesystem | volume | snapshot | bookmark", "TYPE");
zprop_register_string(ZFS_PROP_SHARESMB, "sharesmb", "off",
PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
"on | off | sharemgr(1M) options", "SHARESMB");
zprop_register_string(ZFS_PROP_MLSLABEL, "mlslabel",
ZFS_MLSLABEL_DEFAULT, PROP_INHERIT, ZFS_TYPE_DATASET,
"<sensitivity label>", "MLSLABEL");
zprop_register_string(ZFS_PROP_RECEIVE_RESUME_TOKEN,
"receive_resume_token",
NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"<string token>", "RESUMETOK");
/* readonly number properties */
zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY,
ZFS_TYPE_DATASET, "<size>", "USED");
zprop_register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "AVAIL");
zprop_register_number(ZFS_PROP_REFERENCED, "referenced", 0,
PROP_READONLY, ZFS_TYPE_DATASET, "<size>", "REFER");
zprop_register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0,
PROP_READONLY, ZFS_TYPE_DATASET,
"<1.00x or higher if compressed>", "RATIO");
zprop_register_number(ZFS_PROP_REFRATIO, "refcompressratio", 0,
PROP_READONLY, ZFS_TYPE_DATASET,
"<1.00x or higher if compressed>", "REFRATIO");
zprop_register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize",
ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME,
ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK");
zprop_register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0,
PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
"USEDSNAP");
zprop_register_number(ZFS_PROP_USEDDS, "usedbydataset", 0,
PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
"USEDDS");
zprop_register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0,
PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
"USEDCHILD");
zprop_register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0,
PROP_READONLY,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDREFRESERV");
zprop_register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY,
ZFS_TYPE_SNAPSHOT, "<count>", "USERREFS");
zprop_register_number(ZFS_PROP_WRITTEN, "written", 0, PROP_READONLY,
ZFS_TYPE_DATASET, "<size>", "WRITTEN");
zprop_register_number(ZFS_PROP_LOGICALUSED, "logicalused", 0,
PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
"LUSED");
zprop_register_number(ZFS_PROP_LOGICALREFERENCED, "logicalreferenced",
0, PROP_READONLY, ZFS_TYPE_DATASET, "<size>", "LREFER");
/* default number properties */
zprop_register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT,
ZFS_TYPE_FILESYSTEM, "<size> | none", "QUOTA");
zprop_register_number(ZFS_PROP_RESERVATION, "reservation", 0,
PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"<size> | none", "RESERV");
zprop_register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT,
ZFS_TYPE_VOLUME, "<size>", "VOLSIZE");
zprop_register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT,
ZFS_TYPE_FILESYSTEM, "<size> | none", "REFQUOTA");
zprop_register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0,
PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"<size> | none", "REFRESERV");
zprop_register_number(ZFS_PROP_FILESYSTEM_LIMIT, "filesystem_limit",
UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM,
"<count> | none", "FSLIMIT");
zprop_register_number(ZFS_PROP_SNAPSHOT_LIMIT, "snapshot_limit",
UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"<count> | none", "SSLIMIT");
zprop_register_number(ZFS_PROP_FILESYSTEM_COUNT, "filesystem_count",
UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM,
"<count>", "FSCOUNT");
zprop_register_number(ZFS_PROP_SNAPSHOT_COUNT, "snapshot_count",
UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"<count>", "SSCOUNT");
/* inherit number properties */
zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE");
/* hidden properties */
zprop_register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER,
PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "CREATETXG");
+ zprop_register_hidden(ZFS_PROP_REMAPTXG, "remaptxg", PROP_TYPE_NUMBER,
+ PROP_READONLY, ZFS_TYPE_DATASET, "REMAPTXG");
zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
PROP_READONLY, ZFS_TYPE_SNAPSHOT, "NUMCLONES");
zprop_register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING,
PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "NAME");
zprop_register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions",
PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS");
zprop_register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu",
PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME,
"STMF_SBD_LU");
zprop_register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER,
PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "GUID");
zprop_register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting",
PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET,
"USERACCOUNTING");
zprop_register_hidden(ZFS_PROP_UNIQUE, "unique", PROP_TYPE_NUMBER,
PROP_READONLY, ZFS_TYPE_DATASET, "UNIQUE");
zprop_register_hidden(ZFS_PROP_OBJSETID, "objsetid", PROP_TYPE_NUMBER,
PROP_READONLY, ZFS_TYPE_DATASET, "OBJSETID");
zprop_register_hidden(ZFS_PROP_INCONSISTENT, "inconsistent",
PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "INCONSISTENT");
zprop_register_hidden(ZFS_PROP_PREV_SNAP, "prevsnap", PROP_TYPE_STRING,
PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "PREVSNAP");
/* oddball properties */
zprop_register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0,
NULL, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK,
"<date>", "CREATION", B_FALSE, B_TRUE, NULL);
}
boolean_t
zfs_prop_delegatable(zfs_prop_t prop)
{
zprop_desc_t *pd = &zfs_prop_table[prop];
/* The mlslabel property is never delegatable. */
if (prop == ZFS_PROP_MLSLABEL)
return (B_FALSE);
return (pd->pd_attr != PROP_READONLY);
}
/*
* Given a zfs dataset property name, returns the corresponding property ID.
*/
zfs_prop_t
zfs_name_to_prop(const char *propname)
{
return (zprop_name_to_prop(propname, ZFS_TYPE_DATASET));
}
/*
* For user property names, we allow all lowercase alphanumeric characters, plus
* a few useful punctuation characters.
*/
static int
valid_char(char c)
{
return ((c >= 'a' && c <= 'z') ||
(c >= '0' && c <= '9') ||
c == '-' || c == '_' || c == '.' || c == ':');
}
/*
* Returns true if this is a valid user-defined property (one with a ':').
*/
boolean_t
zfs_prop_user(const char *name)
{
int i;
char c;
boolean_t foundsep = B_FALSE;
for (i = 0; i < strlen(name); i++) {
c = name[i];
if (!valid_char(c))
return (B_FALSE);
if (c == ':')
foundsep = B_TRUE;
}
if (!foundsep)
return (B_FALSE);
return (B_TRUE);
}
/*
* Returns true if this is a valid userspace-type property (one with a '@').
* Note that after the @, any character is valid (eg, another @, for SID
* user@domain).
*/
boolean_t
zfs_prop_userquota(const char *name)
{
zfs_userquota_prop_t prop;
for (prop = 0; prop < ZFS_NUM_USERQUOTA_PROPS; prop++) {
if (strncmp(name, zfs_userquota_prop_prefixes[prop],
strlen(zfs_userquota_prop_prefixes[prop])) == 0) {
return (B_TRUE);
}
}
return (B_FALSE);
}
/*
* Returns true if this is a valid written@ property.
* Note that after the @, any character is valid (eg, another @, for
* written@pool/fs@origin).
*/
boolean_t
zfs_prop_written(const char *name)
{
static const char *prefix = "written@";
return (strncmp(name, prefix, strlen(prefix)) == 0);
}
/*
* Tables of index types, plus functions to convert between the user view
* (strings) and internal representation (uint64_t).
*/
int
zfs_prop_string_to_index(zfs_prop_t prop, const char *string, uint64_t *index)
{
return (zprop_string_to_index(prop, string, index, ZFS_TYPE_DATASET));
}
int
zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string)
{
return (zprop_index_to_string(prop, index, string, ZFS_TYPE_DATASET));
}
uint64_t
zfs_prop_random_value(zfs_prop_t prop, uint64_t seed)
{
return (zprop_random_value(prop, seed, ZFS_TYPE_DATASET));
}
/*
* Returns TRUE if the property applies to any of the given dataset types.
*/
boolean_t
zfs_prop_valid_for_type(int prop, zfs_type_t types)
{
return (zprop_valid_for_type(prop, types));
}
zprop_type_t
zfs_prop_get_type(zfs_prop_t prop)
{
return (zfs_prop_table[prop].pd_proptype);
}
/*
* Returns TRUE if the property is readonly.
*/
boolean_t
zfs_prop_readonly(zfs_prop_t prop)
{
return (zfs_prop_table[prop].pd_attr == PROP_READONLY ||
zfs_prop_table[prop].pd_attr == PROP_ONETIME);
}
/*
* Returns TRUE if the property is visible (not hidden).
*/
boolean_t
zfs_prop_visible(zfs_prop_t prop)
{
return (zfs_prop_table[prop].pd_visible);
}
/*
* Returns TRUE if the property is only allowed to be set once.
*/
boolean_t
zfs_prop_setonce(zfs_prop_t prop)
{
return (zfs_prop_table[prop].pd_attr == PROP_ONETIME);
}
const char *
zfs_prop_default_string(zfs_prop_t prop)
{
return (zfs_prop_table[prop].pd_strdefault);
}
uint64_t
zfs_prop_default_numeric(zfs_prop_t prop)
{
return (zfs_prop_table[prop].pd_numdefault);
}
/*
* Given a dataset property ID, returns the corresponding name.
* Assuming the zfs dataset property ID is valid.
*/
const char *
zfs_prop_to_name(zfs_prop_t prop)
{
return (zfs_prop_table[prop].pd_name);
}
/*
* Returns TRUE if the property is inheritable.
*/
boolean_t
zfs_prop_inheritable(zfs_prop_t prop)
{
return (zfs_prop_table[prop].pd_attr == PROP_INHERIT ||
zfs_prop_table[prop].pd_attr == PROP_ONETIME);
}
#ifndef _KERNEL
/*
* Returns a string describing the set of acceptable values for the given
* zfs property, or NULL if it cannot be set.
*/
const char *
zfs_prop_values(zfs_prop_t prop)
{
return (zfs_prop_table[prop].pd_values);
}
/*
* Returns TRUE if this property is a string type. Note that index types
* (compression, checksum) are treated as strings in userland, even though they
* are stored numerically on disk.
*/
int
zfs_prop_is_string(zfs_prop_t prop)
{
return (zfs_prop_table[prop].pd_proptype == PROP_TYPE_STRING ||
zfs_prop_table[prop].pd_proptype == PROP_TYPE_INDEX);
}
/*
* Returns the column header for the given property. Used only in
* 'zfs list -o', but centralized here with the other property information.
*/
const char *
zfs_prop_column_name(zfs_prop_t prop)
{
return (zfs_prop_table[prop].pd_colname);
}
/*
* Returns whether the given property should be displayed right-justified for
* 'zfs list'.
*/
boolean_t
zfs_prop_align_right(zfs_prop_t prop)
{
return (zfs_prop_table[prop].pd_rightalign);
}
#endif
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/Makefile.files (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/Makefile.files (revision 332525)
@@ -1,173 +1,177 @@
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
# Copyright (c) 2012 Joyent, Inc. All rights reserved.
# Copyright (c) 2011, 2014 by Delphix. All rights reserved.
# Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
#
#
# This Makefile defines all file modules for the directory uts/common
# and its children. These are the source files which may be considered
# common to all SunOS systems.
LUA_OBJS += \
ldo.o \
lvm.o \
lbitlib.o \
lopcodes.o \
lstring.o \
ltable.o \
ltm.o \
lcorolib.o \
lauxlib.o \
ldebug.o \
lstate.o \
lgc.o \
lmem.o \
lctype.o \
lfunc.o \
ldump.o \
lundump.o \
lstrlib.o \
ltablib.o \
lapi.o \
lobject.o \
lbaselib.o \
lcompat.o \
lzio.o \
lcode.o \
llex.o \
lparser.o
ZFS_COMMON_OBJS += \
abd.o \
arc.o \
bplist.o \
blkptr.o \
bpobj.o \
bptree.o \
bqueue.o \
dbuf.o \
ddt.o \
ddt_zap.o \
dmu.o \
dmu_diff.o \
dmu_send.o \
dmu_object.o \
dmu_objset.o \
dmu_traverse.o \
dmu_tx.o \
dnode.o \
dnode_sync.o \
dsl_bookmark.o \
dsl_dir.o \
dsl_dataset.o \
dsl_deadlist.o \
dsl_destroy.o \
dsl_pool.o \
dsl_synctask.o \
dsl_userhold.o \
dmu_zfetch.o \
dsl_deleg.o \
dsl_prop.o \
dsl_scan.o \
zfeature.o \
gzip.o \
lz4.o \
lzjb.o \
metaslab.o \
multilist.o \
range_tree.o \
refcount.o \
rrwlock.o \
sa.o \
sha256.o \
skein_zfs.o \
spa.o \
spa_config.o \
spa_errlog.o \
spa_history.o \
spa_misc.o \
space_map.o \
space_reftree.o \
txg.o \
uberblock.o \
unique.o \
vdev.o \
vdev_cache.o \
vdev_file.o \
+ vdev_indirect.o \
+ vdev_indirect_births.o \
+ vdev_indirect_mapping.o \
vdev_label.o \
vdev_mirror.o \
vdev_missing.o \
vdev_queue.o \
vdev_raidz.o \
+ vdev_removal.o \
vdev_root.o \
zap.o \
zap_leaf.o \
zap_micro.o \
zcp.o \
zcp_get.o \
zcp_global.o \
zcp_iter.o \
zcp_synctask.o \
zfs_byteswap.o \
zfs_debug.o \
zfs_fm.o \
zfs_fuid.o \
zfs_sa.o \
zfs_znode.o \
zil.o \
zio.o \
zio_checksum.o \
zio_compress.o \
zio_inject.o \
zle.o \
zrlock.o
ZFS_SHARED_OBJS += \
zfeature_common.o \
zfs_comutil.o \
zfs_deleg.o \
zfs_fletcher.o \
zfs_namecheck.o \
zfs_prop.o \
zpool_prop.o \
zprop_common.o
ZFS_OBJS += \
$(ZFS_COMMON_OBJS) \
$(ZFS_SHARED_OBJS) \
zfs_acl.o \
zfs_ctldir.o \
zfs_dir.o \
zfs_ioctl.o \
zfs_ioctl_compat.o \
zfs_log.o \
zfs_onexit.o \
zfs_replay.o \
zfs_rlock.o \
zfs_vfsops.o \
zfs_vnops.o \
zvol.o
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (revision 332525)
@@ -1,7857 +1,7857 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc. All rights reserved.
*/
/*
* DVA-based Adjustable Replacement Cache
*
* While much of the theory of operation used here is
* based on the self-tuning, low overhead replacement cache
* presented by Megiddo and Modha at FAST 2003, there are some
* significant differences:
*
* 1. The Megiddo and Modha model assumes any page is evictable.
* Pages in its cache cannot be "locked" into memory. This makes
* the eviction algorithm simple: evict the last page in the list.
* This also make the performance characteristics easy to reason
* about. Our cache is not so simple. At any given moment, some
* subset of the blocks in the cache are un-evictable because we
* have handed out a reference to them. Blocks are only evictable
* when there are no external references active. This makes
* eviction far more problematic: we choose to evict the evictable
* blocks that are the "lowest" in the list.
*
* There are times when it is not possible to evict the requested
* space. In these circumstances we are unable to adjust the cache
* size. To prevent the cache growing unbounded at these times we
* implement a "cache throttle" that slows the flow of new data
* into the cache until we can make space available.
*
* 2. The Megiddo and Modha model assumes a fixed cache size.
* Pages are evicted when the cache is full and there is a cache
* miss. Our model has a variable sized cache. It grows with
* high use, but also tries to react to memory pressure from the
* operating system: decreasing its size when system memory is
* tight.
*
* 3. The Megiddo and Modha model assumes a fixed page size. All
* elements of the cache are therefore exactly the same size. So
* when adjusting the cache size following a cache miss, its simply
* a matter of choosing a single page to evict. In our model, we
* have variable sized cache blocks (rangeing from 512 bytes to
* 128K bytes). We therefore choose a set of blocks to evict to make
* space for a cache miss that approximates as closely as possible
* the space used by the new block.
*
* See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
* by N. Megiddo & D. Modha, FAST 2003
*/
/*
* The locking model:
*
* A new reference to a cache buffer can be obtained in two
* ways: 1) via a hash table lookup using the DVA as a key,
* or 2) via one of the ARC lists. The arc_read() interface
* uses method 1, while the internal ARC algorithms for
* adjusting the cache use method 2. We therefore provide two
* types of locks: 1) the hash table lock array, and 2) the
* ARC list locks.
*
* Buffers do not have their own mutexes, rather they rely on the
* hash table mutexes for the bulk of their protection (i.e. most
* fields in the arc_buf_hdr_t are protected by these mutexes).
*
* buf_hash_find() returns the appropriate mutex (held) when it
* locates the requested buffer in the hash table. It returns
* NULL for the mutex if the buffer was not in the table.
*
* buf_hash_remove() expects the appropriate hash mutex to be
* already held before it is invoked.
*
* Each ARC state also has a mutex which is used to protect the
* buffer list associated with the state. When attempting to
* obtain a hash table lock while holding an ARC list lock you
* must use: mutex_tryenter() to avoid deadlock. Also note that
* the active state mutex must be held before the ghost state mutex.
*
* Note that the majority of the performance stats are manipulated
* with atomic operations.
*
* The L2ARC uses the l2ad_mtx on each vdev for the following:
*
* - L2ARC buflist creation
* - L2ARC buflist eviction
* - L2ARC write completion, which walks L2ARC buflists
* - ARC header destruction, as it removes from L2ARC buflists
* - ARC header release, as it removes from L2ARC buflists
*/
/*
* ARC operation:
*
* Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
* This structure can point either to a block that is still in the cache or to
* one that is only accessible in an L2 ARC device, or it can provide
* information about a block that was recently evicted. If a block is
* only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
* information to retrieve it from the L2ARC device. This information is
* stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
* that is in this state cannot access the data directly.
*
* Blocks that are actively being referenced or have not been evicted
* are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
* the arc_buf_hdr_t that will point to the data block in memory. A block can
* only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
* caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
* also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
*
* The L1ARC's data pointer may or may not be uncompressed. The ARC has the
* ability to store the physical data (b_pabd) associated with the DVA of the
* arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
* it will match its on-disk compression characteristics. This behavior can be
* disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
* compressed ARC functionality is disabled, the b_pabd will point to an
* uncompressed version of the on-disk data.
*
* Data in the L1ARC is not accessed by consumers of the ARC directly. Each
* arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
* Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
* consumer. The ARC will provide references to this data and will keep it
* cached until it is no longer in use. The ARC caches only the L1ARC's physical
* data block and will evict any arc_buf_t that is no longer referenced. The
* amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
* "overhead_size" kstat.
*
* Depending on the consumer, an arc_buf_t can be requested in uncompressed or
* compressed form. The typical case is that consumers will want uncompressed
* data, and when that happens a new data buffer is allocated where the data is
* decompressed for them to use. Currently the only consumer who wants
* compressed arc_buf_t's is "zfs send", when it streams data exactly as it
* exists on disk. When this happens, the arc_buf_t's data buffer is shared
* with the arc_buf_hdr_t.
*
* Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
* first one is owned by a compressed send consumer (and therefore references
* the same compressed data buffer as the arc_buf_hdr_t) and the second could be
* used by any other consumer (and has its own uncompressed copy of the data
* buffer).
*
* arc_buf_hdr_t
* +-----------+
* | fields |
* | common to |
* | L1- and |
* | L2ARC |
* +-----------+
* | l2arc_buf_hdr_t
* | |
* +-----------+
* | l1arc_buf_hdr_t
* | | arc_buf_t
* | b_buf +------------>+-----------+ arc_buf_t
* | b_pabd +-+ |b_next +---->+-----------+
* +-----------+ | |-----------| |b_next +-->NULL
* | |b_comp = T | +-----------+
* | |b_data +-+ |b_comp = F |
* | +-----------+ | |b_data +-+
* +->+------+ | +-----------+ |
* compressed | | | |
* data | |<--------------+ | uncompressed
* +------+ compressed, | data
* shared +-->+------+
* data | |
* | |
* +------+
*
* When a consumer reads a block, the ARC must first look to see if the
* arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
* arc_buf_t and either copies uncompressed data into a new data buffer from an
* existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
* new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
* hdr is compressed and the desired compression characteristics of the
* arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
* arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
* the last buffer in the hdr's b_buf list, however a shared compressed buf can
* be anywhere in the hdr's list.
*
* The diagram below shows an example of an uncompressed ARC hdr that is
* sharing its data with an arc_buf_t (note that the shared uncompressed buf is
* the last element in the buf list):
*
* arc_buf_hdr_t
* +-----------+
* | |
* | |
* | |
* +-----------+
* l2arc_buf_hdr_t| |
* | |
* +-----------+
* l1arc_buf_hdr_t| |
* | | arc_buf_t (shared)
* | b_buf +------------>+---------+ arc_buf_t
* | | |b_next +---->+---------+
* | b_pabd +-+ |---------| |b_next +-->NULL
* +-----------+ | | | +---------+
* | |b_data +-+ | |
* | +---------+ | |b_data +-+
* +->+------+ | +---------+ |
* | | | |
* uncompressed | | | |
* data +------+ | |
* ^ +->+------+ |
* | uncompressed | | |
* | data | | |
* | +------+ |
* +---------------------------------+
*
* Writing to the ARC requires that the ARC first discard the hdr's b_pabd
* since the physical block is about to be rewritten. The new data contents
* will be contained in the arc_buf_t. As the I/O pipeline performs the write,
* it may compress the data before writing it to disk. The ARC will be called
* with the transformed data and will bcopy the transformed on-disk block into
* a newly allocated b_pabd. Writes are always done into buffers which have
* either been loaned (and hence are new and don't have other readers) or
* buffers which have been released (and hence have their own hdr, if there
* were originally other readers of the buf's original hdr). This ensures that
* the ARC only needs to update a single buf and its hdr after a write occurs.
*
* When the L2ARC is in use, it will also take advantage of the b_pabd. The
* L2ARC will always write the contents of b_pabd to the L2ARC. This means
* that when compressed ARC is enabled that the L2ARC blocks are identical
* to the on-disk block in the main data pool. This provides a significant
* advantage since the ARC can leverage the bp's checksum when reading from the
* L2ARC to determine if the contents are valid. However, if the compressed
* ARC is disabled, then the L2ARC's block must be transformed to look
* like the physical block in the main data pool before comparing the
* checksum and determining its validity.
*/
#include <sys/spa.h>
#include <sys/zio.h>
#include <sys/spa_impl.h>
#include <sys/zio_compress.h>
#include <sys/zio_checksum.h>
#include <sys/zfs_context.h>
#include <sys/arc.h>
#include <sys/refcount.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
#include <sys/dsl_pool.h>
#include <sys/zio_checksum.h>
#include <sys/multilist.h>
#include <sys/abd.h>
#ifdef _KERNEL
#include <sys/dnlc.h>
#include <sys/racct.h>
#endif
#include <sys/callb.h>
#include <sys/kstat.h>
#include <sys/trim_map.h>
#include <zfs_fletcher.h>
#include <sys/sdt.h>
#include <machine/vmparam.h>
#ifdef illumos
#ifndef _KERNEL
/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
boolean_t arc_watch = B_FALSE;
int arc_procfd;
#endif
#endif /* illumos */
static kmutex_t arc_reclaim_lock;
static kcondvar_t arc_reclaim_thread_cv;
static boolean_t arc_reclaim_thread_exit;
static kcondvar_t arc_reclaim_waiters_cv;
static kmutex_t arc_dnlc_evicts_lock;
static kcondvar_t arc_dnlc_evicts_cv;
static boolean_t arc_dnlc_evicts_thread_exit;
uint_t arc_reduce_dnlc_percent = 3;
/*
* The number of headers to evict in arc_evict_state_impl() before
* dropping the sublist lock and evicting from another sublist. A lower
* value means we're more likely to evict the "correct" header (i.e. the
* oldest header in the arc state), but comes with higher overhead
* (i.e. more invocations of arc_evict_state_impl()).
*/
int zfs_arc_evict_batch_limit = 10;
/* number of seconds before growing cache again */
static int arc_grow_retry = 60;
/* shift of arc_c for calculating overflow limit in arc_get_data_impl */
int zfs_arc_overflow_shift = 8;
/* shift of arc_c for calculating both min and max arc_p */
static int arc_p_min_shift = 4;
/* log2(fraction of arc to reclaim) */
static int arc_shrink_shift = 7;
/*
* log2(fraction of ARC which must be free to allow growing).
* I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
* when reading a new block into the ARC, we will evict an equal-sized block
* from the ARC.
*
* This must be less than arc_shrink_shift, so that when we shrink the ARC,
* we will still not allow it to grow.
*/
int arc_no_grow_shift = 5;
/*
* minimum lifespan of a prefetch block in clock ticks
* (initialized in arc_init())
*/
static int arc_min_prefetch_lifespan;
/*
* If this percent of memory is free, don't throttle.
*/
int arc_lotsfree_percent = 10;
static int arc_dead;
extern boolean_t zfs_prefetch_disable;
/*
* The arc has filled available memory and has now warmed up.
*/
static boolean_t arc_warm;
/*
* log2 fraction of the zio arena to keep free.
*/
int arc_zio_arena_free_shift = 2;
/*
* These tunables are for performance analysis.
*/
uint64_t zfs_arc_max;
uint64_t zfs_arc_min;
uint64_t zfs_arc_meta_limit = 0;
uint64_t zfs_arc_meta_min = 0;
int zfs_arc_grow_retry = 0;
int zfs_arc_shrink_shift = 0;
int zfs_arc_no_grow_shift = 0;
int zfs_arc_p_min_shift = 0;
uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
u_int zfs_arc_free_target = 0;
/* Absolute min for arc min / max is 16MB. */
static uint64_t arc_abs_min = 16 << 20;
boolean_t zfs_compressed_arc_enabled = B_TRUE;
static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS);
static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS);
static int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS);
static int sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS);
#if defined(__FreeBSD__) && defined(_KERNEL)
static void
arc_free_target_init(void *unused __unused)
{
zfs_arc_free_target = vm_pageout_wakeup_thresh;
}
SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
arc_free_target_init, NULL);
TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min);
TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift);
TUNABLE_INT("vfs.zfs.arc_grow_retry", &zfs_arc_grow_retry);
TUNABLE_INT("vfs.zfs.arc_no_grow_shift", &zfs_arc_no_grow_shift);
SYSCTL_DECL(_vfs_zfs);
SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_U64 | CTLFLAG_RWTUN,
0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size");
SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_U64 | CTLFLAG_RWTUN,
0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size");
SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, CTLTYPE_U32 | CTLFLAG_RWTUN,
0, sizeof(uint32_t), sysctl_vfs_zfs_arc_no_grow_shift, "U",
"log2(fraction of ARC which must be free to allow growing)");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
&zfs_arc_average_blocksize, 0,
"ARC average blocksize");
SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
&arc_shrink_shift, 0,
"log2(fraction of arc to reclaim)");
SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_grow_retry, CTLFLAG_RW,
&arc_grow_retry, 0,
"Wait in seconds before considering growing ARC");
SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN,
&zfs_compressed_arc_enabled, 0, "Enable compressed ARC");
/*
* We don't have a tunable for arc_free_target due to the dependency on
* pagedaemon initialisation.
*/
SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int),
sysctl_vfs_zfs_arc_free_target, "IU",
"Desired number of free pages below which ARC triggers reclaim");
static int
sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
{
u_int val;
int err;
val = zfs_arc_free_target;
err = sysctl_handle_int(oidp, &val, 0, req);
if (err != 0 || req->newptr == NULL)
return (err);
if (val < minfree)
return (EINVAL);
if (val > vm_cnt.v_page_count)
return (EINVAL);
zfs_arc_free_target = val;
return (0);
}
/*
* Must be declared here, before the definition of corresponding kstat
* macro which uses the same names will confuse the compiler.
*/
SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit,
CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
sysctl_vfs_zfs_arc_meta_limit, "QU",
"ARC metadata limit");
#endif
/*
* Note that buffers can be in one of 6 states:
* ARC_anon - anonymous (discussed below)
* ARC_mru - recently used, currently cached
* ARC_mru_ghost - recentely used, no longer in cache
* ARC_mfu - frequently used, currently cached
* ARC_mfu_ghost - frequently used, no longer in cache
* ARC_l2c_only - exists in L2ARC but not other states
* When there are no active references to the buffer, they are
* are linked onto a list in one of these arc states. These are
* the only buffers that can be evicted or deleted. Within each
* state there are multiple lists, one for meta-data and one for
* non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
* etc.) is tracked separately so that it can be managed more
* explicitly: favored over data, limited explicitly.
*
* Anonymous buffers are buffers that are not associated with
* a DVA. These are buffers that hold dirty block copies
* before they are written to stable storage. By definition,
* they are "ref'd" and are considered part of arc_mru
* that cannot be freed. Generally, they will aquire a DVA
* as they are written and migrate onto the arc_mru list.
*
* The ARC_l2c_only state is for buffers that are in the second
* level ARC but no longer in any of the ARC_m* lists. The second
* level ARC itself may also contain buffers that are in any of
* the ARC_m* states - meaning that a buffer can exist in two
* places. The reason for the ARC_l2c_only state is to keep the
* buffer header in the hash table, so that reads that hit the
* second level ARC benefit from these fast lookups.
*/
typedef struct arc_state {
/*
* list of evictable buffers
*/
multilist_t *arcs_list[ARC_BUFC_NUMTYPES];
/*
* total amount of evictable data in this state
*/
refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
/*
* total amount of data in this state; this includes: evictable,
* non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
*/
refcount_t arcs_size;
} arc_state_t;
/* The 6 states: */
static arc_state_t ARC_anon;
static arc_state_t ARC_mru;
static arc_state_t ARC_mru_ghost;
static arc_state_t ARC_mfu;
static arc_state_t ARC_mfu_ghost;
static arc_state_t ARC_l2c_only;
typedef struct arc_stats {
kstat_named_t arcstat_hits;
kstat_named_t arcstat_misses;
kstat_named_t arcstat_demand_data_hits;
kstat_named_t arcstat_demand_data_misses;
kstat_named_t arcstat_demand_metadata_hits;
kstat_named_t arcstat_demand_metadata_misses;
kstat_named_t arcstat_prefetch_data_hits;
kstat_named_t arcstat_prefetch_data_misses;
kstat_named_t arcstat_prefetch_metadata_hits;
kstat_named_t arcstat_prefetch_metadata_misses;
kstat_named_t arcstat_mru_hits;
kstat_named_t arcstat_mru_ghost_hits;
kstat_named_t arcstat_mfu_hits;
kstat_named_t arcstat_mfu_ghost_hits;
kstat_named_t arcstat_allocated;
kstat_named_t arcstat_deleted;
/*
* Number of buffers that could not be evicted because the hash lock
* was held by another thread. The lock may not necessarily be held
* by something using the same buffer, since hash locks are shared
* by multiple buffers.
*/
kstat_named_t arcstat_mutex_miss;
/*
* Number of buffers skipped because they have I/O in progress, are
* indrect prefetch buffers that have not lived long enough, or are
* not from the spa we're trying to evict from.
*/
kstat_named_t arcstat_evict_skip;
/*
* Number of times arc_evict_state() was unable to evict enough
* buffers to reach it's target amount.
*/
kstat_named_t arcstat_evict_not_enough;
kstat_named_t arcstat_evict_l2_cached;
kstat_named_t arcstat_evict_l2_eligible;
kstat_named_t arcstat_evict_l2_ineligible;
kstat_named_t arcstat_evict_l2_skip;
kstat_named_t arcstat_hash_elements;
kstat_named_t arcstat_hash_elements_max;
kstat_named_t arcstat_hash_collisions;
kstat_named_t arcstat_hash_chains;
kstat_named_t arcstat_hash_chain_max;
kstat_named_t arcstat_p;
kstat_named_t arcstat_c;
kstat_named_t arcstat_c_min;
kstat_named_t arcstat_c_max;
kstat_named_t arcstat_size;
/*
* Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
* Note that the compressed bytes may match the uncompressed bytes
* if the block is either not compressed or compressed arc is disabled.
*/
kstat_named_t arcstat_compressed_size;
/*
* Uncompressed size of the data stored in b_pabd. If compressed
* arc is disabled then this value will be identical to the stat
* above.
*/
kstat_named_t arcstat_uncompressed_size;
/*
* Number of bytes stored in all the arc_buf_t's. This is classified
* as "overhead" since this data is typically short-lived and will
* be evicted from the arc when it becomes unreferenced unless the
* zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level
* values have been set (see comment in dbuf.c for more information).
*/
kstat_named_t arcstat_overhead_size;
/*
* Number of bytes consumed by internal ARC structures necessary
* for tracking purposes; these structures are not actually
* backed by ARC buffers. This includes arc_buf_hdr_t structures
* (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
* caches), and arc_buf_t structures (allocated via arc_buf_t
* cache).
*/
kstat_named_t arcstat_hdr_size;
/*
* Number of bytes consumed by ARC buffers of type equal to
* ARC_BUFC_DATA. This is generally consumed by buffers backing
* on disk user data (e.g. plain file contents).
*/
kstat_named_t arcstat_data_size;
/*
* Number of bytes consumed by ARC buffers of type equal to
* ARC_BUFC_METADATA. This is generally consumed by buffers
* backing on disk data that is used for internal ZFS
* structures (e.g. ZAP, dnode, indirect blocks, etc).
*/
kstat_named_t arcstat_metadata_size;
/*
* Number of bytes consumed by various buffers and structures
* not actually backed with ARC buffers. This includes bonus
* buffers (allocated directly via zio_buf_* functions),
* dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
* cache), and dnode_t structures (allocated via dnode_t cache).
*/
kstat_named_t arcstat_other_size;
/*
* Total number of bytes consumed by ARC buffers residing in the
* arc_anon state. This includes *all* buffers in the arc_anon
* state; e.g. data, metadata, evictable, and unevictable buffers
* are all included in this value.
*/
kstat_named_t arcstat_anon_size;
/*
* Number of bytes consumed by ARC buffers that meet the
* following criteria: backing buffers of type ARC_BUFC_DATA,
* residing in the arc_anon state, and are eligible for eviction
* (e.g. have no outstanding holds on the buffer).
*/
kstat_named_t arcstat_anon_evictable_data;
/*
* Number of bytes consumed by ARC buffers that meet the
* following criteria: backing buffers of type ARC_BUFC_METADATA,
* residing in the arc_anon state, and are eligible for eviction
* (e.g. have no outstanding holds on the buffer).
*/
kstat_named_t arcstat_anon_evictable_metadata;
/*
* Total number of bytes consumed by ARC buffers residing in the
* arc_mru state. This includes *all* buffers in the arc_mru
* state; e.g. data, metadata, evictable, and unevictable buffers
* are all included in this value.
*/
kstat_named_t arcstat_mru_size;
/*
* Number of bytes consumed by ARC buffers that meet the
* following criteria: backing buffers of type ARC_BUFC_DATA,
* residing in the arc_mru state, and are eligible for eviction
* (e.g. have no outstanding holds on the buffer).
*/
kstat_named_t arcstat_mru_evictable_data;
/*
* Number of bytes consumed by ARC buffers that meet the
* following criteria: backing buffers of type ARC_BUFC_METADATA,
* residing in the arc_mru state, and are eligible for eviction
* (e.g. have no outstanding holds on the buffer).
*/
kstat_named_t arcstat_mru_evictable_metadata;
/*
* Total number of bytes that *would have been* consumed by ARC
* buffers in the arc_mru_ghost state. The key thing to note
* here, is the fact that this size doesn't actually indicate
* RAM consumption. The ghost lists only consist of headers and
* don't actually have ARC buffers linked off of these headers.
* Thus, *if* the headers had associated ARC buffers, these
* buffers *would have* consumed this number of bytes.
*/
kstat_named_t arcstat_mru_ghost_size;
/*
* Number of bytes that *would have been* consumed by ARC
* buffers that are eligible for eviction, of type
* ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
*/
kstat_named_t arcstat_mru_ghost_evictable_data;
/*
* Number of bytes that *would have been* consumed by ARC
* buffers that are eligible for eviction, of type
* ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
*/
kstat_named_t arcstat_mru_ghost_evictable_metadata;
/*
* Total number of bytes consumed by ARC buffers residing in the
* arc_mfu state. This includes *all* buffers in the arc_mfu
* state; e.g. data, metadata, evictable, and unevictable buffers
* are all included in this value.
*/
kstat_named_t arcstat_mfu_size;
/*
* Number of bytes consumed by ARC buffers that are eligible for
* eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
* state.
*/
kstat_named_t arcstat_mfu_evictable_data;
/*
* Number of bytes consumed by ARC buffers that are eligible for
* eviction, of type ARC_BUFC_METADATA, and reside in the
* arc_mfu state.
*/
kstat_named_t arcstat_mfu_evictable_metadata;
/*
* Total number of bytes that *would have been* consumed by ARC
* buffers in the arc_mfu_ghost state. See the comment above
* arcstat_mru_ghost_size for more details.
*/
kstat_named_t arcstat_mfu_ghost_size;
/*
* Number of bytes that *would have been* consumed by ARC
* buffers that are eligible for eviction, of type
* ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
*/
kstat_named_t arcstat_mfu_ghost_evictable_data;
/*
* Number of bytes that *would have been* consumed by ARC
* buffers that are eligible for eviction, of type
* ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
*/
kstat_named_t arcstat_mfu_ghost_evictable_metadata;
kstat_named_t arcstat_l2_hits;
kstat_named_t arcstat_l2_misses;
kstat_named_t arcstat_l2_feeds;
kstat_named_t arcstat_l2_rw_clash;
kstat_named_t arcstat_l2_read_bytes;
kstat_named_t arcstat_l2_write_bytes;
kstat_named_t arcstat_l2_writes_sent;
kstat_named_t arcstat_l2_writes_done;
kstat_named_t arcstat_l2_writes_error;
kstat_named_t arcstat_l2_writes_lock_retry;
kstat_named_t arcstat_l2_evict_lock_retry;
kstat_named_t arcstat_l2_evict_reading;
kstat_named_t arcstat_l2_evict_l1cached;
kstat_named_t arcstat_l2_free_on_write;
kstat_named_t arcstat_l2_abort_lowmem;
kstat_named_t arcstat_l2_cksum_bad;
kstat_named_t arcstat_l2_io_error;
kstat_named_t arcstat_l2_lsize;
kstat_named_t arcstat_l2_psize;
kstat_named_t arcstat_l2_hdr_size;
kstat_named_t arcstat_l2_write_trylock_fail;
kstat_named_t arcstat_l2_write_passed_headroom;
kstat_named_t arcstat_l2_write_spa_mismatch;
kstat_named_t arcstat_l2_write_in_l2;
kstat_named_t arcstat_l2_write_hdr_io_in_progress;
kstat_named_t arcstat_l2_write_not_cacheable;
kstat_named_t arcstat_l2_write_full;
kstat_named_t arcstat_l2_write_buffer_iter;
kstat_named_t arcstat_l2_write_pios;
kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
kstat_named_t arcstat_l2_write_buffer_list_iter;
kstat_named_t arcstat_l2_write_buffer_list_null_iter;
kstat_named_t arcstat_memory_throttle_count;
kstat_named_t arcstat_meta_used;
kstat_named_t arcstat_meta_limit;
kstat_named_t arcstat_meta_max;
kstat_named_t arcstat_meta_min;
kstat_named_t arcstat_sync_wait_for_async;
kstat_named_t arcstat_demand_hit_predictive_prefetch;
} arc_stats_t;
static arc_stats_t arc_stats = {
{ "hits", KSTAT_DATA_UINT64 },
{ "misses", KSTAT_DATA_UINT64 },
{ "demand_data_hits", KSTAT_DATA_UINT64 },
{ "demand_data_misses", KSTAT_DATA_UINT64 },
{ "demand_metadata_hits", KSTAT_DATA_UINT64 },
{ "demand_metadata_misses", KSTAT_DATA_UINT64 },
{ "prefetch_data_hits", KSTAT_DATA_UINT64 },
{ "prefetch_data_misses", KSTAT_DATA_UINT64 },
{ "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
{ "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
{ "mru_hits", KSTAT_DATA_UINT64 },
{ "mru_ghost_hits", KSTAT_DATA_UINT64 },
{ "mfu_hits", KSTAT_DATA_UINT64 },
{ "mfu_ghost_hits", KSTAT_DATA_UINT64 },
{ "allocated", KSTAT_DATA_UINT64 },
{ "deleted", KSTAT_DATA_UINT64 },
{ "mutex_miss", KSTAT_DATA_UINT64 },
{ "evict_skip", KSTAT_DATA_UINT64 },
{ "evict_not_enough", KSTAT_DATA_UINT64 },
{ "evict_l2_cached", KSTAT_DATA_UINT64 },
{ "evict_l2_eligible", KSTAT_DATA_UINT64 },
{ "evict_l2_ineligible", KSTAT_DATA_UINT64 },
{ "evict_l2_skip", KSTAT_DATA_UINT64 },
{ "hash_elements", KSTAT_DATA_UINT64 },
{ "hash_elements_max", KSTAT_DATA_UINT64 },
{ "hash_collisions", KSTAT_DATA_UINT64 },
{ "hash_chains", KSTAT_DATA_UINT64 },
{ "hash_chain_max", KSTAT_DATA_UINT64 },
{ "p", KSTAT_DATA_UINT64 },
{ "c", KSTAT_DATA_UINT64 },
{ "c_min", KSTAT_DATA_UINT64 },
{ "c_max", KSTAT_DATA_UINT64 },
{ "size", KSTAT_DATA_UINT64 },
{ "compressed_size", KSTAT_DATA_UINT64 },
{ "uncompressed_size", KSTAT_DATA_UINT64 },
{ "overhead_size", KSTAT_DATA_UINT64 },
{ "hdr_size", KSTAT_DATA_UINT64 },
{ "data_size", KSTAT_DATA_UINT64 },
{ "metadata_size", KSTAT_DATA_UINT64 },
{ "other_size", KSTAT_DATA_UINT64 },
{ "anon_size", KSTAT_DATA_UINT64 },
{ "anon_evictable_data", KSTAT_DATA_UINT64 },
{ "anon_evictable_metadata", KSTAT_DATA_UINT64 },
{ "mru_size", KSTAT_DATA_UINT64 },
{ "mru_evictable_data", KSTAT_DATA_UINT64 },
{ "mru_evictable_metadata", KSTAT_DATA_UINT64 },
{ "mru_ghost_size", KSTAT_DATA_UINT64 },
{ "mru_ghost_evictable_data", KSTAT_DATA_UINT64 },
{ "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
{ "mfu_size", KSTAT_DATA_UINT64 },
{ "mfu_evictable_data", KSTAT_DATA_UINT64 },
{ "mfu_evictable_metadata", KSTAT_DATA_UINT64 },
{ "mfu_ghost_size", KSTAT_DATA_UINT64 },
{ "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 },
{ "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
{ "l2_hits", KSTAT_DATA_UINT64 },
{ "l2_misses", KSTAT_DATA_UINT64 },
{ "l2_feeds", KSTAT_DATA_UINT64 },
{ "l2_rw_clash", KSTAT_DATA_UINT64 },
{ "l2_read_bytes", KSTAT_DATA_UINT64 },
{ "l2_write_bytes", KSTAT_DATA_UINT64 },
{ "l2_writes_sent", KSTAT_DATA_UINT64 },
{ "l2_writes_done", KSTAT_DATA_UINT64 },
{ "l2_writes_error", KSTAT_DATA_UINT64 },
{ "l2_writes_lock_retry", KSTAT_DATA_UINT64 },
{ "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
{ "l2_evict_reading", KSTAT_DATA_UINT64 },
{ "l2_evict_l1cached", KSTAT_DATA_UINT64 },
{ "l2_free_on_write", KSTAT_DATA_UINT64 },
{ "l2_abort_lowmem", KSTAT_DATA_UINT64 },
{ "l2_cksum_bad", KSTAT_DATA_UINT64 },
{ "l2_io_error", KSTAT_DATA_UINT64 },
{ "l2_size", KSTAT_DATA_UINT64 },
{ "l2_asize", KSTAT_DATA_UINT64 },
{ "l2_hdr_size", KSTAT_DATA_UINT64 },
{ "l2_write_trylock_fail", KSTAT_DATA_UINT64 },
{ "l2_write_passed_headroom", KSTAT_DATA_UINT64 },
{ "l2_write_spa_mismatch", KSTAT_DATA_UINT64 },
{ "l2_write_in_l2", KSTAT_DATA_UINT64 },
{ "l2_write_io_in_progress", KSTAT_DATA_UINT64 },
{ "l2_write_not_cacheable", KSTAT_DATA_UINT64 },
{ "l2_write_full", KSTAT_DATA_UINT64 },
{ "l2_write_buffer_iter", KSTAT_DATA_UINT64 },
{ "l2_write_pios", KSTAT_DATA_UINT64 },
{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
{ "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 },
{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
{ "memory_throttle_count", KSTAT_DATA_UINT64 },
{ "arc_meta_used", KSTAT_DATA_UINT64 },
{ "arc_meta_limit", KSTAT_DATA_UINT64 },
{ "arc_meta_max", KSTAT_DATA_UINT64 },
{ "arc_meta_min", KSTAT_DATA_UINT64 },
{ "sync_wait_for_async", KSTAT_DATA_UINT64 },
{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
};
#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
#define ARCSTAT_INCR(stat, val) \
atomic_add_64(&arc_stats.stat.value.ui64, (val))
#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
#define ARCSTAT_MAX(stat, val) { \
uint64_t m; \
while ((val) > (m = arc_stats.stat.value.ui64) && \
(m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
continue; \
}
#define ARCSTAT_MAXSTAT(stat) \
ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
/*
* We define a macro to allow ARC hits/misses to be easily broken down by
* two separate conditions, giving a total of four different subtypes for
* each of hits and misses (so eight statistics total).
*/
#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
if (cond1) { \
if (cond2) { \
ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
} else { \
ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
} \
} else { \
if (cond2) { \
ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
} else { \
ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
} \
}
kstat_t *arc_ksp;
static arc_state_t *arc_anon;
static arc_state_t *arc_mru;
static arc_state_t *arc_mru_ghost;
static arc_state_t *arc_mfu;
static arc_state_t *arc_mfu_ghost;
static arc_state_t *arc_l2c_only;
/*
* There are several ARC variables that are critical to export as kstats --
* but we don't want to have to grovel around in the kstat whenever we wish to
* manipulate them. For these variables, we therefore define them to be in
* terms of the statistic variable. This assures that we are not introducing
* the possibility of inconsistency by having shadow copies of the variables,
* while still allowing the code to be readable.
*/
#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */
#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
/* compressed size of entire arc */
#define arc_compressed_size ARCSTAT(arcstat_compressed_size)
/* uncompressed size of entire arc */
#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size)
/* number of bytes in the arc from arc_buf_t's */
#define arc_overhead_size ARCSTAT(arcstat_overhead_size)
static int arc_no_grow; /* Don't try to grow cache size */
static uint64_t arc_tempreserve;
static uint64_t arc_loaned_bytes;
typedef struct arc_callback arc_callback_t;
struct arc_callback {
void *acb_private;
arc_done_func_t *acb_done;
arc_buf_t *acb_buf;
boolean_t acb_compressed;
zio_t *acb_zio_dummy;
arc_callback_t *acb_next;
};
typedef struct arc_write_callback arc_write_callback_t;
struct arc_write_callback {
void *awcb_private;
arc_done_func_t *awcb_ready;
arc_done_func_t *awcb_children_ready;
arc_done_func_t *awcb_physdone;
arc_done_func_t *awcb_done;
arc_buf_t *awcb_buf;
};
/*
* ARC buffers are separated into multiple structs as a memory saving measure:
* - Common fields struct, always defined, and embedded within it:
* - L2-only fields, always allocated but undefined when not in L2ARC
* - L1-only fields, only allocated when in L1ARC
*
* Buffer in L1 Buffer only in L2
* +------------------------+ +------------------------+
* | arc_buf_hdr_t | | arc_buf_hdr_t |
* | | | |
* | | | |
* | | | |
* +------------------------+ +------------------------+
* | l2arc_buf_hdr_t | | l2arc_buf_hdr_t |
* | (undefined if L1-only) | | |
* +------------------------+ +------------------------+
* | l1arc_buf_hdr_t |
* | |
* | |
* | |
* | |
* +------------------------+
*
* Because it's possible for the L2ARC to become extremely large, we can wind
* up eating a lot of memory in L2ARC buffer headers, so the size of a header
* is minimized by only allocating the fields necessary for an L1-cached buffer
* when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
* l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
* words in pointers. arc_hdr_realloc() is used to switch a header between
* these two allocation states.
*/
typedef struct l1arc_buf_hdr {
kmutex_t b_freeze_lock;
zio_cksum_t *b_freeze_cksum;
#ifdef ZFS_DEBUG
/*
* Used for debugging with kmem_flags - by allocating and freeing
* b_thawed when the buffer is thawed, we get a record of the stack
* trace that thawed it.
*/
void *b_thawed;
#endif
arc_buf_t *b_buf;
uint32_t b_bufcnt;
/* for waiting on writes to complete */
kcondvar_t b_cv;
uint8_t b_byteswap;
/* protected by arc state mutex */
arc_state_t *b_state;
multilist_node_t b_arc_node;
/* updated atomically */
clock_t b_arc_access;
/* self protecting */
refcount_t b_refcnt;
arc_callback_t *b_acb;
abd_t *b_pabd;
} l1arc_buf_hdr_t;
typedef struct l2arc_dev l2arc_dev_t;
typedef struct l2arc_buf_hdr {
/* protected by arc_buf_hdr mutex */
l2arc_dev_t *b_dev; /* L2ARC device */
uint64_t b_daddr; /* disk address, offset byte */
list_node_t b_l2node;
} l2arc_buf_hdr_t;
struct arc_buf_hdr {
/* protected by hash lock */
dva_t b_dva;
uint64_t b_birth;
arc_buf_contents_t b_type;
arc_buf_hdr_t *b_hash_next;
arc_flags_t b_flags;
/*
* This field stores the size of the data buffer after
* compression, and is set in the arc's zio completion handlers.
* It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
*
* While the block pointers can store up to 32MB in their psize
* field, we can only store up to 32MB minus 512B. This is due
* to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
* a field of zeros represents 512B in the bp). We can't use a
* bias of 1 since we need to reserve a psize of zero, here, to
* represent holes and embedded blocks.
*
* This isn't a problem in practice, since the maximum size of a
* buffer is limited to 16MB, so we never need to store 32MB in
* this field. Even in the upstream illumos code base, the
* maximum size of a buffer is limited to 16MB.
*/
uint16_t b_psize;
/*
* This field stores the size of the data buffer before
* compression, and cannot change once set. It is in units
* of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
*/
uint16_t b_lsize; /* immutable */
uint64_t b_spa; /* immutable */
/* L2ARC fields. Undefined when not in L2ARC. */
l2arc_buf_hdr_t b_l2hdr;
/* L1ARC fields. Undefined when in l2arc_only state */
l1arc_buf_hdr_t b_l1hdr;
};
#if defined(__FreeBSD__) && defined(_KERNEL)
static int
sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS)
{
uint64_t val;
int err;
val = arc_meta_limit;
err = sysctl_handle_64(oidp, &val, 0, req);
if (err != 0 || req->newptr == NULL)
return (err);
if (val <= 0 || val > arc_c_max)
return (EINVAL);
arc_meta_limit = val;
return (0);
}
static int
sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
{
uint32_t val;
int err;
val = arc_no_grow_shift;
err = sysctl_handle_32(oidp, &val, 0, req);
if (err != 0 || req->newptr == NULL)
return (err);
if (val >= arc_shrink_shift)
return (EINVAL);
arc_no_grow_shift = val;
return (0);
}
static int
sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS)
{
uint64_t val;
int err;
val = zfs_arc_max;
err = sysctl_handle_64(oidp, &val, 0, req);
if (err != 0 || req->newptr == NULL)
return (err);
if (zfs_arc_max == 0) {
/* Loader tunable so blindly set */
zfs_arc_max = val;
return (0);
}
if (val < arc_abs_min || val > kmem_size())
return (EINVAL);
if (val < arc_c_min)
return (EINVAL);
if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit)
return (EINVAL);
arc_c_max = val;
arc_c = arc_c_max;
arc_p = (arc_c >> 1);
if (zfs_arc_meta_limit == 0) {
/* limit meta-data to 1/4 of the arc capacity */
arc_meta_limit = arc_c_max / 4;
}
/* if kmem_flags are set, lets try to use less memory */
if (kmem_debugging())
arc_c = arc_c / 2;
zfs_arc_max = arc_c;
return (0);
}
static int
sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS)
{
uint64_t val;
int err;
val = zfs_arc_min;
err = sysctl_handle_64(oidp, &val, 0, req);
if (err != 0 || req->newptr == NULL)
return (err);
if (zfs_arc_min == 0) {
/* Loader tunable so blindly set */
zfs_arc_min = val;
return (0);
}
if (val < arc_abs_min || val > arc_c_max)
return (EINVAL);
arc_c_min = val;
if (zfs_arc_meta_min == 0)
arc_meta_min = arc_c_min / 2;
if (arc_c < arc_c_min)
arc_c = arc_c_min;
zfs_arc_min = arc_c_min;
return (0);
}
#endif
#define GHOST_STATE(state) \
((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
(state) == arc_l2c_only)
#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH)
#define HDR_COMPRESSION_ENABLED(hdr) \
((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE)
#define HDR_L2_READING(hdr) \
(((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \
((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
#define HDR_ISTYPE_METADATA(hdr) \
((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr))
#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
/* For storing compression mode in b_flags */
#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1)
#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \
HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL)
#define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED)
#define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
/*
* Other sizes
*/
#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
/*
* Hash table routines
*/
#define HT_LOCK_PAD CACHE_LINE_SIZE
struct ht_lock {
kmutex_t ht_lock;
#ifdef _KERNEL
unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
#endif
};
#define BUF_LOCKS 256
typedef struct buf_hash_table {
uint64_t ht_mask;
arc_buf_hdr_t **ht_table;
struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
} buf_hash_table_t;
static buf_hash_table_t buf_hash_table;
#define BUF_HASH_INDEX(spa, dva, birth) \
(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
#define HDR_LOCK(hdr) \
(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
uint64_t zfs_crc64_table[256];
/*
* Level 2 ARC
*/
#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
#define L2ARC_HEADROOM 2 /* num of writes */
/*
* If we discover during ARC scan any buffers to be compressed, we boost
* our headroom for the next scanning cycle by this percentage multiple.
*/
#define L2ARC_HEADROOM_BOOST 200
#define L2ARC_FEED_SECS 1 /* caching interval secs */
#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
/* L2ARC Performance Tunables */
uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
&l2arc_write_max, 0, "max write size");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
&l2arc_write_boost, 0, "extra write during warmup");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
&l2arc_headroom, 0, "number of dev writes");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
&l2arc_feed_secs, 0, "interval seconds");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
&l2arc_feed_min_ms, 0, "min interval milliseconds");
SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
&l2arc_noprefetch, 0, "don't cache prefetch bufs");
SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
&l2arc_feed_again, 0, "turbo warmup");
SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
&l2arc_norw, 0, "no reads during writes");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
&ARC_anon.arcs_size.rc_count, 0, "size of anonymous state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD,
&ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
"size of anonymous state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD,
&ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
"size of anonymous state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
&ARC_mru.arcs_size.rc_count, 0, "size of mru state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD,
&ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
"size of metadata in mru state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD,
&ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
"size of data in mru state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
&ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD,
&ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
"size of metadata in mru ghost state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD,
&ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
"size of data in mru ghost state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
&ARC_mfu.arcs_size.rc_count, 0, "size of mfu state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD,
&ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
"size of metadata in mfu state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD,
&ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
"size of data in mfu state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
&ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD,
&ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
"size of metadata in mfu ghost state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
&ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
"size of data in mfu ghost state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
&ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state");
/*
* L2ARC Internals
*/
struct l2arc_dev {
vdev_t *l2ad_vdev; /* vdev */
spa_t *l2ad_spa; /* spa */
uint64_t l2ad_hand; /* next write location */
uint64_t l2ad_start; /* first addr on device */
uint64_t l2ad_end; /* last addr on device */
boolean_t l2ad_first; /* first sweep through */
boolean_t l2ad_writing; /* currently writing */
kmutex_t l2ad_mtx; /* lock for buffer list */
list_t l2ad_buflist; /* buffer list */
list_node_t l2ad_node; /* device list node */
refcount_t l2ad_alloc; /* allocated bytes */
};
static list_t L2ARC_dev_list; /* device list */
static list_t *l2arc_dev_list; /* device list pointer */
static kmutex_t l2arc_dev_mtx; /* device list mutex */
static l2arc_dev_t *l2arc_dev_last; /* last device used */
static list_t L2ARC_free_on_write; /* free after write buf list */
static list_t *l2arc_free_on_write; /* free after write list ptr */
static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
static uint64_t l2arc_ndev; /* number of devices */
typedef struct l2arc_read_callback {
arc_buf_hdr_t *l2rcb_hdr; /* read header */
blkptr_t l2rcb_bp; /* original blkptr */
zbookmark_phys_t l2rcb_zb; /* original bookmark */
int l2rcb_flags; /* original flags */
abd_t *l2rcb_abd; /* temporary buffer */
} l2arc_read_callback_t;
typedef struct l2arc_write_callback {
l2arc_dev_t *l2wcb_dev; /* device info */
arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
} l2arc_write_callback_t;
typedef struct l2arc_data_free {
/* protected by l2arc_free_on_write_mtx */
abd_t *l2df_abd;
size_t l2df_size;
arc_buf_contents_t l2df_type;
list_node_t l2df_list_node;
} l2arc_data_free_t;
static kmutex_t l2arc_feed_thr_lock;
static kcondvar_t l2arc_feed_thr_cv;
static uint8_t l2arc_thread_exit;
static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *);
static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *);
static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *);
static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag);
static void arc_hdr_free_pabd(arc_buf_hdr_t *);
static void arc_hdr_alloc_pabd(arc_buf_hdr_t *);
static void arc_access(arc_buf_hdr_t *, kmutex_t *);
static boolean_t arc_is_overflowing();
static void arc_buf_watch(arc_buf_t *);
static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
static void l2arc_read_done(zio_t *);
static void
l2arc_trim(const arc_buf_hdr_t *hdr)
{
l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
ASSERT(HDR_HAS_L2HDR(hdr));
ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
if (HDR_GET_PSIZE(hdr) != 0) {
trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr,
HDR_GET_PSIZE(hdr), 0);
}
}
static uint64_t
buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
{
uint8_t *vdva = (uint8_t *)dva;
uint64_t crc = -1ULL;
int i;
ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
for (i = 0; i < sizeof (dva_t); i++)
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
crc ^= (spa>>8) ^ birth;
return (crc);
}
#define HDR_EMPTY(hdr) \
((hdr)->b_dva.dva_word[0] == 0 && \
(hdr)->b_dva.dva_word[1] == 0)
#define HDR_EQUAL(spa, dva, birth, hdr) \
((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
static void
buf_discard_identity(arc_buf_hdr_t *hdr)
{
hdr->b_dva.dva_word[0] = 0;
hdr->b_dva.dva_word[1] = 0;
hdr->b_birth = 0;
}
static arc_buf_hdr_t *
buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
{
const dva_t *dva = BP_IDENTITY(bp);
uint64_t birth = BP_PHYSICAL_BIRTH(bp);
uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
arc_buf_hdr_t *hdr;
mutex_enter(hash_lock);
for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
hdr = hdr->b_hash_next) {
if (HDR_EQUAL(spa, dva, birth, hdr)) {
*lockp = hash_lock;
return (hdr);
}
}
mutex_exit(hash_lock);
*lockp = NULL;
return (NULL);
}
/*
* Insert an entry into the hash table. If there is already an element
* equal to elem in the hash table, then the already existing element
* will be returned and the new element will not be inserted.
* Otherwise returns NULL.
* If lockp == NULL, the caller is assumed to already hold the hash lock.
*/
static arc_buf_hdr_t *
buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
{
uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
arc_buf_hdr_t *fhdr;
uint32_t i;
ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
ASSERT(hdr->b_birth != 0);
ASSERT(!HDR_IN_HASH_TABLE(hdr));
if (lockp != NULL) {
*lockp = hash_lock;
mutex_enter(hash_lock);
} else {
ASSERT(MUTEX_HELD(hash_lock));
}
for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
fhdr = fhdr->b_hash_next, i++) {
if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
return (fhdr);
}
hdr->b_hash_next = buf_hash_table.ht_table[idx];
buf_hash_table.ht_table[idx] = hdr;
arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
/* collect some hash table performance data */
if (i > 0) {
ARCSTAT_BUMP(arcstat_hash_collisions);
if (i == 1)
ARCSTAT_BUMP(arcstat_hash_chains);
ARCSTAT_MAX(arcstat_hash_chain_max, i);
}
ARCSTAT_BUMP(arcstat_hash_elements);
ARCSTAT_MAXSTAT(arcstat_hash_elements);
return (NULL);
}
static void
buf_hash_remove(arc_buf_hdr_t *hdr)
{
arc_buf_hdr_t *fhdr, **hdrp;
uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
ASSERT(HDR_IN_HASH_TABLE(hdr));
hdrp = &buf_hash_table.ht_table[idx];
while ((fhdr = *hdrp) != hdr) {
ASSERT3P(fhdr, !=, NULL);
hdrp = &fhdr->b_hash_next;
}
*hdrp = hdr->b_hash_next;
hdr->b_hash_next = NULL;
arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
/* collect some hash table performance data */
ARCSTAT_BUMPDOWN(arcstat_hash_elements);
if (buf_hash_table.ht_table[idx] &&
buf_hash_table.ht_table[idx]->b_hash_next == NULL)
ARCSTAT_BUMPDOWN(arcstat_hash_chains);
}
/*
* Global data structures and functions for the buf kmem cache.
*/
static kmem_cache_t *hdr_full_cache;
static kmem_cache_t *hdr_l2only_cache;
static kmem_cache_t *buf_cache;
static void
buf_fini(void)
{
int i;
kmem_free(buf_hash_table.ht_table,
(buf_hash_table.ht_mask + 1) * sizeof (void *));
for (i = 0; i < BUF_LOCKS; i++)
mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
kmem_cache_destroy(hdr_full_cache);
kmem_cache_destroy(hdr_l2only_cache);
kmem_cache_destroy(buf_cache);
}
/*
* Constructor callback - called when the cache is empty
* and a new buf is requested.
*/
/* ARGSUSED */
static int
hdr_full_cons(void *vbuf, void *unused, int kmflag)
{
arc_buf_hdr_t *hdr = vbuf;
bzero(hdr, HDR_FULL_SIZE);
cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
refcount_create(&hdr->b_l1hdr.b_refcnt);
mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
multilist_link_init(&hdr->b_l1hdr.b_arc_node);
arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
return (0);
}
/* ARGSUSED */
static int
hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
{
arc_buf_hdr_t *hdr = vbuf;
bzero(hdr, HDR_L2ONLY_SIZE);
arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
return (0);
}
/* ARGSUSED */
static int
buf_cons(void *vbuf, void *unused, int kmflag)
{
arc_buf_t *buf = vbuf;
bzero(buf, sizeof (arc_buf_t));
mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
return (0);
}
/*
* Destructor callback - called when a cached buf is
* no longer required.
*/
/* ARGSUSED */
static void
hdr_full_dest(void *vbuf, void *unused)
{
arc_buf_hdr_t *hdr = vbuf;
ASSERT(HDR_EMPTY(hdr));
cv_destroy(&hdr->b_l1hdr.b_cv);
refcount_destroy(&hdr->b_l1hdr.b_refcnt);
mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
}
/* ARGSUSED */
static void
hdr_l2only_dest(void *vbuf, void *unused)
{
arc_buf_hdr_t *hdr = vbuf;
ASSERT(HDR_EMPTY(hdr));
arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
}
/* ARGSUSED */
static void
buf_dest(void *vbuf, void *unused)
{
arc_buf_t *buf = vbuf;
mutex_destroy(&buf->b_evict_lock);
arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
}
/*
* Reclaim callback -- invoked when memory is low.
*/
/* ARGSUSED */
static void
hdr_recl(void *unused)
{
dprintf("hdr_recl called\n");
/*
* umem calls the reclaim func when we destroy the buf cache,
* which is after we do arc_fini().
*/
if (!arc_dead)
cv_signal(&arc_reclaim_thread_cv);
}
static void
buf_init(void)
{
uint64_t *ct;
uint64_t hsize = 1ULL << 12;
int i, j;
/*
* The hash table is big enough to fill all of physical memory
* with an average block size of zfs_arc_average_blocksize (default 8K).
* By default, the table will take up
* totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
*/
while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE)
hsize <<= 1;
retry:
buf_hash_table.ht_mask = hsize - 1;
buf_hash_table.ht_table =
kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
if (buf_hash_table.ht_table == NULL) {
ASSERT(hsize > (1ULL << 8));
hsize >>= 1;
goto retry;
}
hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
NULL, NULL, 0);
buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
for (i = 0; i < 256; i++)
for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
for (i = 0; i < BUF_LOCKS; i++) {
mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
NULL, MUTEX_DEFAULT, NULL);
}
}
/*
* This is the size that the buf occupies in memory. If the buf is compressed,
* it will correspond to the compressed size. You should use this method of
* getting the buf size unless you explicitly need the logical size.
*/
int32_t
arc_buf_size(arc_buf_t *buf)
{
return (ARC_BUF_COMPRESSED(buf) ?
HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
}
int32_t
arc_buf_lsize(arc_buf_t *buf)
{
return (HDR_GET_LSIZE(buf->b_hdr));
}
enum zio_compress
arc_get_compression(arc_buf_t *buf)
{
return (ARC_BUF_COMPRESSED(buf) ?
HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
}
#define ARC_MINTIME (hz>>4) /* 62 ms */
static inline boolean_t
arc_buf_is_shared(arc_buf_t *buf)
{
boolean_t shared = (buf->b_data != NULL &&
buf->b_hdr->b_l1hdr.b_pabd != NULL &&
abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
IMPLY(shared, ARC_BUF_SHARED(buf));
IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
/*
* It would be nice to assert arc_can_share() too, but the "hdr isn't
* already being shared" requirement prevents us from doing that.
*/
return (shared);
}
/*
* Free the checksum associated with this header. If there is no checksum, this
* is a no-op.
*/
static inline void
arc_cksum_free(arc_buf_hdr_t *hdr)
{
ASSERT(HDR_HAS_L1HDR(hdr));
mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
hdr->b_l1hdr.b_freeze_cksum = NULL;
}
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
}
/*
* Return true iff at least one of the bufs on hdr is not compressed.
*/
static boolean_t
arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
{
for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
if (!ARC_BUF_COMPRESSED(b)) {
return (B_TRUE);
}
}
return (B_FALSE);
}
/*
* If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
* matches the checksum that is stored in the hdr. If there is no checksum,
* or if the buf is compressed, this is a no-op.
*/
static void
arc_cksum_verify(arc_buf_t *buf)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
zio_cksum_t zc;
if (!(zfs_flags & ZFS_DEBUG_MODIFY))
return;
if (ARC_BUF_COMPRESSED(buf)) {
ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
arc_hdr_has_uncompressed_buf(hdr));
return;
}
ASSERT(HDR_HAS_L1HDR(hdr));
mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
return;
}
fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
panic("buffer modified while frozen!");
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
}
static boolean_t
arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
{
enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp);
boolean_t valid_cksum;
ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
/*
* We rely on the blkptr's checksum to determine if the block
* is valid or not. When compressed arc is enabled, the l2arc
* writes the block to the l2arc just as it appears in the pool.
* This allows us to use the blkptr's checksum to validate the
* data that we just read off of the l2arc without having to store
* a separate checksum in the arc_buf_hdr_t. However, if compressed
* arc is disabled, then the data written to the l2arc is always
* uncompressed and won't match the block as it exists in the main
* pool. When this is the case, we must first compress it if it is
* compressed on the main pool before we can validate the checksum.
*/
if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) {
ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
uint64_t lsize = HDR_GET_LSIZE(hdr);
uint64_t csize;
abd_t *cdata = abd_alloc_linear(HDR_GET_PSIZE(hdr), B_TRUE);
csize = zio_compress_data(compress, zio->io_abd,
abd_to_buf(cdata), lsize);
ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr));
if (csize < HDR_GET_PSIZE(hdr)) {
/*
* Compressed blocks are always a multiple of the
* smallest ashift in the pool. Ideally, we would
* like to round up the csize to the next
* spa_min_ashift but that value may have changed
* since the block was last written. Instead,
* we rely on the fact that the hdr's psize
* was set to the psize of the block when it was
* last written. We set the csize to that value
* and zero out any part that should not contain
* data.
*/
abd_zero_off(cdata, csize, HDR_GET_PSIZE(hdr) - csize);
csize = HDR_GET_PSIZE(hdr);
}
zio_push_transform(zio, cdata, csize, HDR_GET_PSIZE(hdr), NULL);
}
/*
* Block pointers always store the checksum for the logical data.
* If the block pointer has the gang bit set, then the checksum
* it represents is for the reconstituted data and not for an
* individual gang member. The zio pipeline, however, must be able to
* determine the checksum of each of the gang constituents so it
* treats the checksum comparison differently than what we need
* for l2arc blocks. This prevents us from using the
* zio_checksum_error() interface directly. Instead we must call the
* zio_checksum_error_impl() so that we can ensure the checksum is
* generated using the correct checksum algorithm and accounts for the
* logical I/O size and not just a gang fragment.
*/
valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
zio->io_offset, NULL) == 0);
zio_pop_transforms(zio);
return (valid_cksum);
}
/*
* Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
* checksum and attaches it to the buf's hdr so that we can ensure that the buf
* isn't modified later on. If buf is compressed or there is already a checksum
* on the hdr, this is a no-op (we only checksum uncompressed bufs).
*/
static void
arc_cksum_compute(arc_buf_t *buf)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
if (!(zfs_flags & ZFS_DEBUG_MODIFY))
return;
ASSERT(HDR_HAS_L1HDR(hdr));
mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
ASSERT(arc_hdr_has_uncompressed_buf(hdr));
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
return;
} else if (ARC_BUF_COMPRESSED(buf)) {
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
return;
}
ASSERT(!ARC_BUF_COMPRESSED(buf));
hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
KM_SLEEP);
fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
hdr->b_l1hdr.b_freeze_cksum);
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
#ifdef illumos
arc_buf_watch(buf);
#endif
}
#ifdef illumos
#ifndef _KERNEL
typedef struct procctl {
long cmd;
prwatch_t prwatch;
} procctl_t;
#endif
/* ARGSUSED */
static void
arc_buf_unwatch(arc_buf_t *buf)
{
#ifndef _KERNEL
if (arc_watch) {
int result;
procctl_t ctl;
ctl.cmd = PCWATCH;
ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
ctl.prwatch.pr_size = 0;
ctl.prwatch.pr_wflags = 0;
result = write(arc_procfd, &ctl, sizeof (ctl));
ASSERT3U(result, ==, sizeof (ctl));
}
#endif
}
/* ARGSUSED */
static void
arc_buf_watch(arc_buf_t *buf)
{
#ifndef _KERNEL
if (arc_watch) {
int result;
procctl_t ctl;
ctl.cmd = PCWATCH;
ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
ctl.prwatch.pr_size = arc_buf_size(buf);
ctl.prwatch.pr_wflags = WA_WRITE;
result = write(arc_procfd, &ctl, sizeof (ctl));
ASSERT3U(result, ==, sizeof (ctl));
}
#endif
}
#endif /* illumos */
static arc_buf_contents_t
arc_buf_type(arc_buf_hdr_t *hdr)
{
arc_buf_contents_t type;
if (HDR_ISTYPE_METADATA(hdr)) {
type = ARC_BUFC_METADATA;
} else {
type = ARC_BUFC_DATA;
}
VERIFY3U(hdr->b_type, ==, type);
return (type);
}
boolean_t
arc_is_metadata(arc_buf_t *buf)
{
return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
}
static uint32_t
arc_bufc_to_flags(arc_buf_contents_t type)
{
switch (type) {
case ARC_BUFC_DATA:
/* metadata field is 0 if buffer contains normal data */
return (0);
case ARC_BUFC_METADATA:
return (ARC_FLAG_BUFC_METADATA);
default:
break;
}
panic("undefined ARC buffer type!");
return ((uint32_t)-1);
}
void
arc_buf_thaw(arc_buf_t *buf)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
arc_cksum_verify(buf);
/*
* Compressed buffers do not manipulate the b_freeze_cksum or
* allocate b_thawed.
*/
if (ARC_BUF_COMPRESSED(buf)) {
ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
arc_hdr_has_uncompressed_buf(hdr));
return;
}
ASSERT(HDR_HAS_L1HDR(hdr));
arc_cksum_free(hdr);
mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
#ifdef ZFS_DEBUG
if (zfs_flags & ZFS_DEBUG_MODIFY) {
if (hdr->b_l1hdr.b_thawed != NULL)
kmem_free(hdr->b_l1hdr.b_thawed, 1);
hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
}
#endif
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
#ifdef illumos
arc_buf_unwatch(buf);
#endif
}
void
arc_buf_freeze(arc_buf_t *buf)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
kmutex_t *hash_lock;
if (!(zfs_flags & ZFS_DEBUG_MODIFY))
return;
if (ARC_BUF_COMPRESSED(buf)) {
ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
arc_hdr_has_uncompressed_buf(hdr));
return;
}
hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL ||
hdr->b_l1hdr.b_state == arc_anon);
arc_cksum_compute(buf);
mutex_exit(hash_lock);
}
/*
* The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
* the following functions should be used to ensure that the flags are
* updated in a thread-safe way. When manipulating the flags either
* the hash_lock must be held or the hdr must be undiscoverable. This
* ensures that we're not racing with any other threads when updating
* the flags.
*/
static inline void
arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
{
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
hdr->b_flags |= flags;
}
static inline void
arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
{
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
hdr->b_flags &= ~flags;
}
/*
* Setting the compression bits in the arc_buf_hdr_t's b_flags is
* done in a special way since we have to clear and set bits
* at the same time. Consumers that wish to set the compression bits
* must use this function to ensure that the flags are updated in
* thread-safe manner.
*/
static void
arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
{
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
/*
* Holes and embedded blocks will always have a psize = 0 so
* we ignore the compression of the blkptr and set the
* arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF.
* Holes and embedded blocks remain anonymous so we don't
* want to uncompress them. Mark them as uncompressed.
*/
if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
} else {
arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
HDR_SET_COMPRESS(hdr, cmp);
ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
ASSERT(HDR_COMPRESSION_ENABLED(hdr));
}
}
/*
* Looks for another buf on the same hdr which has the data decompressed, copies
* from it, and returns true. If no such buf exists, returns false.
*/
static boolean_t
arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
boolean_t copied = B_FALSE;
ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT3P(buf->b_data, !=, NULL);
ASSERT(!ARC_BUF_COMPRESSED(buf));
for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
from = from->b_next) {
/* can't use our own data buffer */
if (from == buf) {
continue;
}
if (!ARC_BUF_COMPRESSED(from)) {
bcopy(from->b_data, buf->b_data, arc_buf_size(buf));
copied = B_TRUE;
break;
}
}
/*
* There were no decompressed bufs, so there should not be a
* checksum on the hdr either.
*/
EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
return (copied);
}
/*
* Given a buf that has a data buffer attached to it, this function will
* efficiently fill the buf with data of the specified compression setting from
* the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
* are already sharing a data buf, no copy is performed.
*
* If the buf is marked as compressed but uncompressed data was requested, this
* will allocate a new data buffer for the buf, remove that flag, and fill the
* buf with uncompressed data. You can't request a compressed buf on a hdr with
* uncompressed data, and (since we haven't added support for it yet) if you
* want compressed data your buf must already be marked as compressed and have
* the correct-sized data buffer.
*/
static int
arc_buf_fill(arc_buf_t *buf, boolean_t compressed)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
ASSERT3P(buf->b_data, !=, NULL);
IMPLY(compressed, hdr_compressed);
IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
if (hdr_compressed == compressed) {
if (!arc_buf_is_shared(buf)) {
abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
arc_buf_size(buf));
}
} else {
ASSERT(hdr_compressed);
ASSERT(!compressed);
ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
/*
* If the buf is sharing its data with the hdr, unlink it and
* allocate a new data buffer for the buf.
*/
if (arc_buf_is_shared(buf)) {
ASSERT(ARC_BUF_COMPRESSED(buf));
/* We need to give the buf it's own b_data */
buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
buf->b_data =
arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
/* Previously overhead was 0; just add new overhead */
ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
} else if (ARC_BUF_COMPRESSED(buf)) {
/* We need to reallocate the buf's b_data */
arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
buf);
buf->b_data =
arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
/* We increased the size of b_data; update overhead */
ARCSTAT_INCR(arcstat_overhead_size,
HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
}
/*
* Regardless of the buf's previous compression settings, it
* should not be compressed at the end of this function.
*/
buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
/*
* Try copying the data from another buf which already has a
* decompressed version. If that's not possible, it's time to
* bite the bullet and decompress the data from the hdr.
*/
if (arc_buf_try_copy_decompressed_data(buf)) {
/* Skip byteswapping and checksumming (already done) */
ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL);
return (0);
} else {
int error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
hdr->b_l1hdr.b_pabd, buf->b_data,
HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
/*
* Absent hardware errors or software bugs, this should
* be impossible, but log it anyway so we can debug it.
*/
if (error != 0) {
zfs_dbgmsg(
"hdr %p, compress %d, psize %d, lsize %d",
hdr, HDR_GET_COMPRESS(hdr),
HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
return (SET_ERROR(EIO));
}
}
}
/* Byteswap the buf's data if necessary */
if (bswap != DMU_BSWAP_NUMFUNCS) {
ASSERT(!HDR_SHARED_DATA(hdr));
ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
}
/* Compute the hdr's checksum if necessary */
arc_cksum_compute(buf);
return (0);
}
int
arc_decompress(arc_buf_t *buf)
{
return (arc_buf_fill(buf, B_FALSE));
}
/*
* Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
*/
static uint64_t
arc_hdr_size(arc_buf_hdr_t *hdr)
{
uint64_t size;
if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
HDR_GET_PSIZE(hdr) > 0) {
size = HDR_GET_PSIZE(hdr);
} else {
ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
size = HDR_GET_LSIZE(hdr);
}
return (size);
}
/*
* Increment the amount of evictable space in the arc_state_t's refcount.
* We account for the space used by the hdr and the arc buf individually
* so that we can add and remove them from the refcount individually.
*/
static void
arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
{
arc_buf_contents_t type = arc_buf_type(hdr);
ASSERT(HDR_HAS_L1HDR(hdr));
if (GHOST_STATE(state)) {
ASSERT0(hdr->b_l1hdr.b_bufcnt);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
(void) refcount_add_many(&state->arcs_esize[type],
HDR_GET_LSIZE(hdr), hdr);
return;
}
ASSERT(!GHOST_STATE(state));
if (hdr->b_l1hdr.b_pabd != NULL) {
(void) refcount_add_many(&state->arcs_esize[type],
arc_hdr_size(hdr), hdr);
}
for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
buf = buf->b_next) {
if (arc_buf_is_shared(buf))
continue;
(void) refcount_add_many(&state->arcs_esize[type],
arc_buf_size(buf), buf);
}
}
/*
* Decrement the amount of evictable space in the arc_state_t's refcount.
* We account for the space used by the hdr and the arc buf individually
* so that we can add and remove them from the refcount individually.
*/
static void
arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
{
arc_buf_contents_t type = arc_buf_type(hdr);
ASSERT(HDR_HAS_L1HDR(hdr));
if (GHOST_STATE(state)) {
ASSERT0(hdr->b_l1hdr.b_bufcnt);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
(void) refcount_remove_many(&state->arcs_esize[type],
HDR_GET_LSIZE(hdr), hdr);
return;
}
ASSERT(!GHOST_STATE(state));
if (hdr->b_l1hdr.b_pabd != NULL) {
(void) refcount_remove_many(&state->arcs_esize[type],
arc_hdr_size(hdr), hdr);
}
for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
buf = buf->b_next) {
if (arc_buf_is_shared(buf))
continue;
(void) refcount_remove_many(&state->arcs_esize[type],
arc_buf_size(buf), buf);
}
}
/*
* Add a reference to this hdr indicating that someone is actively
* referencing that memory. When the refcount transitions from 0 to 1,
* we remove it from the respective arc_state_t list to indicate that
* it is not evictable.
*/
static void
add_reference(arc_buf_hdr_t *hdr, void *tag)
{
ASSERT(HDR_HAS_L1HDR(hdr));
if (!MUTEX_HELD(HDR_LOCK(hdr))) {
ASSERT(hdr->b_l1hdr.b_state == arc_anon);
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
}
arc_state_t *state = hdr->b_l1hdr.b_state;
if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
(state != arc_anon)) {
/* We don't use the L2-only state list. */
if (state != arc_l2c_only) {
multilist_remove(state->arcs_list[arc_buf_type(hdr)],
hdr);
arc_evictable_space_decrement(hdr, state);
}
/* remove the prefetch flag if we get a reference */
arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
}
}
/*
* Remove a reference from this hdr. When the reference transitions from
* 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
* list making it eligible for eviction.
*/
static int
remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
{
int cnt;
arc_state_t *state = hdr->b_l1hdr.b_state;
ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
ASSERT(!GHOST_STATE(state));
/*
* arc_l2c_only counts as a ghost state so we don't need to explicitly
* check to prevent usage of the arc_l2c_only list.
*/
if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
(state != arc_anon)) {
multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr);
ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
arc_evictable_space_increment(hdr, state);
}
return (cnt);
}
/*
* Move the supplied buffer to the indicated state. The hash lock
* for the buffer must be held by the caller.
*/
static void
arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
kmutex_t *hash_lock)
{
arc_state_t *old_state;
int64_t refcnt;
uint32_t bufcnt;
boolean_t update_old, update_new;
arc_buf_contents_t buftype = arc_buf_type(hdr);
/*
* We almost always have an L1 hdr here, since we call arc_hdr_realloc()
* in arc_read() when bringing a buffer out of the L2ARC. However, the
* L1 hdr doesn't always exist when we change state to arc_anon before
* destroying a header, in which case reallocating to add the L1 hdr is
* pointless.
*/
if (HDR_HAS_L1HDR(hdr)) {
old_state = hdr->b_l1hdr.b_state;
refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
bufcnt = hdr->b_l1hdr.b_bufcnt;
update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL);
} else {
old_state = arc_l2c_only;
refcnt = 0;
bufcnt = 0;
update_old = B_FALSE;
}
update_new = update_old;
ASSERT(MUTEX_HELD(hash_lock));
ASSERT3P(new_state, !=, old_state);
ASSERT(!GHOST_STATE(new_state) || bufcnt == 0);
ASSERT(old_state != arc_anon || bufcnt <= 1);
/*
* If this buffer is evictable, transfer it from the
* old state list to the new state list.
*/
if (refcnt == 0) {
if (old_state != arc_anon && old_state != arc_l2c_only) {
ASSERT(HDR_HAS_L1HDR(hdr));
multilist_remove(old_state->arcs_list[buftype], hdr);
if (GHOST_STATE(old_state)) {
ASSERT0(bufcnt);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
update_old = B_TRUE;
}
arc_evictable_space_decrement(hdr, old_state);
}
if (new_state != arc_anon && new_state != arc_l2c_only) {
/*
* An L1 header always exists here, since if we're
* moving to some L1-cached state (i.e. not l2c_only or
* anonymous), we realloc the header to add an L1hdr
* beforehand.
*/
ASSERT(HDR_HAS_L1HDR(hdr));
multilist_insert(new_state->arcs_list[buftype], hdr);
if (GHOST_STATE(new_state)) {
ASSERT0(bufcnt);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
update_new = B_TRUE;
}
arc_evictable_space_increment(hdr, new_state);
}
}
ASSERT(!HDR_EMPTY(hdr));
if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
buf_hash_remove(hdr);
/* adjust state sizes (ignore arc_l2c_only) */
if (update_new && new_state != arc_l2c_only) {
ASSERT(HDR_HAS_L1HDR(hdr));
if (GHOST_STATE(new_state)) {
ASSERT0(bufcnt);
/*
* When moving a header to a ghost state, we first
* remove all arc buffers. Thus, we'll have a
* bufcnt of zero, and no arc buffer to use for
* the reference. As a result, we use the arc
* header pointer for the reference.
*/
(void) refcount_add_many(&new_state->arcs_size,
HDR_GET_LSIZE(hdr), hdr);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
} else {
uint32_t buffers = 0;
/*
* Each individual buffer holds a unique reference,
* thus we must remove each of these references one
* at a time.
*/
for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
buf = buf->b_next) {
ASSERT3U(bufcnt, !=, 0);
buffers++;
/*
* When the arc_buf_t is sharing the data
* block with the hdr, the owner of the
* reference belongs to the hdr. Only
* add to the refcount if the arc_buf_t is
* not shared.
*/
if (arc_buf_is_shared(buf))
continue;
(void) refcount_add_many(&new_state->arcs_size,
arc_buf_size(buf), buf);
}
ASSERT3U(bufcnt, ==, buffers);
if (hdr->b_l1hdr.b_pabd != NULL) {
(void) refcount_add_many(&new_state->arcs_size,
arc_hdr_size(hdr), hdr);
} else {
ASSERT(GHOST_STATE(old_state));
}
}
}
if (update_old && old_state != arc_l2c_only) {
ASSERT(HDR_HAS_L1HDR(hdr));
if (GHOST_STATE(old_state)) {
ASSERT0(bufcnt);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
/*
* When moving a header off of a ghost state,
* the header will not contain any arc buffers.
* We use the arc header pointer for the reference
* which is exactly what we did when we put the
* header on the ghost state.
*/
(void) refcount_remove_many(&old_state->arcs_size,
HDR_GET_LSIZE(hdr), hdr);
} else {
uint32_t buffers = 0;
/*
* Each individual buffer holds a unique reference,
* thus we must remove each of these references one
* at a time.
*/
for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
buf = buf->b_next) {
ASSERT3U(bufcnt, !=, 0);
buffers++;
/*
* When the arc_buf_t is sharing the data
* block with the hdr, the owner of the
* reference belongs to the hdr. Only
* add to the refcount if the arc_buf_t is
* not shared.
*/
if (arc_buf_is_shared(buf))
continue;
(void) refcount_remove_many(
&old_state->arcs_size, arc_buf_size(buf),
buf);
}
ASSERT3U(bufcnt, ==, buffers);
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
(void) refcount_remove_many(
&old_state->arcs_size, arc_hdr_size(hdr), hdr);
}
}
if (HDR_HAS_L1HDR(hdr))
hdr->b_l1hdr.b_state = new_state;
/*
* L2 headers should never be on the L2 state list since they don't
* have L1 headers allocated.
*/
ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
}
void
arc_space_consume(uint64_t space, arc_space_type_t type)
{
ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
switch (type) {
case ARC_SPACE_DATA:
ARCSTAT_INCR(arcstat_data_size, space);
break;
case ARC_SPACE_META:
ARCSTAT_INCR(arcstat_metadata_size, space);
break;
case ARC_SPACE_OTHER:
ARCSTAT_INCR(arcstat_other_size, space);
break;
case ARC_SPACE_HDRS:
ARCSTAT_INCR(arcstat_hdr_size, space);
break;
case ARC_SPACE_L2HDRS:
ARCSTAT_INCR(arcstat_l2_hdr_size, space);
break;
}
if (type != ARC_SPACE_DATA)
ARCSTAT_INCR(arcstat_meta_used, space);
atomic_add_64(&arc_size, space);
}
void
arc_space_return(uint64_t space, arc_space_type_t type)
{
ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
switch (type) {
case ARC_SPACE_DATA:
ARCSTAT_INCR(arcstat_data_size, -space);
break;
case ARC_SPACE_META:
ARCSTAT_INCR(arcstat_metadata_size, -space);
break;
case ARC_SPACE_OTHER:
ARCSTAT_INCR(arcstat_other_size, -space);
break;
case ARC_SPACE_HDRS:
ARCSTAT_INCR(arcstat_hdr_size, -space);
break;
case ARC_SPACE_L2HDRS:
ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
break;
}
if (type != ARC_SPACE_DATA) {
ASSERT(arc_meta_used >= space);
if (arc_meta_max < arc_meta_used)
arc_meta_max = arc_meta_used;
ARCSTAT_INCR(arcstat_meta_used, -space);
}
ASSERT(arc_size >= space);
atomic_add_64(&arc_size, -space);
}
/*
* Given a hdr and a buf, returns whether that buf can share its b_data buffer
* with the hdr's b_pabd.
*/
static boolean_t
arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
{
/*
* The criteria for sharing a hdr's data are:
* 1. the hdr's compression matches the buf's compression
* 2. the hdr doesn't need to be byteswapped
* 3. the hdr isn't already being shared
* 4. the buf is either compressed or it is the last buf in the hdr list
*
* Criterion #4 maintains the invariant that shared uncompressed
* bufs must be the final buf in the hdr's b_buf list. Reading this, you
* might ask, "if a compressed buf is allocated first, won't that be the
* last thing in the list?", but in that case it's impossible to create
* a shared uncompressed buf anyway (because the hdr must be compressed
* to have the compressed buf). You might also think that #3 is
* sufficient to make this guarantee, however it's possible
* (specifically in the rare L2ARC write race mentioned in
* arc_buf_alloc_impl()) there will be an existing uncompressed buf that
* is sharable, but wasn't at the time of its allocation. Rather than
* allow a new shared uncompressed buf to be created and then shuffle
* the list around to make it the last element, this simply disallows
* sharing if the new buf isn't the first to be added.
*/
ASSERT3P(buf->b_hdr, ==, hdr);
boolean_t hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF;
boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
return (buf_compressed == hdr_compressed &&
hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
!HDR_SHARED_DATA(hdr) &&
(ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf)));
}
/*
* Allocate a buf for this hdr. If you care about the data that's in the hdr,
* or if you want a compressed buffer, pass those flags in. Returns 0 if the
* copy was made successfully, or an error code otherwise.
*/
static int
arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed,
boolean_t fill, arc_buf_t **ret)
{
arc_buf_t *buf;
ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
VERIFY(hdr->b_type == ARC_BUFC_DATA ||
hdr->b_type == ARC_BUFC_METADATA);
ASSERT3P(ret, !=, NULL);
ASSERT3P(*ret, ==, NULL);
buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
buf->b_next = hdr->b_l1hdr.b_buf;
buf->b_flags = 0;
add_reference(hdr, tag);
/*
* We're about to change the hdr's b_flags. We must either
* hold the hash_lock or be undiscoverable.
*/
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
/*
* Only honor requests for compressed bufs if the hdr is actually
* compressed.
*/
if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
/*
* If the hdr's data can be shared then we share the data buffer and
* set the appropriate bit in the hdr's b_flags to indicate the hdr is
* sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new
* buffer to store the buf's data.
*
* There are two additional restrictions here because we're sharing
* hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
* actively involved in an L2ARC write, because if this buf is used by
* an arc_write() then the hdr's data buffer will be released when the
* write completes, even though the L2ARC write might still be using it.
* Second, the hdr's ABD must be linear so that the buf's user doesn't
* need to be ABD-aware.
*/
boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) &&
abd_is_linear(hdr->b_l1hdr.b_pabd);
/* Set up b_data and sharing */
if (can_share) {
buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
buf->b_flags |= ARC_BUF_FLAG_SHARED;
arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
} else {
buf->b_data =
arc_get_data_buf(hdr, arc_buf_size(buf), buf);
ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
}
VERIFY3P(buf->b_data, !=, NULL);
hdr->b_l1hdr.b_buf = buf;
hdr->b_l1hdr.b_bufcnt += 1;
/*
* If the user wants the data from the hdr, we need to either copy or
* decompress the data.
*/
if (fill) {
return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0));
}
return (0);
}
static char *arc_onloan_tag = "onloan";
static inline void
arc_loaned_bytes_update(int64_t delta)
{
atomic_add_64(&arc_loaned_bytes, delta);
/* assert that it did not wrap around */
ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
}
/*
* Loan out an anonymous arc buffer. Loaned buffers are not counted as in
* flight data by arc_tempreserve_space() until they are "returned". Loaned
* buffers must be returned to the arc before they can be used by the DMU or
* freed.
*/
arc_buf_t *
arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
{
arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
arc_loaned_bytes_update(size);
return (buf);
}
arc_buf_t *
arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
enum zio_compress compression_type)
{
arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
psize, lsize, compression_type);
arc_loaned_bytes_update(psize);
return (buf);
}
/*
* Return a loaned arc buffer to the arc.
*/
void
arc_return_buf(arc_buf_t *buf, void *tag)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
ASSERT3P(buf->b_data, !=, NULL);
ASSERT(HDR_HAS_L1HDR(hdr));
(void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
arc_loaned_bytes_update(-arc_buf_size(buf));
}
/* Detach an arc_buf from a dbuf (tag) */
void
arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
ASSERT3P(buf->b_data, !=, NULL);
ASSERT(HDR_HAS_L1HDR(hdr));
(void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
arc_loaned_bytes_update(arc_buf_size(buf));
}
static void
l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
{
l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
df->l2df_abd = abd;
df->l2df_size = size;
df->l2df_type = type;
mutex_enter(&l2arc_free_on_write_mtx);
list_insert_head(l2arc_free_on_write, df);
mutex_exit(&l2arc_free_on_write_mtx);
}
static void
arc_hdr_free_on_write(arc_buf_hdr_t *hdr)
{
arc_state_t *state = hdr->b_l1hdr.b_state;
arc_buf_contents_t type = arc_buf_type(hdr);
uint64_t size = arc_hdr_size(hdr);
/* protected by hash lock, if in the hash table */
if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
ASSERT(state != arc_anon && state != arc_l2c_only);
(void) refcount_remove_many(&state->arcs_esize[type],
size, hdr);
}
(void) refcount_remove_many(&state->arcs_size, size, hdr);
if (type == ARC_BUFC_METADATA) {
arc_space_return(size, ARC_SPACE_META);
} else {
ASSERT(type == ARC_BUFC_DATA);
arc_space_return(size, ARC_SPACE_DATA);
}
l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
}
/*
* Share the arc_buf_t's data with the hdr. Whenever we are sharing the
* data buffer, we transfer the refcount ownership to the hdr and update
* the appropriate kstats.
*/
static void
arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
{
arc_state_t *state = hdr->b_l1hdr.b_state;
ASSERT(arc_can_share(hdr, buf));
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
/*
* Start sharing the data buffer. We transfer the
* refcount ownership to the hdr since it always owns
* the refcount whenever an arc_buf_t is shared.
*/
refcount_transfer_ownership(&state->arcs_size, buf, hdr);
hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
HDR_ISTYPE_METADATA(hdr));
arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
buf->b_flags |= ARC_BUF_FLAG_SHARED;
/*
* Since we've transferred ownership to the hdr we need
* to increment its compressed and uncompressed kstats and
* decrement the overhead size.
*/
ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf));
}
static void
arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
{
arc_state_t *state = hdr->b_l1hdr.b_state;
ASSERT(arc_buf_is_shared(buf));
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
/*
* We are no longer sharing this buffer so we need
* to transfer its ownership to the rightful owner.
*/
refcount_transfer_ownership(&state->arcs_size, hdr, buf);
arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
abd_put(hdr->b_l1hdr.b_pabd);
hdr->b_l1hdr.b_pabd = NULL;
buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
/*
* Since the buffer is no longer shared between
* the arc buf and the hdr, count it as overhead.
*/
ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
}
/*
* Remove an arc_buf_t from the hdr's buf list and return the last
* arc_buf_t on the list. If no buffers remain on the list then return
* NULL.
*/
static arc_buf_t *
arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
{
ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
arc_buf_t **bufp = &hdr->b_l1hdr.b_buf;
arc_buf_t *lastbuf = NULL;
/*
* Remove the buf from the hdr list and locate the last
* remaining buffer on the list.
*/
while (*bufp != NULL) {
if (*bufp == buf)
*bufp = buf->b_next;
/*
* If we've removed a buffer in the middle of
* the list then update the lastbuf and update
* bufp.
*/
if (*bufp != NULL) {
lastbuf = *bufp;
bufp = &(*bufp)->b_next;
}
}
buf->b_next = NULL;
ASSERT3P(lastbuf, !=, buf);
IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL);
IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL);
IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
return (lastbuf);
}
/*
* Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's
* list and free it.
*/
static void
arc_buf_destroy_impl(arc_buf_t *buf)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
/*
* Free up the data associated with the buf but only if we're not
* sharing this with the hdr. If we are sharing it with the hdr, the
* hdr is responsible for doing the free.
*/
if (buf->b_data != NULL) {
/*
* We're about to change the hdr's b_flags. We must either
* hold the hash_lock or be undiscoverable.
*/
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
arc_cksum_verify(buf);
#ifdef illumos
arc_buf_unwatch(buf);
#endif
if (arc_buf_is_shared(buf)) {
arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
} else {
uint64_t size = arc_buf_size(buf);
arc_free_data_buf(hdr, buf->b_data, size, buf);
ARCSTAT_INCR(arcstat_overhead_size, -size);
}
buf->b_data = NULL;
ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
hdr->b_l1hdr.b_bufcnt -= 1;
}
arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
/*
* If the current arc_buf_t is sharing its data buffer with the
* hdr, then reassign the hdr's b_pabd to share it with the new
* buffer at the end of the list. The shared buffer is always
* the last one on the hdr's buffer list.
*
* There is an equivalent case for compressed bufs, but since
* they aren't guaranteed to be the last buf in the list and
* that is an exceedingly rare case, we just allow that space be
* wasted temporarily.
*/
if (lastbuf != NULL) {
/* Only one buf can be shared at once */
VERIFY(!arc_buf_is_shared(lastbuf));
/* hdr is uncompressed so can't have compressed buf */
VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
arc_hdr_free_pabd(hdr);
/*
* We must setup a new shared block between the
* last buffer and the hdr. The data would have
* been allocated by the arc buf so we need to transfer
* ownership to the hdr since it's now being shared.
*/
arc_share_buf(hdr, lastbuf);
}
} else if (HDR_SHARED_DATA(hdr)) {
/*
* Uncompressed shared buffers are always at the end
* of the list. Compressed buffers don't have the
* same requirements. This makes it hard to
* simply assert that the lastbuf is shared so
* we rely on the hdr's compression flags to determine
* if we have a compressed, shared buffer.
*/
ASSERT3P(lastbuf, !=, NULL);
ASSERT(arc_buf_is_shared(lastbuf) ||
HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
}
/*
* Free the checksum if we're removing the last uncompressed buf from
* this hdr.
*/
if (!arc_hdr_has_uncompressed_buf(hdr)) {
arc_cksum_free(hdr);
}
/* clean up the buf */
buf->b_hdr = NULL;
kmem_cache_free(buf_cache, buf);
}
static void
arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr)
{
ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT(!HDR_SHARED_DATA(hdr));
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr);
hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
}
static void
arc_hdr_free_pabd(arc_buf_hdr_t *hdr)
{
ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
/*
* If the hdr is currently being written to the l2arc then
* we defer freeing the data by adding it to the l2arc_free_on_write
* list. The l2arc will free the data once it's finished
* writing it to the l2arc device.
*/
if (HDR_L2_WRITING(hdr)) {
arc_hdr_free_on_write(hdr);
ARCSTAT_BUMP(arcstat_l2_free_on_write);
} else {
arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
arc_hdr_size(hdr), hdr);
}
hdr->b_l1hdr.b_pabd = NULL;
hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
}
static arc_buf_hdr_t *
arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
enum zio_compress compression_type, arc_buf_contents_t type)
{
arc_buf_hdr_t *hdr;
VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
ASSERT(HDR_EMPTY(hdr));
ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL);
HDR_SET_PSIZE(hdr, psize);
HDR_SET_LSIZE(hdr, lsize);
hdr->b_spa = spa;
hdr->b_type = type;
hdr->b_flags = 0;
arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
arc_hdr_set_compress(hdr, compression_type);
hdr->b_l1hdr.b_state = arc_anon;
hdr->b_l1hdr.b_arc_access = 0;
hdr->b_l1hdr.b_bufcnt = 0;
hdr->b_l1hdr.b_buf = NULL;
/*
* Allocate the hdr's buffer. This will contain either
* the compressed or uncompressed data depending on the block
* it references and compressed arc enablement.
*/
arc_hdr_alloc_pabd(hdr);
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
return (hdr);
}
/*
* Transition between the two allocation states for the arc_buf_hdr struct.
* The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
* (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
* version is used when a cache buffer is only in the L2ARC in order to reduce
* memory usage.
*/
static arc_buf_hdr_t *
arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
{
ASSERT(HDR_HAS_L2HDR(hdr));
arc_buf_hdr_t *nhdr;
l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
(old == hdr_l2only_cache && new == hdr_full_cache));
nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
buf_hash_remove(hdr);
bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
if (new == hdr_full_cache) {
arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
/*
* arc_access and arc_change_state need to be aware that a
* header has just come out of L2ARC, so we set its state to
* l2c_only even though it's about to change.
*/
nhdr->b_l1hdr.b_state = arc_l2c_only;
/* Verify previous threads set to NULL before freeing */
ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
} else {
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT0(hdr->b_l1hdr.b_bufcnt);
ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
/*
* If we've reached here, We must have been called from
* arc_evict_hdr(), as such we should have already been
* removed from any ghost list we were previously on
* (which protects us from racing with arc_evict_state),
* thus no locking is needed during this check.
*/
ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
/*
* A buffer must not be moved into the arc_l2c_only
* state if it's not finished being written out to the
* l2arc device. Otherwise, the b_l1hdr.b_pabd field
* might try to be accessed, even though it was removed.
*/
VERIFY(!HDR_L2_WRITING(hdr));
VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);
#ifdef ZFS_DEBUG
if (hdr->b_l1hdr.b_thawed != NULL) {
kmem_free(hdr->b_l1hdr.b_thawed, 1);
hdr->b_l1hdr.b_thawed = NULL;
}
#endif
arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
}
/*
* The header has been reallocated so we need to re-insert it into any
* lists it was on.
*/
(void) buf_hash_insert(nhdr, NULL);
ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
mutex_enter(&dev->l2ad_mtx);
/*
* We must place the realloc'ed header back into the list at
* the same spot. Otherwise, if it's placed earlier in the list,
* l2arc_write_buffers() could find it during the function's
* write phase, and try to write it out to the l2arc.
*/
list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
list_remove(&dev->l2ad_buflist, hdr);
mutex_exit(&dev->l2ad_mtx);
/*
* Since we're using the pointer address as the tag when
* incrementing and decrementing the l2ad_alloc refcount, we
* must remove the old pointer (that we're about to destroy) and
* add the new pointer to the refcount. Otherwise we'd remove
* the wrong pointer address when calling arc_hdr_destroy() later.
*/
(void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr);
(void) refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr);
buf_discard_identity(hdr);
kmem_cache_free(old, hdr);
return (nhdr);
}
/*
* Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
* The buf is returned thawed since we expect the consumer to modify it.
*/
arc_buf_t *
arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size)
{
arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
ZIO_COMPRESS_OFF, type);
ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
arc_buf_t *buf = NULL;
VERIFY0(arc_buf_alloc_impl(hdr, tag, B_FALSE, B_FALSE, &buf));
arc_buf_thaw(buf);
return (buf);
}
/*
* Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this
* for bufs containing metadata.
*/
arc_buf_t *
arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
enum zio_compress compression_type)
{
ASSERT3U(lsize, >, 0);
ASSERT3U(lsize, >=, psize);
ASSERT(compression_type > ZIO_COMPRESS_OFF);
ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS);
arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
compression_type, ARC_BUFC_DATA);
ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
arc_buf_t *buf = NULL;
VERIFY0(arc_buf_alloc_impl(hdr, tag, B_TRUE, B_FALSE, &buf));
arc_buf_thaw(buf);
ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
if (!arc_buf_is_shared(buf)) {
/*
* To ensure that the hdr has the correct data in it if we call
* arc_decompress() on this buf before it's been written to
* disk, it's easiest if we just set up sharing between the
* buf and the hdr.
*/
ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd));
arc_hdr_free_pabd(hdr);
arc_share_buf(hdr, buf);
}
return (buf);
}
static void
arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
{
l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
l2arc_dev_t *dev = l2hdr->b_dev;
uint64_t psize = arc_hdr_size(hdr);
ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
ASSERT(HDR_HAS_L2HDR(hdr));
list_remove(&dev->l2ad_buflist, hdr);
ARCSTAT_INCR(arcstat_l2_psize, -psize);
ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));
vdev_space_update(dev->l2ad_vdev, -psize, 0, 0);
(void) refcount_remove_many(&dev->l2ad_alloc, psize, hdr);
arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
}
static void
arc_hdr_destroy(arc_buf_hdr_t *hdr)
{
if (HDR_HAS_L1HDR(hdr)) {
ASSERT(hdr->b_l1hdr.b_buf == NULL ||
hdr->b_l1hdr.b_bufcnt > 0);
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
}
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
ASSERT(!HDR_IN_HASH_TABLE(hdr));
if (!HDR_EMPTY(hdr))
buf_discard_identity(hdr);
if (HDR_HAS_L2HDR(hdr)) {
l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
if (!buflist_held)
mutex_enter(&dev->l2ad_mtx);
/*
* Even though we checked this conditional above, we
* need to check this again now that we have the
* l2ad_mtx. This is because we could be racing with
* another thread calling l2arc_evict() which might have
* destroyed this header's L2 portion as we were waiting
* to acquire the l2ad_mtx. If that happens, we don't
* want to re-destroy the header's L2 portion.
*/
if (HDR_HAS_L2HDR(hdr)) {
l2arc_trim(hdr);
arc_hdr_l2hdr_destroy(hdr);
}
if (!buflist_held)
mutex_exit(&dev->l2ad_mtx);
}
if (HDR_HAS_L1HDR(hdr)) {
arc_cksum_free(hdr);
while (hdr->b_l1hdr.b_buf != NULL)
arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
#ifdef ZFS_DEBUG
if (hdr->b_l1hdr.b_thawed != NULL) {
kmem_free(hdr->b_l1hdr.b_thawed, 1);
hdr->b_l1hdr.b_thawed = NULL;
}
#endif
if (hdr->b_l1hdr.b_pabd != NULL) {
arc_hdr_free_pabd(hdr);
}
}
ASSERT3P(hdr->b_hash_next, ==, NULL);
if (HDR_HAS_L1HDR(hdr)) {
ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
kmem_cache_free(hdr_full_cache, hdr);
} else {
kmem_cache_free(hdr_l2only_cache, hdr);
}
}
void
arc_buf_destroy(arc_buf_t *buf, void* tag)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
kmutex_t *hash_lock = HDR_LOCK(hdr);
if (hdr->b_l1hdr.b_state == arc_anon) {
ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
VERIFY0(remove_reference(hdr, NULL, tag));
arc_hdr_destroy(hdr);
return;
}
mutex_enter(hash_lock);
ASSERT3P(hdr, ==, buf->b_hdr);
ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
ASSERT3P(buf->b_data, !=, NULL);
(void) remove_reference(hdr, hash_lock, tag);
arc_buf_destroy_impl(buf);
mutex_exit(hash_lock);
}
/*
* Evict the arc_buf_hdr that is provided as a parameter. The resultant
* state of the header is dependent on it's state prior to entering this
* function. The following transitions are possible:
*
* - arc_mru -> arc_mru_ghost
* - arc_mfu -> arc_mfu_ghost
* - arc_mru_ghost -> arc_l2c_only
* - arc_mru_ghost -> deleted
* - arc_mfu_ghost -> arc_l2c_only
* - arc_mfu_ghost -> deleted
*/
static int64_t
arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
{
arc_state_t *evicted_state, *state;
int64_t bytes_evicted = 0;
ASSERT(MUTEX_HELD(hash_lock));
ASSERT(HDR_HAS_L1HDR(hdr));
state = hdr->b_l1hdr.b_state;
if (GHOST_STATE(state)) {
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
/*
* l2arc_write_buffers() relies on a header's L1 portion
* (i.e. its b_pabd field) during it's write phase.
* Thus, we cannot push a header onto the arc_l2c_only
* state (removing it's L1 piece) until the header is
* done being written to the l2arc.
*/
if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
ARCSTAT_BUMP(arcstat_evict_l2_skip);
return (bytes_evicted);
}
ARCSTAT_BUMP(arcstat_deleted);
bytes_evicted += HDR_GET_LSIZE(hdr);
DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
if (HDR_HAS_L2HDR(hdr)) {
/*
* This buffer is cached on the 2nd Level ARC;
* don't destroy the header.
*/
arc_change_state(arc_l2c_only, hdr, hash_lock);
/*
* dropping from L1+L2 cached to L2-only,
* realloc to remove the L1 header.
*/
hdr = arc_hdr_realloc(hdr, hdr_full_cache,
hdr_l2only_cache);
} else {
arc_change_state(arc_anon, hdr, hash_lock);
arc_hdr_destroy(hdr);
}
return (bytes_evicted);
}
ASSERT(state == arc_mru || state == arc_mfu);
evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
/* prefetch buffers have a minimum lifespan */
if (HDR_IO_IN_PROGRESS(hdr) ||
((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
arc_min_prefetch_lifespan)) {
ARCSTAT_BUMP(arcstat_evict_skip);
return (bytes_evicted);
}
ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
while (hdr->b_l1hdr.b_buf) {
arc_buf_t *buf = hdr->b_l1hdr.b_buf;
if (!mutex_tryenter(&buf->b_evict_lock)) {
ARCSTAT_BUMP(arcstat_mutex_miss);
break;
}
if (buf->b_data != NULL)
bytes_evicted += HDR_GET_LSIZE(hdr);
mutex_exit(&buf->b_evict_lock);
arc_buf_destroy_impl(buf);
}
if (HDR_HAS_L2HDR(hdr)) {
ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
} else {
if (l2arc_write_eligible(hdr->b_spa, hdr)) {
ARCSTAT_INCR(arcstat_evict_l2_eligible,
HDR_GET_LSIZE(hdr));
} else {
ARCSTAT_INCR(arcstat_evict_l2_ineligible,
HDR_GET_LSIZE(hdr));
}
}
if (hdr->b_l1hdr.b_bufcnt == 0) {
arc_cksum_free(hdr);
bytes_evicted += arc_hdr_size(hdr);
/*
* If this hdr is being evicted and has a compressed
* buffer then we discard it here before we change states.
* This ensures that the accounting is updated correctly
* in arc_free_data_impl().
*/
arc_hdr_free_pabd(hdr);
arc_change_state(evicted_state, hdr, hash_lock);
ASSERT(HDR_IN_HASH_TABLE(hdr));
arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
}
return (bytes_evicted);
}
static uint64_t
arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
uint64_t spa, int64_t bytes)
{
multilist_sublist_t *mls;
uint64_t bytes_evicted = 0;
arc_buf_hdr_t *hdr;
kmutex_t *hash_lock;
int evict_count = 0;
ASSERT3P(marker, !=, NULL);
IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
mls = multilist_sublist_lock(ml, idx);
for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
hdr = multilist_sublist_prev(mls, marker)) {
if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
(evict_count >= zfs_arc_evict_batch_limit))
break;
/*
* To keep our iteration location, move the marker
* forward. Since we're not holding hdr's hash lock, we
* must be very careful and not remove 'hdr' from the
* sublist. Otherwise, other consumers might mistake the
* 'hdr' as not being on a sublist when they call the
* multilist_link_active() function (they all rely on
* the hash lock protecting concurrent insertions and
* removals). multilist_sublist_move_forward() was
* specifically implemented to ensure this is the case
* (only 'marker' will be removed and re-inserted).
*/
multilist_sublist_move_forward(mls, marker);
/*
* The only case where the b_spa field should ever be
* zero, is the marker headers inserted by
* arc_evict_state(). It's possible for multiple threads
* to be calling arc_evict_state() concurrently (e.g.
* dsl_pool_close() and zio_inject_fault()), so we must
* skip any markers we see from these other threads.
*/
if (hdr->b_spa == 0)
continue;
/* we're only interested in evicting buffers of a certain spa */
if (spa != 0 && hdr->b_spa != spa) {
ARCSTAT_BUMP(arcstat_evict_skip);
continue;
}
hash_lock = HDR_LOCK(hdr);
/*
* We aren't calling this function from any code path
* that would already be holding a hash lock, so we're
* asserting on this assumption to be defensive in case
* this ever changes. Without this check, it would be
* possible to incorrectly increment arcstat_mutex_miss
* below (e.g. if the code changed such that we called
* this function with a hash lock held).
*/
ASSERT(!MUTEX_HELD(hash_lock));
if (mutex_tryenter(hash_lock)) {
uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
mutex_exit(hash_lock);
bytes_evicted += evicted;
/*
* If evicted is zero, arc_evict_hdr() must have
* decided to skip this header, don't increment
* evict_count in this case.
*/
if (evicted != 0)
evict_count++;
/*
* If arc_size isn't overflowing, signal any
* threads that might happen to be waiting.
*
* For each header evicted, we wake up a single
* thread. If we used cv_broadcast, we could
* wake up "too many" threads causing arc_size
* to significantly overflow arc_c; since
* arc_get_data_impl() doesn't check for overflow
* when it's woken up (it doesn't because it's
* possible for the ARC to be overflowing while
* full of un-evictable buffers, and the
* function should proceed in this case).
*
* If threads are left sleeping, due to not
* using cv_broadcast, they will be woken up
* just before arc_reclaim_thread() sleeps.
*/
mutex_enter(&arc_reclaim_lock);
if (!arc_is_overflowing())
cv_signal(&arc_reclaim_waiters_cv);
mutex_exit(&arc_reclaim_lock);
} else {
ARCSTAT_BUMP(arcstat_mutex_miss);
}
}
multilist_sublist_unlock(mls);
return (bytes_evicted);
}
/*
* Evict buffers from the given arc state, until we've removed the
* specified number of bytes. Move the removed buffers to the
* appropriate evict state.
*
* This function makes a "best effort". It skips over any buffers
* it can't get a hash_lock on, and so, may not catch all candidates.
* It may also return without evicting as much space as requested.
*
* If bytes is specified using the special value ARC_EVICT_ALL, this
* will evict all available (i.e. unlocked and evictable) buffers from
* the given arc state; which is used by arc_flush().
*/
static uint64_t
arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
arc_buf_contents_t type)
{
uint64_t total_evicted = 0;
multilist_t *ml = state->arcs_list[type];
int num_sublists;
arc_buf_hdr_t **markers;
IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
num_sublists = multilist_get_num_sublists(ml);
/*
* If we've tried to evict from each sublist, made some
* progress, but still have not hit the target number of bytes
* to evict, we want to keep trying. The markers allow us to
* pick up where we left off for each individual sublist, rather
* than starting from the tail each time.
*/
markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
for (int i = 0; i < num_sublists; i++) {
markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
/*
* A b_spa of 0 is used to indicate that this header is
* a marker. This fact is used in arc_adjust_type() and
* arc_evict_state_impl().
*/
markers[i]->b_spa = 0;
multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
multilist_sublist_insert_tail(mls, markers[i]);
multilist_sublist_unlock(mls);
}
/*
* While we haven't hit our target number of bytes to evict, or
* we're evicting all available buffers.
*/
while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
/*
* Start eviction using a randomly selected sublist,
* this is to try and evenly balance eviction across all
* sublists. Always starting at the same sublist
* (e.g. index 0) would cause evictions to favor certain
* sublists over others.
*/
int sublist_idx = multilist_get_random_index(ml);
uint64_t scan_evicted = 0;
for (int i = 0; i < num_sublists; i++) {
uint64_t bytes_remaining;
uint64_t bytes_evicted;
if (bytes == ARC_EVICT_ALL)
bytes_remaining = ARC_EVICT_ALL;
else if (total_evicted < bytes)
bytes_remaining = bytes - total_evicted;
else
break;
bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
markers[sublist_idx], spa, bytes_remaining);
scan_evicted += bytes_evicted;
total_evicted += bytes_evicted;
/* we've reached the end, wrap to the beginning */
if (++sublist_idx >= num_sublists)
sublist_idx = 0;
}
/*
* If we didn't evict anything during this scan, we have
* no reason to believe we'll evict more during another
* scan, so break the loop.
*/
if (scan_evicted == 0) {
/* This isn't possible, let's make that obvious */
ASSERT3S(bytes, !=, 0);
/*
* When bytes is ARC_EVICT_ALL, the only way to
* break the loop is when scan_evicted is zero.
* In that case, we actually have evicted enough,
* so we don't want to increment the kstat.
*/
if (bytes != ARC_EVICT_ALL) {
ASSERT3S(total_evicted, <, bytes);
ARCSTAT_BUMP(arcstat_evict_not_enough);
}
break;
}
}
for (int i = 0; i < num_sublists; i++) {
multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
multilist_sublist_remove(mls, markers[i]);
multilist_sublist_unlock(mls);
kmem_cache_free(hdr_full_cache, markers[i]);
}
kmem_free(markers, sizeof (*markers) * num_sublists);
return (total_evicted);
}
/*
* Flush all "evictable" data of the given type from the arc state
* specified. This will not evict any "active" buffers (i.e. referenced).
*
* When 'retry' is set to B_FALSE, the function will make a single pass
* over the state and evict any buffers that it can. Since it doesn't
* continually retry the eviction, it might end up leaving some buffers
* in the ARC due to lock misses.
*
* When 'retry' is set to B_TRUE, the function will continually retry the
* eviction until *all* evictable buffers have been removed from the
* state. As a result, if concurrent insertions into the state are
* allowed (e.g. if the ARC isn't shutting down), this function might
* wind up in an infinite loop, continually trying to evict buffers.
*/
static uint64_t
arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
boolean_t retry)
{
uint64_t evicted = 0;
while (refcount_count(&state->arcs_esize[type]) != 0) {
evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
if (!retry)
break;
}
return (evicted);
}
/*
* Evict the specified number of bytes from the state specified,
* restricting eviction to the spa and type given. This function
* prevents us from trying to evict more from a state's list than
* is "evictable", and to skip evicting altogether when passed a
* negative value for "bytes". In contrast, arc_evict_state() will
* evict everything it can, when passed a negative value for "bytes".
*/
static uint64_t
arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
arc_buf_contents_t type)
{
int64_t delta;
if (bytes > 0 && refcount_count(&state->arcs_esize[type]) > 0) {
delta = MIN(refcount_count(&state->arcs_esize[type]), bytes);
return (arc_evict_state(state, spa, delta, type));
}
return (0);
}
/*
* Evict metadata buffers from the cache, such that arc_meta_used is
* capped by the arc_meta_limit tunable.
*/
static uint64_t
arc_adjust_meta(void)
{
uint64_t total_evicted = 0;
int64_t target;
/*
* If we're over the meta limit, we want to evict enough
* metadata to get back under the meta limit. We don't want to
* evict so much that we drop the MRU below arc_p, though. If
* we're over the meta limit more than we're over arc_p, we
* evict some from the MRU here, and some from the MFU below.
*/
target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
(int64_t)(refcount_count(&arc_anon->arcs_size) +
refcount_count(&arc_mru->arcs_size) - arc_p));
total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
/*
* Similar to the above, we want to evict enough bytes to get us
* below the meta limit, but not so much as to drop us below the
* space allotted to the MFU (which is defined as arc_c - arc_p).
*/
target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
(int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p)));
total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
return (total_evicted);
}
/*
* Return the type of the oldest buffer in the given arc state
*
* This function will select a random sublist of type ARC_BUFC_DATA and
* a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
* is compared, and the type which contains the "older" buffer will be
* returned.
*/
static arc_buf_contents_t
arc_adjust_type(arc_state_t *state)
{
multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA];
multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA];
int data_idx = multilist_get_random_index(data_ml);
int meta_idx = multilist_get_random_index(meta_ml);
multilist_sublist_t *data_mls;
multilist_sublist_t *meta_mls;
arc_buf_contents_t type;
arc_buf_hdr_t *data_hdr;
arc_buf_hdr_t *meta_hdr;
/*
* We keep the sublist lock until we're finished, to prevent
* the headers from being destroyed via arc_evict_state().
*/
data_mls = multilist_sublist_lock(data_ml, data_idx);
meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
/*
* These two loops are to ensure we skip any markers that
* might be at the tail of the lists due to arc_evict_state().
*/
for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
if (data_hdr->b_spa != 0)
break;
}
for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
if (meta_hdr->b_spa != 0)
break;
}
if (data_hdr == NULL && meta_hdr == NULL) {
type = ARC_BUFC_DATA;
} else if (data_hdr == NULL) {
ASSERT3P(meta_hdr, !=, NULL);
type = ARC_BUFC_METADATA;
} else if (meta_hdr == NULL) {
ASSERT3P(data_hdr, !=, NULL);
type = ARC_BUFC_DATA;
} else {
ASSERT3P(data_hdr, !=, NULL);
ASSERT3P(meta_hdr, !=, NULL);
/* The headers can't be on the sublist without an L1 header */
ASSERT(HDR_HAS_L1HDR(data_hdr));
ASSERT(HDR_HAS_L1HDR(meta_hdr));
if (data_hdr->b_l1hdr.b_arc_access <
meta_hdr->b_l1hdr.b_arc_access) {
type = ARC_BUFC_DATA;
} else {
type = ARC_BUFC_METADATA;
}
}
multilist_sublist_unlock(meta_mls);
multilist_sublist_unlock(data_mls);
return (type);
}
/*
* Evict buffers from the cache, such that arc_size is capped by arc_c.
*/
static uint64_t
arc_adjust(void)
{
uint64_t total_evicted = 0;
uint64_t bytes;
int64_t target;
/*
* If we're over arc_meta_limit, we want to correct that before
* potentially evicting data buffers below.
*/
total_evicted += arc_adjust_meta();
/*
* Adjust MRU size
*
* If we're over the target cache size, we want to evict enough
* from the list to get back to our target size. We don't want
* to evict too much from the MRU, such that it drops below
* arc_p. So, if we're over our target cache size more than
* the MRU is over arc_p, we'll evict enough to get back to
* arc_p here, and then evict more from the MFU below.
*/
target = MIN((int64_t)(arc_size - arc_c),
(int64_t)(refcount_count(&arc_anon->arcs_size) +
refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p));
/*
* If we're below arc_meta_min, always prefer to evict data.
* Otherwise, try to satisfy the requested number of bytes to
* evict from the type which contains older buffers; in an
* effort to keep newer buffers in the cache regardless of their
* type. If we cannot satisfy the number of bytes from this
* type, spill over into the next type.
*/
if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
arc_meta_used > arc_meta_min) {
bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
total_evicted += bytes;
/*
* If we couldn't evict our target number of bytes from
* metadata, we try to get the rest from data.
*/
target -= bytes;
total_evicted +=
arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
} else {
bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
total_evicted += bytes;
/*
* If we couldn't evict our target number of bytes from
* data, we try to get the rest from metadata.
*/
target -= bytes;
total_evicted +=
arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
}
/*
* Adjust MFU size
*
* Now that we've tried to evict enough from the MRU to get its
* size back to arc_p, if we're still above the target cache
* size, we evict the rest from the MFU.
*/
target = arc_size - arc_c;
if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
arc_meta_used > arc_meta_min) {
bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
total_evicted += bytes;
/*
* If we couldn't evict our target number of bytes from
* metadata, we try to get the rest from data.
*/
target -= bytes;
total_evicted +=
arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
} else {
bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
total_evicted += bytes;
/*
* If we couldn't evict our target number of bytes from
* data, we try to get the rest from data.
*/
target -= bytes;
total_evicted +=
arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
}
/*
* Adjust ghost lists
*
* In addition to the above, the ARC also defines target values
* for the ghost lists. The sum of the mru list and mru ghost
* list should never exceed the target size of the cache, and
* the sum of the mru list, mfu list, mru ghost list, and mfu
* ghost list should never exceed twice the target size of the
* cache. The following logic enforces these limits on the ghost
* caches, and evicts from them as needed.
*/
target = refcount_count(&arc_mru->arcs_size) +
refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
total_evicted += bytes;
target -= bytes;
total_evicted +=
arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
/*
* We assume the sum of the mru list and mfu list is less than
* or equal to arc_c (we enforced this above), which means we
* can use the simpler of the two equations below:
*
* mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
* mru ghost + mfu ghost <= arc_c
*/
target = refcount_count(&arc_mru_ghost->arcs_size) +
refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
total_evicted += bytes;
target -= bytes;
total_evicted +=
arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
return (total_evicted);
}
void
arc_flush(spa_t *spa, boolean_t retry)
{
uint64_t guid = 0;
/*
* If retry is B_TRUE, a spa must not be specified since we have
* no good way to determine if all of a spa's buffers have been
* evicted from an arc state.
*/
ASSERT(!retry || spa == 0);
if (spa != NULL)
guid = spa_load_guid(spa);
(void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
(void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
}
void
arc_shrink(int64_t to_free)
{
if (arc_c > arc_c_min) {
DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
arc_c_min, uint64_t, arc_p, uint64_t, to_free);
if (arc_c > arc_c_min + to_free)
atomic_add_64(&arc_c, -to_free);
else
arc_c = arc_c_min;
atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
if (arc_c > arc_size)
arc_c = MAX(arc_size, arc_c_min);
if (arc_p > arc_c)
arc_p = (arc_c >> 1);
DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t,
arc_p);
ASSERT(arc_c >= arc_c_min);
ASSERT((int64_t)arc_p >= 0);
}
if (arc_size > arc_c) {
DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size,
uint64_t, arc_c);
(void) arc_adjust();
}
}
typedef enum free_memory_reason_t {
FMR_UNKNOWN,
FMR_NEEDFREE,
FMR_LOTSFREE,
FMR_SWAPFS_MINFREE,
FMR_PAGES_PP_MAXIMUM,
FMR_HEAP_ARENA,
FMR_ZIO_ARENA,
FMR_ZIO_FRAG,
} free_memory_reason_t;
int64_t last_free_memory;
free_memory_reason_t last_free_reason;
/*
* Additional reserve of pages for pp_reserve.
*/
int64_t arc_pages_pp_reserve = 64;
/*
* Additional reserve of pages for swapfs.
*/
int64_t arc_swapfs_reserve = 64;
/*
* Return the amount of memory that can be consumed before reclaim will be
* needed. Positive if there is sufficient free memory, negative indicates
* the amount of memory that needs to be freed up.
*/
static int64_t
arc_available_memory(void)
{
int64_t lowest = INT64_MAX;
int64_t n;
free_memory_reason_t r = FMR_UNKNOWN;
#ifdef _KERNEL
#ifdef __FreeBSD__
/*
* Cooperate with pagedaemon when it's time for it to scan
* and reclaim some pages.
*/
n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target);
if (n < lowest) {
lowest = n;
r = FMR_LOTSFREE;
}
#else
if (needfree > 0) {
n = PAGESIZE * (-needfree);
if (n < lowest) {
lowest = n;
r = FMR_NEEDFREE;
}
}
/*
* check that we're out of range of the pageout scanner. It starts to
* schedule paging if freemem is less than lotsfree and needfree.
* lotsfree is the high-water mark for pageout, and needfree is the
* number of needed free pages. We add extra pages here to make sure
* the scanner doesn't start up while we're freeing memory.
*/
n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
if (n < lowest) {
lowest = n;
r = FMR_LOTSFREE;
}
/*
* check to make sure that swapfs has enough space so that anon
* reservations can still succeed. anon_resvmem() checks that the
* availrmem is greater than swapfs_minfree, and the number of reserved
* swap pages. We also add a bit of extra here just to prevent
* circumstances from getting really dire.
*/
n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
desfree - arc_swapfs_reserve);
if (n < lowest) {
lowest = n;
r = FMR_SWAPFS_MINFREE;
}
/*
* Check that we have enough availrmem that memory locking (e.g., via
* mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum
* stores the number of pages that cannot be locked; when availrmem
* drops below pages_pp_maximum, page locking mechanisms such as
* page_pp_lock() will fail.)
*/
n = PAGESIZE * (availrmem - pages_pp_maximum -
arc_pages_pp_reserve);
if (n < lowest) {
lowest = n;
r = FMR_PAGES_PP_MAXIMUM;
}
#endif /* __FreeBSD__ */
#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
/*
* If we're on an i386 platform, it's possible that we'll exhaust the
* kernel heap space before we ever run out of available physical
* memory. Most checks of the size of the heap_area compare against
* tune.t_minarmem, which is the minimum available real memory that we
* can have in the system. However, this is generally fixed at 25 pages
* which is so low that it's useless. In this comparison, we seek to
* calculate the total heap-size, and reclaim if more than 3/4ths of the
* heap is allocated. (Or, in the calculation, if less than 1/4th is
* free)
*/
n = (int64_t)vmem_size(heap_arena, VMEM_FREE) -
(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2);
if (n < lowest) {
lowest = n;
r = FMR_HEAP_ARENA;
}
#define zio_arena NULL
#else
#define zio_arena heap_arena
#endif
/*
* If zio data pages are being allocated out of a separate heap segment,
* then enforce that the size of available vmem for this arena remains
* above about 1/4th (1/(2^arc_zio_arena_free_shift)) free.
*
* Note that reducing the arc_zio_arena_free_shift keeps more virtual
* memory (in the zio_arena) free, which can avoid memory
* fragmentation issues.
*/
if (zio_arena != NULL) {
n = (int64_t)vmem_size(zio_arena, VMEM_FREE) -
(vmem_size(zio_arena, VMEM_ALLOC) >>
arc_zio_arena_free_shift);
if (n < lowest) {
lowest = n;
r = FMR_ZIO_ARENA;
}
}
/*
* Above limits know nothing about real level of KVA fragmentation.
* Start aggressive reclamation if too little sequential KVA left.
*/
if (lowest > 0) {
n = (vmem_size(heap_arena, VMEM_MAXFREE) < SPA_MAXBLOCKSIZE) ?
-((int64_t)vmem_size(heap_arena, VMEM_ALLOC) >> 4) :
INT64_MAX;
if (n < lowest) {
lowest = n;
r = FMR_ZIO_FRAG;
}
}
#else /* _KERNEL */
/* Every 100 calls, free a small amount */
if (spa_get_random(100) == 0)
lowest = -1024;
#endif /* _KERNEL */
last_free_memory = lowest;
last_free_reason = r;
DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r);
return (lowest);
}
/*
* Determine if the system is under memory pressure and is asking
* to reclaim memory. A return value of B_TRUE indicates that the system
* is under memory pressure and that the arc should adjust accordingly.
*/
static boolean_t
arc_reclaim_needed(void)
{
return (arc_available_memory() < 0);
}
extern kmem_cache_t *zio_buf_cache[];
extern kmem_cache_t *zio_data_buf_cache[];
extern kmem_cache_t *range_seg_cache;
extern kmem_cache_t *abd_chunk_cache;
static __noinline void
arc_kmem_reap_now(void)
{
size_t i;
kmem_cache_t *prev_cache = NULL;
kmem_cache_t *prev_data_cache = NULL;
DTRACE_PROBE(arc__kmem_reap_start);
#ifdef _KERNEL
if (arc_meta_used >= arc_meta_limit) {
/*
* We are exceeding our meta-data cache limit.
* Purge some DNLC entries to release holds on meta-data.
*/
dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
}
#if defined(__i386)
/*
* Reclaim unused memory from all kmem caches.
*/
kmem_reap();
#endif
#endif
for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
if (zio_buf_cache[i] != prev_cache) {
prev_cache = zio_buf_cache[i];
kmem_cache_reap_now(zio_buf_cache[i]);
}
if (zio_data_buf_cache[i] != prev_data_cache) {
prev_data_cache = zio_data_buf_cache[i];
kmem_cache_reap_now(zio_data_buf_cache[i]);
}
}
kmem_cache_reap_now(abd_chunk_cache);
kmem_cache_reap_now(buf_cache);
kmem_cache_reap_now(hdr_full_cache);
kmem_cache_reap_now(hdr_l2only_cache);
kmem_cache_reap_now(range_seg_cache);
#ifdef illumos
if (zio_arena != NULL) {
/*
* Ask the vmem arena to reclaim unused memory from its
* quantum caches.
*/
vmem_qcache_reap(zio_arena);
}
#endif
DTRACE_PROBE(arc__kmem_reap_end);
}
/*
* Threads can block in arc_get_data_impl() waiting for this thread to evict
* enough data and signal them to proceed. When this happens, the threads in
* arc_get_data_impl() are sleeping while holding the hash lock for their
* particular arc header. Thus, we must be careful to never sleep on a
* hash lock in this thread. This is to prevent the following deadlock:
*
* - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L",
* waiting for the reclaim thread to signal it.
*
* - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
* fails, and goes to sleep forever.
*
* This possible deadlock is avoided by always acquiring a hash lock
* using mutex_tryenter() from arc_reclaim_thread().
*/
/* ARGSUSED */
static void
arc_reclaim_thread(void *unused __unused)
{
hrtime_t growtime = 0;
callb_cpr_t cpr;
CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
mutex_enter(&arc_reclaim_lock);
while (!arc_reclaim_thread_exit) {
uint64_t evicted = 0;
/*
* This is necessary in order for the mdb ::arc dcmd to
* show up to date information. Since the ::arc command
* does not call the kstat's update function, without
* this call, the command may show stale stats for the
* anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
* with this change, the data might be up to 1 second
* out of date; but that should suffice. The arc_state_t
* structures can be queried directly if more accurate
* information is needed.
*/
if (arc_ksp != NULL)
arc_ksp->ks_update(arc_ksp, KSTAT_READ);
mutex_exit(&arc_reclaim_lock);
/*
* We call arc_adjust() before (possibly) calling
* arc_kmem_reap_now(), so that we can wake up
* arc_get_data_impl() sooner.
*/
evicted = arc_adjust();
int64_t free_memory = arc_available_memory();
if (free_memory < 0) {
arc_no_grow = B_TRUE;
arc_warm = B_TRUE;
/*
* Wait at least zfs_grow_retry (default 60) seconds
* before considering growing.
*/
growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
arc_kmem_reap_now();
/*
* If we are still low on memory, shrink the ARC
* so that we have arc_shrink_min free space.
*/
free_memory = arc_available_memory();
int64_t to_free =
(arc_c >> arc_shrink_shift) - free_memory;
if (to_free > 0) {
#ifdef _KERNEL
#ifdef illumos
to_free = MAX(to_free, ptob(needfree));
#endif
#endif
arc_shrink(to_free);
}
} else if (free_memory < arc_c >> arc_no_grow_shift) {
arc_no_grow = B_TRUE;
} else if (gethrtime() >= growtime) {
arc_no_grow = B_FALSE;
}
mutex_enter(&arc_reclaim_lock);
/*
* If evicted is zero, we couldn't evict anything via
* arc_adjust(). This could be due to hash lock
* collisions, but more likely due to the majority of
* arc buffers being unevictable. Therefore, even if
* arc_size is above arc_c, another pass is unlikely to
* be helpful and could potentially cause us to enter an
* infinite loop.
*/
if (arc_size <= arc_c || evicted == 0) {
/*
* We're either no longer overflowing, or we
* can't evict anything more, so we should wake
* up any threads before we go to sleep.
*/
cv_broadcast(&arc_reclaim_waiters_cv);
/*
* Block until signaled, or after one second (we
* might need to perform arc_kmem_reap_now()
* even if we aren't being signalled)
*/
CALLB_CPR_SAFE_BEGIN(&cpr);
(void) cv_timedwait_hires(&arc_reclaim_thread_cv,
&arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
}
}
arc_reclaim_thread_exit = B_FALSE;
cv_broadcast(&arc_reclaim_thread_cv);
CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */
thread_exit();
}
static u_int arc_dnlc_evicts_arg;
extern struct vfsops zfs_vfsops;
static void
arc_dnlc_evicts_thread(void *dummy __unused)
{
callb_cpr_t cpr;
u_int percent;
CALLB_CPR_INIT(&cpr, &arc_dnlc_evicts_lock, callb_generic_cpr, FTAG);
mutex_enter(&arc_dnlc_evicts_lock);
while (!arc_dnlc_evicts_thread_exit) {
CALLB_CPR_SAFE_BEGIN(&cpr);
(void) cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock);
CALLB_CPR_SAFE_END(&cpr, &arc_dnlc_evicts_lock);
if (arc_dnlc_evicts_arg != 0) {
percent = arc_dnlc_evicts_arg;
mutex_exit(&arc_dnlc_evicts_lock);
#ifdef _KERNEL
vnlru_free(desiredvnodes * percent / 100, &zfs_vfsops);
#endif
mutex_enter(&arc_dnlc_evicts_lock);
/*
* Clear our token only after vnlru_free()
* pass is done, to avoid false queueing of
* the requests.
*/
arc_dnlc_evicts_arg = 0;
}
}
arc_dnlc_evicts_thread_exit = FALSE;
cv_broadcast(&arc_dnlc_evicts_cv);
CALLB_CPR_EXIT(&cpr);
thread_exit();
}
void
dnlc_reduce_cache(void *arg)
{
u_int percent;
percent = (u_int)(uintptr_t)arg;
mutex_enter(&arc_dnlc_evicts_lock);
if (arc_dnlc_evicts_arg == 0) {
arc_dnlc_evicts_arg = percent;
cv_broadcast(&arc_dnlc_evicts_cv);
}
mutex_exit(&arc_dnlc_evicts_lock);
}
/*
* Adapt arc info given the number of bytes we are trying to add and
* the state that we are comming from. This function is only called
* when we are adding new content to the cache.
*/
static void
arc_adapt(int bytes, arc_state_t *state)
{
int mult;
uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size);
int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size);
if (state == arc_l2c_only)
return;
ASSERT(bytes > 0);
/*
* Adapt the target size of the MRU list:
* - if we just hit in the MRU ghost list, then increase
* the target size of the MRU list.
* - if we just hit in the MFU ghost list, then increase
* the target size of the MFU list by decreasing the
* target size of the MRU list.
*/
if (state == arc_mru_ghost) {
mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
} else if (state == arc_mfu_ghost) {
uint64_t delta;
mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
mult = MIN(mult, 10);
delta = MIN(bytes * mult, arc_p);
arc_p = MAX(arc_p_min, arc_p - delta);
}
ASSERT((int64_t)arc_p >= 0);
if (arc_reclaim_needed()) {
cv_signal(&arc_reclaim_thread_cv);
return;
}
if (arc_no_grow)
return;
if (arc_c >= arc_c_max)
return;
/*
* If we're within (2 * maxblocksize) bytes of the target
* cache size, increment the target cache size
*/
if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
DTRACE_PROBE1(arc__inc_adapt, int, bytes);
atomic_add_64(&arc_c, (int64_t)bytes);
if (arc_c > arc_c_max)
arc_c = arc_c_max;
else if (state == arc_anon)
atomic_add_64(&arc_p, (int64_t)bytes);
if (arc_p > arc_c)
arc_p = arc_c;
}
ASSERT((int64_t)arc_p >= 0);
}
/*
* Check if arc_size has grown past our upper threshold, determined by
* zfs_arc_overflow_shift.
*/
static boolean_t
arc_is_overflowing(void)
{
/* Always allow at least one block of overflow */
uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
arc_c >> zfs_arc_overflow_shift);
return (arc_size >= arc_c + overflow);
}
static abd_t *
arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
{
arc_buf_contents_t type = arc_buf_type(hdr);
arc_get_data_impl(hdr, size, tag);
if (type == ARC_BUFC_METADATA) {
return (abd_alloc(size, B_TRUE));
} else {
ASSERT(type == ARC_BUFC_DATA);
return (abd_alloc(size, B_FALSE));
}
}
static void *
arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
{
arc_buf_contents_t type = arc_buf_type(hdr);
arc_get_data_impl(hdr, size, tag);
if (type == ARC_BUFC_METADATA) {
return (zio_buf_alloc(size));
} else {
ASSERT(type == ARC_BUFC_DATA);
return (zio_data_buf_alloc(size));
}
}
/*
* Allocate a block and return it to the caller. If we are hitting the
* hard limit for the cache size, we must sleep, waiting for the eviction
* thread to catch up. If we're past the target size but below the hard
* limit, we'll only signal the reclaim thread and continue on.
*/
static void
arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
{
arc_state_t *state = hdr->b_l1hdr.b_state;
arc_buf_contents_t type = arc_buf_type(hdr);
arc_adapt(size, state);
/*
* If arc_size is currently overflowing, and has grown past our
* upper limit, we must be adding data faster than the evict
* thread can evict. Thus, to ensure we don't compound the
* problem by adding more data and forcing arc_size to grow even
* further past it's target size, we halt and wait for the
* eviction thread to catch up.
*
* It's also possible that the reclaim thread is unable to evict
* enough buffers to get arc_size below the overflow limit (e.g.
* due to buffers being un-evictable, or hash lock collisions).
* In this case, we want to proceed regardless if we're
* overflowing; thus we don't use a while loop here.
*/
if (arc_is_overflowing()) {
mutex_enter(&arc_reclaim_lock);
/*
* Now that we've acquired the lock, we may no longer be
* over the overflow limit, lets check.
*
* We're ignoring the case of spurious wake ups. If that
* were to happen, it'd let this thread consume an ARC
* buffer before it should have (i.e. before we're under
* the overflow limit and were signalled by the reclaim
* thread). As long as that is a rare occurrence, it
* shouldn't cause any harm.
*/
if (arc_is_overflowing()) {
cv_signal(&arc_reclaim_thread_cv);
cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
}
mutex_exit(&arc_reclaim_lock);
}
VERIFY3U(hdr->b_type, ==, type);
if (type == ARC_BUFC_METADATA) {
arc_space_consume(size, ARC_SPACE_META);
} else {
arc_space_consume(size, ARC_SPACE_DATA);
}
/*
* Update the state size. Note that ghost states have a
* "ghost size" and so don't need to be updated.
*/
if (!GHOST_STATE(state)) {
(void) refcount_add_many(&state->arcs_size, size, tag);
/*
* If this is reached via arc_read, the link is
* protected by the hash lock. If reached via
* arc_buf_alloc, the header should not be accessed by
* any other thread. And, if reached via arc_read_done,
* the hash lock will protect it if it's found in the
* hash table; otherwise no other thread should be
* trying to [add|remove]_reference it.
*/
if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
(void) refcount_add_many(&state->arcs_esize[type],
size, tag);
}
/*
* If we are growing the cache, and we are adding anonymous
* data, and we have outgrown arc_p, update arc_p
*/
if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
(refcount_count(&arc_anon->arcs_size) +
refcount_count(&arc_mru->arcs_size) > arc_p))
arc_p = MIN(arc_c, arc_p + size);
}
ARCSTAT_BUMP(arcstat_allocated);
}
static void
arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag)
{
arc_free_data_impl(hdr, size, tag);
abd_free(abd);
}
static void
arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag)
{
arc_buf_contents_t type = arc_buf_type(hdr);
arc_free_data_impl(hdr, size, tag);
if (type == ARC_BUFC_METADATA) {
zio_buf_free(buf, size);
} else {
ASSERT(type == ARC_BUFC_DATA);
zio_data_buf_free(buf, size);
}
}
/*
* Free the arc data buffer.
*/
static void
arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
{
arc_state_t *state = hdr->b_l1hdr.b_state;
arc_buf_contents_t type = arc_buf_type(hdr);
/* protected by hash lock, if in the hash table */
if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
ASSERT(state != arc_anon && state != arc_l2c_only);
(void) refcount_remove_many(&state->arcs_esize[type],
size, tag);
}
(void) refcount_remove_many(&state->arcs_size, size, tag);
VERIFY3U(hdr->b_type, ==, type);
if (type == ARC_BUFC_METADATA) {
arc_space_return(size, ARC_SPACE_META);
} else {
ASSERT(type == ARC_BUFC_DATA);
arc_space_return(size, ARC_SPACE_DATA);
}
}
/*
* This routine is called whenever a buffer is accessed.
* NOTE: the hash lock is dropped in this function.
*/
static void
arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
{
clock_t now;
ASSERT(MUTEX_HELD(hash_lock));
ASSERT(HDR_HAS_L1HDR(hdr));
if (hdr->b_l1hdr.b_state == arc_anon) {
/*
* This buffer is not in the cache, and does not
* appear in our "ghost" list. Add the new buffer
* to the MRU state.
*/
ASSERT0(hdr->b_l1hdr.b_arc_access);
hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
arc_change_state(arc_mru, hdr, hash_lock);
} else if (hdr->b_l1hdr.b_state == arc_mru) {
now = ddi_get_lbolt();
/*
* If this buffer is here because of a prefetch, then either:
* - clear the flag if this is a "referencing" read
* (any subsequent access will bump this into the MFU state).
* or
* - move the buffer to the head of the list if this is
* another prefetch (to make it less likely to be evicted).
*/
if (HDR_PREFETCH(hdr)) {
if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
/* link protected by hash lock */
ASSERT(multilist_link_active(
&hdr->b_l1hdr.b_arc_node));
} else {
arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
ARCSTAT_BUMP(arcstat_mru_hits);
}
hdr->b_l1hdr.b_arc_access = now;
return;
}
/*
* This buffer has been "accessed" only once so far,
* but it is still in the cache. Move it to the MFU
* state.
*/
if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) {
/*
* More than 125ms have passed since we
* instantiated this buffer. Move it to the
* most frequently used state.
*/
hdr->b_l1hdr.b_arc_access = now;
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
arc_change_state(arc_mfu, hdr, hash_lock);
}
ARCSTAT_BUMP(arcstat_mru_hits);
} else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
arc_state_t *new_state;
/*
* This buffer has been "accessed" recently, but
* was evicted from the cache. Move it to the
* MFU state.
*/
if (HDR_PREFETCH(hdr)) {
new_state = arc_mru;
if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
} else {
new_state = arc_mfu;
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
}
hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
arc_change_state(new_state, hdr, hash_lock);
ARCSTAT_BUMP(arcstat_mru_ghost_hits);
} else if (hdr->b_l1hdr.b_state == arc_mfu) {
/*
* This buffer has been accessed more than once and is
* still in the cache. Keep it in the MFU state.
*
* NOTE: an add_reference() that occurred when we did
* the arc_read() will have kicked this off the list.
* If it was a prefetch, we will explicitly move it to
* the head of the list now.
*/
if ((HDR_PREFETCH(hdr)) != 0) {
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
/* link protected by hash_lock */
ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
}
ARCSTAT_BUMP(arcstat_mfu_hits);
hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
arc_state_t *new_state = arc_mfu;
/*
* This buffer has been accessed more than once but has
* been evicted from the cache. Move it back to the
* MFU state.
*/
if (HDR_PREFETCH(hdr)) {
/*
* This is a prefetch access...
* move this block back to the MRU state.
*/
ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
new_state = arc_mru;
}
hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
arc_change_state(new_state, hdr, hash_lock);
ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
} else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
/*
* This buffer is on the 2nd Level ARC.
*/
hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
arc_change_state(arc_mfu, hdr, hash_lock);
} else {
ASSERT(!"invalid arc state");
}
}
/* a generic arc_done_func_t which you can use */
/* ARGSUSED */
void
arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
{
if (zio == NULL || zio->io_error == 0)
bcopy(buf->b_data, arg, arc_buf_size(buf));
arc_buf_destroy(buf, arg);
}
/* a generic arc_done_func_t */
void
arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
{
arc_buf_t **bufp = arg;
if (zio && zio->io_error) {
arc_buf_destroy(buf, arg);
*bufp = NULL;
} else {
*bufp = buf;
ASSERT(buf->b_data);
}
}
static void
arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp)
{
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0);
ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
} else {
if (HDR_COMPRESSION_ENABLED(hdr)) {
ASSERT3U(HDR_GET_COMPRESS(hdr), ==,
BP_GET_COMPRESS(bp));
}
ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
}
}
static void
arc_read_done(zio_t *zio)
{
arc_buf_hdr_t *hdr = zio->io_private;
kmutex_t *hash_lock = NULL;
arc_callback_t *callback_list;
arc_callback_t *acb;
boolean_t freeable = B_FALSE;
boolean_t no_zio_error = (zio->io_error == 0);
/*
* The hdr was inserted into hash-table and removed from lists
* prior to starting I/O. We should find this header, since
* it's in the hash table, and it should be legit since it's
* not possible to evict it during the I/O. The only possible
* reason for it not to be found is if we were freed during the
* read.
*/
if (HDR_IN_HASH_TABLE(hdr)) {
ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
ASSERT3U(hdr->b_dva.dva_word[0], ==,
BP_IDENTITY(zio->io_bp)->dva_word[0]);
ASSERT3U(hdr->b_dva.dva_word[1], ==,
BP_IDENTITY(zio->io_bp)->dva_word[1]);
arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
&hash_lock);
ASSERT((found == hdr &&
DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
(found == hdr && HDR_L2_READING(hdr)));
ASSERT3P(hash_lock, !=, NULL);
}
if (no_zio_error) {
/* byteswap if necessary */
if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
if (BP_GET_LEVEL(zio->io_bp) > 0) {
hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
} else {
hdr->b_l1hdr.b_byteswap =
DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
}
} else {
hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
}
}
arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
if (l2arc_noprefetch && HDR_PREFETCH(hdr))
arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
callback_list = hdr->b_l1hdr.b_acb;
ASSERT3P(callback_list, !=, NULL);
if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) {
/*
* Only call arc_access on anonymous buffers. This is because
* if we've issued an I/O for an evicted buffer, we've already
* called arc_access (to prevent any simultaneous readers from
* getting confused).
*/
arc_access(hdr, hash_lock);
}
/*
* If a read request has a callback (i.e. acb_done is not NULL), then we
* make a buf containing the data according to the parameters which were
* passed in. The implementation of arc_buf_alloc_impl() ensures that we
* aren't needlessly decompressing the data multiple times.
*/
int callback_cnt = 0;
for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
if (!acb->acb_done)
continue;
/* This is a demand read since prefetches don't use callbacks */
callback_cnt++;
int error = arc_buf_alloc_impl(hdr, acb->acb_private,
acb->acb_compressed, no_zio_error, &acb->acb_buf);
if (no_zio_error) {
zio->io_error = error;
}
}
hdr->b_l1hdr.b_acb = NULL;
arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
if (callback_cnt == 0) {
ASSERT(HDR_PREFETCH(hdr));
ASSERT0(hdr->b_l1hdr.b_bufcnt);
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
}
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
callback_list != NULL);
if (no_zio_error) {
arc_hdr_verify(hdr, zio->io_bp);
} else {
arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
if (hdr->b_l1hdr.b_state != arc_anon)
arc_change_state(arc_anon, hdr, hash_lock);
if (HDR_IN_HASH_TABLE(hdr))
buf_hash_remove(hdr);
freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
}
/*
* Broadcast before we drop the hash_lock to avoid the possibility
* that the hdr (and hence the cv) might be freed before we get to
* the cv_broadcast().
*/
cv_broadcast(&hdr->b_l1hdr.b_cv);
if (hash_lock != NULL) {
mutex_exit(hash_lock);
} else {
/*
* This block was freed while we waited for the read to
* complete. It has been removed from the hash table and
* moved to the anonymous state (so that it won't show up
* in the cache).
*/
ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
}
/* execute each callback and free its structure */
while ((acb = callback_list) != NULL) {
if (acb->acb_done)
acb->acb_done(zio, acb->acb_buf, acb->acb_private);
if (acb->acb_zio_dummy != NULL) {
acb->acb_zio_dummy->io_error = zio->io_error;
zio_nowait(acb->acb_zio_dummy);
}
callback_list = acb->acb_next;
kmem_free(acb, sizeof (arc_callback_t));
}
if (freeable)
arc_hdr_destroy(hdr);
}
/*
* "Read" the block at the specified DVA (in bp) via the
* cache. If the block is found in the cache, invoke the provided
* callback immediately and return. Note that the `zio' parameter
* in the callback will be NULL in this case, since no IO was
* required. If the block is not in the cache pass the read request
* on to the spa with a substitute callback function, so that the
* requested block will be added to the cache.
*
* If a read request arrives for a block that has a read in-progress,
* either wait for the in-progress read to complete (and return the
* results); or, if this is a read with a "done" func, add a record
* to the read to invoke the "done" func when the read completes,
* and return; or just return.
*
* arc_read_done() will invoke all the requested "done" functions
* for readers of this block.
*/
int
arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
void *private, zio_priority_t priority, int zio_flags,
arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
{
arc_buf_hdr_t *hdr = NULL;
kmutex_t *hash_lock = NULL;
zio_t *rzio;
uint64_t guid = spa_load_guid(spa);
boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0;
ASSERT(!BP_IS_EMBEDDED(bp) ||
BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
top:
if (!BP_IS_EMBEDDED(bp)) {
/*
* Embedded BP's have no DVA and require no I/O to "read".
* Create an anonymous arc buf to back it.
*/
hdr = buf_hash_find(guid, bp, &hash_lock);
}
if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) {
arc_buf_t *buf = NULL;
*arc_flags |= ARC_FLAG_CACHED;
if (HDR_IO_IN_PROGRESS(hdr)) {
if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
priority == ZIO_PRIORITY_SYNC_READ) {
/*
* This sync read must wait for an
* in-progress async read (e.g. a predictive
* prefetch). Async reads are queued
* separately at the vdev_queue layer, so
* this is a form of priority inversion.
* Ideally, we would "inherit" the demand
* i/o's priority by moving the i/o from
* the async queue to the synchronous queue,
* but there is currently no mechanism to do
* so. Track this so that we can evaluate
* the magnitude of this potential performance
* problem.
*
* Note that if the prefetch i/o is already
* active (has been issued to the device),
* the prefetch improved performance, because
* we issued it sooner than we would have
* without the prefetch.
*/
DTRACE_PROBE1(arc__sync__wait__for__async,
arc_buf_hdr_t *, hdr);
ARCSTAT_BUMP(arcstat_sync_wait_for_async);
}
if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
arc_hdr_clear_flags(hdr,
ARC_FLAG_PREDICTIVE_PREFETCH);
}
if (*arc_flags & ARC_FLAG_WAIT) {
cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
mutex_exit(hash_lock);
goto top;
}
ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
if (done) {
arc_callback_t *acb = NULL;
acb = kmem_zalloc(sizeof (arc_callback_t),
KM_SLEEP);
acb->acb_done = done;
acb->acb_private = private;
acb->acb_compressed = compressed_read;
if (pio != NULL)
acb->acb_zio_dummy = zio_null(pio,
spa, NULL, NULL, NULL, zio_flags);
ASSERT3P(acb->acb_done, !=, NULL);
acb->acb_next = hdr->b_l1hdr.b_acb;
hdr->b_l1hdr.b_acb = acb;
mutex_exit(hash_lock);
return (0);
}
mutex_exit(hash_lock);
return (0);
}
ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
hdr->b_l1hdr.b_state == arc_mfu);
if (done) {
if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
/*
* This is a demand read which does not have to
* wait for i/o because we did a predictive
* prefetch i/o for it, which has completed.
*/
DTRACE_PROBE1(
arc__demand__hit__predictive__prefetch,
arc_buf_hdr_t *, hdr);
ARCSTAT_BUMP(
arcstat_demand_hit_predictive_prefetch);
arc_hdr_clear_flags(hdr,
ARC_FLAG_PREDICTIVE_PREFETCH);
}
ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
/* Get a buf with the desired data in it. */
VERIFY0(arc_buf_alloc_impl(hdr, private,
compressed_read, B_TRUE, &buf));
} else if (*arc_flags & ARC_FLAG_PREFETCH &&
refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
}
DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
arc_access(hdr, hash_lock);
if (*arc_flags & ARC_FLAG_L2CACHE)
arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_hits);
ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
data, metadata, hits);
if (done)
done(NULL, buf, private);
} else {
uint64_t lsize = BP_GET_LSIZE(bp);
uint64_t psize = BP_GET_PSIZE(bp);
arc_callback_t *acb;
vdev_t *vd = NULL;
uint64_t addr = 0;
boolean_t devw = B_FALSE;
uint64_t size;
if (hdr == NULL) {
/* this block is not in the cache */
arc_buf_hdr_t *exists = NULL;
arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
BP_GET_COMPRESS(bp), type);
if (!BP_IS_EMBEDDED(bp)) {
hdr->b_dva = *BP_IDENTITY(bp);
hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
exists = buf_hash_insert(hdr, &hash_lock);
}
if (exists != NULL) {
/* somebody beat us to the hash insert */
mutex_exit(hash_lock);
buf_discard_identity(hdr);
arc_hdr_destroy(hdr);
goto top; /* restart the IO request */
}
} else {
/*
* This block is in the ghost cache. If it was L2-only
* (and thus didn't have an L1 hdr), we realloc the
* header to add an L1 hdr.
*/
if (!HDR_HAS_L1HDR(hdr)) {
hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
hdr_full_cache);
}
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
/*
* This is a delicate dance that we play here.
* This hdr is in the ghost list so we access it
* to move it out of the ghost list before we
* initiate the read. If it's a prefetch then
* it won't have a callback so we'll remove the
* reference that arc_buf_alloc_impl() created. We
* do this after we've called arc_access() to
* avoid hitting an assert in remove_reference().
*/
arc_access(hdr, hash_lock);
arc_hdr_alloc_pabd(hdr);
}
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
size = arc_hdr_size(hdr);
/*
* If compression is enabled on the hdr, then will do
* RAW I/O and will store the compressed data in the hdr's
* data block. Otherwise, the hdr's data block will contain
* the uncompressed data.
*/
if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
zio_flags |= ZIO_FLAG_RAW;
}
if (*arc_flags & ARC_FLAG_PREFETCH)
arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
if (*arc_flags & ARC_FLAG_L2CACHE)
arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
if (BP_GET_LEVEL(bp) > 0)
arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH);
ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
acb->acb_done = done;
acb->acb_private = private;
acb->acb_compressed = compressed_read;
ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
hdr->b_l1hdr.b_acb = acb;
arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
if (HDR_HAS_L2HDR(hdr) &&
(vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
devw = hdr->b_l2hdr.b_dev->l2ad_writing;
addr = hdr->b_l2hdr.b_daddr;
/*
- * Lock out device removal.
+ * Lock out L2ARC device removal.
*/
if (vdev_is_dead(vd) ||
!spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
vd = NULL;
}
if (priority == ZIO_PRIORITY_ASYNC_READ)
arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
else
arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
if (hash_lock != NULL)
mutex_exit(hash_lock);
/*
* At this point, we have a level 1 cache miss. Try again in
* L2ARC if possible.
*/
ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize);
DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
uint64_t, lsize, zbookmark_phys_t *, zb);
ARCSTAT_BUMP(arcstat_misses);
ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
data, metadata, misses);
#ifdef _KERNEL
#ifdef RACCT
if (racct_enable) {
PROC_LOCK(curproc);
racct_add_force(curproc, RACCT_READBPS, size);
racct_add_force(curproc, RACCT_READIOPS, 1);
PROC_UNLOCK(curproc);
}
#endif /* RACCT */
curthread->td_ru.ru_inblock++;
#endif
if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
/*
* Read from the L2ARC if the following are true:
* 1. The L2ARC vdev was previously cached.
* 2. This buffer still has L2ARC metadata.
* 3. This buffer isn't currently writing to the L2ARC.
* 4. The L2ARC entry wasn't evicted, which may
* also have invalidated the vdev.
* 5. This isn't prefetch and l2arc_noprefetch is set.
*/
if (HDR_HAS_L2HDR(hdr) &&
!HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
!(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
l2arc_read_callback_t *cb;
abd_t *abd;
uint64_t asize;
DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
ARCSTAT_BUMP(arcstat_l2_hits);
cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
KM_SLEEP);
cb->l2rcb_hdr = hdr;
cb->l2rcb_bp = *bp;
cb->l2rcb_zb = *zb;
cb->l2rcb_flags = zio_flags;
asize = vdev_psize_to_asize(vd, size);
if (asize != size) {
abd = abd_alloc_for_io(asize,
HDR_ISTYPE_METADATA(hdr));
cb->l2rcb_abd = abd;
} else {
abd = hdr->b_l1hdr.b_pabd;
}
ASSERT(addr >= VDEV_LABEL_START_SIZE &&
addr + asize <= vd->vdev_psize -
VDEV_LABEL_END_SIZE);
/*
* l2arc read. The SCL_L2ARC lock will be
* released by l2arc_read_done().
* Issue a null zio if the underlying buffer
* was squashed to zero size by compression.
*/
ASSERT3U(HDR_GET_COMPRESS(hdr), !=,
ZIO_COMPRESS_EMPTY);
rzio = zio_read_phys(pio, vd, addr,
asize, abd,
ZIO_CHECKSUM_OFF,
l2arc_read_done, cb, priority,
zio_flags | ZIO_FLAG_DONT_CACHE |
ZIO_FLAG_CANFAIL |
ZIO_FLAG_DONT_PROPAGATE |
ZIO_FLAG_DONT_RETRY, B_FALSE);
DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
zio_t *, rzio);
ARCSTAT_INCR(arcstat_l2_read_bytes, size);
if (*arc_flags & ARC_FLAG_NOWAIT) {
zio_nowait(rzio);
return (0);
}
ASSERT(*arc_flags & ARC_FLAG_WAIT);
if (zio_wait(rzio) == 0)
return (0);
/* l2arc read error; goto zio_read() */
} else {
DTRACE_PROBE1(l2arc__miss,
arc_buf_hdr_t *, hdr);
ARCSTAT_BUMP(arcstat_l2_misses);
if (HDR_L2_WRITING(hdr))
ARCSTAT_BUMP(arcstat_l2_rw_clash);
spa_config_exit(spa, SCL_L2ARC, vd);
}
} else {
if (vd != NULL)
spa_config_exit(spa, SCL_L2ARC, vd);
if (l2arc_ndev != 0) {
DTRACE_PROBE1(l2arc__miss,
arc_buf_hdr_t *, hdr);
ARCSTAT_BUMP(arcstat_l2_misses);
}
}
rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size,
arc_read_done, hdr, priority, zio_flags, zb);
if (*arc_flags & ARC_FLAG_WAIT)
return (zio_wait(rzio));
ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
zio_nowait(rzio);
}
return (0);
}
/*
* Notify the arc that a block was freed, and thus will never be used again.
*/
void
arc_freed(spa_t *spa, const blkptr_t *bp)
{
arc_buf_hdr_t *hdr;
kmutex_t *hash_lock;
uint64_t guid = spa_load_guid(spa);
ASSERT(!BP_IS_EMBEDDED(bp));
hdr = buf_hash_find(guid, bp, &hash_lock);
if (hdr == NULL)
return;
/*
* We might be trying to free a block that is still doing I/O
* (i.e. prefetch) or has a reference (i.e. a dedup-ed,
* dmu_sync-ed block). If this block is being prefetched, then it
* would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr
* until the I/O completes. A block may also have a reference if it is
* part of a dedup-ed, dmu_synced write. The dmu_sync() function would
* have written the new block to its final resting place on disk but
* without the dedup flag set. This would have left the hdr in the MRU
* state and discoverable. When the txg finally syncs it detects that
* the block was overridden in open context and issues an override I/O.
* Since this is a dedup block, the override I/O will determine if the
* block is already in the DDT. If so, then it will replace the io_bp
* with the bp from the DDT and allow the I/O to finish. When the I/O
* reaches the done callback, dbuf_write_override_done, it will
* check to see if the io_bp and io_bp_override are identical.
* If they are not, then it indicates that the bp was replaced with
* the bp in the DDT and the override bp is freed. This allows
* us to arrive here with a reference on a block that is being
* freed. So if we have an I/O in progress, or a reference to
* this hdr, then we don't destroy the hdr.
*/
if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) &&
refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) {
arc_change_state(arc_anon, hdr, hash_lock);
arc_hdr_destroy(hdr);
mutex_exit(hash_lock);
} else {
mutex_exit(hash_lock);
}
}
/*
* Release this buffer from the cache, making it an anonymous buffer. This
* must be done after a read and prior to modifying the buffer contents.
* If the buffer has more than one reference, we must make
* a new hdr for the buffer.
*/
void
arc_release(arc_buf_t *buf, void *tag)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
/*
* It would be nice to assert that if it's DMU metadata (level >
* 0 || it's the dnode file), then it must be syncing context.
* But we don't know that information at this level.
*/
mutex_enter(&buf->b_evict_lock);
ASSERT(HDR_HAS_L1HDR(hdr));
/*
* We don't grab the hash lock prior to this check, because if
* the buffer's header is in the arc_anon state, it won't be
* linked into the hash table.
*/
if (hdr->b_l1hdr.b_state == arc_anon) {
mutex_exit(&buf->b_evict_lock);
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
ASSERT(!HDR_IN_HASH_TABLE(hdr));
ASSERT(!HDR_HAS_L2HDR(hdr));
ASSERT(HDR_EMPTY(hdr));
ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
hdr->b_l1hdr.b_arc_access = 0;
/*
* If the buf is being overridden then it may already
* have a hdr that is not empty.
*/
buf_discard_identity(hdr);
arc_buf_thaw(buf);
return;
}
kmutex_t *hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
/*
* This assignment is only valid as long as the hash_lock is
* held, we must be careful not to reference state or the
* b_state field after dropping the lock.
*/
arc_state_t *state = hdr->b_l1hdr.b_state;
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
ASSERT3P(state, !=, arc_anon);
/* this buffer is not on any list */
ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);
if (HDR_HAS_L2HDR(hdr)) {
mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
/*
* We have to recheck this conditional again now that
* we're holding the l2ad_mtx to prevent a race with
* another thread which might be concurrently calling
* l2arc_evict(). In that case, l2arc_evict() might have
* destroyed the header's L2 portion as we were waiting
* to acquire the l2ad_mtx.
*/
if (HDR_HAS_L2HDR(hdr)) {
l2arc_trim(hdr);
arc_hdr_l2hdr_destroy(hdr);
}
mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
}
/*
* Do we have more than one buf?
*/
if (hdr->b_l1hdr.b_bufcnt > 1) {
arc_buf_hdr_t *nhdr;
uint64_t spa = hdr->b_spa;
uint64_t psize = HDR_GET_PSIZE(hdr);
uint64_t lsize = HDR_GET_LSIZE(hdr);
enum zio_compress compress = HDR_GET_COMPRESS(hdr);
arc_buf_contents_t type = arc_buf_type(hdr);
VERIFY3U(hdr->b_type, ==, type);
ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
(void) remove_reference(hdr, hash_lock, tag);
if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) {
ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
ASSERT(ARC_BUF_LAST(buf));
}
/*
* Pull the data off of this hdr and attach it to
* a new anonymous hdr. Also find the last buffer
* in the hdr's buffer list.
*/
arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
ASSERT3P(lastbuf, !=, NULL);
/*
* If the current arc_buf_t and the hdr are sharing their data
* buffer, then we must stop sharing that block.
*/
if (arc_buf_is_shared(buf)) {
VERIFY(!arc_buf_is_shared(lastbuf));
/*
* First, sever the block sharing relationship between
* buf and the arc_buf_hdr_t.
*/
arc_unshare_buf(hdr, buf);
/*
* Now we need to recreate the hdr's b_pabd. Since we
* have lastbuf handy, we try to share with it, but if
* we can't then we allocate a new b_pabd and copy the
* data from buf into it.
*/
if (arc_can_share(hdr, lastbuf)) {
arc_share_buf(hdr, lastbuf);
} else {
arc_hdr_alloc_pabd(hdr);
abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
buf->b_data, psize);
}
VERIFY3P(lastbuf->b_data, !=, NULL);
} else if (HDR_SHARED_DATA(hdr)) {
/*
* Uncompressed shared buffers are always at the end
* of the list. Compressed buffers don't have the
* same requirements. This makes it hard to
* simply assert that the lastbuf is shared so
* we rely on the hdr's compression flags to determine
* if we have a compressed, shared buffer.
*/
ASSERT(arc_buf_is_shared(lastbuf) ||
HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
ASSERT(!ARC_BUF_SHARED(buf));
}
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
ASSERT3P(state, !=, arc_l2c_only);
(void) refcount_remove_many(&state->arcs_size,
arc_buf_size(buf), buf);
if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
ASSERT3P(state, !=, arc_l2c_only);
(void) refcount_remove_many(&state->arcs_esize[type],
arc_buf_size(buf), buf);
}
hdr->b_l1hdr.b_bufcnt -= 1;
arc_cksum_verify(buf);
#ifdef illumos
arc_buf_unwatch(buf);
#endif
mutex_exit(hash_lock);
/*
* Allocate a new hdr. The new hdr will contain a b_pabd
* buffer which will be freed in arc_write().
*/
nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type);
ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
ASSERT0(nhdr->b_l1hdr.b_bufcnt);
ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt));
VERIFY3U(nhdr->b_type, ==, type);
ASSERT(!HDR_SHARED_DATA(nhdr));
nhdr->b_l1hdr.b_buf = buf;
nhdr->b_l1hdr.b_bufcnt = 1;
(void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
buf->b_hdr = nhdr;
mutex_exit(&buf->b_evict_lock);
(void) refcount_add_many(&arc_anon->arcs_size,
arc_buf_size(buf), buf);
} else {
mutex_exit(&buf->b_evict_lock);
ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
/* protected by hash lock, or hdr is on arc_anon */
ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
arc_change_state(arc_anon, hdr, hash_lock);
hdr->b_l1hdr.b_arc_access = 0;
mutex_exit(hash_lock);
buf_discard_identity(hdr);
arc_buf_thaw(buf);
}
}
int
arc_released(arc_buf_t *buf)
{
int released;
mutex_enter(&buf->b_evict_lock);
released = (buf->b_data != NULL &&
buf->b_hdr->b_l1hdr.b_state == arc_anon);
mutex_exit(&buf->b_evict_lock);
return (released);
}
#ifdef ZFS_DEBUG
int
arc_referenced(arc_buf_t *buf)
{
int referenced;
mutex_enter(&buf->b_evict_lock);
referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
mutex_exit(&buf->b_evict_lock);
return (referenced);
}
#endif
static void
arc_write_ready(zio_t *zio)
{
arc_write_callback_t *callback = zio->io_private;
arc_buf_t *buf = callback->awcb_buf;
arc_buf_hdr_t *hdr = buf->b_hdr;
uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp);
ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
/*
* If we're reexecuting this zio because the pool suspended, then
* cleanup any state that was previously set the first time the
* callback was invoked.
*/
if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
arc_cksum_free(hdr);
#ifdef illumos
arc_buf_unwatch(buf);
#endif
if (hdr->b_l1hdr.b_pabd != NULL) {
if (arc_buf_is_shared(buf)) {
arc_unshare_buf(hdr, buf);
} else {
arc_hdr_free_pabd(hdr);
}
}
}
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
ASSERT(!HDR_SHARED_DATA(hdr));
ASSERT(!arc_buf_is_shared(buf));
callback->awcb_ready(zio, buf, callback->awcb_private);
if (HDR_IO_IN_PROGRESS(hdr))
ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
arc_cksum_compute(buf);
arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
enum zio_compress compress;
if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
compress = ZIO_COMPRESS_OFF;
} else {
ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp));
compress = BP_GET_COMPRESS(zio->io_bp);
}
HDR_SET_PSIZE(hdr, psize);
arc_hdr_set_compress(hdr, compress);
/*
* Fill the hdr with data. If the hdr is compressed, the data we want
* is available from the zio, otherwise we can take it from the buf.
*
* We might be able to share the buf's data with the hdr here. However,
* doing so would cause the ARC to be full of linear ABDs if we write a
* lot of shareable data. As a compromise, we check whether scattered
* ABDs are allowed, and assume that if they are then the user wants
* the ARC to be primarily filled with them regardless of the data being
* written. Therefore, if they're allowed then we allocate one and copy
* the data into it; otherwise, we share the data directly if we can.
*/
if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) {
arc_hdr_alloc_pabd(hdr);
/*
* Ideally, we would always copy the io_abd into b_pabd, but the
* user may have disabled compressed ARC, thus we must check the
* hdr's compression setting rather than the io_bp's.
*/
if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=,
ZIO_COMPRESS_OFF);
ASSERT3U(psize, >, 0);
abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
} else {
ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
arc_buf_size(buf));
}
} else {
ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
arc_share_buf(hdr, buf);
}
arc_hdr_verify(hdr, zio->io_bp);
}
static void
arc_write_children_ready(zio_t *zio)
{
arc_write_callback_t *callback = zio->io_private;
arc_buf_t *buf = callback->awcb_buf;
callback->awcb_children_ready(zio, buf, callback->awcb_private);
}
/*
* The SPA calls this callback for each physical write that happens on behalf
* of a logical write. See the comment in dbuf_write_physdone() for details.
*/
static void
arc_write_physdone(zio_t *zio)
{
arc_write_callback_t *cb = zio->io_private;
if (cb->awcb_physdone != NULL)
cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
}
static void
arc_write_done(zio_t *zio)
{
arc_write_callback_t *callback = zio->io_private;
arc_buf_t *buf = callback->awcb_buf;
arc_buf_hdr_t *hdr = buf->b_hdr;
ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
if (zio->io_error == 0) {
arc_hdr_verify(hdr, zio->io_bp);
if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
buf_discard_identity(hdr);
} else {
hdr->b_dva = *BP_IDENTITY(zio->io_bp);
hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
}
} else {
ASSERT(HDR_EMPTY(hdr));
}
/*
* If the block to be written was all-zero or compressed enough to be
* embedded in the BP, no write was performed so there will be no
* dva/birth/checksum. The buffer must therefore remain anonymous
* (and uncached).
*/
if (!HDR_EMPTY(hdr)) {
arc_buf_hdr_t *exists;
kmutex_t *hash_lock;
ASSERT3U(zio->io_error, ==, 0);
arc_cksum_verify(buf);
exists = buf_hash_insert(hdr, &hash_lock);
if (exists != NULL) {
/*
* This can only happen if we overwrite for
* sync-to-convergence, because we remove
* buffers from the hash table when we arc_free().
*/
if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
panic("bad overwrite, hdr=%p exists=%p",
(void *)hdr, (void *)exists);
ASSERT(refcount_is_zero(
&exists->b_l1hdr.b_refcnt));
arc_change_state(arc_anon, exists, hash_lock);
mutex_exit(hash_lock);
arc_hdr_destroy(exists);
exists = buf_hash_insert(hdr, &hash_lock);
ASSERT3P(exists, ==, NULL);
} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
/* nopwrite */
ASSERT(zio->io_prop.zp_nopwrite);
if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
panic("bad nopwrite, hdr=%p exists=%p",
(void *)hdr, (void *)exists);
} else {
/* Dedup */
ASSERT(hdr->b_l1hdr.b_bufcnt == 1);
ASSERT(hdr->b_l1hdr.b_state == arc_anon);
ASSERT(BP_GET_DEDUP(zio->io_bp));
ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
}
}
arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
/* if it's not anon, we are doing a scrub */
if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
arc_access(hdr, hash_lock);
mutex_exit(hash_lock);
} else {
arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
}
ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
callback->awcb_done(zio, buf, callback->awcb_private);
abd_put(zio->io_abd);
kmem_free(callback, sizeof (arc_write_callback_t));
}
zio_t *
arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready,
arc_done_func_t *children_ready, arc_done_func_t *physdone,
arc_done_func_t *done, void *private, zio_priority_t priority,
int zio_flags, const zbookmark_phys_t *zb)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
arc_write_callback_t *callback;
zio_t *zio;
zio_prop_t localprop = *zp;
ASSERT3P(ready, !=, NULL);
ASSERT3P(done, !=, NULL);
ASSERT(!HDR_IO_ERROR(hdr));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
if (l2arc)
arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
if (ARC_BUF_COMPRESSED(buf)) {
/*
* We're writing a pre-compressed buffer. Make the
* compression algorithm requested by the zio_prop_t match
* the pre-compressed buffer's compression algorithm.
*/
localprop.zp_compress = HDR_GET_COMPRESS(hdr);
ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
zio_flags |= ZIO_FLAG_RAW;
}
callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
callback->awcb_ready = ready;
callback->awcb_children_ready = children_ready;
callback->awcb_physdone = physdone;
callback->awcb_done = done;
callback->awcb_private = private;
callback->awcb_buf = buf;
/*
* The hdr's b_pabd is now stale, free it now. A new data block
* will be allocated when the zio pipeline calls arc_write_ready().
*/
if (hdr->b_l1hdr.b_pabd != NULL) {
/*
* If the buf is currently sharing the data block with
* the hdr then we need to break that relationship here.
* The hdr will remain with a NULL data pointer and the
* buf will take sole ownership of the block.
*/
if (arc_buf_is_shared(buf)) {
arc_unshare_buf(hdr, buf);
} else {
arc_hdr_free_pabd(hdr);
}
VERIFY3P(buf->b_data, !=, NULL);
arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
}
ASSERT(!arc_buf_is_shared(buf));
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
zio = zio_write(pio, spa, txg, bp,
abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
(children_ready != NULL) ? arc_write_children_ready : NULL,
arc_write_physdone, arc_write_done, callback,
priority, zio_flags, zb);
return (zio);
}
static int
arc_memory_throttle(uint64_t reserve, uint64_t txg)
{
#ifdef _KERNEL
uint64_t available_memory = ptob(freemem);
static uint64_t page_load = 0;
static uint64_t last_txg = 0;
#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
available_memory =
MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE)));
#endif
if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
return (0);
if (txg > last_txg) {
last_txg = txg;
page_load = 0;
}
/*
* If we are in pageout, we know that memory is already tight,
* the arc is already going to be evicting, so we just want to
* continue to let page writes occur as quickly as possible.
*/
if (curproc == pageproc) {
if (page_load > MAX(ptob(minfree), available_memory) / 4)
return (SET_ERROR(ERESTART));
/* Note: reserve is inflated, so we deflate */
page_load += reserve / 8;
return (0);
} else if (page_load > 0 && arc_reclaim_needed()) {
/* memory is low, delay before restarting */
ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
return (SET_ERROR(EAGAIN));
}
page_load = 0;
#endif
return (0);
}
void
arc_tempreserve_clear(uint64_t reserve)
{
atomic_add_64(&arc_tempreserve, -reserve);
ASSERT((int64_t)arc_tempreserve >= 0);
}
int
arc_tempreserve_space(uint64_t reserve, uint64_t txg)
{
int error;
uint64_t anon_size;
if (reserve > arc_c/4 && !arc_no_grow) {
arc_c = MIN(arc_c_max, reserve * 4);
DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c);
}
if (reserve > arc_c)
return (SET_ERROR(ENOMEM));
/*
* Don't count loaned bufs as in flight dirty data to prevent long
* network delays from blocking transactions that are ready to be
* assigned to a txg.
*/
/* assert that it has not wrapped around */
ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) -
arc_loaned_bytes), 0);
/*
* Writes will, almost always, require additional memory allocations
* in order to compress/encrypt/etc the data. We therefore need to
* make sure that there is sufficient available memory for this.
*/
error = arc_memory_throttle(reserve, txg);
if (error != 0)
return (error);
/*
* Throttle writes when the amount of dirty data in the cache
* gets too large. We try to keep the cache less than half full
* of dirty blocks so that our sync times don't grow too large.
* Note: if two requests come in concurrently, we might let them
* both succeed, when one of them should fail. Not a huge deal.
*/
if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
anon_size > arc_c / 4) {
uint64_t meta_esize =
refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
uint64_t data_esize =
refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
"anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
arc_tempreserve >> 10, meta_esize >> 10,
data_esize >> 10, reserve >> 10, arc_c >> 10);
return (SET_ERROR(ERESTART));
}
atomic_add_64(&arc_tempreserve, reserve);
return (0);
}
static void
arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
kstat_named_t *evict_data, kstat_named_t *evict_metadata)
{
size->value.ui64 = refcount_count(&state->arcs_size);
evict_data->value.ui64 =
refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
evict_metadata->value.ui64 =
refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]);
}
static int
arc_kstat_update(kstat_t *ksp, int rw)
{
arc_stats_t *as = ksp->ks_data;
if (rw == KSTAT_WRITE) {
return (EACCES);
} else {
arc_kstat_update_state(arc_anon,
&as->arcstat_anon_size,
&as->arcstat_anon_evictable_data,
&as->arcstat_anon_evictable_metadata);
arc_kstat_update_state(arc_mru,
&as->arcstat_mru_size,
&as->arcstat_mru_evictable_data,
&as->arcstat_mru_evictable_metadata);
arc_kstat_update_state(arc_mru_ghost,
&as->arcstat_mru_ghost_size,
&as->arcstat_mru_ghost_evictable_data,
&as->arcstat_mru_ghost_evictable_metadata);
arc_kstat_update_state(arc_mfu,
&as->arcstat_mfu_size,
&as->arcstat_mfu_evictable_data,
&as->arcstat_mfu_evictable_metadata);
arc_kstat_update_state(arc_mfu_ghost,
&as->arcstat_mfu_ghost_size,
&as->arcstat_mfu_ghost_evictable_data,
&as->arcstat_mfu_ghost_evictable_metadata);
}
return (0);
}
/*
* This function *must* return indices evenly distributed between all
* sublists of the multilist. This is needed due to how the ARC eviction
* code is laid out; arc_evict_state() assumes ARC buffers are evenly
* distributed between all sublists and uses this assumption when
* deciding which sublist to evict from and how much to evict from it.
*/
unsigned int
arc_state_multilist_index_func(multilist_t *ml, void *obj)
{
arc_buf_hdr_t *hdr = obj;
/*
* We rely on b_dva to generate evenly distributed index
* numbers using buf_hash below. So, as an added precaution,
* let's make sure we never add empty buffers to the arc lists.
*/
ASSERT(!HDR_EMPTY(hdr));
/*
* The assumption here, is the hash value for a given
* arc_buf_hdr_t will remain constant throughout it's lifetime
* (i.e. it's b_spa, b_dva, and b_birth fields don't change).
* Thus, we don't need to store the header's sublist index
* on insertion, as this index can be recalculated on removal.
*
* Also, the low order bits of the hash value are thought to be
* distributed evenly. Otherwise, in the case that the multilist
* has a power of two number of sublists, each sublists' usage
* would not be evenly distributed.
*/
return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
multilist_get_num_sublists(ml));
}
#ifdef _KERNEL
static eventhandler_tag arc_event_lowmem = NULL;
static void
arc_lowmem(void *arg __unused, int howto __unused)
{
mutex_enter(&arc_reclaim_lock);
DTRACE_PROBE1(arc__needfree, int64_t, ((int64_t)freemem - zfs_arc_free_target) * PAGESIZE);
cv_signal(&arc_reclaim_thread_cv);
/*
* It is unsafe to block here in arbitrary threads, because we can come
* here from ARC itself and may hold ARC locks and thus risk a deadlock
* with ARC reclaim thread.
*/
if (curproc == pageproc)
(void) cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
mutex_exit(&arc_reclaim_lock);
}
#endif
static void
arc_state_init(void)
{
arc_anon = &ARC_anon;
arc_mru = &ARC_mru;
arc_mru_ghost = &ARC_mru_ghost;
arc_mfu = &ARC_mfu;
arc_mfu_ghost = &ARC_mfu_ghost;
arc_l2c_only = &ARC_l2c_only;
arc_mru->arcs_list[ARC_BUFC_METADATA] =
multilist_create(sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
arc_state_multilist_index_func);
arc_mru->arcs_list[ARC_BUFC_DATA] =
multilist_create(sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
arc_state_multilist_index_func);
arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] =
multilist_create(sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
arc_state_multilist_index_func);
arc_mru_ghost->arcs_list[ARC_BUFC_DATA] =
multilist_create(sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
arc_state_multilist_index_func);
arc_mfu->arcs_list[ARC_BUFC_METADATA] =
multilist_create(sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
arc_state_multilist_index_func);
arc_mfu->arcs_list[ARC_BUFC_DATA] =
multilist_create(sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
arc_state_multilist_index_func);
arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] =
multilist_create(sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
arc_state_multilist_index_func);
arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] =
multilist_create(sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
arc_state_multilist_index_func);
arc_l2c_only->arcs_list[ARC_BUFC_METADATA] =
multilist_create(sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
arc_state_multilist_index_func);
arc_l2c_only->arcs_list[ARC_BUFC_DATA] =
multilist_create(sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
arc_state_multilist_index_func);
refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
refcount_create(&arc_anon->arcs_size);
refcount_create(&arc_mru->arcs_size);
refcount_create(&arc_mru_ghost->arcs_size);
refcount_create(&arc_mfu->arcs_size);
refcount_create(&arc_mfu_ghost->arcs_size);
refcount_create(&arc_l2c_only->arcs_size);
}
static void
arc_state_fini(void)
{
refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
refcount_destroy(&arc_anon->arcs_size);
refcount_destroy(&arc_mru->arcs_size);
refcount_destroy(&arc_mru_ghost->arcs_size);
refcount_destroy(&arc_mfu->arcs_size);
refcount_destroy(&arc_mfu_ghost->arcs_size);
refcount_destroy(&arc_l2c_only->arcs_size);
multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]);
multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]);
multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]);
multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]);
multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
}
uint64_t
arc_max_bytes(void)
{
return (arc_c_max);
}
void
arc_init(void)
{
int i, prefetch_tunable_set = 0;
/*
* allmem is "all memory that we could possibly use".
*/
#ifdef illumos
#ifdef _KERNEL
uint64_t allmem = ptob(physmem - swapfs_minfree);
#else
uint64_t allmem = (physmem * PAGESIZE) / 2;
#endif
#else
uint64_t allmem = kmem_size();
#endif
mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL);
/* Convert seconds to clock ticks */
arc_min_prefetch_lifespan = 1 * hz;
/* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */
arc_c_min = MAX(allmem / 32, arc_abs_min);
/* set max to 5/8 of all memory, or all but 1GB, whichever is more */
if (allmem >= 1 << 30)
arc_c_max = allmem - (1 << 30);
else
arc_c_max = arc_c_min;
arc_c_max = MAX(allmem * 5 / 8, arc_c_max);
/*
* In userland, there's only the memory pressure that we artificially
* create (see arc_available_memory()). Don't let arc_c get too
* small, because it can cause transactions to be larger than
* arc_c, causing arc_tempreserve_space() to fail.
*/
#ifndef _KERNEL
arc_c_min = arc_c_max / 2;
#endif
#ifdef _KERNEL
/*
* Allow the tunables to override our calculations if they are
* reasonable.
*/
if (zfs_arc_max > arc_abs_min && zfs_arc_max < allmem) {
arc_c_max = zfs_arc_max;
arc_c_min = MIN(arc_c_min, arc_c_max);
}
if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max)
arc_c_min = zfs_arc_min;
#endif
arc_c = arc_c_max;
arc_p = (arc_c >> 1);
arc_size = 0;
/* limit meta-data to 1/4 of the arc capacity */
arc_meta_limit = arc_c_max / 4;
#ifdef _KERNEL
/*
* Metadata is stored in the kernel's heap. Don't let us
* use more than half the heap for the ARC.
*/
arc_meta_limit = MIN(arc_meta_limit,
vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 2);
#endif
/* Allow the tunable to override if it is reasonable */
if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
arc_meta_limit = zfs_arc_meta_limit;
if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
arc_c_min = arc_meta_limit / 2;
if (zfs_arc_meta_min > 0) {
arc_meta_min = zfs_arc_meta_min;
} else {
arc_meta_min = arc_c_min / 2;
}
if (zfs_arc_grow_retry > 0)
arc_grow_retry = zfs_arc_grow_retry;
if (zfs_arc_shrink_shift > 0)
arc_shrink_shift = zfs_arc_shrink_shift;
if (zfs_arc_no_grow_shift > 0)
arc_no_grow_shift = zfs_arc_no_grow_shift;
/*
* Ensure that arc_no_grow_shift is less than arc_shrink_shift.
*/
if (arc_no_grow_shift >= arc_shrink_shift)
arc_no_grow_shift = arc_shrink_shift - 1;
if (zfs_arc_p_min_shift > 0)
arc_p_min_shift = zfs_arc_p_min_shift;
/* if kmem_flags are set, lets try to use less memory */
if (kmem_debugging())
arc_c = arc_c / 2;
if (arc_c < arc_c_min)
arc_c = arc_c_min;
zfs_arc_min = arc_c_min;
zfs_arc_max = arc_c_max;
arc_state_init();
buf_init();
arc_reclaim_thread_exit = B_FALSE;
arc_dnlc_evicts_thread_exit = FALSE;
arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
if (arc_ksp != NULL) {
arc_ksp->ks_data = &arc_stats;
arc_ksp->ks_update = arc_kstat_update;
kstat_install(arc_ksp);
}
(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
TS_RUN, minclsyspri);
#ifdef _KERNEL
arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
EVENTHANDLER_PRI_FIRST);
#endif
(void) thread_create(NULL, 0, arc_dnlc_evicts_thread, NULL, 0, &p0,
TS_RUN, minclsyspri);
arc_dead = B_FALSE;
arc_warm = B_FALSE;
/*
* Calculate maximum amount of dirty data per pool.
*
* If it has been set by /etc/system, take that.
* Otherwise, use a percentage of physical memory defined by
* zfs_dirty_data_max_percent (default 10%) with a cap at
* zfs_dirty_data_max_max (default 4GB).
*/
if (zfs_dirty_data_max == 0) {
zfs_dirty_data_max = ptob(physmem) *
zfs_dirty_data_max_percent / 100;
zfs_dirty_data_max = MIN(zfs_dirty_data_max,
zfs_dirty_data_max_max);
}
#ifdef _KERNEL
if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
prefetch_tunable_set = 1;
#ifdef __i386__
if (prefetch_tunable_set == 0) {
printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
"-- to enable,\n");
printf(" add \"vfs.zfs.prefetch_disable=0\" "
"to /boot/loader.conf.\n");
zfs_prefetch_disable = 1;
}
#else
if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
prefetch_tunable_set == 0) {
printf("ZFS NOTICE: Prefetch is disabled by default if less "
"than 4GB of RAM is present;\n"
" to enable, add \"vfs.zfs.prefetch_disable=0\" "
"to /boot/loader.conf.\n");
zfs_prefetch_disable = 1;
}
#endif
/* Warn about ZFS memory and address space requirements. */
if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
"expect unstable behavior.\n");
}
if (allmem < 512 * (1 << 20)) {
printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
"expect unstable behavior.\n");
printf(" Consider tuning vm.kmem_size and "
"vm.kmem_size_max\n");
printf(" in /boot/loader.conf.\n");
}
#endif
}
void
arc_fini(void)
{
#ifdef _KERNEL
if (arc_event_lowmem != NULL)
EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
#endif
mutex_enter(&arc_reclaim_lock);
arc_reclaim_thread_exit = B_TRUE;
/*
* The reclaim thread will set arc_reclaim_thread_exit back to
* B_FALSE when it is finished exiting; we're waiting for that.
*/
while (arc_reclaim_thread_exit) {
cv_signal(&arc_reclaim_thread_cv);
cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock);
}
mutex_exit(&arc_reclaim_lock);
/* Use B_TRUE to ensure *all* buffers are evicted */
arc_flush(NULL, B_TRUE);
mutex_enter(&arc_dnlc_evicts_lock);
arc_dnlc_evicts_thread_exit = TRUE;
/*
* The user evicts thread will set arc_user_evicts_thread_exit
* to FALSE when it is finished exiting; we're waiting for that.
*/
while (arc_dnlc_evicts_thread_exit) {
cv_signal(&arc_dnlc_evicts_cv);
cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock);
}
mutex_exit(&arc_dnlc_evicts_lock);
arc_dead = B_TRUE;
if (arc_ksp != NULL) {
kstat_delete(arc_ksp);
arc_ksp = NULL;
}
mutex_destroy(&arc_reclaim_lock);
cv_destroy(&arc_reclaim_thread_cv);
cv_destroy(&arc_reclaim_waiters_cv);
mutex_destroy(&arc_dnlc_evicts_lock);
cv_destroy(&arc_dnlc_evicts_cv);
arc_state_fini();
buf_fini();
ASSERT0(arc_loaned_bytes);
}
/*
* Level 2 ARC
*
* The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
* It uses dedicated storage devices to hold cached data, which are populated
* using large infrequent writes. The main role of this cache is to boost
* the performance of random read workloads. The intended L2ARC devices
* include short-stroked disks, solid state disks, and other media with
* substantially faster read latency than disk.
*
* +-----------------------+
* | ARC |
* +-----------------------+
* | ^ ^
* | | |
* l2arc_feed_thread() arc_read()
* | | |
* | l2arc read |
* V | |
* +---------------+ |
* | L2ARC | |
* +---------------+ |
* | ^ |
* l2arc_write() | |
* | | |
* V | |
* +-------+ +-------+
* | vdev | | vdev |
* | cache | | cache |
* +-------+ +-------+
* +=========+ .-----.
* : L2ARC : |-_____-|
* : devices : | Disks |
* +=========+ `-_____-'
*
* Read requests are satisfied from the following sources, in order:
*
* 1) ARC
* 2) vdev cache of L2ARC devices
* 3) L2ARC devices
* 4) vdev cache of disks
* 5) disks
*
* Some L2ARC device types exhibit extremely slow write performance.
* To accommodate for this there are some significant differences between
* the L2ARC and traditional cache design:
*
* 1. There is no eviction path from the ARC to the L2ARC. Evictions from
* the ARC behave as usual, freeing buffers and placing headers on ghost
* lists. The ARC does not send buffers to the L2ARC during eviction as
* this would add inflated write latencies for all ARC memory pressure.
*
* 2. The L2ARC attempts to cache data from the ARC before it is evicted.
* It does this by periodically scanning buffers from the eviction-end of
* the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
* not already there. It scans until a headroom of buffers is satisfied,
* which itself is a buffer for ARC eviction. If a compressible buffer is
* found during scanning and selected for writing to an L2ARC device, we
* temporarily boost scanning headroom during the next scan cycle to make
* sure we adapt to compression effects (which might significantly reduce
* the data volume we write to L2ARC). The thread that does this is
* l2arc_feed_thread(), illustrated below; example sizes are included to
* provide a better sense of ratio than this diagram:
*
* head --> tail
* +---------------------+----------+
* ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
* +---------------------+----------+ | o L2ARC eligible
* ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
* +---------------------+----------+ |
* 15.9 Gbytes ^ 32 Mbytes |
* headroom |
* l2arc_feed_thread()
* |
* l2arc write hand <--[oooo]--'
* | 8 Mbyte
* | write max
* V
* +==============================+
* L2ARC dev |####|#|###|###| |####| ... |
* +==============================+
* 32 Gbytes
*
* 3. If an ARC buffer is copied to the L2ARC but then hit instead of
* evicted, then the L2ARC has cached a buffer much sooner than it probably
* needed to, potentially wasting L2ARC device bandwidth and storage. It is
* safe to say that this is an uncommon case, since buffers at the end of
* the ARC lists have moved there due to inactivity.
*
* 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
* then the L2ARC simply misses copying some buffers. This serves as a
* pressure valve to prevent heavy read workloads from both stalling the ARC
* with waits and clogging the L2ARC with writes. This also helps prevent
* the potential for the L2ARC to churn if it attempts to cache content too
* quickly, such as during backups of the entire pool.
*
* 5. After system boot and before the ARC has filled main memory, there are
* no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
* lists can remain mostly static. Instead of searching from tail of these
* lists as pictured, the l2arc_feed_thread() will search from the list heads
* for eligible buffers, greatly increasing its chance of finding them.
*
* The L2ARC device write speed is also boosted during this time so that
* the L2ARC warms up faster. Since there have been no ARC evictions yet,
* there are no L2ARC reads, and no fear of degrading read performance
* through increased writes.
*
* 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
* the vdev queue can aggregate them into larger and fewer writes. Each
* device is written to in a rotor fashion, sweeping writes through
* available space then repeating.
*
* 7. The L2ARC does not store dirty content. It never needs to flush
* write buffers back to disk based storage.
*
* 8. If an ARC buffer is written (and dirtied) which also exists in the
* L2ARC, the now stale L2ARC buffer is immediately dropped.
*
* The performance of the L2ARC can be tweaked by a number of tunables, which
* may be necessary for different workloads:
*
* l2arc_write_max max write bytes per interval
* l2arc_write_boost extra write bytes during device warmup
* l2arc_noprefetch skip caching prefetched buffers
* l2arc_headroom number of max device writes to precache
* l2arc_headroom_boost when we find compressed buffers during ARC
* scanning, we multiply headroom by this
* percentage factor for the next scan cycle,
* since more compressed buffers are likely to
* be present
* l2arc_feed_secs seconds between L2ARC writing
*
* Tunables may be removed or added as future performance improvements are
* integrated, and also may become zpool properties.
*
* There are three key functions that control how the L2ARC warms up:
*
* l2arc_write_eligible() check if a buffer is eligible to cache
* l2arc_write_size() calculate how much to write
* l2arc_write_interval() calculate sleep delay between writes
*
* These three functions determine what to write, how much, and how quickly
* to send writes.
*/
static boolean_t
l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
{
/*
* A buffer is *not* eligible for the L2ARC if it:
* 1. belongs to a different spa.
* 2. is already cached on the L2ARC.
* 3. has an I/O in progress (it may be an incomplete read).
* 4. is flagged not eligible (zfs property).
*/
if (hdr->b_spa != spa_guid) {
ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
return (B_FALSE);
}
if (HDR_HAS_L2HDR(hdr)) {
ARCSTAT_BUMP(arcstat_l2_write_in_l2);
return (B_FALSE);
}
if (HDR_IO_IN_PROGRESS(hdr)) {
ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
return (B_FALSE);
}
if (!HDR_L2CACHE(hdr)) {
ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
return (B_FALSE);
}
return (B_TRUE);
}
static uint64_t
l2arc_write_size(void)
{
uint64_t size;
/*
* Make sure our globals have meaningful values in case the user
* altered them.
*/
size = l2arc_write_max;
if (size == 0) {
cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
"be greater than zero, resetting it to the default (%d)",
L2ARC_WRITE_SIZE);
size = l2arc_write_max = L2ARC_WRITE_SIZE;
}
if (arc_warm == B_FALSE)
size += l2arc_write_boost;
return (size);
}
static clock_t
l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
{
clock_t interval, next, now;
/*
* If the ARC lists are busy, increase our write rate; if the
* lists are stale, idle back. This is achieved by checking
* how much we previously wrote - if it was more than half of
* what we wanted, schedule the next write much sooner.
*/
if (l2arc_feed_again && wrote > (wanted / 2))
interval = (hz * l2arc_feed_min_ms) / 1000;
else
interval = hz * l2arc_feed_secs;
now = ddi_get_lbolt();
next = MAX(now, MIN(now + interval, began + interval));
return (next);
}
/*
* Cycle through L2ARC devices. This is how L2ARC load balances.
* If a device is returned, this also returns holding the spa config lock.
*/
static l2arc_dev_t *
l2arc_dev_get_next(void)
{
l2arc_dev_t *first, *next = NULL;
/*
* Lock out the removal of spas (spa_namespace_lock), then removal
* of cache devices (l2arc_dev_mtx). Once a device has been selected,
* both locks will be dropped and a spa config lock held instead.
*/
mutex_enter(&spa_namespace_lock);
mutex_enter(&l2arc_dev_mtx);
/* if there are no vdevs, there is nothing to do */
if (l2arc_ndev == 0)
goto out;
first = NULL;
next = l2arc_dev_last;
do {
/* loop around the list looking for a non-faulted vdev */
if (next == NULL) {
next = list_head(l2arc_dev_list);
} else {
next = list_next(l2arc_dev_list, next);
if (next == NULL)
next = list_head(l2arc_dev_list);
}
/* if we have come back to the start, bail out */
if (first == NULL)
first = next;
else if (next == first)
break;
} while (vdev_is_dead(next->l2ad_vdev));
/* if we were unable to find any usable vdevs, return NULL */
if (vdev_is_dead(next->l2ad_vdev))
next = NULL;
l2arc_dev_last = next;
out:
mutex_exit(&l2arc_dev_mtx);
/*
* Grab the config lock to prevent the 'next' device from being
* removed while we are writing to it.
*/
if (next != NULL)
spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
mutex_exit(&spa_namespace_lock);
return (next);
}
/*
* Free buffers that were tagged for destruction.
*/
static void
l2arc_do_free_on_write()
{
list_t *buflist;
l2arc_data_free_t *df, *df_prev;
mutex_enter(&l2arc_free_on_write_mtx);
buflist = l2arc_free_on_write;
for (df = list_tail(buflist); df; df = df_prev) {
df_prev = list_prev(buflist, df);
ASSERT3P(df->l2df_abd, !=, NULL);
abd_free(df->l2df_abd);
list_remove(buflist, df);
kmem_free(df, sizeof (l2arc_data_free_t));
}
mutex_exit(&l2arc_free_on_write_mtx);
}
/*
* A write to a cache device has completed. Update all headers to allow
* reads from these buffers to begin.
*/
static void
l2arc_write_done(zio_t *zio)
{
l2arc_write_callback_t *cb;
l2arc_dev_t *dev;
list_t *buflist;
arc_buf_hdr_t *head, *hdr, *hdr_prev;
kmutex_t *hash_lock;
int64_t bytes_dropped = 0;
cb = zio->io_private;
ASSERT3P(cb, !=, NULL);
dev = cb->l2wcb_dev;
ASSERT3P(dev, !=, NULL);
head = cb->l2wcb_head;
ASSERT3P(head, !=, NULL);
buflist = &dev->l2ad_buflist;
ASSERT3P(buflist, !=, NULL);
DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
l2arc_write_callback_t *, cb);
if (zio->io_error != 0)
ARCSTAT_BUMP(arcstat_l2_writes_error);
/*
* All writes completed, or an error was hit.
*/
top:
mutex_enter(&dev->l2ad_mtx);
for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
hdr_prev = list_prev(buflist, hdr);
hash_lock = HDR_LOCK(hdr);
/*
* We cannot use mutex_enter or else we can deadlock
* with l2arc_write_buffers (due to swapping the order
* the hash lock and l2ad_mtx are taken).
*/
if (!mutex_tryenter(hash_lock)) {
/*
* Missed the hash lock. We must retry so we
* don't leave the ARC_FLAG_L2_WRITING bit set.
*/
ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
/*
* We don't want to rescan the headers we've
* already marked as having been written out, so
* we reinsert the head node so we can pick up
* where we left off.
*/
list_remove(buflist, head);
list_insert_after(buflist, hdr, head);
mutex_exit(&dev->l2ad_mtx);
/*
* We wait for the hash lock to become available
* to try and prevent busy waiting, and increase
* the chance we'll be able to acquire the lock
* the next time around.
*/
mutex_enter(hash_lock);
mutex_exit(hash_lock);
goto top;
}
/*
* We could not have been moved into the arc_l2c_only
* state while in-flight due to our ARC_FLAG_L2_WRITING
* bit being set. Let's just ensure that's being enforced.
*/
ASSERT(HDR_HAS_L1HDR(hdr));
if (zio->io_error != 0) {
/*
* Error - drop L2ARC entry.
*/
list_remove(buflist, hdr);
l2arc_trim(hdr);
arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
ARCSTAT_INCR(arcstat_l2_psize, -arc_hdr_size(hdr));
ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));
bytes_dropped += arc_hdr_size(hdr);
(void) refcount_remove_many(&dev->l2ad_alloc,
arc_hdr_size(hdr), hdr);
}
/*
* Allow ARC to begin reads and ghost list evictions to
* this L2ARC entry.
*/
arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING);
mutex_exit(hash_lock);
}
atomic_inc_64(&l2arc_writes_done);
list_remove(buflist, head);
ASSERT(!HDR_HAS_L1HDR(head));
kmem_cache_free(hdr_l2only_cache, head);
mutex_exit(&dev->l2ad_mtx);
vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
l2arc_do_free_on_write();
kmem_free(cb, sizeof (l2arc_write_callback_t));
}
/*
* A read to a cache device completed. Validate buffer contents before
* handing over to the regular ARC routines.
*/
static void
l2arc_read_done(zio_t *zio)
{
l2arc_read_callback_t *cb;
arc_buf_hdr_t *hdr;
kmutex_t *hash_lock;
boolean_t valid_cksum;
ASSERT3P(zio->io_vd, !=, NULL);
ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
cb = zio->io_private;
ASSERT3P(cb, !=, NULL);
hdr = cb->l2rcb_hdr;
ASSERT3P(hdr, !=, NULL);
hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
/*
* If the data was read into a temporary buffer,
* move it and free the buffer.
*/
if (cb->l2rcb_abd != NULL) {
ASSERT3U(arc_hdr_size(hdr), <, zio->io_size);
if (zio->io_error == 0) {
abd_copy(hdr->b_l1hdr.b_pabd, cb->l2rcb_abd,
arc_hdr_size(hdr));
}
/*
* The following must be done regardless of whether
* there was an error:
* - free the temporary buffer
* - point zio to the real ARC buffer
* - set zio size accordingly
* These are required because zio is either re-used for
* an I/O of the block in the case of the error
* or the zio is passed to arc_read_done() and it
* needs real data.
*/
abd_free(cb->l2rcb_abd);
zio->io_size = zio->io_orig_size = arc_hdr_size(hdr);
zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd;
}
ASSERT3P(zio->io_abd, !=, NULL);
/*
* Check this survived the L2ARC journey.
*/
ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd);
zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
valid_cksum = arc_cksum_is_equal(hdr, zio);
if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
mutex_exit(hash_lock);
zio->io_private = hdr;
arc_read_done(zio);
} else {
mutex_exit(hash_lock);
/*
* Buffer didn't survive caching. Increment stats and
* reissue to the original storage device.
*/
if (zio->io_error != 0) {
ARCSTAT_BUMP(arcstat_l2_io_error);
} else {
zio->io_error = SET_ERROR(EIO);
}
if (!valid_cksum)
ARCSTAT_BUMP(arcstat_l2_cksum_bad);
/*
* If there's no waiter, issue an async i/o to the primary
* storage now. If there *is* a waiter, the caller must
* issue the i/o in a context where it's OK to block.
*/
if (zio->io_waiter == NULL) {
zio_t *pio = zio_unique_parent(zio);
ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp,
hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done,
hdr, zio->io_priority, cb->l2rcb_flags,
&cb->l2rcb_zb));
}
}
kmem_free(cb, sizeof (l2arc_read_callback_t));
}
/*
* This is the list priority from which the L2ARC will search for pages to
* cache. This is used within loops (0..3) to cycle through lists in the
* desired order. This order can have a significant effect on cache
* performance.
*
* Currently the metadata lists are hit first, MFU then MRU, followed by
* the data lists. This function returns a locked list, and also returns
* the lock pointer.
*/
static multilist_sublist_t *
l2arc_sublist_lock(int list_num)
{
multilist_t *ml = NULL;
unsigned int idx;
ASSERT(list_num >= 0 && list_num <= 3);
switch (list_num) {
case 0:
ml = arc_mfu->arcs_list[ARC_BUFC_METADATA];
break;
case 1:
ml = arc_mru->arcs_list[ARC_BUFC_METADATA];
break;
case 2:
ml = arc_mfu->arcs_list[ARC_BUFC_DATA];
break;
case 3:
ml = arc_mru->arcs_list[ARC_BUFC_DATA];
break;
}
/*
* Return a randomly-selected sublist. This is acceptable
* because the caller feeds only a little bit of data for each
* call (8MB). Subsequent calls will result in different
* sublists being selected.
*/
idx = multilist_get_random_index(ml);
return (multilist_sublist_lock(ml, idx));
}
/*
* Evict buffers from the device write hand to the distance specified in
* bytes. This distance may span populated buffers, it may span nothing.
* This is clearing a region on the L2ARC device ready for writing.
* If the 'all' boolean is set, every buffer is evicted.
*/
static void
l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
{
list_t *buflist;
arc_buf_hdr_t *hdr, *hdr_prev;
kmutex_t *hash_lock;
uint64_t taddr;
buflist = &dev->l2ad_buflist;
if (!all && dev->l2ad_first) {
/*
* This is the first sweep through the device. There is
* nothing to evict.
*/
return;
}
if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
/*
* When nearing the end of the device, evict to the end
* before the device write hand jumps to the start.
*/
taddr = dev->l2ad_end;
} else {
taddr = dev->l2ad_hand + distance;
}
DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
uint64_t, taddr, boolean_t, all);
top:
mutex_enter(&dev->l2ad_mtx);
for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
hdr_prev = list_prev(buflist, hdr);
hash_lock = HDR_LOCK(hdr);
/*
* We cannot use mutex_enter or else we can deadlock
* with l2arc_write_buffers (due to swapping the order
* the hash lock and l2ad_mtx are taken).
*/
if (!mutex_tryenter(hash_lock)) {
/*
* Missed the hash lock. Retry.
*/
ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
mutex_exit(&dev->l2ad_mtx);
mutex_enter(hash_lock);
mutex_exit(hash_lock);
goto top;
}
/*
* A header can't be on this list if it doesn't have L2 header.
*/
ASSERT(HDR_HAS_L2HDR(hdr));
/* Ensure this header has finished being written. */
ASSERT(!HDR_L2_WRITING(hdr));
ASSERT(!HDR_L2_WRITE_HEAD(hdr));
if (!all && (hdr->b_l2hdr.b_daddr >= taddr ||
hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
/*
* We've evicted to the target address,
* or the end of the device.
*/
mutex_exit(hash_lock);
break;
}
if (!HDR_HAS_L1HDR(hdr)) {
ASSERT(!HDR_L2_READING(hdr));
/*
* This doesn't exist in the ARC. Destroy.
* arc_hdr_destroy() will call list_remove()
* and decrement arcstat_l2_lsize.
*/
arc_change_state(arc_anon, hdr, hash_lock);
arc_hdr_destroy(hdr);
} else {
ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
/*
* Invalidate issued or about to be issued
* reads, since we may be about to write
* over this location.
*/
if (HDR_L2_READING(hdr)) {
ARCSTAT_BUMP(arcstat_l2_evict_reading);
arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED);
}
arc_hdr_l2hdr_destroy(hdr);
}
mutex_exit(hash_lock);
}
mutex_exit(&dev->l2ad_mtx);
}
/*
* Find and write ARC buffers to the L2ARC device.
*
* An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
* for reading until they have completed writing.
* The headroom_boost is an in-out parameter used to maintain headroom boost
* state between calls to this function.
*
* Returns the number of bytes actually written (which may be smaller than
* the delta by which the device hand has changed due to alignment).
*/
static uint64_t
l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
{
arc_buf_hdr_t *hdr, *hdr_prev, *head;
uint64_t write_asize, write_psize, write_lsize, headroom;
boolean_t full;
l2arc_write_callback_t *cb;
zio_t *pio, *wzio;
uint64_t guid = spa_load_guid(spa);
int try;
ASSERT3P(dev->l2ad_vdev, !=, NULL);
pio = NULL;
write_lsize = write_asize = write_psize = 0;
full = B_FALSE;
head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
/*
* Copy buffers for L2ARC writing.
*/
for (try = 0; try <= 3; try++) {
multilist_sublist_t *mls = l2arc_sublist_lock(try);
uint64_t passed_sz = 0;
ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
/*
* L2ARC fast warmup.
*
* Until the ARC is warm and starts to evict, read from the
* head of the ARC lists rather than the tail.
*/
if (arc_warm == B_FALSE)
hdr = multilist_sublist_head(mls);
else
hdr = multilist_sublist_tail(mls);
if (hdr == NULL)
ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
headroom = target_sz * l2arc_headroom;
if (zfs_compressed_arc_enabled)
headroom = (headroom * l2arc_headroom_boost) / 100;
for (; hdr; hdr = hdr_prev) {
kmutex_t *hash_lock;
if (arc_warm == B_FALSE)
hdr_prev = multilist_sublist_next(mls, hdr);
else
hdr_prev = multilist_sublist_prev(mls, hdr);
ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned,
HDR_GET_LSIZE(hdr));
hash_lock = HDR_LOCK(hdr);
if (!mutex_tryenter(hash_lock)) {
ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
/*
* Skip this buffer rather than waiting.
*/
continue;
}
passed_sz += HDR_GET_LSIZE(hdr);
if (passed_sz > headroom) {
/*
* Searched too far.
*/
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
break;
}
if (!l2arc_write_eligible(guid, hdr)) {
mutex_exit(hash_lock);
continue;
}
/*
* We rely on the L1 portion of the header below, so
* it's invalid for this header to have been evicted out
* of the ghost cache, prior to being written out. The
* ARC_FLAG_L2_WRITING bit ensures this won't happen.
*/
ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
ASSERT3U(arc_hdr_size(hdr), >, 0);
uint64_t psize = arc_hdr_size(hdr);
uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
psize);
if ((write_asize + asize) > target_sz) {
full = B_TRUE;
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_l2_write_full);
break;
}
if (pio == NULL) {
/*
* Insert a dummy header on the buflist so
* l2arc_write_done() can find where the
* write buffers begin without searching.
*/
mutex_enter(&dev->l2ad_mtx);
list_insert_head(&dev->l2ad_buflist, head);
mutex_exit(&dev->l2ad_mtx);
cb = kmem_alloc(
sizeof (l2arc_write_callback_t), KM_SLEEP);
cb->l2wcb_dev = dev;
cb->l2wcb_head = head;
pio = zio_root(spa, l2arc_write_done, cb,
ZIO_FLAG_CANFAIL);
ARCSTAT_BUMP(arcstat_l2_write_pios);
}
hdr->b_l2hdr.b_dev = dev;
hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
arc_hdr_set_flags(hdr,
ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR);
mutex_enter(&dev->l2ad_mtx);
list_insert_head(&dev->l2ad_buflist, hdr);
mutex_exit(&dev->l2ad_mtx);
(void) refcount_add_many(&dev->l2ad_alloc, psize, hdr);
/*
* Normally the L2ARC can use the hdr's data, but if
* we're sharing data between the hdr and one of its
* bufs, L2ARC needs its own copy of the data so that
* the ZIO below can't race with the buf consumer.
* Another case where we need to create a copy of the
* data is when the buffer size is not device-aligned
* and we need to pad the block to make it such.
* That also keeps the clock hand suitably aligned.
*
* To ensure that the copy will be available for the
* lifetime of the ZIO and be cleaned up afterwards, we
* add it to the l2arc_free_on_write queue.
*/
abd_t *to_write;
if (!HDR_SHARED_DATA(hdr) && psize == asize) {
to_write = hdr->b_l1hdr.b_pabd;
} else {
to_write = abd_alloc_for_io(asize,
HDR_ISTYPE_METADATA(hdr));
abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize);
if (asize != psize) {
abd_zero_off(to_write, psize,
asize - psize);
}
l2arc_free_abd_on_write(to_write, asize,
arc_buf_type(hdr));
}
wzio = zio_write_phys(pio, dev->l2ad_vdev,
hdr->b_l2hdr.b_daddr, asize, to_write,
ZIO_CHECKSUM_OFF, NULL, hdr,
ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_CANFAIL, B_FALSE);
write_lsize += HDR_GET_LSIZE(hdr);
DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
zio_t *, wzio);
write_psize += psize;
write_asize += asize;
dev->l2ad_hand += asize;
mutex_exit(hash_lock);
(void) zio_nowait(wzio);
}
multilist_sublist_unlock(mls);
if (full == B_TRUE)
break;
}
/* No buffers selected for writing? */
if (pio == NULL) {
ASSERT0(write_lsize);
ASSERT(!HDR_HAS_L1HDR(head));
kmem_cache_free(hdr_l2only_cache, head);
return (0);
}
ASSERT3U(write_psize, <=, target_sz);
ARCSTAT_BUMP(arcstat_l2_writes_sent);
ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize);
ARCSTAT_INCR(arcstat_l2_lsize, write_lsize);
ARCSTAT_INCR(arcstat_l2_psize, write_psize);
vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
/*
* Bump device hand to the device start if it is approaching the end.
* l2arc_evict() will already have evicted ahead for this case.
*/
if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
dev->l2ad_hand = dev->l2ad_start;
dev->l2ad_first = B_FALSE;
}
dev->l2ad_writing = B_TRUE;
(void) zio_wait(pio);
dev->l2ad_writing = B_FALSE;
return (write_asize);
}
/*
* This thread feeds the L2ARC at regular intervals. This is the beating
* heart of the L2ARC.
*/
/* ARGSUSED */
static void
l2arc_feed_thread(void *unused __unused)
{
callb_cpr_t cpr;
l2arc_dev_t *dev;
spa_t *spa;
uint64_t size, wrote;
clock_t begin, next = ddi_get_lbolt();
CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
mutex_enter(&l2arc_feed_thr_lock);
while (l2arc_thread_exit == 0) {
CALLB_CPR_SAFE_BEGIN(&cpr);
(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
next - ddi_get_lbolt());
CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
next = ddi_get_lbolt() + hz;
/*
* Quick check for L2ARC devices.
*/
mutex_enter(&l2arc_dev_mtx);
if (l2arc_ndev == 0) {
mutex_exit(&l2arc_dev_mtx);
continue;
}
mutex_exit(&l2arc_dev_mtx);
begin = ddi_get_lbolt();
/*
* This selects the next l2arc device to write to, and in
* doing so the next spa to feed from: dev->l2ad_spa. This
* will return NULL if there are now no l2arc devices or if
* they are all faulted.
*
* If a device is returned, its spa's config lock is also
* held to prevent device removal. l2arc_dev_get_next()
* will grab and release l2arc_dev_mtx.
*/
if ((dev = l2arc_dev_get_next()) == NULL)
continue;
spa = dev->l2ad_spa;
ASSERT3P(spa, !=, NULL);
/*
* If the pool is read-only then force the feed thread to
* sleep a little longer.
*/
if (!spa_writeable(spa)) {
next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
spa_config_exit(spa, SCL_L2ARC, dev);
continue;
}
/*
* Avoid contributing to memory pressure.
*/
if (arc_reclaim_needed()) {
ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
spa_config_exit(spa, SCL_L2ARC, dev);
continue;
}
ARCSTAT_BUMP(arcstat_l2_feeds);
size = l2arc_write_size();
/*
* Evict L2ARC buffers that will be overwritten.
*/
l2arc_evict(dev, size, B_FALSE);
/*
* Write ARC buffers.
*/
wrote = l2arc_write_buffers(spa, dev, size);
/*
* Calculate interval between writes.
*/
next = l2arc_write_interval(begin, size, wrote);
spa_config_exit(spa, SCL_L2ARC, dev);
}
l2arc_thread_exit = 0;
cv_broadcast(&l2arc_feed_thr_cv);
CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
thread_exit();
}
boolean_t
l2arc_vdev_present(vdev_t *vd)
{
l2arc_dev_t *dev;
mutex_enter(&l2arc_dev_mtx);
for (dev = list_head(l2arc_dev_list); dev != NULL;
dev = list_next(l2arc_dev_list, dev)) {
if (dev->l2ad_vdev == vd)
break;
}
mutex_exit(&l2arc_dev_mtx);
return (dev != NULL);
}
/*
* Add a vdev for use by the L2ARC. By this point the spa has already
* validated the vdev and opened it.
*/
void
l2arc_add_vdev(spa_t *spa, vdev_t *vd)
{
l2arc_dev_t *adddev;
ASSERT(!l2arc_vdev_present(vd));
vdev_ashift_optimize(vd);
/*
* Create a new l2arc device entry.
*/
adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
adddev->l2ad_spa = spa;
adddev->l2ad_vdev = vd;
adddev->l2ad_start = VDEV_LABEL_START_SIZE;
adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
adddev->l2ad_hand = adddev->l2ad_start;
adddev->l2ad_first = B_TRUE;
adddev->l2ad_writing = B_FALSE;
mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
/*
* This is a list of all ARC buffers that are still valid on the
* device.
*/
list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
refcount_create(&adddev->l2ad_alloc);
/*
* Add device to global list
*/
mutex_enter(&l2arc_dev_mtx);
list_insert_head(l2arc_dev_list, adddev);
atomic_inc_64(&l2arc_ndev);
mutex_exit(&l2arc_dev_mtx);
}
/*
* Remove a vdev from the L2ARC.
*/
void
l2arc_remove_vdev(vdev_t *vd)
{
l2arc_dev_t *dev, *nextdev, *remdev = NULL;
/*
* Find the device by vdev
*/
mutex_enter(&l2arc_dev_mtx);
for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
nextdev = list_next(l2arc_dev_list, dev);
if (vd == dev->l2ad_vdev) {
remdev = dev;
break;
}
}
ASSERT3P(remdev, !=, NULL);
/*
* Remove device from global list
*/
list_remove(l2arc_dev_list, remdev);
l2arc_dev_last = NULL; /* may have been invalidated */
atomic_dec_64(&l2arc_ndev);
mutex_exit(&l2arc_dev_mtx);
/*
* Clear all buflists and ARC references. L2ARC device flush.
*/
l2arc_evict(remdev, 0, B_TRUE);
list_destroy(&remdev->l2ad_buflist);
mutex_destroy(&remdev->l2ad_mtx);
refcount_destroy(&remdev->l2ad_alloc);
kmem_free(remdev, sizeof (l2arc_dev_t));
}
void
l2arc_init(void)
{
l2arc_thread_exit = 0;
l2arc_ndev = 0;
l2arc_writes_sent = 0;
l2arc_writes_done = 0;
mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
l2arc_dev_list = &L2ARC_dev_list;
l2arc_free_on_write = &L2ARC_free_on_write;
list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
offsetof(l2arc_dev_t, l2ad_node));
list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
offsetof(l2arc_data_free_t, l2df_list_node));
}
void
l2arc_fini(void)
{
/*
* This is called from dmu_fini(), which is called from spa_fini();
* Because of this, we can assume that all l2arc devices have
* already been removed when the pools themselves were removed.
*/
l2arc_do_free_on_write();
mutex_destroy(&l2arc_feed_thr_lock);
cv_destroy(&l2arc_feed_thr_cv);
mutex_destroy(&l2arc_dev_mtx);
mutex_destroy(&l2arc_free_on_write_mtx);
list_destroy(l2arc_dev_list);
list_destroy(l2arc_free_on_write);
}
void
l2arc_start(void)
{
if (!(spa_mode_global & FWRITE))
return;
(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
TS_RUN, minclsyspri);
}
void
l2arc_stop(void)
{
if (!(spa_mode_global & FWRITE))
return;
mutex_enter(&l2arc_feed_thr_lock);
cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
l2arc_thread_exit = 1;
while (l2arc_thread_exit != 0)
cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
mutex_exit(&l2arc_feed_thr_lock);
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c (revision 332525)
@@ -1,596 +1,606 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright (c) 2017 Datto Inc.
*/
#include <sys/bpobj.h>
#include <sys/zfs_context.h>
#include <sys/refcount.h>
#include <sys/dsl_pool.h>
#include <sys/zfeature.h>
#include <sys/zap.h>
/*
* Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
*/
uint64_t
bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
{
spa_t *spa = dmu_objset_spa(os);
dsl_pool_t *dp = dmu_objset_pool(os);
if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
ASSERT0(dp->dp_empty_bpobj);
dp->dp_empty_bpobj =
bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
VERIFY(zap_add(os,
DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
&dp->dp_empty_bpobj, tx) == 0);
}
spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx);
ASSERT(dp->dp_empty_bpobj != 0);
return (dp->dp_empty_bpobj);
} else {
return (bpobj_alloc(os, blocksize, tx));
}
}
void
bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)
{
dsl_pool_t *dp = dmu_objset_pool(os);
spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx);
if (!spa_feature_is_active(dmu_objset_spa(os),
SPA_FEATURE_EMPTY_BPOBJ)) {
VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_EMPTY_BPOBJ, tx));
VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));
dp->dp_empty_bpobj = 0;
}
}
uint64_t
bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
{
int size;
if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
size = BPOBJ_SIZE_V0;
else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
size = BPOBJ_SIZE_V1;
else
size = sizeof (bpobj_phys_t);
return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
DMU_OT_BPOBJ_HDR, size, tx));
}
void
bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
{
int64_t i;
bpobj_t bpo;
dmu_object_info_t doi;
int epb;
dmu_buf_t *dbuf = NULL;
ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);
VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
mutex_enter(&bpo.bpo_lock);
if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
goto out;
VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
epb = doi.doi_data_block_size / sizeof (uint64_t);
for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
uint64_t *objarray;
uint64_t offset, blkoff;
offset = i * sizeof (uint64_t);
blkoff = P2PHASE(i, epb);
if (dbuf == NULL || dbuf->db_offset > offset) {
if (dbuf)
dmu_buf_rele(dbuf, FTAG);
VERIFY3U(0, ==, dmu_buf_hold(os,
bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
}
ASSERT3U(offset, >=, dbuf->db_offset);
ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
objarray = dbuf->db_data;
bpobj_free(os, objarray[blkoff], tx);
}
if (dbuf) {
dmu_buf_rele(dbuf, FTAG);
dbuf = NULL;
}
VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
out:
mutex_exit(&bpo.bpo_lock);
bpobj_close(&bpo);
VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
}
int
bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
{
dmu_object_info_t doi;
int err;
err = dmu_object_info(os, object, &doi);
if (err)
return (err);
bzero(bpo, sizeof (*bpo));
mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
ASSERT(bpo->bpo_dbuf == NULL);
ASSERT(bpo->bpo_phys == NULL);
ASSERT(object != 0);
ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
if (err)
return (err);
bpo->bpo_os = os;
bpo->bpo_object = object;
bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
bpo->bpo_phys = bpo->bpo_dbuf->db_data;
return (0);
}
+boolean_t
+bpobj_is_open(const bpobj_t *bpo)
+{
+ return (bpo->bpo_object != 0);
+}
+
void
bpobj_close(bpobj_t *bpo)
{
/* Lame workaround for closing a bpobj that was never opened. */
if (bpo->bpo_object == 0)
return;
dmu_buf_rele(bpo->bpo_dbuf, bpo);
if (bpo->bpo_cached_dbuf != NULL)
dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
bpo->bpo_dbuf = NULL;
bpo->bpo_phys = NULL;
bpo->bpo_cached_dbuf = NULL;
bpo->bpo_object = 0;
mutex_destroy(&bpo->bpo_lock);
}
-static boolean_t
-bpobj_hasentries(bpobj_t *bpo)
+boolean_t
+bpobj_is_empty(bpobj_t *bpo)
{
- return (bpo->bpo_phys->bpo_num_blkptrs != 0 ||
- (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs != 0));
+ return (bpo->bpo_phys->bpo_num_blkptrs == 0 &&
+ (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0));
}
static int
bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
boolean_t free)
{
dmu_object_info_t doi;
int epb;
int64_t i;
int err = 0;
dmu_buf_t *dbuf = NULL;
+ ASSERT(bpobj_is_open(bpo));
mutex_enter(&bpo->bpo_lock);
- if (!bpobj_hasentries(bpo))
- goto out;
-
if (free)
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
blkptr_t *bparray;
blkptr_t *bp;
uint64_t offset, blkoff;
offset = i * sizeof (blkptr_t);
blkoff = P2PHASE(i, bpo->bpo_epb);
if (dbuf == NULL || dbuf->db_offset > offset) {
if (dbuf)
dmu_buf_rele(dbuf, FTAG);
err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
FTAG, &dbuf, 0);
if (err)
break;
}
ASSERT3U(offset, >=, dbuf->db_offset);
ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
bparray = dbuf->db_data;
bp = &bparray[blkoff];
err = func(arg, bp, tx);
if (err)
break;
if (free) {
bpo->bpo_phys->bpo_bytes -=
bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
if (bpo->bpo_havecomp) {
bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp);
bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp);
}
bpo->bpo_phys->bpo_num_blkptrs--;
ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
}
}
if (dbuf) {
dmu_buf_rele(dbuf, FTAG);
dbuf = NULL;
}
if (free) {
VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
(i + 1) * sizeof (blkptr_t), -1ULL, tx));
}
if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
goto out;
ASSERT(bpo->bpo_havecomp);
err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
if (err) {
mutex_exit(&bpo->bpo_lock);
return (err);
}
ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
epb = doi.doi_data_block_size / sizeof (uint64_t);
for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
uint64_t *objarray;
uint64_t offset, blkoff;
bpobj_t sublist;
uint64_t used_before, comp_before, uncomp_before;
uint64_t used_after, comp_after, uncomp_after;
offset = i * sizeof (uint64_t);
blkoff = P2PHASE(i, epb);
if (dbuf == NULL || dbuf->db_offset > offset) {
if (dbuf)
dmu_buf_rele(dbuf, FTAG);
err = dmu_buf_hold(bpo->bpo_os,
bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0);
if (err)
break;
}
ASSERT3U(offset, >=, dbuf->db_offset);
ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
objarray = dbuf->db_data;
err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]);
if (err)
break;
if (free) {
err = bpobj_space(&sublist,
&used_before, &comp_before, &uncomp_before);
if (err != 0) {
bpobj_close(&sublist);
break;
}
}
err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
if (free) {
VERIFY3U(0, ==, bpobj_space(&sublist,
&used_after, &comp_after, &uncomp_after));
bpo->bpo_phys->bpo_bytes -= used_before - used_after;
ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
bpo->bpo_phys->bpo_comp -= comp_before - comp_after;
bpo->bpo_phys->bpo_uncomp -=
uncomp_before - uncomp_after;
}
bpobj_close(&sublist);
if (err)
break;
if (free) {
err = dmu_object_free(bpo->bpo_os,
objarray[blkoff], tx);
if (err)
break;
bpo->bpo_phys->bpo_num_subobjs--;
ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0);
}
}
if (dbuf) {
dmu_buf_rele(dbuf, FTAG);
dbuf = NULL;
}
if (free) {
VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os,
bpo->bpo_phys->bpo_subobjs,
(i + 1) * sizeof (uint64_t), -1ULL, tx));
}
out:
/* If there are no entries, there should be no bytes. */
- if (!bpobj_hasentries(bpo)) {
+ if (bpobj_is_empty(bpo)) {
ASSERT0(bpo->bpo_phys->bpo_bytes);
ASSERT0(bpo->bpo_phys->bpo_comp);
ASSERT0(bpo->bpo_phys->bpo_uncomp);
}
mutex_exit(&bpo->bpo_lock);
return (err);
}
/*
* Iterate and remove the entries. If func returns nonzero, iteration
* will stop and that entry will not be removed.
*/
int
bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
{
return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
}
/*
* Iterate the entries. If func returns nonzero, iteration will stop.
*/
int
bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
{
return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
}
void
bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
{
bpobj_t subbpo;
uint64_t used, comp, uncomp, subsubobjs;
+ ASSERT(bpobj_is_open(bpo));
+ ASSERT(subobj != 0);
ASSERT(bpo->bpo_havesubobj);
ASSERT(bpo->bpo_havecomp);
ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {
bpobj_decr_empty(bpo->bpo_os, tx);
return;
}
VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
- if (!bpobj_hasentries(&subbpo)) {
+ if (bpobj_is_empty(&subbpo)) {
/* No point in having an empty subobj. */
bpobj_close(&subbpo);
bpobj_free(bpo->bpo_os, subobj, tx);
return;
}
mutex_enter(&bpo->bpo_lock);
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
if (bpo->bpo_phys->bpo_subobjs == 0) {
bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
DMU_OT_NONE, 0, tx);
}
dmu_object_info_t doi;
ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi));
ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
sizeof (subobj), &subobj, tx);
bpo->bpo_phys->bpo_num_subobjs++;
/*
* If subobj has only one block of subobjs, then move subobj's
* subobjs to bpo's subobj list directly. This reduces
* recursion in bpobj_iterate due to nested subobjs.
*/
subsubobjs = subbpo.bpo_phys->bpo_subobjs;
if (subsubobjs != 0) {
dmu_object_info_t doi;
VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
if (doi.doi_max_offset == doi.doi_data_block_size) {
dmu_buf_t *subdb;
uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs,
0, FTAG, &subdb, 0));
/*
* Make sure that we are not asking dmu_write()
* to write more data than we have in our buffer.
*/
VERIFY3U(subdb->db_size, >=,
numsubsub * sizeof (subobj));
dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
numsubsub * sizeof (subobj), subdb->db_data, tx);
dmu_buf_rele(subdb, FTAG);
bpo->bpo_phys->bpo_num_subobjs += numsubsub;
dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
subbpo.bpo_phys->bpo_subobjs = 0;
VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os,
subsubobjs, tx));
}
}
bpo->bpo_phys->bpo_bytes += used;
bpo->bpo_phys->bpo_comp += comp;
bpo->bpo_phys->bpo_uncomp += uncomp;
mutex_exit(&bpo->bpo_lock);
bpobj_close(&subbpo);
}
void
bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
{
blkptr_t stored_bp = *bp;
uint64_t offset;
int blkoff;
blkptr_t *bparray;
+ ASSERT(bpobj_is_open(bpo));
ASSERT(!BP_IS_HOLE(bp));
ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
if (BP_IS_EMBEDDED(bp)) {
/*
* The bpobj will compress better without the payload.
*
* Note that we store EMBEDDED bp's because they have an
* uncompressed size, which must be accounted for. An
* alternative would be to add their size to bpo_uncomp
* without storing the bp, but that would create additional
* complications: bpo_uncomp would be inconsistent with the
* set of BP's stored, and bpobj_iterate() wouldn't visit
* all the space accounted for in the bpobj.
*/
bzero(&stored_bp, sizeof (stored_bp));
stored_bp.blk_prop = bp->blk_prop;
stored_bp.blk_birth = bp->blk_birth;
} else if (!BP_GET_DEDUP(bp)) {
/* The bpobj will compress better without the checksum */
bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
}
/* We never need the fill count. */
stored_bp.blk_fill = 0;
mutex_enter(&bpo->bpo_lock);
offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
if (bpo->bpo_cached_dbuf == NULL ||
offset < bpo->bpo_cached_dbuf->db_offset ||
offset >= bpo->bpo_cached_dbuf->db_offset +
bpo->bpo_cached_dbuf->db_size) {
if (bpo->bpo_cached_dbuf)
dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
offset, bpo, &bpo->bpo_cached_dbuf, 0));
}
dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
bparray = bpo->bpo_cached_dbuf->db_data;
bparray[blkoff] = stored_bp;
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
bpo->bpo_phys->bpo_num_blkptrs++;
bpo->bpo_phys->bpo_bytes +=
bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
if (bpo->bpo_havecomp) {
bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
}
mutex_exit(&bpo->bpo_lock);
}
struct space_range_arg {
spa_t *spa;
uint64_t mintxg;
uint64_t maxtxg;
uint64_t used;
uint64_t comp;
uint64_t uncomp;
};
/* ARGSUSED */
static int
space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
{
struct space_range_arg *sra = arg;
if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
sra->used += bp_get_dsize_sync(sra->spa, bp);
else
sra->used += bp_get_dsize(sra->spa, bp);
sra->comp += BP_GET_PSIZE(bp);
sra->uncomp += BP_GET_UCSIZE(bp);
}
return (0);
}
int
bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
{
+ ASSERT(bpobj_is_open(bpo));
mutex_enter(&bpo->bpo_lock);
*usedp = bpo->bpo_phys->bpo_bytes;
if (bpo->bpo_havecomp) {
*compp = bpo->bpo_phys->bpo_comp;
*uncompp = bpo->bpo_phys->bpo_uncomp;
mutex_exit(&bpo->bpo_lock);
return (0);
} else {
mutex_exit(&bpo->bpo_lock);
return (bpobj_space_range(bpo, 0, UINT64_MAX,
usedp, compp, uncompp));
}
}
/*
* Return the amount of space in the bpobj which is:
* mintxg < blk_birth <= maxtxg
*/
int
bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
{
struct space_range_arg sra = { 0 };
int err;
+
+ ASSERT(bpobj_is_open(bpo));
/*
* As an optimization, if they want the whole txg range, just
* get bpo_bytes rather than iterating over the bps.
*/
if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
return (bpobj_space(bpo, usedp, compp, uncompp));
sra.spa = dmu_objset_spa(bpo->bpo_os);
sra.mintxg = mintxg;
sra.maxtxg = maxtxg;
err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
*usedp = sra.used;
*compp = sra.comp;
*uncompp = sra.uncomp;
return (err);
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c (revision 332525)
@@ -1,3623 +1,3761 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/zfs_context.h>
#include <sys/dmu.h>
#include <sys/dmu_send.h>
#include <sys/dmu_impl.h>
#include <sys/dbuf.h>
#include <sys/dmu_objset.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_dir.h>
#include <sys/dmu_tx.h>
#include <sys/spa.h>
#include <sys/zio.h>
#include <sys/dmu_zfetch.h>
#include <sys/sa.h>
#include <sys/sa_impl.h>
#include <sys/zfeature.h>
#include <sys/blkptr.h>
#include <sys/range_tree.h>
#include <sys/callb.h>
#include <sys/abd.h>
+#include <sys/vdev.h>
uint_t zfs_dbuf_evict_key;
static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
#ifndef __lint
extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
dmu_buf_evict_func_t *evict_func_sync,
dmu_buf_evict_func_t *evict_func_async,
dmu_buf_t **clear_on_evict_dbufp);
#endif /* ! __lint */
/*
* Global data structures and functions for the dbuf cache.
*/
static kmem_cache_t *dbuf_kmem_cache;
static taskq_t *dbu_evict_taskq;
static kthread_t *dbuf_cache_evict_thread;
static kmutex_t dbuf_evict_lock;
static kcondvar_t dbuf_evict_cv;
static boolean_t dbuf_evict_thread_exit;
/*
* LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
* are not currently held but have been recently released. These dbufs
* are not eligible for arc eviction until they are aged out of the cache.
* Dbufs are added to the dbuf cache once the last hold is released. If a
* dbuf is later accessed and still exists in the dbuf cache, then it will
* be removed from the cache and later re-added to the head of the cache.
* Dbufs that are aged out of the cache will be immediately destroyed and
* become eligible for arc eviction.
*/
static multilist_t *dbuf_cache;
static refcount_t dbuf_cache_size;
uint64_t dbuf_cache_max_bytes = 100 * 1024 * 1024;
/* Cap the size of the dbuf cache to log2 fraction of arc size. */
int dbuf_cache_max_shift = 5;
/*
* The dbuf cache uses a three-stage eviction policy:
* - A low water marker designates when the dbuf eviction thread
* should stop evicting from the dbuf cache.
* - When we reach the maximum size (aka mid water mark), we
* signal the eviction thread to run.
* - The high water mark indicates when the eviction thread
* is unable to keep up with the incoming load and eviction must
* happen in the context of the calling thread.
*
* The dbuf cache:
* (max size)
* low water mid water hi water
* +----------------------------------------+----------+----------+
* | | | |
* | | | |
* | | | |
* | | | |
* +----------------------------------------+----------+----------+
* stop signal evict
* evicting eviction directly
* thread
*
* The high and low water marks indicate the operating range for the eviction
* thread. The low water mark is, by default, 90% of the total size of the
* cache and the high water mark is at 110% (both of these percentages can be
* changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,
* respectively). The eviction thread will try to ensure that the cache remains
* within this range by waking up every second and checking if the cache is
* above the low water mark. The thread can also be woken up by callers adding
* elements into the cache if the cache is larger than the mid water (i.e max
* cache size). Once the eviction thread is woken up and eviction is required,
* it will continue evicting buffers until it's able to reduce the cache size
* to the low water mark. If the cache size continues to grow and hits the high
* water mark, then callers adding elments to the cache will begin to evict
* directly from the cache until the cache is no longer above the high water
* mark.
*/
/*
* The percentage above and below the maximum cache size.
*/
uint_t dbuf_cache_hiwater_pct = 10;
uint_t dbuf_cache_lowater_pct = 10;
SYSCTL_DECL(_vfs_zfs);
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_cache_max_bytes, CTLFLAG_RWTUN,
&dbuf_cache_max_bytes, 0, "dbuf cache size in bytes");
SYSCTL_INT(_vfs_zfs, OID_AUTO, dbuf_cache_max_shift, CTLFLAG_RDTUN,
&dbuf_cache_max_shift, 0, "dbuf size as log2 fraction of ARC");
SYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_hiwater_pct, CTLFLAG_RWTUN,
&dbuf_cache_hiwater_pct, 0, "max percents above the dbuf cache size");
SYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_lowater_pct, CTLFLAG_RWTUN,
&dbuf_cache_lowater_pct, 0, "max percents below the dbuf cache size");
/* ARGSUSED */
static int
dbuf_cons(void *vdb, void *unused, int kmflag)
{
dmu_buf_impl_t *db = vdb;
bzero(db, sizeof (dmu_buf_impl_t));
mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
multilist_link_init(&db->db_cache_link);
refcount_create(&db->db_holds);
return (0);
}
/* ARGSUSED */
static void
dbuf_dest(void *vdb, void *unused)
{
dmu_buf_impl_t *db = vdb;
mutex_destroy(&db->db_mtx);
cv_destroy(&db->db_changed);
ASSERT(!multilist_link_active(&db->db_cache_link));
refcount_destroy(&db->db_holds);
}
/*
* dbuf hash table routines
*/
static dbuf_hash_table_t dbuf_hash_table;
static uint64_t dbuf_hash_count;
static uint64_t
dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
{
uintptr_t osv = (uintptr_t)os;
uint64_t crc = -1ULL;
ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
return (crc);
}
#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
((dbuf)->db.db_object == (obj) && \
(dbuf)->db_objset == (os) && \
(dbuf)->db_level == (level) && \
(dbuf)->db_blkid == (blkid))
dmu_buf_impl_t *
dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
{
dbuf_hash_table_t *h = &dbuf_hash_table;
uint64_t hv = dbuf_hash(os, obj, level, blkid);
uint64_t idx = hv & h->hash_table_mask;
dmu_buf_impl_t *db;
mutex_enter(DBUF_HASH_MUTEX(h, idx));
for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
if (DBUF_EQUAL(db, os, obj, level, blkid)) {
mutex_enter(&db->db_mtx);
if (db->db_state != DB_EVICTING) {
mutex_exit(DBUF_HASH_MUTEX(h, idx));
return (db);
}
mutex_exit(&db->db_mtx);
}
}
mutex_exit(DBUF_HASH_MUTEX(h, idx));
return (NULL);
}
static dmu_buf_impl_t *
dbuf_find_bonus(objset_t *os, uint64_t object)
{
dnode_t *dn;
dmu_buf_impl_t *db = NULL;
if (dnode_hold(os, object, FTAG, &dn) == 0) {
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_bonus != NULL) {
db = dn->dn_bonus;
mutex_enter(&db->db_mtx);
}
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
}
return (db);
}
/*
* Insert an entry into the hash table. If there is already an element
* equal to elem in the hash table, then the already existing element
* will be returned and the new element will not be inserted.
* Otherwise returns NULL.
*/
static dmu_buf_impl_t *
dbuf_hash_insert(dmu_buf_impl_t *db)
{
dbuf_hash_table_t *h = &dbuf_hash_table;
objset_t *os = db->db_objset;
uint64_t obj = db->db.db_object;
int level = db->db_level;
uint64_t blkid = db->db_blkid;
uint64_t hv = dbuf_hash(os, obj, level, blkid);
uint64_t idx = hv & h->hash_table_mask;
dmu_buf_impl_t *dbf;
mutex_enter(DBUF_HASH_MUTEX(h, idx));
for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
mutex_enter(&dbf->db_mtx);
if (dbf->db_state != DB_EVICTING) {
mutex_exit(DBUF_HASH_MUTEX(h, idx));
return (dbf);
}
mutex_exit(&dbf->db_mtx);
}
}
mutex_enter(&db->db_mtx);
db->db_hash_next = h->hash_table[idx];
h->hash_table[idx] = db;
mutex_exit(DBUF_HASH_MUTEX(h, idx));
atomic_inc_64(&dbuf_hash_count);
return (NULL);
}
/*
* Remove an entry from the hash table. It must be in the EVICTING state.
*/
static void
dbuf_hash_remove(dmu_buf_impl_t *db)
{
dbuf_hash_table_t *h = &dbuf_hash_table;
uint64_t hv = dbuf_hash(db->db_objset, db->db.db_object,
db->db_level, db->db_blkid);
uint64_t idx = hv & h->hash_table_mask;
dmu_buf_impl_t *dbf, **dbp;
/*
* We musn't hold db_mtx to maintain lock ordering:
* DBUF_HASH_MUTEX > db_mtx.
*/
ASSERT(refcount_is_zero(&db->db_holds));
ASSERT(db->db_state == DB_EVICTING);
ASSERT(!MUTEX_HELD(&db->db_mtx));
mutex_enter(DBUF_HASH_MUTEX(h, idx));
dbp = &h->hash_table[idx];
while ((dbf = *dbp) != db) {
dbp = &dbf->db_hash_next;
ASSERT(dbf != NULL);
}
*dbp = db->db_hash_next;
db->db_hash_next = NULL;
mutex_exit(DBUF_HASH_MUTEX(h, idx));
atomic_dec_64(&dbuf_hash_count);
}
typedef enum {
DBVU_EVICTING,
DBVU_NOT_EVICTING
} dbvu_verify_type_t;
static void
dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
{
#ifdef ZFS_DEBUG
int64_t holds;
if (db->db_user == NULL)
return;
/* Only data blocks support the attachment of user data. */
ASSERT(db->db_level == 0);
/* Clients must resolve a dbuf before attaching user data. */
ASSERT(db->db.db_data != NULL);
ASSERT3U(db->db_state, ==, DB_CACHED);
holds = refcount_count(&db->db_holds);
if (verify_type == DBVU_EVICTING) {
/*
* Immediate eviction occurs when holds == dirtycnt.
* For normal eviction buffers, holds is zero on
* eviction, except when dbuf_fix_old_data() calls
* dbuf_clear_data(). However, the hold count can grow
* during eviction even though db_mtx is held (see
* dmu_bonus_hold() for an example), so we can only
* test the generic invariant that holds >= dirtycnt.
*/
ASSERT3U(holds, >=, db->db_dirtycnt);
} else {
if (db->db_user_immediate_evict == TRUE)
ASSERT3U(holds, >=, db->db_dirtycnt);
else
ASSERT3U(holds, >, 0);
}
#endif
}
static void
dbuf_evict_user(dmu_buf_impl_t *db)
{
dmu_buf_user_t *dbu = db->db_user;
ASSERT(MUTEX_HELD(&db->db_mtx));
if (dbu == NULL)
return;
dbuf_verify_user(db, DBVU_EVICTING);
db->db_user = NULL;
#ifdef ZFS_DEBUG
if (dbu->dbu_clear_on_evict_dbufp != NULL)
*dbu->dbu_clear_on_evict_dbufp = NULL;
#endif
/*
* There are two eviction callbacks - one that we call synchronously
* and one that we invoke via a taskq. The async one is useful for
* avoiding lock order reversals and limiting stack depth.
*
* Note that if we have a sync callback but no async callback,
* it's likely that the sync callback will free the structure
* containing the dbu. In that case we need to take care to not
* dereference dbu after calling the sync evict func.
*/
boolean_t has_async = (dbu->dbu_evict_func_async != NULL);
if (dbu->dbu_evict_func_sync != NULL)
dbu->dbu_evict_func_sync(dbu);
if (has_async) {
taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async,
dbu, 0, &dbu->dbu_tqent);
}
}
boolean_t
dbuf_is_metadata(dmu_buf_impl_t *db)
{
if (db->db_level > 0) {
return (B_TRUE);
} else {
boolean_t is_metadata;
DB_DNODE_ENTER(db);
is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
DB_DNODE_EXIT(db);
return (is_metadata);
}
}
/*
* This function *must* return indices evenly distributed between all
* sublists of the multilist. This is needed due to how the dbuf eviction
* code is laid out; dbuf_evict_thread() assumes dbufs are evenly
* distributed between all sublists and uses this assumption when
* deciding which sublist to evict from and how much to evict from it.
*/
unsigned int
dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
{
dmu_buf_impl_t *db = obj;
/*
* The assumption here, is the hash value for a given
* dmu_buf_impl_t will remain constant throughout it's lifetime
* (i.e. it's objset, object, level and blkid fields don't change).
* Thus, we don't need to store the dbuf's sublist index
* on insertion, as this index can be recalculated on removal.
*
* Also, the low order bits of the hash value are thought to be
* distributed evenly. Otherwise, in the case that the multilist
* has a power of two number of sublists, each sublists' usage
* would not be evenly distributed.
*/
return (dbuf_hash(db->db_objset, db->db.db_object,
db->db_level, db->db_blkid) %
multilist_get_num_sublists(ml));
}
static inline boolean_t
dbuf_cache_above_hiwater(void)
{
uint64_t dbuf_cache_hiwater_bytes =
(dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100;
return (refcount_count(&dbuf_cache_size) >
dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes);
}
static inline boolean_t
dbuf_cache_above_lowater(void)
{
uint64_t dbuf_cache_lowater_bytes =
(dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100;
return (refcount_count(&dbuf_cache_size) >
dbuf_cache_max_bytes - dbuf_cache_lowater_bytes);
}
/*
* Evict the oldest eligible dbuf from the dbuf cache.
*/
static void
dbuf_evict_one(void)
{
int idx = multilist_get_random_index(dbuf_cache);
multilist_sublist_t *mls = multilist_sublist_lock(dbuf_cache, idx);
ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
/*
* Set the thread's tsd to indicate that it's processing evictions.
* Once a thread stops evicting from the dbuf cache it will
* reset its tsd to NULL.
*/
ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL);
(void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE);
dmu_buf_impl_t *db = multilist_sublist_tail(mls);
while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
db = multilist_sublist_prev(mls, db);
}
DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
multilist_sublist_t *, mls);
if (db != NULL) {
multilist_sublist_remove(mls, db);
multilist_sublist_unlock(mls);
(void) refcount_remove_many(&dbuf_cache_size,
db->db.db_size, db);
dbuf_destroy(db);
} else {
multilist_sublist_unlock(mls);
}
(void) tsd_set(zfs_dbuf_evict_key, NULL);
}
/*
* The dbuf evict thread is responsible for aging out dbufs from the
* cache. Once the cache has reached it's maximum size, dbufs are removed
* and destroyed. The eviction thread will continue running until the size
* of the dbuf cache is at or below the maximum size. Once the dbuf is aged
* out of the cache it is destroyed and becomes eligible for arc eviction.
*/
/* ARGSUSED */
static void
dbuf_evict_thread(void *unused __unused)
{
callb_cpr_t cpr;
CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
mutex_enter(&dbuf_evict_lock);
while (!dbuf_evict_thread_exit) {
while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
CALLB_CPR_SAFE_BEGIN(&cpr);
(void) cv_timedwait_hires(&dbuf_evict_cv,
&dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
}
mutex_exit(&dbuf_evict_lock);
/*
* Keep evicting as long as we're above the low water mark
* for the cache. We do this without holding the locks to
* minimize lock contention.
*/
while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
dbuf_evict_one();
}
mutex_enter(&dbuf_evict_lock);
}
dbuf_evict_thread_exit = B_FALSE;
cv_broadcast(&dbuf_evict_cv);
CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */
thread_exit();
}
/*
* Wake up the dbuf eviction thread if the dbuf cache is at its max size.
* If the dbuf cache is at its high water mark, then evict a dbuf from the
* dbuf cache using the callers context.
*/
static void
dbuf_evict_notify(void)
{
/*
* We use thread specific data to track when a thread has
* started processing evictions. This allows us to avoid deeply
* nested stacks that would have a call flow similar to this:
*
* dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
* ^ |
* | |
* +-----dbuf_destroy()<--dbuf_evict_one()<--------+
*
* The dbuf_eviction_thread will always have its tsd set until
* that thread exits. All other threads will only set their tsd
* if they are participating in the eviction process. This only
* happens if the eviction thread is unable to process evictions
* fast enough. To keep the dbuf cache size in check, other threads
* can evict from the dbuf cache directly. Those threads will set
* their tsd values so that we ensure that they only evict one dbuf
* from the dbuf cache.
*/
if (tsd_get(zfs_dbuf_evict_key) != NULL)
return;
/*
* We check if we should evict without holding the dbuf_evict_lock,
* because it's OK to occasionally make the wrong decision here,
* and grabbing the lock results in massive lock contention.
*/
if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) {
if (dbuf_cache_above_hiwater())
dbuf_evict_one();
cv_signal(&dbuf_evict_cv);
}
}
void
dbuf_init(void)
{
uint64_t hsize = 1ULL << 16;
dbuf_hash_table_t *h = &dbuf_hash_table;
int i;
/*
* The hash table is big enough to fill all of physical memory
* with an average 4K block size. The table will take up
* totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
*/
while (hsize * 4096 < (uint64_t)physmem * PAGESIZE)
hsize <<= 1;
retry:
h->hash_table_mask = hsize - 1;
h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
if (h->hash_table == NULL) {
/* XXX - we should really return an error instead of assert */
ASSERT(hsize > (1ULL << 10));
hsize >>= 1;
goto retry;
}
dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
sizeof (dmu_buf_impl_t),
0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
for (i = 0; i < DBUF_MUTEXES; i++)
mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
/*
* Setup the parameters for the dbuf cache. We cap the size of the
* dbuf cache to 1/32nd (default) of the size of the ARC.
*/
dbuf_cache_max_bytes = MIN(dbuf_cache_max_bytes,
arc_max_bytes() >> dbuf_cache_max_shift);
/*
* All entries are queued via taskq_dispatch_ent(), so min/maxalloc
* configuration is not required.
*/
dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
dbuf_cache = multilist_create(sizeof (dmu_buf_impl_t),
offsetof(dmu_buf_impl_t, db_cache_link),
dbuf_cache_multilist_index_func);
refcount_create(&dbuf_cache_size);
tsd_create(&zfs_dbuf_evict_key, NULL);
dbuf_evict_thread_exit = B_FALSE;
mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
NULL, 0, &p0, TS_RUN, minclsyspri);
}
void
dbuf_fini(void)
{
dbuf_hash_table_t *h = &dbuf_hash_table;
int i;
for (i = 0; i < DBUF_MUTEXES; i++)
mutex_destroy(&h->hash_mutexes[i]);
kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
kmem_cache_destroy(dbuf_kmem_cache);
taskq_destroy(dbu_evict_taskq);
mutex_enter(&dbuf_evict_lock);
dbuf_evict_thread_exit = B_TRUE;
while (dbuf_evict_thread_exit) {
cv_signal(&dbuf_evict_cv);
cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
}
mutex_exit(&dbuf_evict_lock);
tsd_destroy(&zfs_dbuf_evict_key);
mutex_destroy(&dbuf_evict_lock);
cv_destroy(&dbuf_evict_cv);
refcount_destroy(&dbuf_cache_size);
multilist_destroy(dbuf_cache);
}
/*
* Other stuff.
*/
#ifdef ZFS_DEBUG
static void
dbuf_verify(dmu_buf_impl_t *db)
{
dnode_t *dn;
dbuf_dirty_record_t *dr;
ASSERT(MUTEX_HELD(&db->db_mtx));
if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
return;
ASSERT(db->db_objset != NULL);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
if (dn == NULL) {
ASSERT(db->db_parent == NULL);
ASSERT(db->db_blkptr == NULL);
} else {
ASSERT3U(db->db.db_object, ==, dn->dn_object);
ASSERT3P(db->db_objset, ==, dn->dn_objset);
ASSERT3U(db->db_level, <, dn->dn_nlevels);
ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
db->db_blkid == DMU_SPILL_BLKID ||
!avl_is_empty(&dn->dn_dbufs));
}
if (db->db_blkid == DMU_BONUS_BLKID) {
ASSERT(dn != NULL);
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
} else if (db->db_blkid == DMU_SPILL_BLKID) {
ASSERT(dn != NULL);
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
ASSERT0(db->db.db_offset);
} else {
ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
}
for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
ASSERT(dr->dr_dbuf == db);
for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
ASSERT(dr->dr_dbuf == db);
/*
* We can't assert that db_size matches dn_datablksz because it
* can be momentarily different when another thread is doing
* dnode_set_blksz().
*/
if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
dr = db->db_data_pending;
/*
* It should only be modified in syncing context, so
* make sure we only have one copy of the data.
*/
ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
}
/* verify db->db_blkptr */
if (db->db_blkptr) {
if (db->db_parent == dn->dn_dbuf) {
/* db is pointed to by the dnode */
/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
ASSERT(db->db_parent == NULL);
else
ASSERT(db->db_parent != NULL);
if (db->db_blkid != DMU_SPILL_BLKID)
ASSERT3P(db->db_blkptr, ==,
&dn->dn_phys->dn_blkptr[db->db_blkid]);
} else {
/* db is pointed to by an indirect block */
int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
ASSERT3U(db->db_parent->db.db_object, ==,
db->db.db_object);
/*
* dnode_grow_indblksz() can make this fail if we don't
* have the struct_rwlock. XXX indblksz no longer
* grows. safe to do this now?
*/
if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
ASSERT3P(db->db_blkptr, ==,
((blkptr_t *)db->db_parent->db.db_data +
db->db_blkid % epb));
}
}
}
if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
(db->db_buf == NULL || db->db_buf->b_data) &&
db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
db->db_state != DB_FILL && !dn->dn_free_txg) {
/*
* If the blkptr isn't set but they have nonzero data,
* it had better be dirty, otherwise we'll lose that
* data when we evict this buffer.
*
* There is an exception to this rule for indirect blocks; in
* this case, if the indirect block is a hole, we fill in a few
* fields on each of the child blocks (importantly, birth time)
* to prevent hole birth times from being lost when you
* partially fill in a hole.
*/
if (db->db_dirtycnt == 0) {
if (db->db_level == 0) {
uint64_t *buf = db->db.db_data;
int i;
for (i = 0; i < db->db.db_size >> 3; i++) {
ASSERT(buf[i] == 0);
}
} else {
blkptr_t *bps = db->db.db_data;
ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,
db->db.db_size);
/*
* We want to verify that all the blkptrs in the
* indirect block are holes, but we may have
* automatically set up a few fields for them.
* We iterate through each blkptr and verify
* they only have those fields set.
*/
for (int i = 0;
i < db->db.db_size / sizeof (blkptr_t);
i++) {
blkptr_t *bp = &bps[i];
ASSERT(ZIO_CHECKSUM_IS_ZERO(
&bp->blk_cksum));
ASSERT(
DVA_IS_EMPTY(&bp->blk_dva[0]) &&
DVA_IS_EMPTY(&bp->blk_dva[1]) &&
DVA_IS_EMPTY(&bp->blk_dva[2]));
ASSERT0(bp->blk_fill);
ASSERT0(bp->blk_pad[0]);
ASSERT0(bp->blk_pad[1]);
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT(BP_IS_HOLE(bp));
ASSERT0(bp->blk_phys_birth);
}
}
}
}
DB_DNODE_EXIT(db);
}
#endif
static void
dbuf_clear_data(dmu_buf_impl_t *db)
{
ASSERT(MUTEX_HELD(&db->db_mtx));
dbuf_evict_user(db);
ASSERT3P(db->db_buf, ==, NULL);
db->db.db_data = NULL;
if (db->db_state != DB_NOFILL)
db->db_state = DB_UNCACHED;
}
static void
dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
{
ASSERT(MUTEX_HELD(&db->db_mtx));
ASSERT(buf != NULL);
db->db_buf = buf;
ASSERT(buf->b_data != NULL);
db->db.db_data = buf->b_data;
}
/*
* Loan out an arc_buf for read. Return the loaned arc_buf.
*/
arc_buf_t *
dbuf_loan_arcbuf(dmu_buf_impl_t *db)
{
arc_buf_t *abuf;
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
mutex_enter(&db->db_mtx);
if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
int blksz = db->db.db_size;
spa_t *spa = db->db_objset->os_spa;
mutex_exit(&db->db_mtx);
abuf = arc_loan_buf(spa, B_FALSE, blksz);
bcopy(db->db.db_data, abuf->b_data, blksz);
} else {
abuf = db->db_buf;
arc_loan_inuse_buf(abuf, db);
db->db_buf = NULL;
dbuf_clear_data(db);
mutex_exit(&db->db_mtx);
}
return (abuf);
}
/*
* Calculate which level n block references the data at the level 0 offset
* provided.
*/
uint64_t
dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
{
if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
/*
* The level n blkid is equal to the level 0 blkid divided by
* the number of level 0s in a level n block.
*
* The level 0 blkid is offset >> datablkshift =
* offset / 2^datablkshift.
*
* The number of level 0s in a level n is the number of block
* pointers in an indirect block, raised to the power of level.
* This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
* 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
*
* Thus, the level n blkid is: offset /
* ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT)))
* = offset / 2^(datablkshift + level *
* (indblkshift - SPA_BLKPTRSHIFT))
* = offset >> (datablkshift + level *
* (indblkshift - SPA_BLKPTRSHIFT))
*/
return (offset >> (dn->dn_datablkshift + level *
(dn->dn_indblkshift - SPA_BLKPTRSHIFT)));
} else {
ASSERT3U(offset, <, dn->dn_datablksz);
return (0);
}
}
static void
dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
{
dmu_buf_impl_t *db = vdb;
mutex_enter(&db->db_mtx);
ASSERT3U(db->db_state, ==, DB_READ);
/*
* All reads are synchronous, so we must have a hold on the dbuf
*/
ASSERT(refcount_count(&db->db_holds) > 0);
ASSERT(db->db_buf == NULL);
ASSERT(db->db.db_data == NULL);
if (db->db_level == 0 && db->db_freed_in_flight) {
/* we were freed in flight; disregard any error */
arc_release(buf, db);
bzero(buf->b_data, db->db.db_size);
arc_buf_freeze(buf);
db->db_freed_in_flight = FALSE;
dbuf_set_data(db, buf);
db->db_state = DB_CACHED;
} else if (zio == NULL || zio->io_error == 0) {
dbuf_set_data(db, buf);
db->db_state = DB_CACHED;
} else {
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT3P(db->db_buf, ==, NULL);
arc_buf_destroy(buf, db);
db->db_state = DB_UNCACHED;
}
cv_broadcast(&db->db_changed);
dbuf_rele_and_unlock(db, NULL);
}
static void
dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
{
dnode_t *dn;
zbookmark_phys_t zb;
arc_flags_t aflags = ARC_FLAG_NOWAIT;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
ASSERT(!refcount_is_zero(&db->db_holds));
/* We need the struct_rwlock to prevent db_blkptr from changing. */
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
ASSERT(MUTEX_HELD(&db->db_mtx));
ASSERT(db->db_state == DB_UNCACHED);
ASSERT(db->db_buf == NULL);
if (db->db_blkid == DMU_BONUS_BLKID) {
int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
ASSERT3U(bonuslen, <=, db->db.db_size);
db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
if (bonuslen < DN_MAX_BONUSLEN)
bzero(db->db.db_data, DN_MAX_BONUSLEN);
if (bonuslen)
bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
DB_DNODE_EXIT(db);
db->db_state = DB_CACHED;
mutex_exit(&db->db_mtx);
return;
}
/*
* Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
* processes the delete record and clears the bp while we are waiting
* for the dn_mtx (resulting in a "no" from block_freed).
*/
if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
(db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
BP_IS_HOLE(db->db_blkptr)))) {
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db, type,
db->db.db_size));
bzero(db->db.db_data, db->db.db_size);
if (db->db_blkptr != NULL && db->db_level > 0 &&
BP_IS_HOLE(db->db_blkptr) &&
db->db_blkptr->blk_birth != 0) {
blkptr_t *bps = db->db.db_data;
for (int i = 0; i < ((1 <<
DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t));
i++) {
blkptr_t *bp = &bps[i];
ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
1 << dn->dn_indblkshift);
BP_SET_LSIZE(bp,
BP_GET_LEVEL(db->db_blkptr) == 1 ?
dn->dn_datablksz :
BP_GET_LSIZE(db->db_blkptr));
BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
BP_SET_LEVEL(bp,
BP_GET_LEVEL(db->db_blkptr) - 1);
BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
}
}
DB_DNODE_EXIT(db);
db->db_state = DB_CACHED;
mutex_exit(&db->db_mtx);
return;
}
DB_DNODE_EXIT(db);
db->db_state = DB_READ;
mutex_exit(&db->db_mtx);
if (DBUF_IS_L2CACHEABLE(db))
aflags |= ARC_FLAG_L2CACHE;
SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
db->db.db_object, db->db_level, db->db_blkid);
dbuf_add_ref(db, NULL);
(void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
(flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
&aflags, &zb);
}
/*
* This is our just-in-time copy function. It makes a copy of buffers that
* have been modified in a previous transaction group before we access them in
* the current active group.
*
* This function is used in three places: when we are dirtying a buffer for the
* first time in a txg, when we are freeing a range in a dnode that includes
* this buffer, and when we are accessing a buffer which was received compressed
* and later referenced in a WRITE_BYREF record.
*
* Note that when we are called from dbuf_free_range() we do not put a hold on
* the buffer, we just traverse the active dbuf list for the dnode.
*/
static void
dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
{
dbuf_dirty_record_t *dr = db->db_last_dirty;
ASSERT(MUTEX_HELD(&db->db_mtx));
ASSERT(db->db.db_data != NULL);
ASSERT(db->db_level == 0);
ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
if (dr == NULL ||
(dr->dt.dl.dr_data !=
((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
return;
/*
* If the last dirty record for this dbuf has not yet synced
* and its referencing the dbuf data, either:
* reset the reference to point to a new copy,
* or (if there a no active holders)
* just null out the current db_data pointer.
*/
ASSERT(dr->dr_txg >= txg - 2);
if (db->db_blkid == DMU_BONUS_BLKID) {
/* Note that the data bufs here are zio_bufs */
dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
int size = arc_buf_size(db->db_buf);
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
spa_t *spa = db->db_objset->os_spa;
enum zio_compress compress_type =
arc_get_compression(db->db_buf);
if (compress_type == ZIO_COMPRESS_OFF) {
dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
} else {
ASSERT3U(type, ==, ARC_BUFC_DATA);
dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,
size, arc_buf_lsize(db->db_buf), compress_type);
}
bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
} else {
db->db_buf = NULL;
dbuf_clear_data(db);
}
}
int
dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
{
int err = 0;
boolean_t prefetch;
dnode_t *dn;
/*
* We don't have to hold the mutex to check db_state because it
* can't be freed while we have a hold on the buffer.
*/
ASSERT(!refcount_is_zero(&db->db_holds));
if (db->db_state == DB_NOFILL)
return (SET_ERROR(EIO));
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_enter(&dn->dn_struct_rwlock, RW_READER);
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
(flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
DBUF_IS_CACHEABLE(db);
mutex_enter(&db->db_mtx);
if (db->db_state == DB_CACHED) {
/*
* If the arc buf is compressed, we need to decompress it to
* read the data. This could happen during the "zfs receive" of
* a stream which is compressed and deduplicated.
*/
if (db->db_buf != NULL &&
arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF) {
dbuf_fix_old_data(db,
spa_syncing_txg(dmu_objset_spa(db->db_objset)));
err = arc_decompress(db->db_buf);
dbuf_set_data(db, db->db_buf);
}
mutex_exit(&db->db_mtx);
if (prefetch)
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(db);
} else if (db->db_state == DB_UNCACHED) {
spa_t *spa = dn->dn_objset->os_spa;
boolean_t need_wait = B_FALSE;
if (zio == NULL &&
db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
need_wait = B_TRUE;
}
dbuf_read_impl(db, zio, flags);
/* dbuf_read_impl has dropped db_mtx for us */
if (prefetch)
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(db);
if (need_wait)
err = zio_wait(zio);
} else {
/*
* Another reader came in while the dbuf was in flight
* between UNCACHED and CACHED. Either a writer will finish
* writing the buffer (sending the dbuf to CACHED) or the
* first reader's request will reach the read_done callback
* and send the dbuf to CACHED. Otherwise, a failure
* occurred and the dbuf went to UNCACHED.
*/
mutex_exit(&db->db_mtx);
if (prefetch)
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(db);
/* Skip the wait per the caller's request. */
mutex_enter(&db->db_mtx);
if ((flags & DB_RF_NEVERWAIT) == 0) {
while (db->db_state == DB_READ ||
db->db_state == DB_FILL) {
ASSERT(db->db_state == DB_READ ||
(flags & DB_RF_HAVESTRUCT) == 0);
DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
db, zio_t *, zio);
cv_wait(&db->db_changed, &db->db_mtx);
}
if (db->db_state == DB_UNCACHED)
err = SET_ERROR(EIO);
}
mutex_exit(&db->db_mtx);
}
return (err);
}
static void
dbuf_noread(dmu_buf_impl_t *db)
{
ASSERT(!refcount_is_zero(&db->db_holds));
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
mutex_enter(&db->db_mtx);
while (db->db_state == DB_READ || db->db_state == DB_FILL)
cv_wait(&db->db_changed, &db->db_mtx);
if (db->db_state == DB_UNCACHED) {
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
spa_t *spa = db->db_objset->os_spa;
ASSERT(db->db_buf == NULL);
ASSERT(db->db.db_data == NULL);
dbuf_set_data(db, arc_alloc_buf(spa, db, type, db->db.db_size));
db->db_state = DB_FILL;
} else if (db->db_state == DB_NOFILL) {
dbuf_clear_data(db);
} else {
ASSERT3U(db->db_state, ==, DB_CACHED);
}
mutex_exit(&db->db_mtx);
}
void
dbuf_unoverride(dbuf_dirty_record_t *dr)
{
dmu_buf_impl_t *db = dr->dr_dbuf;
blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
uint64_t txg = dr->dr_txg;
ASSERT(MUTEX_HELD(&db->db_mtx));
/*
* This assert is valid because dmu_sync() expects to be called by
* a zilog's get_data while holding a range lock. This call only
* comes from dbuf_dirty() callers who must also hold a range lock.
*/
ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
ASSERT(db->db_level == 0);
if (db->db_blkid == DMU_BONUS_BLKID ||
dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
return;
ASSERT(db->db_data_pending != dr);
/* free this block */
if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
zio_free(db->db_objset->os_spa, txg, bp);
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
dr->dt.dl.dr_nopwrite = B_FALSE;
/*
* Release the already-written buffer, so we leave it in
* a consistent dirty state. Note that all callers are
* modifying the buffer, so they will immediately do
* another (redundant) arc_release(). Therefore, leave
* the buf thawed to save the effort of freezing &
* immediately re-thawing it.
*/
arc_release(dr->dt.dl.dr_data, db);
}
/*
* Evict (if its unreferenced) or clear (if its referenced) any level-0
* data blocks in the free range, so that any future readers will find
* empty blocks.
*/
void
dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
dmu_tx_t *tx)
{
dmu_buf_impl_t db_search;
dmu_buf_impl_t *db, *db_next;
uint64_t txg = tx->tx_txg;
avl_index_t where;
if (end_blkid > dn->dn_maxblkid &&
!(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID))
end_blkid = dn->dn_maxblkid;
dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
db_search.db_level = 0;
db_search.db_blkid = start_blkid;
db_search.db_state = DB_SEARCH;
mutex_enter(&dn->dn_dbufs_mtx);
db = avl_find(&dn->dn_dbufs, &db_search, &where);
ASSERT3P(db, ==, NULL);
db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
for (; db != NULL; db = db_next) {
db_next = AVL_NEXT(&dn->dn_dbufs, db);
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
if (db->db_level != 0 || db->db_blkid > end_blkid) {
break;
}
ASSERT3U(db->db_blkid, >=, start_blkid);
/* found a level 0 buffer in the range */
mutex_enter(&db->db_mtx);
if (dbuf_undirty(db, tx)) {
/* mutex has been dropped and dbuf destroyed */
continue;
}
if (db->db_state == DB_UNCACHED ||
db->db_state == DB_NOFILL ||
db->db_state == DB_EVICTING) {
ASSERT(db->db.db_data == NULL);
mutex_exit(&db->db_mtx);
continue;
}
if (db->db_state == DB_READ || db->db_state == DB_FILL) {
/* will be handled in dbuf_read_done or dbuf_rele */
db->db_freed_in_flight = TRUE;
mutex_exit(&db->db_mtx);
continue;
}
if (refcount_count(&db->db_holds) == 0) {
ASSERT(db->db_buf);
dbuf_destroy(db);
continue;
}
/* The dbuf is referenced */
if (db->db_last_dirty != NULL) {
dbuf_dirty_record_t *dr = db->db_last_dirty;
if (dr->dr_txg == txg) {
/*
* This buffer is "in-use", re-adjust the file
* size to reflect that this buffer may
* contain new data when we sync.
*/
if (db->db_blkid != DMU_SPILL_BLKID &&
db->db_blkid > dn->dn_maxblkid)
dn->dn_maxblkid = db->db_blkid;
dbuf_unoverride(dr);
} else {
/*
* This dbuf is not dirty in the open context.
* Either uncache it (if its not referenced in
* the open context) or reset its contents to
* empty.
*/
dbuf_fix_old_data(db, txg);
}
}
/* clear the contents if its cached */
if (db->db_state == DB_CACHED) {
ASSERT(db->db.db_data != NULL);
arc_release(db->db_buf, db);
bzero(db->db.db_data, db->db.db_size);
arc_buf_freeze(db->db_buf);
}
mutex_exit(&db->db_mtx);
}
mutex_exit(&dn->dn_dbufs_mtx);
}
void
dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
{
arc_buf_t *buf, *obuf;
int osize = db->db.db_size;
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
dnode_t *dn;
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
/* XXX does *this* func really need the lock? */
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
/*
* This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
* is OK, because there can be no other references to the db
* when we are changing its size, so no concurrent DB_FILL can
* be happening.
*/
/*
* XXX we should be doing a dbuf_read, checking the return
* value and returning that up to our callers
*/
dmu_buf_will_dirty(&db->db, tx);
/* create the data buffer for the new block */
buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size);
/* copy old block data to the new block */
obuf = db->db_buf;
bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
/* zero the remainder */
if (size > osize)
bzero((uint8_t *)buf->b_data + osize, size - osize);
mutex_enter(&db->db_mtx);
dbuf_set_data(db, buf);
arc_buf_destroy(obuf, db);
db->db.db_size = size;
if (db->db_level == 0) {
ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
db->db_last_dirty->dt.dl.dr_data = buf;
}
mutex_exit(&db->db_mtx);
dmu_objset_willuse_space(dn->dn_objset, size - osize, tx);
DB_DNODE_EXIT(db);
}
void
dbuf_release_bp(dmu_buf_impl_t *db)
{
objset_t *os = db->db_objset;
ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
ASSERT(arc_released(os->os_phys_buf) ||
list_link_active(&os->os_dsl_dataset->ds_synced_link));
ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
(void) arc_release(db->db_buf, db);
}
/*
* We already have a dirty record for this TXG, and we are being
* dirtied again.
*/
static void
dbuf_redirty(dbuf_dirty_record_t *dr)
{
dmu_buf_impl_t *db = dr->dr_dbuf;
ASSERT(MUTEX_HELD(&db->db_mtx));
if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
/*
* If this buffer has already been written out,
* we now need to reset its state.
*/
dbuf_unoverride(dr);
if (db->db.db_object != DMU_META_DNODE_OBJECT &&
db->db_state != DB_NOFILL) {
/* Already released on initial dirty, so just thaw. */
ASSERT(arc_released(db->db_buf));
arc_buf_thaw(db->db_buf);
}
}
}
dbuf_dirty_record_t *
dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
dnode_t *dn;
objset_t *os;
dbuf_dirty_record_t **drp, *dr;
int drop_struct_lock = FALSE;
int txgoff = tx->tx_txg & TXG_MASK;
ASSERT(tx->tx_txg != 0);
ASSERT(!refcount_is_zero(&db->db_holds));
DMU_TX_DIRTY_BUF(tx, db);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
/*
* Shouldn't dirty a regular buffer in syncing context. Private
* objects may be dirtied in syncing context, but only if they
* were already pre-dirtied in open context.
*/
#ifdef DEBUG
if (dn->dn_objset->os_dsl_dataset != NULL) {
rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
RW_READER, FTAG);
}
ASSERT(!dmu_tx_is_syncing(tx) ||
BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
dn->dn_objset->os_dsl_dataset == NULL);
if (dn->dn_objset->os_dsl_dataset != NULL)
rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG);
#endif
/*
* We make this assert for private objects as well, but after we
* check if we're already dirty. They are allowed to re-dirty
* in syncing context.
*/
ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
(dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
mutex_enter(&db->db_mtx);
/*
* XXX make this true for indirects too? The problem is that
* transactions created with dmu_tx_create_assigned() from
* syncing context don't bother holding ahead.
*/
ASSERT(db->db_level != 0 ||
db->db_state == DB_CACHED || db->db_state == DB_FILL ||
db->db_state == DB_NOFILL);
mutex_enter(&dn->dn_mtx);
/*
* Don't set dirtyctx to SYNC if we're just modifying this as we
* initialize the objset.
*/
if (dn->dn_dirtyctx == DN_UNDIRTIED) {
if (dn->dn_objset->os_dsl_dataset != NULL) {
rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
RW_READER, FTAG);
}
if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ?
DN_DIRTY_SYNC : DN_DIRTY_OPEN);
ASSERT(dn->dn_dirtyctx_firstset == NULL);
dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
}
if (dn->dn_objset->os_dsl_dataset != NULL) {
rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
FTAG);
}
}
mutex_exit(&dn->dn_mtx);
if (db->db_blkid == DMU_SPILL_BLKID)
dn->dn_have_spill = B_TRUE;
/*
* If this buffer is already dirty, we're done.
*/
drp = &db->db_last_dirty;
ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
db->db.db_object == DMU_META_DNODE_OBJECT);
while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
drp = &dr->dr_next;
if (dr && dr->dr_txg == tx->tx_txg) {
DB_DNODE_EXIT(db);
dbuf_redirty(dr);
mutex_exit(&db->db_mtx);
return (dr);
}
/*
* Only valid if not already dirty.
*/
ASSERT(dn->dn_object == 0 ||
dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
(dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
ASSERT3U(dn->dn_nlevels, >, db->db_level);
/*
* We should only be dirtying in syncing context if it's the
* mos or we're initializing the os or it's a special object.
* However, we are allowed to dirty in syncing context provided
* we already dirtied it in open context. Hence we must make
* this assertion only if we're not already dirty.
*/
os = dn->dn_objset;
VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa));
#ifdef DEBUG
if (dn->dn_objset->os_dsl_dataset != NULL)
rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG);
ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
if (dn->dn_objset->os_dsl_dataset != NULL)
rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
#endif
ASSERT(db->db.db_size != 0);
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
if (db->db_blkid != DMU_BONUS_BLKID) {
dmu_objset_willuse_space(os, db->db.db_size, tx);
}
/*
* If this buffer is dirty in an old transaction group we need
* to make a copy of it so that the changes we make in this
* transaction group won't leak out when we sync the older txg.
*/
dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
if (db->db_level == 0) {
void *data_old = db->db_buf;
if (db->db_state != DB_NOFILL) {
if (db->db_blkid == DMU_BONUS_BLKID) {
dbuf_fix_old_data(db, tx->tx_txg);
data_old = db->db.db_data;
} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
/*
* Release the data buffer from the cache so
* that we can modify it without impacting
* possible other users of this cached data
* block. Note that indirect blocks and
* private objects are not released until the
* syncing state (since they are only modified
* then).
*/
arc_release(db->db_buf, db);
dbuf_fix_old_data(db, tx->tx_txg);
data_old = db->db_buf;
}
ASSERT(data_old != NULL);
}
dr->dt.dl.dr_data = data_old;
} else {
mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
list_create(&dr->dt.di.dr_children,
sizeof (dbuf_dirty_record_t),
offsetof(dbuf_dirty_record_t, dr_dirty_node));
}
if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
dr->dr_accounted = db->db.db_size;
dr->dr_dbuf = db;
dr->dr_txg = tx->tx_txg;
dr->dr_next = *drp;
*drp = dr;
/*
* We could have been freed_in_flight between the dbuf_noread
* and dbuf_dirty. We win, as though the dbuf_noread() had
* happened after the free.
*/
if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
db->db_blkid != DMU_SPILL_BLKID) {
mutex_enter(&dn->dn_mtx);
if (dn->dn_free_ranges[txgoff] != NULL) {
range_tree_clear(dn->dn_free_ranges[txgoff],
db->db_blkid, 1);
}
mutex_exit(&dn->dn_mtx);
db->db_freed_in_flight = FALSE;
}
/*
* This buffer is now part of this txg
*/
dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
db->db_dirtycnt += 1;
ASSERT3U(db->db_dirtycnt, <=, 3);
mutex_exit(&db->db_mtx);
if (db->db_blkid == DMU_BONUS_BLKID ||
db->db_blkid == DMU_SPILL_BLKID) {
mutex_enter(&dn->dn_mtx);
ASSERT(!list_link_active(&dr->dr_dirty_node));
list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
mutex_exit(&dn->dn_mtx);
dnode_setdirty(dn, tx);
DB_DNODE_EXIT(db);
return (dr);
}
/*
* The dn_struct_rwlock prevents db_blkptr from changing
* due to a write from syncing context completing
* while we are running, so we want to acquire it before
* looking at db_blkptr.
*/
if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
rw_enter(&dn->dn_struct_rwlock, RW_READER);
drop_struct_lock = TRUE;
}
/*
* We need to hold the dn_struct_rwlock to make this assertion,
* because it protects dn_phys / dn_next_nlevels from changing.
*/
ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
dn->dn_phys->dn_nlevels > db->db_level ||
dn->dn_next_nlevels[txgoff] > db->db_level ||
dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
/*
* If we are overwriting a dedup BP, then unless it is snapshotted,
* when we get to syncing context we will need to decrement its
* refcount in the DDT. Prefetch the relevant DDT block so that
* syncing context won't have to wait for the i/o.
*/
ddt_prefetch(os->os_spa, db->db_blkptr);
if (db->db_level == 0) {
dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
ASSERT(dn->dn_maxblkid >= db->db_blkid);
}
if (db->db_level+1 < dn->dn_nlevels) {
dmu_buf_impl_t *parent = db->db_parent;
dbuf_dirty_record_t *di;
int parent_held = FALSE;
if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
parent = dbuf_hold_level(dn, db->db_level+1,
db->db_blkid >> epbs, FTAG);
ASSERT(parent != NULL);
parent_held = TRUE;
}
if (drop_struct_lock)
rw_exit(&dn->dn_struct_rwlock);
ASSERT3U(db->db_level+1, ==, parent->db_level);
di = dbuf_dirty(parent, tx);
if (parent_held)
dbuf_rele(parent, FTAG);
mutex_enter(&db->db_mtx);
/*
* Since we've dropped the mutex, it's possible that
* dbuf_undirty() might have changed this out from under us.
*/
if (db->db_last_dirty == dr ||
dn->dn_object == DMU_META_DNODE_OBJECT) {
mutex_enter(&di->dt.di.dr_mtx);
ASSERT3U(di->dr_txg, ==, tx->tx_txg);
ASSERT(!list_link_active(&dr->dr_dirty_node));
list_insert_tail(&di->dt.di.dr_children, dr);
mutex_exit(&di->dt.di.dr_mtx);
dr->dr_parent = di;
}
mutex_exit(&db->db_mtx);
} else {
ASSERT(db->db_level+1 == dn->dn_nlevels);
ASSERT(db->db_blkid < dn->dn_nblkptr);
ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
mutex_enter(&dn->dn_mtx);
ASSERT(!list_link_active(&dr->dr_dirty_node));
list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
mutex_exit(&dn->dn_mtx);
if (drop_struct_lock)
rw_exit(&dn->dn_struct_rwlock);
}
dnode_setdirty(dn, tx);
DB_DNODE_EXIT(db);
return (dr);
}
/*
* Undirty a buffer in the transaction group referenced by the given
* transaction. Return whether this evicted the dbuf.
*/
static boolean_t
dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
dnode_t *dn;
uint64_t txg = tx->tx_txg;
dbuf_dirty_record_t *dr, **drp;
ASSERT(txg != 0);
/*
* Due to our use of dn_nlevels below, this can only be called
* in open context, unless we are operating on the MOS.
* From syncing context, dn_nlevels may be different from the
* dn_nlevels used when dbuf was dirtied.
*/
ASSERT(db->db_objset ==
dmu_objset_pool(db->db_objset)->dp_meta_objset ||
txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT0(db->db_level);
ASSERT(MUTEX_HELD(&db->db_mtx));
/*
* If this buffer is not dirty, we're done.
*/
for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
if (dr->dr_txg <= txg)
break;
if (dr == NULL || dr->dr_txg < txg)
return (B_FALSE);
ASSERT(dr->dr_txg == txg);
ASSERT(dr->dr_dbuf == db);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
ASSERT(db->db.db_size != 0);
dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
dr->dr_accounted, txg);
*drp = dr->dr_next;
/*
* Note that there are three places in dbuf_dirty()
* where this dirty record may be put on a list.
* Make sure to do a list_remove corresponding to
* every one of those list_insert calls.
*/
if (dr->dr_parent) {
mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
list_remove(&dr->dr_parent->dt.di.dr_children, dr);
mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
} else if (db->db_blkid == DMU_SPILL_BLKID ||
db->db_level + 1 == dn->dn_nlevels) {
ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
mutex_enter(&dn->dn_mtx);
list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
mutex_exit(&dn->dn_mtx);
}
DB_DNODE_EXIT(db);
if (db->db_state != DB_NOFILL) {
dbuf_unoverride(dr);
ASSERT(db->db_buf != NULL);
ASSERT(dr->dt.dl.dr_data != NULL);
if (dr->dt.dl.dr_data != db->db_buf)
arc_buf_destroy(dr->dt.dl.dr_data, db);
}
kmem_free(dr, sizeof (dbuf_dirty_record_t));
ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1;
if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
dbuf_destroy(db);
return (B_TRUE);
}
return (B_FALSE);
}
void
dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
ASSERT(tx->tx_txg != 0);
ASSERT(!refcount_is_zero(&db->db_holds));
/*
* Quick check for dirtyness. For already dirty blocks, this
* reduces runtime of this function by >90%, and overall performance
* by 50% for some workloads (e.g. file deletion with indirect blocks
* cached).
*/
mutex_enter(&db->db_mtx);
dbuf_dirty_record_t *dr;
for (dr = db->db_last_dirty;
dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
/*
* It's possible that it is already dirty but not cached,
* because there are some calls to dbuf_dirty() that don't
* go through dmu_buf_will_dirty().
*/
if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) {
/* This dbuf is already dirty and cached. */
dbuf_redirty(dr);
mutex_exit(&db->db_mtx);
return;
}
}
mutex_exit(&db->db_mtx);
DB_DNODE_ENTER(db);
if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
rf |= DB_RF_HAVESTRUCT;
DB_DNODE_EXIT(db);
(void) dbuf_read(db, NULL, rf);
(void) dbuf_dirty(db, tx);
}
void
dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
db->db_state = DB_NOFILL;
dmu_buf_will_fill(db_fake, tx);
}
void
dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT(tx->tx_txg != 0);
ASSERT(db->db_level == 0);
ASSERT(!refcount_is_zero(&db->db_holds));
ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
dmu_tx_private_ok(tx));
dbuf_noread(db);
(void) dbuf_dirty(db, tx);
}
#pragma weak dmu_buf_fill_done = dbuf_fill_done
/* ARGSUSED */
void
dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
mutex_enter(&db->db_mtx);
DBUF_VERIFY(db);
if (db->db_state == DB_FILL) {
if (db->db_level == 0 && db->db_freed_in_flight) {
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
/* we were freed while filling */
/* XXX dbuf_undirty? */
bzero(db->db.db_data, db->db.db_size);
db->db_freed_in_flight = FALSE;
}
db->db_state = DB_CACHED;
cv_broadcast(&db->db_changed);
}
mutex_exit(&db->db_mtx);
}
void
dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
bp_embedded_type_t etype, enum zio_compress comp,
int uncompressed_size, int compressed_size, int byteorder,
dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
struct dirty_leaf *dl;
dmu_object_type_t type;
if (etype == BP_EMBEDDED_TYPE_DATA) {
ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
SPA_FEATURE_EMBEDDED_DATA));
}
DB_DNODE_ENTER(db);
type = DB_DNODE(db)->dn_type;
DB_DNODE_EXIT(db);
ASSERT0(db->db_level);
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
dmu_buf_will_not_fill(dbuf, tx);
ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
dl = &db->db_last_dirty->dt.dl;
encode_embedded_bp_compressed(&dl->dr_overridden_by,
data, comp, uncompressed_size, compressed_size);
BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
BP_SET_TYPE(&dl->dr_overridden_by, type);
BP_SET_LEVEL(&dl->dr_overridden_by, 0);
BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
dl->dr_override_state = DR_OVERRIDDEN;
dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
}
/*
* Directly assign a provided arc buf to a given dbuf if it's not referenced
* by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
*/
void
dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
{
ASSERT(!refcount_is_zero(&db->db_holds));
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT(db->db_level == 0);
ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));
ASSERT(buf != NULL);
ASSERT(arc_buf_lsize(buf) == db->db.db_size);
ASSERT(tx->tx_txg != 0);
arc_return_buf(buf, db);
ASSERT(arc_released(buf));
mutex_enter(&db->db_mtx);
while (db->db_state == DB_READ || db->db_state == DB_FILL)
cv_wait(&db->db_changed, &db->db_mtx);
ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
if (db->db_state == DB_CACHED &&
refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
mutex_exit(&db->db_mtx);
(void) dbuf_dirty(db, tx);
bcopy(buf->b_data, db->db.db_data, db->db.db_size);
arc_buf_destroy(buf, db);
xuio_stat_wbuf_copied();
return;
}
xuio_stat_wbuf_nocopy();
if (db->db_state == DB_CACHED) {
dbuf_dirty_record_t *dr = db->db_last_dirty;
ASSERT(db->db_buf != NULL);
if (dr != NULL && dr->dr_txg == tx->tx_txg) {
ASSERT(dr->dt.dl.dr_data == db->db_buf);
if (!arc_released(db->db_buf)) {
ASSERT(dr->dt.dl.dr_override_state ==
DR_OVERRIDDEN);
arc_release(db->db_buf, db);
}
dr->dt.dl.dr_data = buf;
arc_buf_destroy(db->db_buf, db);
} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
arc_release(db->db_buf, db);
arc_buf_destroy(db->db_buf, db);
}
db->db_buf = NULL;
}
ASSERT(db->db_buf == NULL);
dbuf_set_data(db, buf);
db->db_state = DB_FILL;
mutex_exit(&db->db_mtx);
(void) dbuf_dirty(db, tx);
dmu_buf_fill_done(&db->db, tx);
}
void
dbuf_destroy(dmu_buf_impl_t *db)
{
dnode_t *dn;
dmu_buf_impl_t *parent = db->db_parent;
dmu_buf_impl_t *dndb;
ASSERT(MUTEX_HELD(&db->db_mtx));
ASSERT(refcount_is_zero(&db->db_holds));
if (db->db_buf != NULL) {
arc_buf_destroy(db->db_buf, db);
db->db_buf = NULL;
}
if (db->db_blkid == DMU_BONUS_BLKID) {
ASSERT(db->db.db_data != NULL);
zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
db->db_state = DB_UNCACHED;
}
dbuf_clear_data(db);
if (multilist_link_active(&db->db_cache_link)) {
multilist_remove(dbuf_cache, db);
(void) refcount_remove_many(&dbuf_cache_size,
db->db.db_size, db);
}
ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
ASSERT(db->db_data_pending == NULL);
db->db_state = DB_EVICTING;
db->db_blkptr = NULL;
/*
* Now that db_state is DB_EVICTING, nobody else can find this via
* the hash table. We can now drop db_mtx, which allows us to
* acquire the dn_dbufs_mtx.
*/
mutex_exit(&db->db_mtx);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
dndb = dn->dn_dbuf;
if (db->db_blkid != DMU_BONUS_BLKID) {
boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
if (needlock)
mutex_enter(&dn->dn_dbufs_mtx);
avl_remove(&dn->dn_dbufs, db);
atomic_dec_32(&dn->dn_dbufs_count);
membar_producer();
DB_DNODE_EXIT(db);
if (needlock)
mutex_exit(&dn->dn_dbufs_mtx);
/*
* Decrementing the dbuf count means that the hold corresponding
* to the removed dbuf is no longer discounted in dnode_move(),
* so the dnode cannot be moved until after we release the hold.
* The membar_producer() ensures visibility of the decremented
* value in dnode_move(), since DB_DNODE_EXIT doesn't actually
* release any lock.
*/
dnode_rele(dn, db);
db->db_dnode_handle = NULL;
dbuf_hash_remove(db);
} else {
DB_DNODE_EXIT(db);
}
ASSERT(refcount_is_zero(&db->db_holds));
db->db_parent = NULL;
ASSERT(db->db_buf == NULL);
ASSERT(db->db.db_data == NULL);
ASSERT(db->db_hash_next == NULL);
ASSERT(db->db_blkptr == NULL);
ASSERT(db->db_data_pending == NULL);
ASSERT(!multilist_link_active(&db->db_cache_link));
kmem_cache_free(dbuf_kmem_cache, db);
arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
/*
* If this dbuf is referenced from an indirect dbuf,
* decrement the ref count on the indirect dbuf.
*/
if (parent && parent != dndb)
dbuf_rele(parent, db);
}
/*
* Note: While bpp will always be updated if the function returns success,
* parentp will not be updated if the dnode does not have dn_dbuf filled in;
* this happens when the dnode is the meta-dnode, or a userused or groupused
* object.
*/
static int
dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
dmu_buf_impl_t **parentp, blkptr_t **bpp)
{
*parentp = NULL;
*bpp = NULL;
ASSERT(blkid != DMU_BONUS_BLKID);
if (blkid == DMU_SPILL_BLKID) {
mutex_enter(&dn->dn_mtx);
if (dn->dn_have_spill &&
(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
*bpp = &dn->dn_phys->dn_spill;
else
*bpp = NULL;
dbuf_add_ref(dn->dn_dbuf, NULL);
*parentp = dn->dn_dbuf;
mutex_exit(&dn->dn_mtx);
return (0);
}
int nlevels =
(dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels;
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
ASSERT3U(level * epbs, <, 64);
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
/*
* This assertion shouldn't trip as long as the max indirect block size
* is less than 1M. The reason for this is that up to that point,
* the number of levels required to address an entire object with blocks
* of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64. In
* other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55
* (i.e. we can address the entire object), objects will all use at most
* N-1 levels and the assertion won't overflow. However, once epbs is
* 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66. Then, 4 levels will not be
* enough to address an entire object, so objects will have 5 levels,
* but then this assertion will overflow.
*
* All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we
* need to redo this logic to handle overflows.
*/
ASSERT(level >= nlevels ||
((nlevels - level - 1) * epbs) +
highbit64(dn->dn_phys->dn_nblkptr) <= 64);
if (level >= nlevels ||
blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr <<
((nlevels - level - 1) * epbs)) ||
(fail_sparse &&
blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
/* the buffer has no parent yet */
return (SET_ERROR(ENOENT));
} else if (level < nlevels-1) {
/* this block is referenced from an indirect block */
int err = dbuf_hold_impl(dn, level+1,
blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
if (err)
return (err);
err = dbuf_read(*parentp, NULL,
(DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
if (err) {
dbuf_rele(*parentp, NULL);
*parentp = NULL;
return (err);
}
*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
(blkid & ((1ULL << epbs) - 1));
if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))
ASSERT(BP_IS_HOLE(*bpp));
return (0);
} else {
/* the block is referenced from the dnode */
ASSERT3U(level, ==, nlevels-1);
ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
blkid < dn->dn_phys->dn_nblkptr);
if (dn->dn_dbuf) {
dbuf_add_ref(dn->dn_dbuf, NULL);
*parentp = dn->dn_dbuf;
}
*bpp = &dn->dn_phys->dn_blkptr[blkid];
return (0);
}
}
static dmu_buf_impl_t *
dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
dmu_buf_impl_t *parent, blkptr_t *blkptr)
{
objset_t *os = dn->dn_objset;
dmu_buf_impl_t *db, *odb;
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
ASSERT(dn->dn_type != DMU_OT_NONE);
db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
db->db_objset = os;
db->db.db_object = dn->dn_object;
db->db_level = level;
db->db_blkid = blkid;
db->db_last_dirty = NULL;
db->db_dirtycnt = 0;
db->db_dnode_handle = dn->dn_handle;
db->db_parent = parent;
db->db_blkptr = blkptr;
db->db_user = NULL;
db->db_user_immediate_evict = FALSE;
db->db_freed_in_flight = FALSE;
db->db_pending_evict = FALSE;
if (blkid == DMU_BONUS_BLKID) {
ASSERT3P(parent, ==, dn->dn_dbuf);
db->db.db_size = DN_MAX_BONUSLEN -
(dn->dn_nblkptr-1) * sizeof (blkptr_t);
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
db->db.db_offset = DMU_BONUS_BLKID;
db->db_state = DB_UNCACHED;
/* the bonus dbuf is not placed in the hash table */
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
return (db);
} else if (blkid == DMU_SPILL_BLKID) {
db->db.db_size = (blkptr != NULL) ?
BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
db->db.db_offset = 0;
} else {
int blocksize =
db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
db->db.db_size = blocksize;
db->db.db_offset = db->db_blkid * blocksize;
}
/*
* Hold the dn_dbufs_mtx while we get the new dbuf
* in the hash table *and* added to the dbufs list.
* This prevents a possible deadlock with someone
* trying to look up this dbuf before its added to the
* dn_dbufs list.
*/
mutex_enter(&dn->dn_dbufs_mtx);
db->db_state = DB_EVICTING;
if ((odb = dbuf_hash_insert(db)) != NULL) {
/* someone else inserted it first */
kmem_cache_free(dbuf_kmem_cache, db);
mutex_exit(&dn->dn_dbufs_mtx);
return (odb);
}
avl_add(&dn->dn_dbufs, db);
db->db_state = DB_UNCACHED;
mutex_exit(&dn->dn_dbufs_mtx);
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
if (parent && parent != dn->dn_dbuf)
dbuf_add_ref(parent, db);
ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
refcount_count(&dn->dn_holds) > 0);
(void) refcount_add(&dn->dn_holds, db);
atomic_inc_32(&dn->dn_dbufs_count);
dprintf_dbuf(db, "db=%p\n", db);
return (db);
}
typedef struct dbuf_prefetch_arg {
spa_t *dpa_spa; /* The spa to issue the prefetch in. */
zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
int dpa_curlevel; /* The current level that we're reading */
dnode_t *dpa_dnode; /* The dnode associated with the prefetch */
zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
} dbuf_prefetch_arg_t;
/*
* Actually issue the prefetch read for the block given.
*/
static void
dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
{
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
return;
arc_flags_t aflags =
dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
ASSERT(dpa->dpa_zio != NULL);
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&aflags, &dpa->dpa_zb);
}
/*
* Called when an indirect block above our prefetch target is read in. This
* will either read in the next indirect block down the tree or issue the actual
* prefetch if the next block down is our target.
*/
static void
dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
{
dbuf_prefetch_arg_t *dpa = private;
ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
ASSERT3S(dpa->dpa_curlevel, >, 0);
/*
* The dpa_dnode is only valid if we are called with a NULL
* zio. This indicates that the arc_read() returned without
* first calling zio_read() to issue a physical read. Once
* a physical read is made the dpa_dnode must be invalidated
* as the locks guarding it may have been dropped. If the
* dpa_dnode is still valid, then we want to add it to the dbuf
* cache. To do so, we must hold the dbuf associated with the block
* we just prefetched, read its contents so that we associate it
* with an arc_buf_t, and then release it.
*/
if (zio != NULL) {
ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
if (zio->io_flags & ZIO_FLAG_RAW) {
ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
} else {
ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
}
ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
dpa->dpa_dnode = NULL;
} else if (dpa->dpa_dnode != NULL) {
uint64_t curblkid = dpa->dpa_zb.zb_blkid >>
(dpa->dpa_epbs * (dpa->dpa_curlevel -
dpa->dpa_zb.zb_level));
dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
dpa->dpa_curlevel, curblkid, FTAG);
(void) dbuf_read(db, NULL,
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
dbuf_rele(db, FTAG);
}
dpa->dpa_curlevel--;
uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
(dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
kmem_free(dpa, sizeof (*dpa));
} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
dbuf_issue_final_prefetch(dpa, bp);
kmem_free(dpa, sizeof (*dpa));
} else {
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
zbookmark_phys_t zb;
/* flag if L2ARC eligible, l2arc_noprefetch then decides */
if (dpa->dpa_aflags & ARC_FLAG_L2CACHE)
iter_aflags |= ARC_FLAG_L2CACHE;
ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&iter_aflags, &zb);
}
arc_buf_destroy(abuf, private);
}
/*
* Issue prefetch reads for the given block on the given level. If the indirect
* blocks above that block are not in memory, we will read them in
* asynchronously. As a result, this call never blocks waiting for a read to
* complete.
*/
void
dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
arc_flags_t aflags)
{
blkptr_t bp;
int epbs, nlevels, curlevel;
uint64_t curblkid;
ASSERT(blkid != DMU_BONUS_BLKID);
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
if (blkid > dn->dn_maxblkid)
return;
if (dnode_block_freed(dn, blkid))
return;
/*
* This dnode hasn't been written to disk yet, so there's nothing to
* prefetch.
*/
nlevels = dn->dn_phys->dn_nlevels;
if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
return;
epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
return;
dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
level, blkid);
if (db != NULL) {
mutex_exit(&db->db_mtx);
/*
* This dbuf already exists. It is either CACHED, or
* (we assume) about to be read or filled.
*/
return;
}
/*
* Find the closest ancestor (indirect block) of the target block
* that is present in the cache. In this indirect block, we will
* find the bp that is at curlevel, curblkid.
*/
curlevel = level;
curblkid = blkid;
while (curlevel < nlevels - 1) {
int parent_level = curlevel + 1;
uint64_t parent_blkid = curblkid >> epbs;
dmu_buf_impl_t *db;
if (dbuf_hold_impl(dn, parent_level, parent_blkid,
FALSE, TRUE, FTAG, &db) == 0) {
blkptr_t *bpp = db->db_buf->b_data;
bp = bpp[P2PHASE(curblkid, 1 << epbs)];
dbuf_rele(db, FTAG);
break;
}
curlevel = parent_level;
curblkid = parent_blkid;
}
if (curlevel == nlevels - 1) {
/* No cached indirect blocks found. */
ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
bp = dn->dn_phys->dn_blkptr[curblkid];
}
if (BP_IS_HOLE(&bp))
return;
ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
ZIO_FLAG_CANFAIL);
dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
dn->dn_object, level, blkid);
dpa->dpa_curlevel = curlevel;
dpa->dpa_prio = prio;
dpa->dpa_aflags = aflags;
dpa->dpa_spa = dn->dn_objset->os_spa;
dpa->dpa_dnode = dn;
dpa->dpa_epbs = epbs;
dpa->dpa_zio = pio;
/* flag if L2ARC eligible, l2arc_noprefetch then decides */
if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
/*
* If we have the indirect just above us, no need to do the asynchronous
* prefetch chain; we'll just run the last step ourselves. If we're at
* a higher level, though, we want to issue the prefetches for all the
* indirect blocks asynchronously, so we can go on with whatever we were
* doing.
*/
if (curlevel == level) {
ASSERT3U(curblkid, ==, blkid);
dbuf_issue_final_prefetch(dpa, &bp);
kmem_free(dpa, sizeof (*dpa));
} else {
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
zbookmark_phys_t zb;
/* flag if L2ARC eligible, l2arc_noprefetch then decides */
if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
iter_aflags |= ARC_FLAG_L2CACHE;
SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
dn->dn_object, curlevel, curblkid);
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
&bp, dbuf_prefetch_indirect_done, dpa, prio,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&iter_aflags, &zb);
}
/*
* We use pio here instead of dpa_zio since it's possible that
* dpa may have already been freed.
*/
zio_nowait(pio);
}
/*
* Returns with db_holds incremented, and db_mtx not held.
* Note: dn_struct_rwlock must be held.
*/
int
dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
boolean_t fail_sparse, boolean_t fail_uncached,
void *tag, dmu_buf_impl_t **dbp)
{
dmu_buf_impl_t *db, *parent = NULL;
ASSERT(blkid != DMU_BONUS_BLKID);
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
ASSERT3U(dn->dn_nlevels, >, level);
*dbp = NULL;
top:
/* dbuf_find() returns with db_mtx held */
db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);
if (db == NULL) {
blkptr_t *bp = NULL;
int err;
if (fail_uncached)
return (SET_ERROR(ENOENT));
ASSERT3P(parent, ==, NULL);
err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
if (fail_sparse) {
if (err == 0 && bp && BP_IS_HOLE(bp))
err = SET_ERROR(ENOENT);
if (err) {
if (parent)
dbuf_rele(parent, NULL);
return (err);
}
}
if (err && err != ENOENT)
return (err);
db = dbuf_create(dn, level, blkid, parent, bp);
}
if (fail_uncached && db->db_state != DB_CACHED) {
mutex_exit(&db->db_mtx);
return (SET_ERROR(ENOENT));
}
if (db->db_buf != NULL)
ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
/*
* If this buffer is currently syncing out, and we are are
* still referencing it from db_data, we need to make a copy
* of it in case we decide we want to dirty it again in this txg.
*/
if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
dn->dn_object != DMU_META_DNODE_OBJECT &&
db->db_state == DB_CACHED && db->db_data_pending) {
dbuf_dirty_record_t *dr = db->db_data_pending;
if (dr->dt.dl.dr_data == db->db_buf) {
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
dbuf_set_data(db,
arc_alloc_buf(dn->dn_objset->os_spa, db, type,
db->db.db_size));
bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
db->db.db_size);
}
}
if (multilist_link_active(&db->db_cache_link)) {
ASSERT(refcount_is_zero(&db->db_holds));
multilist_remove(dbuf_cache, db);
(void) refcount_remove_many(&dbuf_cache_size,
db->db.db_size, db);
}
(void) refcount_add(&db->db_holds, tag);
DBUF_VERIFY(db);
mutex_exit(&db->db_mtx);
/* NOTE: we can't rele the parent until after we drop the db_mtx */
if (parent)
dbuf_rele(parent, NULL);
ASSERT3P(DB_DNODE(db), ==, dn);
ASSERT3U(db->db_blkid, ==, blkid);
ASSERT3U(db->db_level, ==, level);
*dbp = db;
return (0);
}
dmu_buf_impl_t *
dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
{
return (dbuf_hold_level(dn, 0, blkid, tag));
}
dmu_buf_impl_t *
dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
{
dmu_buf_impl_t *db;
int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
return (err ? NULL : db);
}
void
dbuf_create_bonus(dnode_t *dn)
{
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
ASSERT(dn->dn_bonus == NULL);
dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
}
int
dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
dnode_t *dn;
if (db->db_blkid != DMU_SPILL_BLKID)
return (SET_ERROR(ENOTSUP));
if (blksz == 0)
blksz = SPA_MINBLOCKSIZE;
ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
dbuf_new_size(db, blksz, tx);
rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(db);
return (0);
}
void
dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
{
dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
}
#pragma weak dmu_buf_add_ref = dbuf_add_ref
void
dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
{
int64_t holds = refcount_add(&db->db_holds, tag);
ASSERT3S(holds, >, 1);
}
#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
boolean_t
dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
void *tag)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
dmu_buf_impl_t *found_db;
boolean_t result = B_FALSE;
if (db->db_blkid == DMU_BONUS_BLKID)
found_db = dbuf_find_bonus(os, obj);
else
found_db = dbuf_find(os, obj, 0, blkid);
if (found_db != NULL) {
if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
(void) refcount_add(&db->db_holds, tag);
result = B_TRUE;
}
mutex_exit(&db->db_mtx);
}
return (result);
}
/*
* If you call dbuf_rele() you had better not be referencing the dnode handle
* unless you have some other direct or indirect hold on the dnode. (An indirect
* hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
* Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
* dnode's parent dbuf evicting its dnode handles.
*/
void
dbuf_rele(dmu_buf_impl_t *db, void *tag)
{
mutex_enter(&db->db_mtx);
dbuf_rele_and_unlock(db, tag);
}
void
dmu_buf_rele(dmu_buf_t *db, void *tag)
{
dbuf_rele((dmu_buf_impl_t *)db, tag);
}
/*
* dbuf_rele() for an already-locked dbuf. This is necessary to allow
* db_dirtycnt and db_holds to be updated atomically.
*/
void
dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
{
int64_t holds;
ASSERT(MUTEX_HELD(&db->db_mtx));
DBUF_VERIFY(db);
/*
* Remove the reference to the dbuf before removing its hold on the
* dnode so we can guarantee in dnode_move() that a referenced bonus
* buffer has a corresponding dnode hold.
*/
holds = refcount_remove(&db->db_holds, tag);
ASSERT(holds >= 0);
/*
* We can't freeze indirects if there is a possibility that they
* may be modified in the current syncing context.
*/
if (db->db_buf != NULL &&
holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) {
arc_buf_freeze(db->db_buf);
}
if (holds == db->db_dirtycnt &&
db->db_level == 0 && db->db_user_immediate_evict)
dbuf_evict_user(db);
if (holds == 0) {
if (db->db_blkid == DMU_BONUS_BLKID) {
dnode_t *dn;
boolean_t evict_dbuf = db->db_pending_evict;
/*
* If the dnode moves here, we cannot cross this
* barrier until the move completes.
*/
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
atomic_dec_32(&dn->dn_dbufs_count);
/*
* Decrementing the dbuf count means that the bonus
* buffer's dnode hold is no longer discounted in
* dnode_move(). The dnode cannot move until after
* the dnode_rele() below.
*/
DB_DNODE_EXIT(db);
/*
* Do not reference db after its lock is dropped.
* Another thread may evict it.
*/
mutex_exit(&db->db_mtx);
if (evict_dbuf)
dnode_evict_bonus(dn);
dnode_rele(dn, db);
} else if (db->db_buf == NULL) {
/*
* This is a special case: we never associated this
* dbuf with any data allocated from the ARC.
*/
ASSERT(db->db_state == DB_UNCACHED ||
db->db_state == DB_NOFILL);
dbuf_destroy(db);
} else if (arc_released(db->db_buf)) {
/*
* This dbuf has anonymous data associated with it.
*/
dbuf_destroy(db);
} else {
boolean_t do_arc_evict = B_FALSE;
blkptr_t bp;
spa_t *spa = dmu_objset_spa(db->db_objset);
if (!DBUF_IS_CACHEABLE(db) &&
db->db_blkptr != NULL &&
!BP_IS_HOLE(db->db_blkptr) &&
!BP_IS_EMBEDDED(db->db_blkptr)) {
do_arc_evict = B_TRUE;
bp = *db->db_blkptr;
}
if (!DBUF_IS_CACHEABLE(db) ||
db->db_pending_evict) {
dbuf_destroy(db);
} else if (!multilist_link_active(&db->db_cache_link)) {
multilist_insert(dbuf_cache, db);
(void) refcount_add_many(&dbuf_cache_size,
db->db.db_size, db);
mutex_exit(&db->db_mtx);
dbuf_evict_notify();
}
if (do_arc_evict)
arc_freed(spa, &bp);
}
} else {
mutex_exit(&db->db_mtx);
}
}
#pragma weak dmu_buf_refcount = dbuf_refcount
uint64_t
dbuf_refcount(dmu_buf_impl_t *db)
{
return (refcount_count(&db->db_holds));
}
void *
dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
dmu_buf_user_t *new_user)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
mutex_enter(&db->db_mtx);
dbuf_verify_user(db, DBVU_NOT_EVICTING);
if (db->db_user == old_user)
db->db_user = new_user;
else
old_user = db->db_user;
dbuf_verify_user(db, DBVU_NOT_EVICTING);
mutex_exit(&db->db_mtx);
return (old_user);
}
void *
dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
{
return (dmu_buf_replace_user(db_fake, NULL, user));
}
void *
dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
db->db_user_immediate_evict = TRUE;
return (dmu_buf_set_user(db_fake, user));
}
void *
dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
{
return (dmu_buf_replace_user(db_fake, user, NULL));
}
void *
dmu_buf_get_user(dmu_buf_t *db_fake)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
dbuf_verify_user(db, DBVU_NOT_EVICTING);
return (db->db_user);
}
void
dmu_buf_user_evict_wait()
{
taskq_wait(dbu_evict_taskq);
}
blkptr_t *
dmu_buf_get_blkptr(dmu_buf_t *db)
{
dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
return (dbi->db_blkptr);
}
objset_t *
dmu_buf_get_objset(dmu_buf_t *db)
{
dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
return (dbi->db_objset);
}
dnode_t *
dmu_buf_dnode_enter(dmu_buf_t *db)
{
dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
DB_DNODE_ENTER(dbi);
return (DB_DNODE(dbi));
}
void
dmu_buf_dnode_exit(dmu_buf_t *db)
{
dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
DB_DNODE_EXIT(dbi);
}
static void
dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
{
/* ASSERT(dmu_tx_is_syncing(tx) */
ASSERT(MUTEX_HELD(&db->db_mtx));
if (db->db_blkptr != NULL)
return;
if (db->db_blkid == DMU_SPILL_BLKID) {
db->db_blkptr = &dn->dn_phys->dn_spill;
BP_ZERO(db->db_blkptr);
return;
}
if (db->db_level == dn->dn_phys->dn_nlevels-1) {
/*
* This buffer was allocated at a time when there was
* no available blkptrs from the dnode, or it was
* inappropriate to hook it in (i.e., nlevels mis-match).
*/
ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
ASSERT(db->db_parent == NULL);
db->db_parent = dn->dn_dbuf;
db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
DBUF_VERIFY(db);
} else {
dmu_buf_impl_t *parent = db->db_parent;
int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
ASSERT(dn->dn_phys->dn_nlevels > 1);
if (parent == NULL) {
mutex_exit(&db->db_mtx);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
parent = dbuf_hold_level(dn, db->db_level + 1,
db->db_blkid >> epbs, db);
rw_exit(&dn->dn_struct_rwlock);
mutex_enter(&db->db_mtx);
db->db_parent = parent;
}
db->db_blkptr = (blkptr_t *)parent->db.db_data +
(db->db_blkid & ((1ULL << epbs) - 1));
DBUF_VERIFY(db);
}
}
static void
dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = dr->dr_dbuf;
dnode_t *dn;
zio_t *zio;
ASSERT(dmu_tx_is_syncing(tx));
dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
mutex_enter(&db->db_mtx);
ASSERT(db->db_level > 0);
DBUF_VERIFY(db);
/* Read the block if it hasn't been read yet. */
if (db->db_buf == NULL) {
mutex_exit(&db->db_mtx);
(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
mutex_enter(&db->db_mtx);
}
ASSERT3U(db->db_state, ==, DB_CACHED);
ASSERT(db->db_buf != NULL);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
/* Indirect block size must match what the dnode thinks it is. */
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
dbuf_check_blkptr(dn, db);
DB_DNODE_EXIT(db);
/* Provide the pending dirty record to child dbufs */
db->db_data_pending = dr;
mutex_exit(&db->db_mtx);
+
dbuf_write(dr, db->db_buf, tx);
zio = dr->dr_zio;
mutex_enter(&dr->dt.di.dr_mtx);
dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
mutex_exit(&dr->dt.di.dr_mtx);
zio_nowait(zio);
}
static void
dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
{
arc_buf_t **datap = &dr->dt.dl.dr_data;
dmu_buf_impl_t *db = dr->dr_dbuf;
dnode_t *dn;
objset_t *os;
uint64_t txg = tx->tx_txg;
ASSERT(dmu_tx_is_syncing(tx));
dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
mutex_enter(&db->db_mtx);
/*
* To be synced, we must be dirtied. But we
* might have been freed after the dirty.
*/
if (db->db_state == DB_UNCACHED) {
/* This buffer has been freed since it was dirtied */
ASSERT(db->db.db_data == NULL);
} else if (db->db_state == DB_FILL) {
/* This buffer was freed and is now being re-filled */
ASSERT(db->db.db_data != dr->dt.dl.dr_data);
} else {
ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
}
DBUF_VERIFY(db);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
if (db->db_blkid == DMU_SPILL_BLKID) {
mutex_enter(&dn->dn_mtx);
dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
mutex_exit(&dn->dn_mtx);
}
/*
* If this is a bonus buffer, simply copy the bonus data into the
* dnode. It will be written out when the dnode is synced (and it
* will be synced, since it must have been dirty for dbuf_sync to
* be called).
*/
if (db->db_blkid == DMU_BONUS_BLKID) {
dbuf_dirty_record_t **drp;
ASSERT(*datap != NULL);
ASSERT0(db->db_level);
ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
DB_DNODE_EXIT(db);
if (*datap != db->db.db_data) {
zio_buf_free(*datap, DN_MAX_BONUSLEN);
arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
}
db->db_data_pending = NULL;
drp = &db->db_last_dirty;
while (*drp != dr)
drp = &(*drp)->dr_next;
ASSERT(dr->dr_next == NULL);
ASSERT(dr->dr_dbuf == db);
*drp = dr->dr_next;
if (dr->dr_dbuf->db_level != 0) {
list_destroy(&dr->dt.di.dr_children);
mutex_destroy(&dr->dt.di.dr_mtx);
}
kmem_free(dr, sizeof (dbuf_dirty_record_t));
ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1;
dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
return;
}
os = dn->dn_objset;
/*
* This function may have dropped the db_mtx lock allowing a dmu_sync
* operation to sneak in. As a result, we need to ensure that we
* don't check the dr_override_state until we have returned from
* dbuf_check_blkptr.
*/
dbuf_check_blkptr(dn, db);
/*
* If this buffer is in the middle of an immediate write,
* wait for the synchronous IO to complete.
*/
while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
cv_wait(&db->db_changed, &db->db_mtx);
ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
}
if (db->db_state != DB_NOFILL &&
dn->dn_object != DMU_META_DNODE_OBJECT &&
refcount_count(&db->db_holds) > 1 &&
dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
*datap == db->db_buf) {
/*
* If this buffer is currently "in use" (i.e., there
* are active holds and db_data still references it),
* then make a copy before we start the write so that
* any modifications from the open txg will not leak
* into this write.
*
* NOTE: this copy does not need to be made for
* objects only modified in the syncing context (e.g.
* DNONE_DNODE blocks).
*/
int psize = arc_buf_size(*datap);
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
enum zio_compress compress_type = arc_get_compression(*datap);
if (compress_type == ZIO_COMPRESS_OFF) {
*datap = arc_alloc_buf(os->os_spa, db, type, psize);
} else {
ASSERT3U(type, ==, ARC_BUFC_DATA);
int lsize = arc_buf_lsize(*datap);
*datap = arc_alloc_compressed_buf(os->os_spa, db,
psize, lsize, compress_type);
}
bcopy(db->db.db_data, (*datap)->b_data, psize);
}
db->db_data_pending = dr;
mutex_exit(&db->db_mtx);
dbuf_write(dr, *datap, tx);
ASSERT(!list_link_active(&dr->dr_dirty_node));
if (dn->dn_object == DMU_META_DNODE_OBJECT) {
list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
DB_DNODE_EXIT(db);
} else {
/*
* Although zio_nowait() does not "wait for an IO", it does
* initiate the IO. If this is an empty write it seems plausible
* that the IO could actually be completed before the nowait
* returns. We need to DB_DNODE_EXIT() first in case
* zio_nowait() invalidates the dbuf.
*/
DB_DNODE_EXIT(db);
zio_nowait(dr->dr_zio);
}
}
void
dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
{
dbuf_dirty_record_t *dr;
while (dr = list_head(list)) {
if (dr->dr_zio != NULL) {
/*
* If we find an already initialized zio then we
* are processing the meta-dnode, and we have finished.
* The dbufs for all dnodes are put back on the list
* during processing, so that we can zio_wait()
* these IOs after initiating all child IOs.
*/
ASSERT3U(dr->dr_dbuf->db.db_object, ==,
DMU_META_DNODE_OBJECT);
break;
}
if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
VERIFY3U(dr->dr_dbuf->db_level, ==, level);
}
list_remove(list, dr);
if (dr->dr_dbuf->db_level > 0)
dbuf_sync_indirect(dr, tx);
else
dbuf_sync_leaf(dr, tx);
}
}
/* ARGSUSED */
static void
dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
{
dmu_buf_impl_t *db = vdb;
dnode_t *dn;
blkptr_t *bp = zio->io_bp;
blkptr_t *bp_orig = &zio->io_bp_orig;
spa_t *spa = zio->io_spa;
int64_t delta;
uint64_t fill = 0;
int i;
ASSERT3P(db->db_blkptr, !=, NULL);
ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
zio->io_prev_space_delta = delta;
if (bp->blk_birth != 0) {
ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
BP_GET_TYPE(bp) == dn->dn_type) ||
(db->db_blkid == DMU_SPILL_BLKID &&
BP_GET_TYPE(bp) == dn->dn_bonustype) ||
BP_IS_EMBEDDED(bp));
ASSERT(BP_GET_LEVEL(bp) == db->db_level);
}
mutex_enter(&db->db_mtx);
#ifdef ZFS_DEBUG
if (db->db_blkid == DMU_SPILL_BLKID) {
ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
ASSERT(!(BP_IS_HOLE(bp)) &&
db->db_blkptr == &dn->dn_phys->dn_spill);
}
#endif
if (db->db_level == 0) {
mutex_enter(&dn->dn_mtx);
if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
db->db_blkid != DMU_SPILL_BLKID)
dn->dn_phys->dn_maxblkid = db->db_blkid;
mutex_exit(&dn->dn_mtx);
if (dn->dn_type == DMU_OT_DNODE) {
dnode_phys_t *dnp = db->db.db_data;
for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
i--, dnp++) {
if (dnp->dn_type != DMU_OT_NONE)
fill++;
}
} else {
if (BP_IS_HOLE(bp)) {
fill = 0;
} else {
fill = 1;
}
}
} else {
blkptr_t *ibp = db->db.db_data;
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
if (BP_IS_HOLE(ibp))
continue;
fill += BP_GET_FILL(ibp);
}
}
DB_DNODE_EXIT(db);
if (!BP_IS_EMBEDDED(bp))
bp->blk_fill = fill;
mutex_exit(&db->db_mtx);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
*db->db_blkptr = *bp;
rw_exit(&dn->dn_struct_rwlock);
}
/* ARGSUSED */
/*
* This function gets called just prior to running through the compression
* stage of the zio pipeline. If we're an indirect block comprised of only
* holes, then we want this indirect to be compressed away to a hole. In
* order to do that we must zero out any information about the holes that
* this indirect points to prior to before we try to compress it.
*/
static void
dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
{
dmu_buf_impl_t *db = vdb;
dnode_t *dn;
blkptr_t *bp;
unsigned int epbs, i;
ASSERT3U(db->db_level, >, 0);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
ASSERT3U(epbs, <, 31);
/* Determine if all our children are holes */
for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
if (!BP_IS_HOLE(bp))
break;
}
/*
* If all the children are holes, then zero them all out so that
* we may get compressed away.
*/
if (i == 1 << epbs) {
/*
* We only found holes. Grab the rwlock to prevent
* anybody from reading the blocks we're about to
* zero out.
*/
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
bzero(db->db.db_data, db->db.db_size);
rw_exit(&dn->dn_struct_rwlock);
}
DB_DNODE_EXIT(db);
}
/*
* The SPA will call this callback several times for each zio - once
* for every physical child i/o (zio->io_phys_children times). This
* allows the DMU to monitor the progress of each logical i/o. For example,
* there may be 2 copies of an indirect block, or many fragments of a RAID-Z
* block. There may be a long delay before all copies/fragments are completed,
* so this callback allows us to retire dirty space gradually, as the physical
* i/os complete.
*/
/* ARGSUSED */
static void
dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
{
dmu_buf_impl_t *db = arg;
objset_t *os = db->db_objset;
dsl_pool_t *dp = dmu_objset_pool(os);
dbuf_dirty_record_t *dr;
int delta = 0;
dr = db->db_data_pending;
ASSERT3U(dr->dr_txg, ==, zio->io_txg);
/*
* The callback will be called io_phys_children times. Retire one
* portion of our dirty space each time we are called. Any rounding
* error will be cleaned up by dsl_pool_sync()'s call to
* dsl_pool_undirty_space().
*/
delta = dr->dr_accounted / zio->io_phys_children;
dsl_pool_undirty_space(dp, delta, zio->io_txg);
}
/* ARGSUSED */
static void
dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
{
dmu_buf_impl_t *db = vdb;
blkptr_t *bp_orig = &zio->io_bp_orig;
blkptr_t *bp = db->db_blkptr;
objset_t *os = db->db_objset;
dmu_tx_t *tx = os->os_synctx;
dbuf_dirty_record_t **drp, *dr;
ASSERT0(zio->io_error);
ASSERT(db->db_blkptr == bp);
/*
* For nopwrites and rewrites we ensure that the bp matches our
* original and bypass all the accounting.
*/
if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
ASSERT(BP_EQUAL(bp, bp_orig));
} else {
dsl_dataset_t *ds = os->os_dsl_dataset;
(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
dsl_dataset_block_born(ds, bp, tx);
}
mutex_enter(&db->db_mtx);
DBUF_VERIFY(db);
drp = &db->db_last_dirty;
while ((dr = *drp) != db->db_data_pending)
drp = &dr->dr_next;
ASSERT(!list_link_active(&dr->dr_dirty_node));
ASSERT(dr->dr_dbuf == db);
ASSERT(dr->dr_next == NULL);
*drp = dr->dr_next;
#ifdef ZFS_DEBUG
if (db->db_blkid == DMU_SPILL_BLKID) {
dnode_t *dn;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
db->db_blkptr == &dn->dn_phys->dn_spill);
DB_DNODE_EXIT(db);
}
#endif
if (db->db_level == 0) {
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
if (db->db_state != DB_NOFILL) {
if (dr->dt.dl.dr_data != db->db_buf)
arc_buf_destroy(dr->dt.dl.dr_data, db);
}
} else {
dnode_t *dn;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
if (!BP_IS_HOLE(db->db_blkptr)) {
int epbs =
dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
ASSERT3U(db->db_blkid, <=,
dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
db->db.db_size);
}
DB_DNODE_EXIT(db);
mutex_destroy(&dr->dt.di.dr_mtx);
list_destroy(&dr->dt.di.dr_children);
}
kmem_free(dr, sizeof (dbuf_dirty_record_t));
cv_broadcast(&db->db_changed);
ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1;
db->db_data_pending = NULL;
dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
}
static void
dbuf_write_nofill_ready(zio_t *zio)
{
dbuf_write_ready(zio, NULL, zio->io_private);
}
static void
dbuf_write_nofill_done(zio_t *zio)
{
dbuf_write_done(zio, NULL, zio->io_private);
}
static void
dbuf_write_override_ready(zio_t *zio)
{
dbuf_dirty_record_t *dr = zio->io_private;
dmu_buf_impl_t *db = dr->dr_dbuf;
dbuf_write_ready(zio, NULL, db);
}
static void
dbuf_write_override_done(zio_t *zio)
{
dbuf_dirty_record_t *dr = zio->io_private;
dmu_buf_impl_t *db = dr->dr_dbuf;
blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
mutex_enter(&db->db_mtx);
if (!BP_EQUAL(zio->io_bp, obp)) {
if (!BP_IS_HOLE(obp))
dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
arc_release(dr->dt.dl.dr_data, db);
}
mutex_exit(&db->db_mtx);
dbuf_write_done(zio, NULL, db);
if (zio->io_abd != NULL)
abd_put(zio->io_abd);
}
+typedef struct dbuf_remap_impl_callback_arg {
+ objset_t *drica_os;
+ uint64_t drica_blk_birth;
+ dmu_tx_t *drica_tx;
+} dbuf_remap_impl_callback_arg_t;
+
+static void
+dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size,
+ void *arg)
+{
+ dbuf_remap_impl_callback_arg_t *drica = arg;
+ objset_t *os = drica->drica_os;
+ spa_t *spa = dmu_objset_spa(os);
+ dmu_tx_t *tx = drica->drica_tx;
+
+ ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
+
+ if (os == spa_meta_objset(spa)) {
+ spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
+ } else {
+ dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset,
+ size, drica->drica_blk_birth, tx);
+ }
+}
+
+static void
+dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, dmu_tx_t *tx)
+{
+ blkptr_t bp_copy = *bp;
+ spa_t *spa = dmu_objset_spa(dn->dn_objset);
+ dbuf_remap_impl_callback_arg_t drica;
+
+ ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
+
+ drica.drica_os = dn->dn_objset;
+ drica.drica_blk_birth = bp->blk_birth;
+ drica.drica_tx = tx;
+ if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
+ &drica)) {
+ /*
+ * The struct_rwlock prevents dbuf_read_impl() from
+ * dereferencing the BP while we are changing it. To
+ * avoid lock contention, only grab it when we are actually
+ * changing the BP.
+ */
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ *bp = bp_copy;
+ rw_exit(&dn->dn_struct_rwlock);
+ }
+}
+
+/*
+ * Returns true if a dbuf_remap would modify the dbuf. We do this by attempting
+ * to remap a copy of every bp in the dbuf.
+ */
+boolean_t
+dbuf_can_remap(const dmu_buf_impl_t *db)
+{
+ spa_t *spa = dmu_objset_spa(db->db_objset);
+ blkptr_t *bp = db->db.db_data;
+ boolean_t ret = B_FALSE;
+
+ ASSERT3U(db->db_level, >, 0);
+ ASSERT3S(db->db_state, ==, DB_CACHED);
+
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
+ blkptr_t bp_copy = bp[i];
+ if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) {
+ ret = B_TRUE;
+ break;
+ }
+ }
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ return (ret);
+}
+
+boolean_t
+dnode_needs_remap(const dnode_t *dn)
+{
+ spa_t *spa = dmu_objset_spa(dn->dn_objset);
+ boolean_t ret = B_FALSE;
+
+ if (dn->dn_phys->dn_nlevels == 0) {
+ return (B_FALSE);
+ }
+
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ for (int j = 0; j < dn->dn_phys->dn_nblkptr; j++) {
+ blkptr_t bp_copy = dn->dn_phys->dn_blkptr[j];
+ if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) {
+ ret = B_TRUE;
+ break;
+ }
+ }
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ return (ret);
+}
+
+/*
+ * Remap any existing BP's to concrete vdevs, if possible.
+ */
+static void
+dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_objset_spa(db->db_objset);
+ ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
+
+ if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
+ return;
+
+ if (db->db_level > 0) {
+ blkptr_t *bp = db->db.db_data;
+ for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
+ dbuf_remap_impl(dn, &bp[i], tx);
+ }
+ } else if (db->db.db_object == DMU_META_DNODE_OBJECT) {
+ dnode_phys_t *dnp = db->db.db_data;
+ ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==,
+ DMU_OT_DNODE);
+ for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; i++) {
+ for (int j = 0; j < dnp[i].dn_nblkptr; j++) {
+ dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], tx);
+ }
+ }
+ }
+}
+
+
/* Issue I/O to commit a dirty buffer to disk. */
static void
dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = dr->dr_dbuf;
dnode_t *dn;
objset_t *os;
dmu_buf_impl_t *parent = db->db_parent;
uint64_t txg = tx->tx_txg;
zbookmark_phys_t zb;
zio_prop_t zp;
zio_t *zio;
int wp_flag = 0;
ASSERT(dmu_tx_is_syncing(tx));
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
os = dn->dn_objset;
if (db->db_state != DB_NOFILL) {
if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
/*
* Private object buffers are released here rather
* than in dbuf_dirty() since they are only modified
* in the syncing context and we don't want the
* overhead of making multiple copies of the data.
*/
if (BP_IS_HOLE(db->db_blkptr)) {
arc_buf_thaw(data);
} else {
dbuf_release_bp(db);
}
+ dbuf_remap(dn, db, tx);
}
}
if (parent != dn->dn_dbuf) {
/* Our parent is an indirect block. */
/* We have a dirty parent that has been scheduled for write. */
ASSERT(parent && parent->db_data_pending);
/* Our parent's buffer is one level closer to the dnode. */
ASSERT(db->db_level == parent->db_level-1);
/*
* We're about to modify our parent's db_data by modifying
* our block pointer, so the parent must be released.
*/
ASSERT(arc_released(parent->db_buf));
zio = parent->db_data_pending->dr_zio;
} else {
/* Our parent is the dnode itself. */
ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
db->db_blkid != DMU_SPILL_BLKID) ||
(db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
if (db->db_blkid != DMU_SPILL_BLKID)
ASSERT3P(db->db_blkptr, ==,
&dn->dn_phys->dn_blkptr[db->db_blkid]);
zio = dn->dn_zio;
}
ASSERT(db->db_level == 0 || data == db->db_buf);
ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
ASSERT(zio);
SET_BOOKMARK(&zb, os->os_dsl_dataset ?
os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
db->db.db_object, db->db_level, db->db_blkid);
if (db->db_blkid == DMU_SPILL_BLKID)
wp_flag = WP_SPILL;
wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
DB_DNODE_EXIT(db);
/*
* We copy the blkptr now (rather than when we instantiate the dirty
* record), because its value can change between open context and
* syncing context. We do not need to hold dn_struct_rwlock to read
* db_blkptr because we are in syncing context.
*/
dr->dr_bp_copy = *db->db_blkptr;
if (db->db_level == 0 &&
dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
/*
* The BP for this block has been provided by open context
* (by dmu_sync() or dmu_buf_write_embedded()).
*/
abd_t *contents = (data != NULL) ?
abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy,
contents, db->db.db_size, db->db.db_size, &zp,
dbuf_write_override_ready, NULL, NULL,
dbuf_write_override_done,
dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
mutex_enter(&db->db_mtx);
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
mutex_exit(&db->db_mtx);
} else if (db->db_state == DB_NOFILL) {
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
dr->dr_zio = zio_write(zio, os->os_spa, txg,
&dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
dbuf_write_nofill_ready, NULL, NULL,
dbuf_write_nofill_done, db,
ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
} else {
ASSERT(arc_released(data));
/*
* For indirect blocks, we want to setup the children
* ready callback so that we can properly handle an indirect
* block that only contains holes.
*/
arc_done_func_t *children_ready_cb = NULL;
if (db->db_level != 0)
children_ready_cb = dbuf_write_children_ready;
dr->dr_zio = arc_write(zio, os->os_spa, txg,
&dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db),
&zp, dbuf_write_ready, children_ready_cb,
dbuf_write_physdone, dbuf_write_done, db,
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
}
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c (revision 332525)
@@ -1,1164 +1,1164 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/spa_impl.h>
#include <sys/zio.h>
#include <sys/ddt.h>
#include <sys/zap.h>
#include <sys/dmu_tx.h>
#include <sys/arc.h>
#include <sys/dsl_pool.h>
#include <sys/zio_checksum.h>
#include <sys/zio_compress.h>
#include <sys/dsl_scan.h>
#include <sys/abd.h>
/*
* Enable/disable prefetching of dedup-ed blocks which are going to be freed.
*/
int zfs_dedup_prefetch = 1;
SYSCTL_DECL(_vfs_zfs);
SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW, 0, "ZFS DEDUP");
SYSCTL_INT(_vfs_zfs_dedup, OID_AUTO, prefetch, CTLFLAG_RWTUN, &zfs_dedup_prefetch,
0, "Enable/disable prefetching of dedup-ed blocks which are going to be freed");
static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
&ddt_zap_ops,
};
static const char *ddt_class_name[DDT_CLASSES] = {
"ditto",
"duplicate",
"unique",
};
static void
ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
dmu_tx_t *tx)
{
spa_t *spa = ddt->ddt_spa;
objset_t *os = ddt->ddt_os;
uint64_t *objectp = &ddt->ddt_object[type][class];
boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
ZCHECKSUM_FLAG_DEDUP;
char name[DDT_NAMELEN];
ddt_object_name(ddt, type, class, name);
ASSERT(*objectp == 0);
VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
ASSERT(*objectp != 0);
VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
sizeof (uint64_t), 1, objectp, tx) == 0);
VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
&ddt->ddt_histogram[type][class], tx) == 0);
}
static void
ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
dmu_tx_t *tx)
{
spa_t *spa = ddt->ddt_spa;
objset_t *os = ddt->ddt_os;
uint64_t *objectp = &ddt->ddt_object[type][class];
uint64_t count;
char name[DDT_NAMELEN];
ddt_object_name(ddt, type, class, name);
ASSERT(*objectp != 0);
VERIFY(ddt_object_count(ddt, type, class, &count) == 0 && count == 0);
ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
*objectp = 0;
}
static int
ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
{
ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
dmu_object_info_t doi;
uint64_t count;
char name[DDT_NAMELEN];
int error;
ddt_object_name(ddt, type, class, name);
error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
if (error != 0)
return (error);
VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
&ddt->ddt_histogram[type][class]));
/*
* Seed the cached statistics.
*/
VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
error = ddt_object_count(ddt, type, class, &count);
if (error)
return error;
ddo->ddo_count = count;
ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
return (0);
}
static void
ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
dmu_tx_t *tx)
{
ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
dmu_object_info_t doi;
uint64_t count;
char name[DDT_NAMELEN];
ddt_object_name(ddt, type, class, name);
VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
&ddt->ddt_histogram[type][class], tx) == 0);
/*
* Cache DDT statistics; this is the only time they'll change.
*/
VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
VERIFY(ddt_object_count(ddt, type, class, &count) == 0);
ddo->ddo_count = count;
ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
}
static int
ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
ddt_entry_t *dde)
{
if (!ddt_object_exists(ddt, type, class))
return (SET_ERROR(ENOENT));
return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
ddt->ddt_object[type][class], dde));
}
static void
ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
ddt_entry_t *dde)
{
if (!ddt_object_exists(ddt, type, class))
return;
ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
ddt->ddt_object[type][class], dde);
}
int
ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
ddt_entry_t *dde, dmu_tx_t *tx)
{
ASSERT(ddt_object_exists(ddt, type, class));
return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
ddt->ddt_object[type][class], dde, tx));
}
static int
ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
ddt_entry_t *dde, dmu_tx_t *tx)
{
ASSERT(ddt_object_exists(ddt, type, class));
return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
ddt->ddt_object[type][class], dde, tx));
}
int
ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
uint64_t *walk, ddt_entry_t *dde)
{
ASSERT(ddt_object_exists(ddt, type, class));
return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
ddt->ddt_object[type][class], dde, walk));
}
int
ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class, uint64_t *count)
{
ASSERT(ddt_object_exists(ddt, type, class));
return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
ddt->ddt_object[type][class], count));
}
int
ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
dmu_object_info_t *doi)
{
if (!ddt_object_exists(ddt, type, class))
return (SET_ERROR(ENOENT));
return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
doi));
}
boolean_t
ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
{
return (!!ddt->ddt_object[type][class]);
}
void
ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
char *name)
{
(void) sprintf(name, DMU_POOL_DDT,
zio_checksum_table[ddt->ddt_checksum].ci_name,
ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
}
void
ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
{
ASSERT(txg != 0);
for (int d = 0; d < SPA_DVAS_PER_BP; d++)
bp->blk_dva[d] = ddp->ddp_dva[d];
BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
}
void
ddt_bp_create(enum zio_checksum checksum,
const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
{
BP_ZERO(bp);
if (ddp != NULL)
ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
bp->blk_cksum = ddk->ddk_cksum;
bp->blk_fill = 1;
BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
BP_SET_CHECKSUM(bp, checksum);
BP_SET_TYPE(bp, DMU_OT_DEDUP);
BP_SET_LEVEL(bp, 0);
BP_SET_DEDUP(bp, 0);
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
}
void
ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
{
ddk->ddk_cksum = bp->blk_cksum;
ddk->ddk_prop = 0;
DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
}
void
ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
{
ASSERT(ddp->ddp_phys_birth == 0);
for (int d = 0; d < SPA_DVAS_PER_BP; d++)
ddp->ddp_dva[d] = bp->blk_dva[d];
ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
}
void
ddt_phys_clear(ddt_phys_t *ddp)
{
bzero(ddp, sizeof (*ddp));
}
void
ddt_phys_addref(ddt_phys_t *ddp)
{
ddp->ddp_refcnt++;
}
void
ddt_phys_decref(ddt_phys_t *ddp)
{
ASSERT((int64_t)ddp->ddp_refcnt > 0);
ddp->ddp_refcnt--;
}
void
ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
{
blkptr_t blk;
ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
ddt_phys_clear(ddp);
zio_free(ddt->ddt_spa, txg, &blk);
}
ddt_phys_t *
ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
{
ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys;
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
return (ddp);
}
return (NULL);
}
uint64_t
ddt_phys_total_refcnt(const ddt_entry_t *dde)
{
uint64_t refcnt = 0;
for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
refcnt += dde->dde_phys[p].ddp_refcnt;
return (refcnt);
}
static void
ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
{
spa_t *spa = ddt->ddt_spa;
ddt_phys_t *ddp = dde->dde_phys;
ddt_key_t *ddk = &dde->dde_key;
uint64_t lsize = DDK_GET_LSIZE(ddk);
uint64_t psize = DDK_GET_PSIZE(ddk);
bzero(dds, sizeof (*dds));
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
uint64_t dsize = 0;
uint64_t refcnt = ddp->ddp_refcnt;
if (ddp->ddp_phys_birth == 0)
continue;
for (int d = 0; d < SPA_DVAS_PER_BP; d++)
dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
dds->dds_blocks += 1;
dds->dds_lsize += lsize;
dds->dds_psize += psize;
dds->dds_dsize += dsize;
dds->dds_ref_blocks += refcnt;
dds->dds_ref_lsize += lsize * refcnt;
dds->dds_ref_psize += psize * refcnt;
dds->dds_ref_dsize += dsize * refcnt;
}
}
void
ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
{
const uint64_t *s = (const uint64_t *)src;
uint64_t *d = (uint64_t *)dst;
uint64_t *d_end = (uint64_t *)(dst + 1);
ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */
while (d < d_end)
*d++ += (*s++ ^ neg) - neg;
}
static void
ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
{
ddt_stat_t dds;
ddt_histogram_t *ddh;
int bucket;
ddt_stat_generate(ddt, dde, &dds);
bucket = highbit64(dds.dds_ref_blocks) - 1;
ASSERT(bucket >= 0);
ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
}
void
ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
{
for (int h = 0; h < 64; h++)
ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
}
void
ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
{
bzero(dds, sizeof (*dds));
for (int h = 0; h < 64; h++)
ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
}
boolean_t
ddt_histogram_empty(const ddt_histogram_t *ddh)
{
const uint64_t *s = (const uint64_t *)ddh;
const uint64_t *s_end = (const uint64_t *)(ddh + 1);
while (s < s_end)
if (*s++ != 0)
return (B_FALSE);
return (B_TRUE);
}
void
ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
{
/* Sum the statistics we cached in ddt_object_sync(). */
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];
for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
for (enum ddt_class class = 0; class < DDT_CLASSES;
class++) {
ddt_object_t *ddo =
&ddt->ddt_object_stats[type][class];
ddo_total->ddo_count += ddo->ddo_count;
ddo_total->ddo_dspace += ddo->ddo_dspace;
ddo_total->ddo_mspace += ddo->ddo_mspace;
}
}
}
/* ... and compute the averages. */
if (ddo_total->ddo_count != 0) {
ddo_total->ddo_dspace /= ddo_total->ddo_count;
ddo_total->ddo_mspace /= ddo_total->ddo_count;
}
}
void
ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
{
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];
for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
for (enum ddt_class class = 0; class < DDT_CLASSES;
class++) {
ddt_histogram_add(ddh,
&ddt->ddt_histogram_cache[type][class]);
}
}
}
}
void
ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
{
ddt_histogram_t *ddh_total;
ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
ddt_get_dedup_histogram(spa, ddh_total);
ddt_histogram_stat(dds_total, ddh_total);
kmem_free(ddh_total, sizeof (ddt_histogram_t));
}
uint64_t
ddt_get_dedup_dspace(spa_t *spa)
{
ddt_stat_t dds_total = { 0 };
ddt_get_dedup_stats(spa, &dds_total);
return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
}
uint64_t
ddt_get_pool_dedup_ratio(spa_t *spa)
{
ddt_stat_t dds_total = { 0 };
ddt_get_dedup_stats(spa, &dds_total);
if (dds_total.dds_dsize == 0)
return (100);
return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
}
int
ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
{
spa_t *spa = ddt->ddt_spa;
uint64_t total_refcnt = 0;
uint64_t ditto = spa->spa_dedup_ditto;
int total_copies = 0;
int desired_copies = 0;
for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
ddt_phys_t *ddp = &dde->dde_phys[p];
zio_t *zio = dde->dde_lead_zio[p];
uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */
if (zio != NULL)
refcnt += zio->io_parent_count; /* pending refs */
if (ddp == ddp_willref)
refcnt++; /* caller's ref */
if (refcnt != 0) {
total_refcnt += refcnt;
total_copies += p;
}
}
if (ditto == 0 || ditto > UINT32_MAX)
ditto = UINT32_MAX;
if (total_refcnt >= 1)
desired_copies++;
if (total_refcnt >= ditto)
desired_copies++;
if (total_refcnt >= ditto * ditto)
desired_copies++;
return (MAX(desired_copies, total_copies) - total_copies);
}
int
ddt_ditto_copies_present(ddt_entry_t *dde)
{
ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO];
dva_t *dva = ddp->ddp_dva;
int copies = 0 - DVA_GET_GANG(dva);
for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
if (DVA_IS_VALID(dva))
copies++;
ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP);
return (copies);
}
size_t
ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len)
{
uchar_t *version = dst++;
int cpfunc = ZIO_COMPRESS_ZLE;
zio_compress_info_t *ci = &zio_compress_table[cpfunc];
size_t c_len;
ASSERT(d_len >= s_len + 1); /* no compression plus version byte */
c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);
if (c_len == s_len) {
cpfunc = ZIO_COMPRESS_OFF;
bcopy(src, dst, s_len);
}
*version = cpfunc;
/* CONSTCOND */
if (ZFS_HOST_BYTEORDER)
*version |= DDT_COMPRESS_BYTEORDER_MASK;
return (c_len + 1);
}
void
ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
{
uchar_t version = *src++;
int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
zio_compress_info_t *ci = &zio_compress_table[cpfunc];
if (ci->ci_decompress != NULL)
(void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
else
bcopy(src, dst, d_len);
if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) !=
(ZFS_HOST_BYTEORDER != 0))
byteswap_uint64_array(dst, d_len);
}
ddt_t *
ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
{
return (spa->spa_ddt[c]);
}
ddt_t *
ddt_select(spa_t *spa, const blkptr_t *bp)
{
return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
}
void
ddt_enter(ddt_t *ddt)
{
mutex_enter(&ddt->ddt_lock);
}
void
ddt_exit(ddt_t *ddt)
{
mutex_exit(&ddt->ddt_lock);
}
static ddt_entry_t *
ddt_alloc(const ddt_key_t *ddk)
{
ddt_entry_t *dde;
dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP);
cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
dde->dde_key = *ddk;
return (dde);
}
static void
ddt_free(ddt_entry_t *dde)
{
ASSERT(!dde->dde_loading);
for (int p = 0; p < DDT_PHYS_TYPES; p++)
ASSERT(dde->dde_lead_zio[p] == NULL);
if (dde->dde_repair_abd != NULL)
abd_free(dde->dde_repair_abd);
cv_destroy(&dde->dde_cv);
kmem_free(dde, sizeof (*dde));
}
void
ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
{
ASSERT(MUTEX_HELD(&ddt->ddt_lock));
avl_remove(&ddt->ddt_tree, dde);
ddt_free(dde);
}
ddt_entry_t *
ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
{
ddt_entry_t *dde, dde_search;
enum ddt_type type;
enum ddt_class class;
avl_index_t where;
int error;
ASSERT(MUTEX_HELD(&ddt->ddt_lock));
ddt_key_fill(&dde_search.dde_key, bp);
dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
if (dde == NULL) {
if (!add)
return (NULL);
dde = ddt_alloc(&dde_search.dde_key);
avl_insert(&ddt->ddt_tree, dde, where);
}
while (dde->dde_loading)
cv_wait(&dde->dde_cv, &ddt->ddt_lock);
if (dde->dde_loaded)
return (dde);
dde->dde_loading = B_TRUE;
ddt_exit(ddt);
error = ENOENT;
for (type = 0; type < DDT_TYPES; type++) {
for (class = 0; class < DDT_CLASSES; class++) {
error = ddt_object_lookup(ddt, type, class, dde);
- if (error != ENOENT)
+ if (error != ENOENT) {
+ ASSERT0(error);
break;
+ }
}
if (error != ENOENT)
break;
}
- ASSERT(error == 0 || error == ENOENT);
-
ddt_enter(ddt);
ASSERT(dde->dde_loaded == B_FALSE);
ASSERT(dde->dde_loading == B_TRUE);
dde->dde_type = type; /* will be DDT_TYPES if no entry found */
dde->dde_class = class; /* will be DDT_CLASSES if no entry found */
dde->dde_loaded = B_TRUE;
dde->dde_loading = B_FALSE;
if (error == 0)
ddt_stat_update(ddt, dde, -1ULL);
cv_broadcast(&dde->dde_cv);
return (dde);
}
void
ddt_prefetch(spa_t *spa, const blkptr_t *bp)
{
ddt_t *ddt;
ddt_entry_t dde;
if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
return;
/*
* We only remove the DDT once all tables are empty and only
* prefetch dedup blocks when there are entries in the DDT.
* Thus no locking is required as the DDT can't disappear on us.
*/
ddt = ddt_select(spa, bp);
ddt_key_fill(&dde.dde_key, bp);
for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
ddt_object_prefetch(ddt, type, class, &dde);
}
}
}
int
ddt_entry_compare(const void *x1, const void *x2)
{
const ddt_entry_t *dde1 = x1;
const ddt_entry_t *dde2 = x2;
const uint64_t *u1 = (const uint64_t *)&dde1->dde_key;
const uint64_t *u2 = (const uint64_t *)&dde2->dde_key;
for (int i = 0; i < DDT_KEY_WORDS; i++) {
if (u1[i] < u2[i])
return (-1);
if (u1[i] > u2[i])
return (1);
}
return (0);
}
static ddt_t *
ddt_table_alloc(spa_t *spa, enum zio_checksum c)
{
ddt_t *ddt;
ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
avl_create(&ddt->ddt_tree, ddt_entry_compare,
sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
ddt->ddt_checksum = c;
ddt->ddt_spa = spa;
ddt->ddt_os = spa->spa_meta_objset;
return (ddt);
}
static void
ddt_table_free(ddt_t *ddt)
{
ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
avl_destroy(&ddt->ddt_tree);
avl_destroy(&ddt->ddt_repair_tree);
mutex_destroy(&ddt->ddt_lock);
kmem_free(ddt, sizeof (*ddt));
}
void
ddt_create(spa_t *spa)
{
spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
spa->spa_ddt[c] = ddt_table_alloc(spa, c);
}
int
ddt_load(spa_t *spa)
{
int error;
ddt_create(spa);
error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
&spa->spa_ddt_stat_object);
if (error)
return (error == ENOENT ? 0 : error);
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];
for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
for (enum ddt_class class = 0; class < DDT_CLASSES;
class++) {
error = ddt_object_load(ddt, type, class);
if (error != 0 && error != ENOENT)
return (error);
}
}
/*
* Seed the cached histograms.
*/
bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
sizeof (ddt->ddt_histogram));
}
return (0);
}
void
ddt_unload(spa_t *spa)
{
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
if (spa->spa_ddt[c]) {
ddt_table_free(spa->spa_ddt[c]);
spa->spa_ddt[c] = NULL;
}
}
}
boolean_t
ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
{
ddt_t *ddt;
ddt_entry_t dde;
if (!BP_GET_DEDUP(bp))
return (B_FALSE);
if (max_class == DDT_CLASS_UNIQUE)
return (B_TRUE);
ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
ddt_key_fill(&dde.dde_key, bp);
for (enum ddt_type type = 0; type < DDT_TYPES; type++)
for (enum ddt_class class = 0; class <= max_class; class++)
if (ddt_object_lookup(ddt, type, class, &dde) == 0)
return (B_TRUE);
return (B_FALSE);
}
ddt_entry_t *
ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
{
ddt_key_t ddk;
ddt_entry_t *dde;
ddt_key_fill(&ddk, bp);
dde = ddt_alloc(&ddk);
for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
/*
* We can only do repair if there are multiple copies
* of the block. For anything in the UNIQUE class,
* there's definitely only one copy, so don't even try.
*/
if (class != DDT_CLASS_UNIQUE &&
ddt_object_lookup(ddt, type, class, dde) == 0)
return (dde);
}
}
bzero(dde->dde_phys, sizeof (dde->dde_phys));
return (dde);
}
void
ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
{
avl_index_t where;
ddt_enter(ddt);
if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) &&
avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
avl_insert(&ddt->ddt_repair_tree, dde, where);
else
ddt_free(dde);
ddt_exit(ddt);
}
static void
ddt_repair_entry_done(zio_t *zio)
{
ddt_entry_t *rdde = zio->io_private;
ddt_free(rdde);
}
static void
ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
{
ddt_phys_t *ddp = dde->dde_phys;
ddt_phys_t *rddp = rdde->dde_phys;
ddt_key_t *ddk = &dde->dde_key;
ddt_key_t *rddk = &rdde->dde_key;
zio_t *zio;
blkptr_t blk;
zio = zio_null(rio, rio->io_spa, NULL,
ddt_repair_entry_done, rdde, rio->io_flags);
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
if (ddp->ddp_phys_birth == 0 ||
ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
continue;
ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL,
ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
}
zio_nowait(zio);
}
static void
ddt_repair_table(ddt_t *ddt, zio_t *rio)
{
spa_t *spa = ddt->ddt_spa;
ddt_entry_t *dde, *rdde_next, *rdde;
avl_tree_t *t = &ddt->ddt_repair_tree;
blkptr_t blk;
if (spa_sync_pass(spa) > 1)
return;
ddt_enter(ddt);
for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
rdde_next = AVL_NEXT(t, rdde);
avl_remove(&ddt->ddt_repair_tree, rdde);
ddt_exit(ddt);
ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
dde = ddt_repair_start(ddt, &blk);
ddt_repair_entry(ddt, dde, rdde, rio);
ddt_repair_done(ddt, dde);
ddt_enter(ddt);
}
ddt_exit(ddt);
}
static void
ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
{
dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
ddt_phys_t *ddp = dde->dde_phys;
ddt_key_t *ddk = &dde->dde_key;
enum ddt_type otype = dde->dde_type;
enum ddt_type ntype = DDT_TYPE_CURRENT;
enum ddt_class oclass = dde->dde_class;
enum ddt_class nclass;
uint64_t total_refcnt = 0;
ASSERT(dde->dde_loaded);
ASSERT(!dde->dde_loading);
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
ASSERT(dde->dde_lead_zio[p] == NULL);
ASSERT((int64_t)ddp->ddp_refcnt >= 0);
if (ddp->ddp_phys_birth == 0) {
ASSERT(ddp->ddp_refcnt == 0);
continue;
}
if (p == DDT_PHYS_DITTO) {
if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
ddt_phys_free(ddt, ddk, ddp, txg);
continue;
}
if (ddp->ddp_refcnt == 0)
ddt_phys_free(ddt, ddk, ddp, txg);
total_refcnt += ddp->ddp_refcnt;
}
if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
nclass = DDT_CLASS_DITTO;
else if (total_refcnt > 1)
nclass = DDT_CLASS_DUPLICATE;
else
nclass = DDT_CLASS_UNIQUE;
if (otype != DDT_TYPES &&
(otype != ntype || oclass != nclass || total_refcnt == 0)) {
VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
}
if (total_refcnt != 0) {
dde->dde_type = ntype;
dde->dde_class = nclass;
ddt_stat_update(ddt, dde, 0);
if (!ddt_object_exists(ddt, ntype, nclass))
ddt_object_create(ddt, ntype, nclass, tx);
VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
/*
* If the class changes, the order that we scan this bp
* changes. If it decreases, we could miss it, so
* scan it right now. (This covers both class changing
* while we are doing ddt_walk(), and when we are
* traversing.)
*/
if (nclass < oclass) {
dsl_scan_ddt_entry(dp->dp_scan,
ddt->ddt_checksum, dde, tx);
}
}
}
static void
ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
{
spa_t *spa = ddt->ddt_spa;
ddt_entry_t *dde;
void *cookie = NULL;
if (avl_numnodes(&ddt->ddt_tree) == 0)
return;
ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
if (spa->spa_ddt_stat_object == 0) {
spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_DDT_STATS, tx);
}
while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
ddt_sync_entry(ddt, dde, tx, txg);
ddt_free(dde);
}
for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
uint64_t add, count = 0;
for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
if (ddt_object_exists(ddt, type, class)) {
ddt_object_sync(ddt, type, class, tx);
VERIFY(ddt_object_count(ddt, type, class,
&add) == 0);
count += add;
}
}
for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
if (count == 0 && ddt_object_exists(ddt, type, class))
ddt_object_destroy(ddt, type, class, tx);
}
}
bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
sizeof (ddt->ddt_histogram));
}
void
ddt_sync(spa_t *spa, uint64_t txg)
{
dmu_tx_t *tx;
zio_t *rio = zio_root(spa, NULL, NULL,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
ASSERT(spa_syncing_txg(spa) == txg);
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];
if (ddt == NULL)
continue;
ddt_sync_table(ddt, tx, txg);
ddt_repair_table(ddt, rio);
}
(void) zio_wait(rio);
dmu_tx_commit(tx);
}
int
ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
{
do {
do {
do {
ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
int error = ENOENT;
if (ddt_object_exists(ddt, ddb->ddb_type,
ddb->ddb_class)) {
error = ddt_object_walk(ddt,
ddb->ddb_type, ddb->ddb_class,
&ddb->ddb_cursor, dde);
}
dde->dde_type = ddb->ddb_type;
dde->dde_class = ddb->ddb_class;
if (error == 0)
return (0);
if (error != ENOENT)
return (error);
ddb->ddb_cursor = 0;
} while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
ddb->ddb_checksum = 0;
} while (++ddb->ddb_type < DDT_TYPES);
ddb->ddb_type = 0;
} while (++ddb->ddb_class < DDT_CLASSES);
return (SET_ERROR(ENOENT));
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c (revision 332525)
@@ -1,2535 +1,2659 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
*/
/* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */
/* Copyright (c) 2013, Joyent, Inc. All rights reserved. */
/* Copyright 2016 Nexenta Systems, Inc. All rights reserved. */
#include <sys/dmu.h>
#include <sys/dmu_impl.h>
#include <sys/dmu_tx.h>
#include <sys/dbuf.h>
#include <sys/dnode.h>
#include <sys/zfs_context.h>
#include <sys/dmu_objset.h>
#include <sys/dmu_traverse.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_dir.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_synctask.h>
#include <sys/dsl_prop.h>
#include <sys/dmu_zfetch.h>
#include <sys/zfs_ioctl.h>
#include <sys/zap.h>
#include <sys/zio_checksum.h>
#include <sys/zio_compress.h>
#include <sys/sa.h>
#include <sys/zfeature.h>
#include <sys/abd.h>
#ifdef _KERNEL
#include <sys/racct.h>
#include <sys/vm.h>
#include <sys/zfs_znode.h>
#endif
/*
* Enable/disable nopwrite feature.
*/
int zfs_nopwrite_enabled = 1;
SYSCTL_DECL(_vfs_zfs);
SYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFLAG_RDTUN,
&zfs_nopwrite_enabled, 0, "Enable nopwrite feature");
/*
* Tunable to control percentage of dirtied blocks from frees in one TXG.
* After this threshold is crossed, additional dirty blocks from frees
* wait until the next TXG.
* A value of zero will disable this throttle.
*/
uint32_t zfs_per_txg_dirty_frees_percent = 30;
SYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_percent, CTLFLAG_RWTUN,
&zfs_per_txg_dirty_frees_percent, 0, "Percentage of dirtied blocks from frees in one txg");
+/*
+ * This can be used for testing, to ensure that certain actions happen
+ * while in the middle of a remap (which might otherwise complete too
+ * quickly).
+ */
+int zfs_object_remap_one_indirect_delay_ticks = 0;
+
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ DMU_BSWAP_UINT8, TRUE, "unallocated" },
{ DMU_BSWAP_ZAP, TRUE, "object directory" },
{ DMU_BSWAP_UINT64, TRUE, "object array" },
{ DMU_BSWAP_UINT8, TRUE, "packed nvlist" },
{ DMU_BSWAP_UINT64, TRUE, "packed nvlist size" },
{ DMU_BSWAP_UINT64, TRUE, "bpobj" },
{ DMU_BSWAP_UINT64, TRUE, "bpobj header" },
{ DMU_BSWAP_UINT64, TRUE, "SPA space map header" },
{ DMU_BSWAP_UINT64, TRUE, "SPA space map" },
{ DMU_BSWAP_UINT64, TRUE, "ZIL intent log" },
{ DMU_BSWAP_DNODE, TRUE, "DMU dnode" },
{ DMU_BSWAP_OBJSET, TRUE, "DMU objset" },
{ DMU_BSWAP_UINT64, TRUE, "DSL directory" },
{ DMU_BSWAP_ZAP, TRUE, "DSL directory child map"},
{ DMU_BSWAP_ZAP, TRUE, "DSL dataset snap map" },
{ DMU_BSWAP_ZAP, TRUE, "DSL props" },
{ DMU_BSWAP_UINT64, TRUE, "DSL dataset" },
{ DMU_BSWAP_ZNODE, TRUE, "ZFS znode" },
{ DMU_BSWAP_OLDACL, TRUE, "ZFS V0 ACL" },
{ DMU_BSWAP_UINT8, FALSE, "ZFS plain file" },
{ DMU_BSWAP_ZAP, TRUE, "ZFS directory" },
{ DMU_BSWAP_ZAP, TRUE, "ZFS master node" },
{ DMU_BSWAP_ZAP, TRUE, "ZFS delete queue" },
{ DMU_BSWAP_UINT8, FALSE, "zvol object" },
{ DMU_BSWAP_ZAP, TRUE, "zvol prop" },
{ DMU_BSWAP_UINT8, FALSE, "other uint8[]" },
{ DMU_BSWAP_UINT64, FALSE, "other uint64[]" },
{ DMU_BSWAP_ZAP, TRUE, "other ZAP" },
{ DMU_BSWAP_ZAP, TRUE, "persistent error log" },
{ DMU_BSWAP_UINT8, TRUE, "SPA history" },
{ DMU_BSWAP_UINT64, TRUE, "SPA history offsets" },
{ DMU_BSWAP_ZAP, TRUE, "Pool properties" },
{ DMU_BSWAP_ZAP, TRUE, "DSL permissions" },
{ DMU_BSWAP_ACL, TRUE, "ZFS ACL" },
{ DMU_BSWAP_UINT8, TRUE, "ZFS SYSACL" },
{ DMU_BSWAP_UINT8, TRUE, "FUID table" },
{ DMU_BSWAP_UINT64, TRUE, "FUID table size" },
{ DMU_BSWAP_ZAP, TRUE, "DSL dataset next clones"},
{ DMU_BSWAP_ZAP, TRUE, "scan work queue" },
{ DMU_BSWAP_ZAP, TRUE, "ZFS user/group used" },
{ DMU_BSWAP_ZAP, TRUE, "ZFS user/group quota" },
{ DMU_BSWAP_ZAP, TRUE, "snapshot refcount tags"},
{ DMU_BSWAP_ZAP, TRUE, "DDT ZAP algorithm" },
{ DMU_BSWAP_ZAP, TRUE, "DDT statistics" },
{ DMU_BSWAP_UINT8, TRUE, "System attributes" },
{ DMU_BSWAP_ZAP, TRUE, "SA master node" },
{ DMU_BSWAP_ZAP, TRUE, "SA attr registration" },
{ DMU_BSWAP_ZAP, TRUE, "SA attr layouts" },
{ DMU_BSWAP_ZAP, TRUE, "scan translations" },
{ DMU_BSWAP_UINT8, FALSE, "deduplicated block" },
{ DMU_BSWAP_ZAP, TRUE, "DSL deadlist map" },
{ DMU_BSWAP_UINT64, TRUE, "DSL deadlist map hdr" },
{ DMU_BSWAP_ZAP, TRUE, "DSL dir clones" },
{ DMU_BSWAP_UINT64, TRUE, "bpobj subobj" }
};
const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
{ byteswap_uint8_array, "uint8" },
{ byteswap_uint16_array, "uint16" },
{ byteswap_uint32_array, "uint32" },
{ byteswap_uint64_array, "uint64" },
{ zap_byteswap, "zap" },
{ dnode_buf_byteswap, "dnode" },
{ dmu_objset_byteswap, "objset" },
{ zfs_znode_byteswap, "znode" },
{ zfs_oldacl_byteswap, "oldacl" },
{ zfs_acl_byteswap, "acl" }
};
int
dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
void *tag, dmu_buf_t **dbp)
{
uint64_t blkid;
dmu_buf_impl_t *db;
blkid = dbuf_whichblock(dn, 0, offset);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
db = dbuf_hold(dn, blkid, tag);
rw_exit(&dn->dn_struct_rwlock);
if (db == NULL) {
*dbp = NULL;
return (SET_ERROR(EIO));
}
*dbp = &db->db;
return (0);
}
int
dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
void *tag, dmu_buf_t **dbp)
{
dnode_t *dn;
uint64_t blkid;
dmu_buf_impl_t *db;
int err;
err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
blkid = dbuf_whichblock(dn, 0, offset);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
db = dbuf_hold(dn, blkid, tag);
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
if (db == NULL) {
*dbp = NULL;
return (SET_ERROR(EIO));
}
*dbp = &db->db;
return (err);
}
int
dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
void *tag, dmu_buf_t **dbp, int flags)
{
int err;
int db_flags = DB_RF_CANFAIL;
if (flags & DMU_READ_NO_PREFETCH)
db_flags |= DB_RF_NOPREFETCH;
err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
if (err == 0) {
dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
err = dbuf_read(db, NULL, db_flags);
if (err != 0) {
dbuf_rele(db, tag);
*dbp = NULL;
}
}
return (err);
}
int
dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
void *tag, dmu_buf_t **dbp, int flags)
{
int err;
int db_flags = DB_RF_CANFAIL;
if (flags & DMU_READ_NO_PREFETCH)
db_flags |= DB_RF_NOPREFETCH;
err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
if (err == 0) {
dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
err = dbuf_read(db, NULL, db_flags);
if (err != 0) {
dbuf_rele(db, tag);
*dbp = NULL;
}
}
return (err);
}
int
dmu_bonus_max(void)
{
return (DN_MAX_BONUSLEN);
}
int
dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
dnode_t *dn;
int error;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
if (dn->dn_bonus != db) {
error = SET_ERROR(EINVAL);
} else if (newsize < 0 || newsize > db_fake->db_size) {
error = SET_ERROR(EINVAL);
} else {
dnode_setbonuslen(dn, newsize, tx);
error = 0;
}
DB_DNODE_EXIT(db);
return (error);
}
int
dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
dnode_t *dn;
int error;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
if (!DMU_OT_IS_VALID(type)) {
error = SET_ERROR(EINVAL);
} else if (dn->dn_bonus != db) {
error = SET_ERROR(EINVAL);
} else {
dnode_setbonus_type(dn, type, tx);
error = 0;
}
DB_DNODE_EXIT(db);
return (error);
}
dmu_object_type_t
dmu_get_bonustype(dmu_buf_t *db_fake)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
dnode_t *dn;
dmu_object_type_t type;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
type = dn->dn_bonustype;
DB_DNODE_EXIT(db);
return (type);
}
int
dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
{
dnode_t *dn;
int error;
error = dnode_hold(os, object, FTAG, &dn);
dbuf_rm_spill(dn, tx);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
dnode_rm_spill(dn, tx);
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
return (error);
}
/*
* returns ENOENT, EIO, or 0.
*/
int
dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
{
dnode_t *dn;
dmu_buf_impl_t *db;
int error;
error = dnode_hold(os, object, FTAG, &dn);
if (error)
return (error);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_bonus == NULL) {
rw_exit(&dn->dn_struct_rwlock);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
if (dn->dn_bonus == NULL)
dbuf_create_bonus(dn);
}
db = dn->dn_bonus;
/* as long as the bonus buf is held, the dnode will be held */
if (refcount_add(&db->db_holds, tag) == 1) {
VERIFY(dnode_add_ref(dn, db));
atomic_inc_32(&dn->dn_dbufs_count);
}
/*
* Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
* hold and incrementing the dbuf count to ensure that dnode_move() sees
* a dnode hold for every dbuf.
*/
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH));
*dbp = &db->db;
return (0);
}
/*
* returns ENOENT, EIO, or 0.
*
* This interface will allocate a blank spill dbuf when a spill blk
* doesn't already exist on the dnode.
*
* if you only want to find an already existing spill db, then
* dmu_spill_hold_existing() should be used.
*/
int
dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
{
dmu_buf_impl_t *db = NULL;
int err;
if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_enter(&dn->dn_struct_rwlock, RW_READER);
db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_exit(&dn->dn_struct_rwlock);
ASSERT(db != NULL);
err = dbuf_read(db, NULL, flags);
if (err == 0)
*dbp = &db->db;
else
dbuf_rele(db, tag);
return (err);
}
int
dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
dnode_t *dn;
int err;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
err = SET_ERROR(EINVAL);
} else {
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (!dn->dn_have_spill) {
err = SET_ERROR(ENOENT);
} else {
err = dmu_spill_hold_by_dnode(dn,
DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
}
rw_exit(&dn->dn_struct_rwlock);
}
DB_DNODE_EXIT(db);
return (err);
}
int
dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
dnode_t *dn;
int err;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
DB_DNODE_EXIT(db);
return (err);
}
/*
* Note: longer-term, we should modify all of the dmu_buf_*() interfaces
* to take a held dnode rather than <os, object> -- the lookup is wasteful,
* and can induce severe lock contention when writing to several files
* whose dnodes are in the same block.
*/
static int
dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
{
dmu_buf_t **dbp;
uint64_t blkid, nblks, i;
uint32_t dbuf_flags;
int err;
zio_t *zio;
ASSERT(length <= DMU_MAX_ACCESS);
/*
* Note: We directly notify the prefetch code of this read, so that
* we can tell it about the multi-block read. dbuf_read() only knows
* about the one block it is accessing.
*/
dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
DB_RF_NOPREFETCH;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_datablkshift) {
int blkshift = dn->dn_datablkshift;
nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
} else {
if (offset + length > dn->dn_datablksz) {
zfs_panic_recover("zfs: accessing past end of object "
"%llx/%llx (size=%u access=%llu+%llu)",
(longlong_t)dn->dn_objset->
os_dsl_dataset->ds_object,
(longlong_t)dn->dn_object, dn->dn_datablksz,
(longlong_t)offset, (longlong_t)length);
rw_exit(&dn->dn_struct_rwlock);
return (SET_ERROR(EIO));
}
nblks = 1;
}
dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
#if defined(_KERNEL) && defined(RACCT)
if (racct_enable && !read) {
PROC_LOCK(curproc);
racct_add_force(curproc, RACCT_WRITEBPS, length);
racct_add_force(curproc, RACCT_WRITEIOPS, nblks);
PROC_UNLOCK(curproc);
}
#endif
zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
blkid = dbuf_whichblock(dn, 0, offset);
for (i = 0; i < nblks; i++) {
dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
if (db == NULL) {
rw_exit(&dn->dn_struct_rwlock);
dmu_buf_rele_array(dbp, nblks, tag);
zio_nowait(zio);
return (SET_ERROR(EIO));
}
/* initiate async i/o */
if (read)
(void) dbuf_read(db, zio, dbuf_flags);
#ifdef _KERNEL
else
curthread->td_ru.ru_oublock++;
#endif
dbp[i] = &db->db;
}
if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
read && DNODE_IS_CACHEABLE(dn));
}
rw_exit(&dn->dn_struct_rwlock);
/* wait for async i/o */
err = zio_wait(zio);
if (err) {
dmu_buf_rele_array(dbp, nblks, tag);
return (err);
}
/* wait for other io to complete */
if (read) {
for (i = 0; i < nblks; i++) {
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
mutex_enter(&db->db_mtx);
while (db->db_state == DB_READ ||
db->db_state == DB_FILL)
cv_wait(&db->db_changed, &db->db_mtx);
if (db->db_state == DB_UNCACHED)
err = SET_ERROR(EIO);
mutex_exit(&db->db_mtx);
if (err) {
dmu_buf_rele_array(dbp, nblks, tag);
return (err);
}
}
}
*numbufsp = nblks;
*dbpp = dbp;
return (0);
}
static int
dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
{
dnode_t *dn;
int err;
err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
numbufsp, dbpp, DMU_READ_PREFETCH);
dnode_rele(dn, FTAG);
return (err);
}
int
dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
uint64_t length, boolean_t read, void *tag, int *numbufsp,
dmu_buf_t ***dbpp)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
dnode_t *dn;
int err;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
numbufsp, dbpp, DMU_READ_PREFETCH);
DB_DNODE_EXIT(db);
return (err);
}
void
dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
{
int i;
dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
if (numbufs == 0)
return;
for (i = 0; i < numbufs; i++) {
if (dbp[i])
dbuf_rele(dbp[i], tag);
}
kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
}
/*
* Issue prefetch i/os for the given blocks. If level is greater than 0, the
* indirect blocks prefeteched will be those that point to the blocks containing
* the data starting at offset, and continuing to offset + len.
*
* Note that if the indirect blocks above the blocks being prefetched are not in
* cache, they will be asychronously read in.
*/
void
dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
uint64_t len, zio_priority_t pri)
{
dnode_t *dn;
uint64_t blkid;
int nblks, err;
if (len == 0) { /* they're interested in the bonus buffer */
dn = DMU_META_DNODE(os);
if (object == 0 || object >= DN_MAX_OBJECT)
return;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
blkid = dbuf_whichblock(dn, level,
object * sizeof (dnode_phys_t));
dbuf_prefetch(dn, level, blkid, pri, 0);
rw_exit(&dn->dn_struct_rwlock);
return;
}
/*
* XXX - Note, if the dnode for the requested object is not
* already cached, we will do a *synchronous* read in the
* dnode_hold() call. The same is true for any indirects.
*/
err = dnode_hold(os, object, FTAG, &dn);
if (err != 0)
return;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
/*
* offset + len - 1 is the last byte we want to prefetch for, and offset
* is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the
* last block we want to prefetch, and dbuf_whichblock(dn, level,
* offset) is the first. Then the number we need to prefetch is the
* last - first + 1.
*/
if (level > 0 || dn->dn_datablkshift != 0) {
nblks = dbuf_whichblock(dn, level, offset + len - 1) -
dbuf_whichblock(dn, level, offset) + 1;
} else {
nblks = (offset < dn->dn_datablksz);
}
if (nblks != 0) {
blkid = dbuf_whichblock(dn, level, offset);
for (int i = 0; i < nblks; i++)
dbuf_prefetch(dn, level, blkid + i, pri, 0);
}
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
}
/*
* Get the next "chunk" of file data to free. We traverse the file from
* the end so that the file gets shorter over time (if we crashes in the
* middle, this will leave us in a better state). We find allocated file
* data by simply searching the allocated level 1 indirects.
*
* On input, *start should be the first offset that does not need to be
* freed (e.g. "offset + length"). On return, *start will be the first
* offset that should be freed.
*/
static int
get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum)
{
uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
/* bytes of data covered by a level-1 indirect block */
uint64_t iblkrange =
dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
ASSERT3U(minimum, <=, *start);
if (*start - minimum <= iblkrange * maxblks) {
*start = minimum;
return (0);
}
ASSERT(ISP2(iblkrange));
for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) {
int err;
/*
* dnode_next_offset(BACKWARDS) will find an allocated L1
* indirect block at or before the input offset. We must
* decrement *start so that it is at the end of the region
* to search.
*/
(*start)--;
err = dnode_next_offset(dn,
DNODE_FIND_BACKWARDS, start, 2, 1, 0);
/* if there are no indirect blocks before start, we are done */
if (err == ESRCH) {
*start = minimum;
break;
} else if (err != 0) {
return (err);
}
/* set start to the beginning of this L1 indirect */
*start = P2ALIGN(*start, iblkrange);
}
if (*start < minimum)
*start = minimum;
return (0);
}
/*
* If this objset is of type OST_ZFS return true if vfs's unmounted flag is set,
* otherwise return false.
* Used below in dmu_free_long_range_impl() to enable abort when unmounting
*/
/*ARGSUSED*/
static boolean_t
dmu_objset_zfs_unmounting(objset_t *os)
{
#ifdef _KERNEL
if (dmu_objset_type(os) == DMU_OST_ZFS)
return (zfs_get_vfs_flag_unmounted(os));
#endif
return (B_FALSE);
}
static int
dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
uint64_t length)
{
uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
int err;
uint64_t dirty_frees_threshold;
dsl_pool_t *dp = dmu_objset_pool(os);
if (offset >= object_size)
return (0);
if (zfs_per_txg_dirty_frees_percent <= 100)
dirty_frees_threshold =
zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
else
dirty_frees_threshold = zfs_dirty_data_max / 4;
if (length == DMU_OBJECT_END || offset + length > object_size)
length = object_size - offset;
while (length != 0) {
uint64_t chunk_end, chunk_begin, chunk_len;
uint64_t long_free_dirty_all_txgs = 0;
dmu_tx_t *tx;
if (dmu_objset_zfs_unmounting(dn->dn_objset))
return (SET_ERROR(EINTR));
chunk_end = chunk_begin = offset + length;
/* move chunk_begin backwards to the beginning of this chunk */
err = get_next_chunk(dn, &chunk_begin, offset);
if (err)
return (err);
ASSERT3U(chunk_begin, >=, offset);
ASSERT3U(chunk_begin, <=, chunk_end);
chunk_len = chunk_end - chunk_begin;
mutex_enter(&dp->dp_lock);
for (int t = 0; t < TXG_SIZE; t++) {
long_free_dirty_all_txgs +=
dp->dp_long_free_dirty_pertxg[t];
}
mutex_exit(&dp->dp_lock);
/*
* To avoid filling up a TXG with just frees wait for
* the next TXG to open before freeing more chunks if
* we have reached the threshold of frees
*/
if (dirty_frees_threshold != 0 &&
long_free_dirty_all_txgs >= dirty_frees_threshold) {
txg_wait_open(dp, 0);
continue;
}
tx = dmu_tx_create(os);
dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
/*
* Mark this transaction as typically resulting in a net
* reduction in space used.
*/
dmu_tx_mark_netfree(tx);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err) {
dmu_tx_abort(tx);
return (err);
}
mutex_enter(&dp->dp_lock);
dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] +=
chunk_len;
mutex_exit(&dp->dp_lock);
DTRACE_PROBE3(free__long__range,
uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len,
uint64_t, dmu_tx_get_txg(tx));
dnode_free_range(dn, chunk_begin, chunk_len, tx);
dmu_tx_commit(tx);
length -= chunk_len;
}
return (0);
}
int
dmu_free_long_range(objset_t *os, uint64_t object,
uint64_t offset, uint64_t length)
{
dnode_t *dn;
int err;
err = dnode_hold(os, object, FTAG, &dn);
if (err != 0)
return (err);
err = dmu_free_long_range_impl(os, dn, offset, length);
/*
* It is important to zero out the maxblkid when freeing the entire
* file, so that (a) subsequent calls to dmu_free_long_range_impl()
* will take the fast path, and (b) dnode_reallocate() can verify
* that the entire file has been freed.
*/
if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
dn->dn_maxblkid = 0;
dnode_rele(dn, FTAG);
return (err);
}
int
dmu_free_long_object(objset_t *os, uint64_t object)
{
dmu_tx_t *tx;
int err;
err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
if (err != 0)
return (err);
tx = dmu_tx_create(os);
dmu_tx_hold_bonus(tx, object);
dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
dmu_tx_mark_netfree(tx);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err == 0) {
err = dmu_object_free(os, object, tx);
dmu_tx_commit(tx);
} else {
dmu_tx_abort(tx);
}
return (err);
}
int
dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size, dmu_tx_t *tx)
{
dnode_t *dn;
int err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
ASSERT(offset < UINT64_MAX);
ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
dnode_free_range(dn, offset, size, tx);
dnode_rele(dn, FTAG);
return (0);
}
static int
dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
void *buf, uint32_t flags)
{
dmu_buf_t **dbp;
int numbufs, err = 0;
/*
* Deal with odd block sizes, where there can't be data past the first
* block. If we ever do the tail block optimization, we will need to
* handle that here as well.
*/
if (dn->dn_maxblkid == 0) {
int newsz = offset > dn->dn_datablksz ? 0 :
MIN(size, dn->dn_datablksz - offset);
bzero((char *)buf + newsz, size - newsz);
size = newsz;
}
while (size > 0) {
uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
int i;
/*
* NB: we could do this block-at-a-time, but it's nice
* to be reading in parallel.
*/
err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
TRUE, FTAG, &numbufs, &dbp, flags);
if (err)
break;
for (i = 0; i < numbufs; i++) {
int tocpy;
int bufoff;
dmu_buf_t *db = dbp[i];
ASSERT(size > 0);
bufoff = offset - db->db_offset;
tocpy = (int)MIN(db->db_size - bufoff, size);
bcopy((char *)db->db_data + bufoff, buf, tocpy);
offset += tocpy;
size -= tocpy;
buf = (char *)buf + tocpy;
}
dmu_buf_rele_array(dbp, numbufs, FTAG);
}
return (err);
}
int
dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
void *buf, uint32_t flags)
{
dnode_t *dn;
int err;
err = dnode_hold(os, object, FTAG, &dn);
if (err != 0)
return (err);
err = dmu_read_impl(dn, offset, size, buf, flags);
dnode_rele(dn, FTAG);
return (err);
}
int
dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
uint32_t flags)
{
return (dmu_read_impl(dn, offset, size, buf, flags));
}
static void
dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx)
{
int i;
for (i = 0; i < numbufs; i++) {
int tocpy;
int bufoff;
dmu_buf_t *db = dbp[i];
ASSERT(size > 0);
bufoff = offset - db->db_offset;
tocpy = (int)MIN(db->db_size - bufoff, size);
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
if (tocpy == db->db_size)
dmu_buf_will_fill(db, tx);
else
dmu_buf_will_dirty(db, tx);
bcopy(buf, (char *)db->db_data + bufoff, tocpy);
if (tocpy == db->db_size)
dmu_buf_fill_done(db, tx);
offset += tocpy;
size -= tocpy;
buf = (char *)buf + tocpy;
}
}
void
dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx)
{
dmu_buf_t **dbp;
int numbufs;
if (size == 0)
return;
VERIFY0(dmu_buf_hold_array(os, object, offset, size,
FALSE, FTAG, &numbufs, &dbp));
dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
dmu_buf_rele_array(dbp, numbufs, FTAG);
}
void
dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx)
{
dmu_buf_t **dbp;
int numbufs;
if (size == 0)
return;
VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
dmu_buf_rele_array(dbp, numbufs, FTAG);
+}
+
+static int
+dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn,
+ uint64_t last_removal_txg, uint64_t offset)
+{
+ uint64_t l1blkid = dbuf_whichblock(dn, 1, offset);
+ int err = 0;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ dmu_buf_impl_t *dbuf = dbuf_hold_level(dn, 1, l1blkid, FTAG);
+ ASSERT3P(dbuf, !=, NULL);
+
+ /*
+ * If the block hasn't been written yet, this default will ensure
+ * we don't try to remap it.
+ */
+ uint64_t birth = UINT64_MAX;
+ ASSERT3U(last_removal_txg, !=, UINT64_MAX);
+ if (dbuf->db_blkptr != NULL)
+ birth = dbuf->db_blkptr->blk_birth;
+ rw_exit(&dn->dn_struct_rwlock);
+
+ /*
+ * If this L1 was already written after the last removal, then we've
+ * already tried to remap it.
+ */
+ if (birth <= last_removal_txg &&
+ dbuf_read(dbuf, NULL, DB_RF_MUST_SUCCEED) == 0 &&
+ dbuf_can_remap(dbuf)) {
+ dmu_tx_t *tx = dmu_tx_create(os);
+ dmu_tx_hold_remap_l1indirect(tx, dn->dn_object);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err == 0) {
+ (void) dbuf_dirty(dbuf, tx);
+ dmu_tx_commit(tx);
+ } else {
+ dmu_tx_abort(tx);
+ }
+ }
+
+ dbuf_rele(dbuf, FTAG);
+
+ delay(zfs_object_remap_one_indirect_delay_ticks);
+
+ return (err);
+}
+
+/*
+ * Remap all blockpointers in the object, if possible, so that they reference
+ * only concrete vdevs.
+ *
+ * To do this, iterate over the L0 blockpointers and remap any that reference
+ * an indirect vdev. Note that we only examine L0 blockpointers; since we
+ * cannot guarantee that we can remap all blockpointer anyways (due to split
+ * blocks), we do not want to make the code unnecessarily complicated to
+ * catch the unlikely case that there is an L1 block on an indirect vdev that
+ * contains no indirect blockpointers.
+ */
+int
+dmu_object_remap_indirects(objset_t *os, uint64_t object,
+ uint64_t last_removal_txg)
+{
+ uint64_t offset, l1span;
+ int err;
+ dnode_t *dn;
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err != 0) {
+ return (err);
+ }
+
+ if (dn->dn_nlevels <= 1) {
+ if (issig(JUSTLOOKING) && issig(FORREAL)) {
+ err = SET_ERROR(EINTR);
+ }
+
+ /*
+ * If the dnode has no indirect blocks, we cannot dirty them.
+ * We still want to remap the blkptr(s) in the dnode if
+ * appropriate, so mark it as dirty.
+ */
+ if (err == 0 && dnode_needs_remap(dn)) {
+ dmu_tx_t *tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, dn->dn_object);
+ if ((err = dmu_tx_assign(tx, TXG_WAIT)) == 0) {
+ dnode_setdirty(dn, tx);
+ dmu_tx_commit(tx);
+ } else {
+ dmu_tx_abort(tx);
+ }
+ }
+
+ dnode_rele(dn, FTAG);
+ return (err);
+ }
+
+ offset = 0;
+ l1span = 1ULL << (dn->dn_indblkshift - SPA_BLKPTRSHIFT +
+ dn->dn_datablkshift);
+ /*
+ * Find the next L1 indirect that is not a hole.
+ */
+ while (dnode_next_offset(dn, 0, &offset, 2, 1, 0) == 0) {
+ if (issig(JUSTLOOKING) && issig(FORREAL)) {
+ err = SET_ERROR(EINTR);
+ break;
+ }
+ if ((err = dmu_object_remap_one_indirect(os, dn,
+ last_removal_txg, offset)) != 0) {
+ break;
+ }
+ offset += l1span;
+ }
+
+ dnode_rele(dn, FTAG);
+ return (err);
}
void
dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_tx_t *tx)
{
dmu_buf_t **dbp;
int numbufs, i;
if (size == 0)
return;
VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
FALSE, FTAG, &numbufs, &dbp));
for (i = 0; i < numbufs; i++) {
dmu_buf_t *db = dbp[i];
dmu_buf_will_not_fill(db, tx);
}
dmu_buf_rele_array(dbp, numbufs, FTAG);
}
void
dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
int compressed_size, int byteorder, dmu_tx_t *tx)
{
dmu_buf_t *db;
ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
VERIFY0(dmu_buf_hold_noread(os, object, offset,
FTAG, &db));
dmu_buf_write_embedded(db,
data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
uncompressed_size, compressed_size, byteorder, tx);
dmu_buf_rele(db, FTAG);
}
/*
* DMU support for xuio
*/
kstat_t *xuio_ksp = NULL;
int
dmu_xuio_init(xuio_t *xuio, int nblk)
{
dmu_xuio_t *priv;
uio_t *uio = &xuio->xu_uio;
uio->uio_iovcnt = nblk;
uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
priv->cnt = nblk;
priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
priv->iovp = uio->uio_iov;
XUIO_XUZC_PRIV(xuio) = priv;
if (XUIO_XUZC_RW(xuio) == UIO_READ)
XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
else
XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
return (0);
}
void
dmu_xuio_fini(xuio_t *xuio)
{
dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
int nblk = priv->cnt;
kmem_free(priv->iovp, nblk * sizeof (iovec_t));
kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
kmem_free(priv, sizeof (dmu_xuio_t));
if (XUIO_XUZC_RW(xuio) == UIO_READ)
XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
else
XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
}
/*
* Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
* and increase priv->next by 1.
*/
int
dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
{
struct iovec *iov;
uio_t *uio = &xuio->xu_uio;
dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
int i = priv->next++;
ASSERT(i < priv->cnt);
ASSERT(off + n <= arc_buf_lsize(abuf));
iov = uio->uio_iov + i;
iov->iov_base = (char *)abuf->b_data + off;
iov->iov_len = n;
priv->bufs[i] = abuf;
return (0);
}
int
dmu_xuio_cnt(xuio_t *xuio)
{
dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
return (priv->cnt);
}
arc_buf_t *
dmu_xuio_arcbuf(xuio_t *xuio, int i)
{
dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
ASSERT(i < priv->cnt);
return (priv->bufs[i]);
}
void
dmu_xuio_clear(xuio_t *xuio, int i)
{
dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
ASSERT(i < priv->cnt);
priv->bufs[i] = NULL;
}
static void
xuio_stat_init(void)
{
xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL);
if (xuio_ksp != NULL) {
xuio_ksp->ks_data = &xuio_stats;
kstat_install(xuio_ksp);
}
}
static void
xuio_stat_fini(void)
{
if (xuio_ksp != NULL) {
kstat_delete(xuio_ksp);
xuio_ksp = NULL;
}
}
void
xuio_stat_wbuf_copied(void)
{
XUIOSTAT_BUMP(xuiostat_wbuf_copied);
}
void
xuio_stat_wbuf_nocopy(void)
{
XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
}
#ifdef _KERNEL
static int
dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size)
{
dmu_buf_t **dbp;
int numbufs, i, err;
xuio_t *xuio = NULL;
/*
* NB: we could do this block-at-a-time, but it's nice
* to be reading in parallel.
*/
err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
TRUE, FTAG, &numbufs, &dbp, 0);
if (err)
return (err);
#ifdef UIO_XUIO
if (uio->uio_extflg == UIO_XUIO)
xuio = (xuio_t *)uio;
#endif
for (i = 0; i < numbufs; i++) {
int tocpy;
int bufoff;
dmu_buf_t *db = dbp[i];
ASSERT(size > 0);
bufoff = uio->uio_loffset - db->db_offset;
tocpy = (int)MIN(db->db_size - bufoff, size);
if (xuio) {
dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
arc_buf_t *dbuf_abuf = dbi->db_buf;
arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
if (!err) {
uio->uio_resid -= tocpy;
uio->uio_loffset += tocpy;
}
if (abuf == dbuf_abuf)
XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
else
XUIOSTAT_BUMP(xuiostat_rbuf_copied);
} else {
#ifdef illumos
err = uiomove((char *)db->db_data + bufoff, tocpy,
UIO_READ, uio);
#else
err = vn_io_fault_uiomove((char *)db->db_data + bufoff,
tocpy, uio);
#endif
}
if (err)
break;
size -= tocpy;
}
dmu_buf_rele_array(dbp, numbufs, FTAG);
return (err);
}
/*
* Read 'size' bytes into the uio buffer.
* From object zdb->db_object.
* Starting at offset uio->uio_loffset.
*
* If the caller already has a dbuf in the target object
* (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
* because we don't have to find the dnode_t for the object.
*/
int
dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
dnode_t *dn;
int err;
if (size == 0)
return (0);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
err = dmu_read_uio_dnode(dn, uio, size);
DB_DNODE_EXIT(db);
return (err);
}
/*
* Read 'size' bytes into the uio buffer.
* From the specified object
* Starting at offset uio->uio_loffset.
*/
int
dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
{
dnode_t *dn;
int err;
if (size == 0)
return (0);
err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
err = dmu_read_uio_dnode(dn, uio, size);
dnode_rele(dn, FTAG);
return (err);
}
static int
dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
{
dmu_buf_t **dbp;
int numbufs;
int err = 0;
int i;
err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
if (err)
return (err);
for (i = 0; i < numbufs; i++) {
int tocpy;
int bufoff;
dmu_buf_t *db = dbp[i];
ASSERT(size > 0);
bufoff = uio->uio_loffset - db->db_offset;
tocpy = (int)MIN(db->db_size - bufoff, size);
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
if (tocpy == db->db_size)
dmu_buf_will_fill(db, tx);
else
dmu_buf_will_dirty(db, tx);
#ifdef illumos
/*
* XXX uiomove could block forever (eg. nfs-backed
* pages). There needs to be a uiolockdown() function
* to lock the pages in memory, so that uiomove won't
* block.
*/
err = uiomove((char *)db->db_data + bufoff, tocpy,
UIO_WRITE, uio);
#else
err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy,
uio);
#endif
if (tocpy == db->db_size)
dmu_buf_fill_done(db, tx);
if (err)
break;
size -= tocpy;
}
dmu_buf_rele_array(dbp, numbufs, FTAG);
return (err);
}
/*
* Write 'size' bytes from the uio buffer.
* To object zdb->db_object.
* Starting at offset uio->uio_loffset.
*
* If the caller already has a dbuf in the target object
* (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
* because we don't have to find the dnode_t for the object.
*/
int
dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
dnode_t *dn;
int err;
if (size == 0)
return (0);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
err = dmu_write_uio_dnode(dn, uio, size, tx);
DB_DNODE_EXIT(db);
return (err);
}
/*
* Write 'size' bytes from the uio buffer.
* To the specified object.
* Starting at offset uio->uio_loffset.
*/
int
dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
dmu_tx_t *tx)
{
dnode_t *dn;
int err;
if (size == 0)
return (0);
err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
err = dmu_write_uio_dnode(dn, uio, size, tx);
dnode_rele(dn, FTAG);
return (err);
}
#ifdef illumos
int
dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
page_t *pp, dmu_tx_t *tx)
{
dmu_buf_t **dbp;
int numbufs, i;
int err;
if (size == 0)
return (0);
err = dmu_buf_hold_array(os, object, offset, size,
FALSE, FTAG, &numbufs, &dbp);
if (err)
return (err);
for (i = 0; i < numbufs; i++) {
int tocpy, copied, thiscpy;
int bufoff;
dmu_buf_t *db = dbp[i];
caddr_t va;
ASSERT(size > 0);
ASSERT3U(db->db_size, >=, PAGESIZE);
bufoff = offset - db->db_offset;
tocpy = (int)MIN(db->db_size - bufoff, size);
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
if (tocpy == db->db_size)
dmu_buf_will_fill(db, tx);
else
dmu_buf_will_dirty(db, tx);
for (copied = 0; copied < tocpy; copied += PAGESIZE) {
ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
thiscpy = MIN(PAGESIZE, tocpy - copied);
va = zfs_map_page(pp, S_READ);
bcopy(va, (char *)db->db_data + bufoff, thiscpy);
zfs_unmap_page(pp, va);
pp = pp->p_next;
bufoff += PAGESIZE;
}
if (tocpy == db->db_size)
dmu_buf_fill_done(db, tx);
offset += tocpy;
size -= tocpy;
}
dmu_buf_rele_array(dbp, numbufs, FTAG);
return (err);
}
#else /* !illumos */
int
dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
vm_page_t *ma, dmu_tx_t *tx)
{
dmu_buf_t **dbp;
struct sf_buf *sf;
int numbufs, i;
int err;
if (size == 0)
return (0);
err = dmu_buf_hold_array(os, object, offset, size,
FALSE, FTAG, &numbufs, &dbp);
if (err)
return (err);
for (i = 0; i < numbufs; i++) {
int tocpy, copied, thiscpy;
int bufoff;
dmu_buf_t *db = dbp[i];
caddr_t va;
ASSERT(size > 0);
ASSERT3U(db->db_size, >=, PAGESIZE);
bufoff = offset - db->db_offset;
tocpy = (int)MIN(db->db_size - bufoff, size);
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
if (tocpy == db->db_size)
dmu_buf_will_fill(db, tx);
else
dmu_buf_will_dirty(db, tx);
for (copied = 0; copied < tocpy; copied += PAGESIZE) {
ASSERT3U(ptoa((*ma)->pindex), ==, db->db_offset + bufoff);
thiscpy = MIN(PAGESIZE, tocpy - copied);
va = zfs_map_page(*ma, &sf);
bcopy(va, (char *)db->db_data + bufoff, thiscpy);
zfs_unmap_page(sf);
ma += 1;
bufoff += PAGESIZE;
}
if (tocpy == db->db_size)
dmu_buf_fill_done(db, tx);
offset += tocpy;
size -= tocpy;
}
dmu_buf_rele_array(dbp, numbufs, FTAG);
return (err);
}
int
dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
int *rbehind, int *rahead, int last_size)
{
struct sf_buf *sf;
vm_object_t vmobj;
vm_page_t m;
dmu_buf_t **dbp;
dmu_buf_t *db;
caddr_t va;
int numbufs, i;
int bufoff, pgoff, tocpy;
int mi, di;
int err;
ASSERT3U(ma[0]->pindex + count - 1, ==, ma[count - 1]->pindex);
ASSERT(last_size <= PAGE_SIZE);
err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex),
IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp);
if (err != 0)
return (err);
#ifdef DEBUG
IMPLY(last_size < PAGE_SIZE, *rahead == 0);
if (dbp[0]->db_offset != 0 || numbufs > 1) {
for (i = 0; i < numbufs; i++) {
ASSERT(ISP2(dbp[i]->db_size));
ASSERT((dbp[i]->db_offset % dbp[i]->db_size) == 0);
ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size);
}
}
#endif
vmobj = ma[0]->object;
zfs_vmobject_wlock(vmobj);
db = dbp[0];
for (i = 0; i < *rbehind; i++) {
m = vm_page_grab(vmobj, ma[0]->pindex - 1 - i,
VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY);
if (m == NULL)
break;
if (m->valid != 0) {
ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
break;
}
ASSERT(m->dirty == 0);
ASSERT(!pmap_page_is_mapped(m));
ASSERT(db->db_size > PAGE_SIZE);
bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
va = zfs_map_page(m, &sf);
bcopy((char *)db->db_data + bufoff, va, PAGESIZE);
zfs_unmap_page(sf);
m->valid = VM_PAGE_BITS_ALL;
vm_page_lock(m);
if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
vm_page_activate(m);
else
vm_page_deactivate(m);
vm_page_unlock(m);
}
*rbehind = i;
bufoff = IDX_TO_OFF(ma[0]->pindex) % db->db_size;
pgoff = 0;
for (mi = 0, di = 0; mi < count && di < numbufs; ) {
if (pgoff == 0) {
m = ma[mi];
vm_page_assert_xbusied(m);
ASSERT(m->valid == 0);
ASSERT(m->dirty == 0);
ASSERT(!pmap_page_is_mapped(m));
va = zfs_map_page(m, &sf);
}
if (bufoff == 0)
db = dbp[di];
ASSERT3U(IDX_TO_OFF(m->pindex) + pgoff, ==,
db->db_offset + bufoff);
/*
* We do not need to clamp the copy size by the file
* size as the last block is zero-filled beyond the
* end of file anyway.
*/
tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff);
bcopy((char *)db->db_data + bufoff, va + pgoff, tocpy);
pgoff += tocpy;
ASSERT(pgoff <= PAGESIZE);
if (pgoff == PAGESIZE) {
zfs_unmap_page(sf);
m->valid = VM_PAGE_BITS_ALL;
ASSERT(mi < count);
mi++;
pgoff = 0;
}
bufoff += tocpy;
ASSERT(bufoff <= db->db_size);
if (bufoff == db->db_size) {
ASSERT(di < numbufs);
di++;
bufoff = 0;
}
}
#ifdef DEBUG
/*
* Three possibilities:
* - last requested page ends at a buffer boundary and , thus,
* all pages and buffers have been iterated;
* - all requested pages are filled, but the last buffer
* has not been exhausted;
* the read-ahead is possible only in this case;
* - all buffers have been read, but the last page has not been
* fully filled;
* this is only possible if the file has only a single buffer
* with a size that is not a multiple of the page size.
*/
if (mi == count) {
ASSERT(di >= numbufs - 1);
IMPLY(*rahead != 0, di == numbufs - 1);
IMPLY(*rahead != 0, bufoff != 0);
ASSERT(pgoff == 0);
}
if (di == numbufs) {
ASSERT(mi >= count - 1);
ASSERT(*rahead == 0);
IMPLY(pgoff == 0, mi == count);
if (pgoff != 0) {
ASSERT(mi == count - 1);
ASSERT((dbp[0]->db_size & PAGE_MASK) != 0);
}
}
#endif
if (pgoff != 0) {
bzero(va + pgoff, PAGESIZE - pgoff);
zfs_unmap_page(sf);
m->valid = VM_PAGE_BITS_ALL;
}
for (i = 0; i < *rahead; i++) {
m = vm_page_grab(vmobj, ma[count - 1]->pindex + 1 + i,
VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY);
if (m == NULL)
break;
if (m->valid != 0) {
ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
break;
}
ASSERT(m->dirty == 0);
ASSERT(!pmap_page_is_mapped(m));
ASSERT(db->db_size > PAGE_SIZE);
bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
tocpy = MIN(db->db_size - bufoff, PAGESIZE);
va = zfs_map_page(m, &sf);
bcopy((char *)db->db_data + bufoff, va, tocpy);
if (tocpy < PAGESIZE) {
ASSERT(i == *rahead - 1);
ASSERT((db->db_size & PAGE_MASK) != 0);
bzero(va + tocpy, PAGESIZE - tocpy);
}
zfs_unmap_page(sf);
m->valid = VM_PAGE_BITS_ALL;
vm_page_lock(m);
if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
vm_page_activate(m);
else
vm_page_deactivate(m);
vm_page_unlock(m);
}
*rahead = i;
zfs_vmobject_wunlock(vmobj);
dmu_buf_rele_array(dbp, numbufs, FTAG);
return (0);
}
#endif /* illumos */
#endif /* _KERNEL */
/*
* Allocate a loaned anonymous arc buffer.
*/
arc_buf_t *
dmu_request_arcbuf(dmu_buf_t *handle, int size)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size));
}
/*
* Free a loaned arc buffer.
*/
void
dmu_return_arcbuf(arc_buf_t *buf)
{
arc_return_buf(buf, FTAG);
arc_buf_destroy(buf, FTAG);
}
/*
* When possible directly assign passed loaned arc buffer to a dbuf.
* If this is not possible copy the contents of passed arc buf via
* dmu_write().
*/
void
dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
dmu_tx_t *tx)
{
dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
dnode_t *dn;
dmu_buf_impl_t *db;
uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
uint64_t blkid;
DB_DNODE_ENTER(dbuf);
dn = DB_DNODE(dbuf);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
blkid = dbuf_whichblock(dn, 0, offset);
VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(dbuf);
/*
* We can only assign if the offset is aligned, the arc buf is the
* same size as the dbuf, and the dbuf is not metadata.
*/
if (offset == db->db.db_offset && blksz == db->db.db_size) {
#ifdef _KERNEL
curthread->td_ru.ru_oublock++;
#ifdef RACCT
if (racct_enable) {
PROC_LOCK(curproc);
racct_add_force(curproc, RACCT_WRITEBPS, blksz);
racct_add_force(curproc, RACCT_WRITEIOPS, 1);
PROC_UNLOCK(curproc);
}
#endif /* RACCT */
#endif /* _KERNEL */
dbuf_assign_arcbuf(db, buf, tx);
dbuf_rele(db, FTAG);
} else {
objset_t *os;
uint64_t object;
/* compressed bufs must always be assignable to their dbuf */
ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
DB_DNODE_ENTER(dbuf);
dn = DB_DNODE(dbuf);
os = dn->dn_objset;
object = dn->dn_object;
DB_DNODE_EXIT(dbuf);
dbuf_rele(db, FTAG);
dmu_write(os, object, offset, blksz, buf->b_data, tx);
dmu_return_arcbuf(buf);
XUIOSTAT_BUMP(xuiostat_wbuf_copied);
}
}
typedef struct {
dbuf_dirty_record_t *dsa_dr;
dmu_sync_cb_t *dsa_done;
zgd_t *dsa_zgd;
dmu_tx_t *dsa_tx;
} dmu_sync_arg_t;
/* ARGSUSED */
static void
dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
{
dmu_sync_arg_t *dsa = varg;
dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
blkptr_t *bp = zio->io_bp;
if (zio->io_error == 0) {
if (BP_IS_HOLE(bp)) {
/*
* A block of zeros may compress to a hole, but the
* block size still needs to be known for replay.
*/
BP_SET_LSIZE(bp, db->db_size);
} else if (!BP_IS_EMBEDDED(bp)) {
ASSERT(BP_GET_LEVEL(bp) == 0);
bp->blk_fill = 1;
}
}
}
static void
dmu_sync_late_arrival_ready(zio_t *zio)
{
dmu_sync_ready(zio, NULL, zio->io_private);
}
/* ARGSUSED */
static void
dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
{
dmu_sync_arg_t *dsa = varg;
dbuf_dirty_record_t *dr = dsa->dsa_dr;
dmu_buf_impl_t *db = dr->dr_dbuf;
mutex_enter(&db->db_mtx);
ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
if (zio->io_error == 0) {
dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
if (dr->dt.dl.dr_nopwrite) {
blkptr_t *bp = zio->io_bp;
blkptr_t *bp_orig = &zio->io_bp_orig;
uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
ASSERT(BP_EQUAL(bp, bp_orig));
VERIFY(BP_EQUAL(bp, db->db_blkptr));
ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
ASSERT(zio_checksum_table[chksum].ci_flags &
ZCHECKSUM_FLAG_NOPWRITE);
}
dr->dt.dl.dr_overridden_by = *zio->io_bp;
dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
/*
* Old style holes are filled with all zeros, whereas
* new-style holes maintain their lsize, type, level,
* and birth time (see zio_write_compress). While we
* need to reset the BP_SET_LSIZE() call that happened
* in dmu_sync_ready for old style holes, we do *not*
* want to wipe out the information contained in new
* style holes. Thus, only zero out the block pointer if
* it's an old style hole.
*/
if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
dr->dt.dl.dr_overridden_by.blk_birth == 0)
BP_ZERO(&dr->dt.dl.dr_overridden_by);
} else {
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
}
cv_broadcast(&db->db_changed);
mutex_exit(&db->db_mtx);
dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
kmem_free(dsa, sizeof (*dsa));
}
static void
dmu_sync_late_arrival_done(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
dmu_sync_arg_t *dsa = zio->io_private;
blkptr_t *bp_orig = &zio->io_bp_orig;
if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
ASSERT(zio->io_bp->blk_birth == zio->io_txg);
ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
}
dmu_tx_commit(dsa->dsa_tx);
dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
abd_put(zio->io_abd);
kmem_free(dsa, sizeof (*dsa));
}
static int
dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
zio_prop_t *zp, zbookmark_phys_t *zb)
{
dmu_sync_arg_t *dsa;
dmu_tx_t *tx;
tx = dmu_tx_create(os);
dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
dmu_tx_abort(tx);
/* Make zl_get_data do txg_waited_synced() */
return (SET_ERROR(EIO));
}
/*
* In order to prevent the zgd's lwb from being free'd prior to
* dmu_sync_late_arrival_done() being called, we have to ensure
* the lwb's "max txg" takes this tx's txg into account.
*/
zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx));
dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
dsa->dsa_dr = NULL;
dsa->dsa_done = done;
dsa->dsa_zgd = zgd;
dsa->dsa_tx = tx;
/*
* Since we are currently syncing this txg, it's nontrivial to
* determine what BP to nopwrite against, so we disable nopwrite.
*
* When syncing, the db_blkptr is initially the BP of the previous
* txg. We can not nopwrite against it because it will be changed
* (this is similar to the non-late-arrival case where the dbuf is
* dirty in a future txg).
*
* Then dbuf_write_ready() sets bp_blkptr to the location we will write.
* We can not nopwrite against it because although the BP will not
* (typically) be changed, the data has not yet been persisted to this
* location.
*
* Finally, when dbuf_write_done() is called, it is theoretically
* possible to always nopwrite, because the data that was written in
* this txg is the same data that we are trying to write. However we
* would need to check that this dbuf is not dirty in any future
* txg's (as we do in the normal dmu_sync() path). For simplicity, we
* don't nopwrite in this case.
*/
zp->zp_nopwrite = B_FALSE;
zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
return (0);
}
/*
* Intent log support: sync the block associated with db to disk.
* N.B. and XXX: the caller is responsible for making sure that the
* data isn't changing while dmu_sync() is writing it.
*
* Return values:
*
* EEXIST: this txg has already been synced, so there's nothing to do.
* The caller should not log the write.
*
* ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
* The caller should not log the write.
*
* EALREADY: this block is already in the process of being synced.
* The caller should track its progress (somehow).
*
* EIO: could not do the I/O.
* The caller should do a txg_wait_synced().
*
* 0: the I/O has been initiated.
* The caller should log this blkptr in the done callback.
* It is possible that the I/O will fail, in which case
* the error will be reported to the done callback and
* propagated to pio from zio_done().
*/
int
dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
objset_t *os = db->db_objset;
dsl_dataset_t *ds = os->os_dsl_dataset;
dbuf_dirty_record_t *dr;
dmu_sync_arg_t *dsa;
zbookmark_phys_t zb;
zio_prop_t zp;
dnode_t *dn;
ASSERT(pio != NULL);
ASSERT(txg != 0);
SET_BOOKMARK(&zb, ds->ds_object,
db->db.db_object, db->db_level, db->db_blkid);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
DB_DNODE_EXIT(db);
/*
* If we're frozen (running ziltest), we always need to generate a bp.
*/
if (txg > spa_freeze_txg(os->os_spa))
return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
/*
* Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
* and us. If we determine that this txg is not yet syncing,
* but it begins to sync a moment later, that's OK because the
* sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
*/
mutex_enter(&db->db_mtx);
if (txg <= spa_last_synced_txg(os->os_spa)) {
/*
* This txg has already synced. There's nothing to do.
*/
mutex_exit(&db->db_mtx);
return (SET_ERROR(EEXIST));
}
if (txg <= spa_syncing_txg(os->os_spa)) {
/*
* This txg is currently syncing, so we can't mess with
* the dirty record anymore; just write a new log block.
*/
mutex_exit(&db->db_mtx);
return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
}
dr = db->db_last_dirty;
while (dr && dr->dr_txg != txg)
dr = dr->dr_next;
if (dr == NULL) {
/*
* There's no dr for this dbuf, so it must have been freed.
* There's no need to log writes to freed blocks, so we're done.
*/
mutex_exit(&db->db_mtx);
return (SET_ERROR(ENOENT));
}
ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
if (db->db_blkptr != NULL) {
/*
* We need to fill in zgd_bp with the current blkptr so that
* the nopwrite code can check if we're writing the same
* data that's already on disk. We can only nopwrite if we
* are sure that after making the copy, db_blkptr will not
* change until our i/o completes. We ensure this by
* holding the db_mtx, and only allowing nopwrite if the
* block is not already dirty (see below). This is verified
* by dmu_sync_done(), which VERIFYs that the db_blkptr has
* not changed.
*/
*zgd->zgd_bp = *db->db_blkptr;
}
/*
* Assume the on-disk data is X, the current syncing data (in
* txg - 1) is Y, and the current in-memory data is Z (currently
* in dmu_sync).
*
* We usually want to perform a nopwrite if X and Z are the
* same. However, if Y is different (i.e. the BP is going to
* change before this write takes effect), then a nopwrite will
* be incorrect - we would override with X, which could have
* been freed when Y was written.
*
* (Note that this is not a concern when we are nop-writing from
* syncing context, because X and Y must be identical, because
* all previous txgs have been synced.)
*
* Therefore, we disable nopwrite if the current BP could change
* before this TXG. There are two ways it could change: by
* being dirty (dr_next is non-NULL), or by being freed
* (dnode_block_freed()). This behavior is verified by
* zio_done(), which VERIFYs that the override BP is identical
* to the on-disk BP.
*/
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid))
zp.zp_nopwrite = B_FALSE;
DB_DNODE_EXIT(db);
ASSERT(dr->dr_txg == txg);
if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
/*
* We have already issued a sync write for this buffer,
* or this buffer has already been synced. It could not
* have been dirtied since, or we would have cleared the state.
*/
mutex_exit(&db->db_mtx);
return (SET_ERROR(EALREADY));
}
ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
mutex_exit(&db->db_mtx);
dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
dsa->dsa_dr = dr;
dsa->dsa_done = done;
dsa->dsa_zgd = zgd;
dsa->dsa_tx = NULL;
zio_nowait(arc_write(pio, os->os_spa, txg,
zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
&zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
return (0);
}
int
dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
dmu_tx_t *tx)
{
dnode_t *dn;
int err;
err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
err = dnode_set_blksz(dn, size, ibs, tx);
dnode_rele(dn, FTAG);
return (err);
}
void
dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
dmu_tx_t *tx)
{
dnode_t *dn;
/*
* Send streams include each object's checksum function. This
* check ensures that the receiving system can understand the
* checksum function transmitted.
*/
ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
VERIFY0(dnode_hold(os, object, FTAG, &dn));
ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
dn->dn_checksum = checksum;
dnode_setdirty(dn, tx);
dnode_rele(dn, FTAG);
}
void
dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
dmu_tx_t *tx)
{
dnode_t *dn;
/*
* Send streams include each object's compression function. This
* check ensures that the receiving system can understand the
* compression function transmitted.
*/
ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
VERIFY0(dnode_hold(os, object, FTAG, &dn));
dn->dn_compress = compress;
dnode_setdirty(dn, tx);
dnode_rele(dn, FTAG);
}
int zfs_mdcomp_disable = 0;
SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RWTUN,
&zfs_mdcomp_disable, 0, "Disable metadata compression");
/*
* When the "redundant_metadata" property is set to "most", only indirect
* blocks of this level and higher will have an additional ditto block.
*/
int zfs_redundant_metadata_most_ditto_level = 2;
void
dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
{
dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
(wp & WP_SPILL));
enum zio_checksum checksum = os->os_checksum;
enum zio_compress compress = os->os_compress;
enum zio_checksum dedup_checksum = os->os_dedup_checksum;
boolean_t dedup = B_FALSE;
boolean_t nopwrite = B_FALSE;
boolean_t dedup_verify = os->os_dedup_verify;
int copies = os->os_copies;
/*
* We maintain different write policies for each of the following
* types of data:
* 1. metadata
* 2. preallocated blocks (i.e. level-0 blocks of a dump device)
* 3. all other level 0 blocks
*/
if (ismd) {
if (zfs_mdcomp_disable) {
compress = ZIO_COMPRESS_EMPTY;
} else {
/*
* XXX -- we should design a compression algorithm
* that specializes in arrays of bps.
*/
compress = zio_compress_select(os->os_spa,
ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
}
/*
* Metadata always gets checksummed. If the data
* checksum is multi-bit correctable, and it's not a
* ZBT-style checksum, then it's suitable for metadata
* as well. Otherwise, the metadata checksum defaults
* to fletcher4.
*/
if (!(zio_checksum_table[checksum].ci_flags &
ZCHECKSUM_FLAG_METADATA) ||
(zio_checksum_table[checksum].ci_flags &
ZCHECKSUM_FLAG_EMBEDDED))
checksum = ZIO_CHECKSUM_FLETCHER_4;
if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
(os->os_redundant_metadata ==
ZFS_REDUNDANT_METADATA_MOST &&
(level >= zfs_redundant_metadata_most_ditto_level ||
DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))))
copies++;
} else if (wp & WP_NOFILL) {
ASSERT(level == 0);
/*
* If we're writing preallocated blocks, we aren't actually
* writing them so don't set any policy properties. These
* blocks are currently only used by an external subsystem
* outside of zfs (i.e. dump) and not written by the zio
* pipeline.
*/
compress = ZIO_COMPRESS_OFF;
checksum = ZIO_CHECKSUM_NOPARITY;
} else {
compress = zio_compress_select(os->os_spa, dn->dn_compress,
compress);
checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
zio_checksum_select(dn->dn_checksum, checksum) :
dedup_checksum;
/*
* Determine dedup setting. If we are in dmu_sync(),
* we won't actually dedup now because that's all
* done in syncing context; but we do want to use the
* dedup checkum. If the checksum is not strong
* enough to ensure unique signatures, force
* dedup_verify.
*/
if (dedup_checksum != ZIO_CHECKSUM_OFF) {
dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
if (!(zio_checksum_table[checksum].ci_flags &
ZCHECKSUM_FLAG_DEDUP))
dedup_verify = B_TRUE;
}
/*
* Enable nopwrite if we have secure enough checksum
* algorithm (see comment in zio_nop_write) and
* compression is enabled. We don't enable nopwrite if
* dedup is enabled as the two features are mutually
* exclusive.
*/
nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
ZCHECKSUM_FLAG_NOPWRITE) &&
compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
}
zp->zp_checksum = checksum;
zp->zp_compress = compress;
ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
zp->zp_level = level;
zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
zp->zp_dedup = dedup;
zp->zp_dedup_verify = dedup && dedup_verify;
zp->zp_nopwrite = nopwrite;
}
int
dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
{
dnode_t *dn;
int err;
/*
* Sync any current changes before
* we go trundling through the block pointers.
*/
err = dmu_object_wait_synced(os, object);
if (err) {
return (err);
}
err = dnode_hold(os, object, FTAG, &dn);
if (err) {
return (err);
}
err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
dnode_rele(dn, FTAG);
return (err);
}
/*
* Given the ZFS object, if it contains any dirty nodes
* this function flushes all dirty blocks to disk. This
* ensures the DMU object info is updated. A more efficient
* future version might just find the TXG with the maximum
* ID and wait for that to be synced.
*/
int
dmu_object_wait_synced(objset_t *os, uint64_t object)
{
dnode_t *dn;
int error, i;
error = dnode_hold(os, object, FTAG, &dn);
if (error) {
return (error);
}
for (i = 0; i < TXG_SIZE; i++) {
if (list_link_active(&dn->dn_dirty_link[i])) {
break;
}
}
dnode_rele(dn, FTAG);
if (i != TXG_SIZE) {
txg_wait_synced(dmu_objset_pool(os), 0);
}
return (0);
}
void
dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
{
dnode_phys_t *dnp;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
mutex_enter(&dn->dn_mtx);
dnp = dn->dn_phys;
doi->doi_data_block_size = dn->dn_datablksz;
doi->doi_metadata_block_size = dn->dn_indblkshift ?
1ULL << dn->dn_indblkshift : 0;
doi->doi_type = dn->dn_type;
doi->doi_bonus_type = dn->dn_bonustype;
doi->doi_bonus_size = dn->dn_bonuslen;
doi->doi_indirection = dn->dn_nlevels;
doi->doi_checksum = dn->dn_checksum;
doi->doi_compress = dn->dn_compress;
doi->doi_nblkptr = dn->dn_nblkptr;
doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
doi->doi_fill_count = 0;
for (int i = 0; i < dnp->dn_nblkptr; i++)
doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
mutex_exit(&dn->dn_mtx);
rw_exit(&dn->dn_struct_rwlock);
}
/*
* Get information on a DMU object.
* If doi is NULL, just indicates whether the object exists.
*/
int
dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
{
dnode_t *dn;
int err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
if (doi != NULL)
dmu_object_info_from_dnode(dn, doi);
dnode_rele(dn, FTAG);
return (0);
}
/*
* As above, but faster; can be used when you have a held dbuf in hand.
*/
void
dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
DB_DNODE_ENTER(db);
dmu_object_info_from_dnode(DB_DNODE(db), doi);
DB_DNODE_EXIT(db);
}
/*
* Faster still when you only care about the size.
* This is specifically optimized for zfs_getattr().
*/
void
dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
u_longlong_t *nblk512)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
dnode_t *dn;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
*blksize = dn->dn_datablksz;
/* add 1 for dnode space */
*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
SPA_MINBLOCKSHIFT) + 1;
DB_DNODE_EXIT(db);
}
void
byteswap_uint64_array(void *vbuf, size_t size)
{
uint64_t *buf = vbuf;
size_t count = size >> 3;
int i;
ASSERT((size & 7) == 0);
for (i = 0; i < count; i++)
buf[i] = BSWAP_64(buf[i]);
}
void
byteswap_uint32_array(void *vbuf, size_t size)
{
uint32_t *buf = vbuf;
size_t count = size >> 2;
int i;
ASSERT((size & 3) == 0);
for (i = 0; i < count; i++)
buf[i] = BSWAP_32(buf[i]);
}
void
byteswap_uint16_array(void *vbuf, size_t size)
{
uint16_t *buf = vbuf;
size_t count = size >> 1;
int i;
ASSERT((size & 1) == 0);
for (i = 0; i < count; i++)
buf[i] = BSWAP_16(buf[i]);
}
/* ARGSUSED */
void
byteswap_uint8_array(void *vbuf, size_t size)
{
}
void
dmu_init(void)
{
abd_init();
zfs_dbgmsg_init();
sa_cache_init();
xuio_stat_init();
dmu_objset_init();
dnode_init();
zfetch_init();
zio_compress_init();
l2arc_init();
arc_init();
dbuf_init();
}
void
dmu_fini(void)
{
arc_fini(); /* arc depends on l2arc, so arc must go first */
l2arc_fini();
zfetch_fini();
zio_compress_fini();
dbuf_fini();
dnode_fini();
dmu_objset_fini();
xuio_stat_fini();
sa_cache_fini();
zfs_dbgmsg_fini();
abd_fini();
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c (revision 332525)
@@ -1,2240 +1,2347 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2017 Nexenta Systems, Inc.
*/
/* Portions Copyright 2010 Robert Milkowski */
#include <sys/cred.h>
#include <sys/zfs_context.h>
#include <sys/dmu_objset.h>
#include <sys/dsl_dir.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_synctask.h>
#include <sys/dsl_deleg.h>
#include <sys/dnode.h>
#include <sys/dbuf.h>
#include <sys/zvol.h>
#include <sys/dmu_tx.h>
#include <sys/zap.h>
#include <sys/zil.h>
#include <sys/dmu_impl.h>
#include <sys/zfs_ioctl.h>
#include <sys/sa.h>
#include <sys/zfs_onexit.h>
#include <sys/dsl_destroy.h>
#include <sys/vdev.h>
+#include <sys/zfeature.h>
/*
* Needed to close a window in dnode_move() that allows the objset to be freed
* before it can be safely accessed.
*/
krwlock_t os_lock;
/*
* Tunable to overwrite the maximum number of threads for the parallization
* of dmu_objset_find_dp, needed to speed up the import of pools with many
* datasets.
* Default is 4 times the number of leaf vdevs.
*/
int dmu_find_threads = 0;
/*
* Backfill lower metadnode objects after this many have been freed.
* Backfilling negatively impacts object creation rates, so only do it
* if there are enough holes to fill.
*/
int dmu_rescan_dnode_threshold = 131072;
static void dmu_objset_find_dp_cb(void *arg);
void
dmu_objset_init(void)
{
rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
}
void
dmu_objset_fini(void)
{
rw_destroy(&os_lock);
}
spa_t *
dmu_objset_spa(objset_t *os)
{
return (os->os_spa);
}
zilog_t *
dmu_objset_zil(objset_t *os)
{
return (os->os_zil);
}
dsl_pool_t *
dmu_objset_pool(objset_t *os)
{
dsl_dataset_t *ds;
if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir)
return (ds->ds_dir->dd_pool);
else
return (spa_get_dsl(os->os_spa));
}
dsl_dataset_t *
dmu_objset_ds(objset_t *os)
{
return (os->os_dsl_dataset);
}
dmu_objset_type_t
dmu_objset_type(objset_t *os)
{
return (os->os_phys->os_type);
}
void
dmu_objset_name(objset_t *os, char *buf)
{
dsl_dataset_name(os->os_dsl_dataset, buf);
}
uint64_t
dmu_objset_id(objset_t *os)
{
dsl_dataset_t *ds = os->os_dsl_dataset;
return (ds ? ds->ds_object : 0);
}
zfs_sync_type_t
dmu_objset_syncprop(objset_t *os)
{
return (os->os_sync);
}
zfs_logbias_op_t
dmu_objset_logbias(objset_t *os)
{
return (os->os_logbias);
}
static void
checksum_changed_cb(void *arg, uint64_t newval)
{
objset_t *os = arg;
/*
* Inheritance should have been done by now.
*/
ASSERT(newval != ZIO_CHECKSUM_INHERIT);
os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
}
static void
compression_changed_cb(void *arg, uint64_t newval)
{
objset_t *os = arg;
/*
* Inheritance and range checking should have been done by now.
*/
ASSERT(newval != ZIO_COMPRESS_INHERIT);
os->os_compress = zio_compress_select(os->os_spa, newval,
ZIO_COMPRESS_ON);
}
static void
copies_changed_cb(void *arg, uint64_t newval)
{
objset_t *os = arg;
/*
* Inheritance and range checking should have been done by now.
*/
ASSERT(newval > 0);
ASSERT(newval <= spa_max_replication(os->os_spa));
os->os_copies = newval;
}
static void
dedup_changed_cb(void *arg, uint64_t newval)
{
objset_t *os = arg;
spa_t *spa = os->os_spa;
enum zio_checksum checksum;
/*
* Inheritance should have been done by now.
*/
ASSERT(newval != ZIO_CHECKSUM_INHERIT);
checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);
os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;
os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);
}
static void
primary_cache_changed_cb(void *arg, uint64_t newval)
{
objset_t *os = arg;
/*
* Inheritance and range checking should have been done by now.
*/
ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
newval == ZFS_CACHE_METADATA);
os->os_primary_cache = newval;
}
static void
secondary_cache_changed_cb(void *arg, uint64_t newval)
{
objset_t *os = arg;
/*
* Inheritance and range checking should have been done by now.
*/
ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
newval == ZFS_CACHE_METADATA);
os->os_secondary_cache = newval;
}
static void
sync_changed_cb(void *arg, uint64_t newval)
{
objset_t *os = arg;
/*
* Inheritance and range checking should have been done by now.
*/
ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS ||
newval == ZFS_SYNC_DISABLED);
os->os_sync = newval;
if (os->os_zil)
zil_set_sync(os->os_zil, newval);
}
static void
redundant_metadata_changed_cb(void *arg, uint64_t newval)
{
objset_t *os = arg;
/*
* Inheritance and range checking should have been done by now.
*/
ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL ||
newval == ZFS_REDUNDANT_METADATA_MOST);
os->os_redundant_metadata = newval;
}
static void
logbias_changed_cb(void *arg, uint64_t newval)
{
objset_t *os = arg;
ASSERT(newval == ZFS_LOGBIAS_LATENCY ||
newval == ZFS_LOGBIAS_THROUGHPUT);
os->os_logbias = newval;
if (os->os_zil)
zil_set_logbias(os->os_zil, newval);
}
static void
recordsize_changed_cb(void *arg, uint64_t newval)
{
objset_t *os = arg;
os->os_recordsize = newval;
}
void
dmu_objset_byteswap(void *buf, size_t size)
{
objset_phys_t *osp = buf;
ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t));
dnode_byteswap(&osp->os_meta_dnode);
byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
osp->os_type = BSWAP_64(osp->os_type);
osp->os_flags = BSWAP_64(osp->os_flags);
if (size == sizeof (objset_phys_t)) {
dnode_byteswap(&osp->os_userused_dnode);
dnode_byteswap(&osp->os_groupused_dnode);
}
}
/*
* The hash is a CRC-based hash of the objset_t pointer and the object number.
*/
static uint64_t
dnode_hash(const objset_t *os, uint64_t obj)
{
uintptr_t osv = (uintptr_t)os;
uint64_t crc = -1ULL;
ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
/*
* The low 6 bits of the pointer don't have much entropy, because
* the objset_t is larger than 2^6 bytes long.
*/
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF];
crc ^= (osv>>14) ^ (obj>>24);
return (crc);
}
unsigned int
dnode_multilist_index_func(multilist_t *ml, void *obj)
{
dnode_t *dn = obj;
return (dnode_hash(dn->dn_objset, dn->dn_object) %
multilist_get_num_sublists(ml));
}
/*
* Instantiates the objset_t in-memory structure corresponding to the
* objset_phys_t that's pointed to by the specified blkptr_t.
*/
int
dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
objset_t **osp)
{
objset_t *os;
int i, err;
ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
+ /*
+ * The $ORIGIN dataset (if it exists) doesn't have an associated
+ * objset, so there's no reason to open it. The $ORIGIN dataset
+ * will not exist on pools older than SPA_VERSION_ORIGIN.
+ */
+ if (ds != NULL && spa_get_dsl(spa) != NULL &&
+ spa_get_dsl(spa)->dp_origin_snap != NULL) {
+ ASSERT3P(ds->ds_dir, !=,
+ spa_get_dsl(spa)->dp_origin_snap->ds_dir);
+ }
+
os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
os->os_dsl_dataset = ds;
os->os_spa = spa;
os->os_rootbp = bp;
if (!BP_IS_HOLE(os->os_rootbp)) {
arc_flags_t aflags = ARC_FLAG_WAIT;
zbookmark_phys_t zb;
SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
if (DMU_OS_IS_L2CACHEABLE(os))
aflags |= ARC_FLAG_L2CACHE;
dprintf_bp(os->os_rootbp, "reading %s", "");
err = arc_read(NULL, spa, os->os_rootbp,
arc_getbuf_func, &os->os_phys_buf,
ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
if (err != 0) {
kmem_free(os, sizeof (objset_t));
/* convert checksum errors into IO errors */
if (err == ECKSUM)
err = SET_ERROR(EIO);
return (err);
}
/* Increase the blocksize if we are permitted. */
if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
arc_buf_t *buf = arc_alloc_buf(spa, &os->os_phys_buf,
ARC_BUFC_METADATA, sizeof (objset_phys_t));
bzero(buf->b_data, sizeof (objset_phys_t));
bcopy(os->os_phys_buf->b_data, buf->b_data,
arc_buf_size(os->os_phys_buf));
arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
os->os_phys_buf = buf;
}
os->os_phys = os->os_phys_buf->b_data;
os->os_flags = os->os_phys->os_flags;
} else {
int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
os->os_phys_buf = arc_alloc_buf(spa, &os->os_phys_buf,
ARC_BUFC_METADATA, size);
os->os_phys = os->os_phys_buf->b_data;
bzero(os->os_phys, size);
}
/*
* Note: the changed_cb will be called once before the register
* func returns, thus changing the checksum/compression from the
* default (fletcher2/off). Snapshots don't need to know about
* checksum/compression/copies.
*/
if (ds != NULL) {
boolean_t needlock = B_FALSE;
/*
* Note: it's valid to open the objset if the dataset is
* long-held, in which case the pool_config lock will not
* be held.
*/
if (!dsl_pool_config_held(dmu_objset_pool(os))) {
needlock = B_TRUE;
dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
}
err = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
primary_cache_changed_cb, os);
if (err == 0) {
err = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
secondary_cache_changed_cb, os);
}
if (!ds->ds_is_snapshot) {
if (err == 0) {
err = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_CHECKSUM),
checksum_changed_cb, os);
}
if (err == 0) {
err = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_COMPRESSION),
compression_changed_cb, os);
}
if (err == 0) {
err = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_COPIES),
copies_changed_cb, os);
}
if (err == 0) {
err = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_DEDUP),
dedup_changed_cb, os);
}
if (err == 0) {
err = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_LOGBIAS),
logbias_changed_cb, os);
}
if (err == 0) {
err = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_SYNC),
sync_changed_cb, os);
}
if (err == 0) {
err = dsl_prop_register(ds,
zfs_prop_to_name(
ZFS_PROP_REDUNDANT_METADATA),
redundant_metadata_changed_cb, os);
}
if (err == 0) {
err = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
recordsize_changed_cb, os);
}
}
if (needlock)
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
if (err != 0) {
arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
kmem_free(os, sizeof (objset_t));
return (err);
}
} else {
/* It's the meta-objset. */
os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
os->os_compress = ZIO_COMPRESS_ON;
os->os_copies = spa_max_replication(spa);
os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
os->os_dedup_verify = B_FALSE;
os->os_logbias = ZFS_LOGBIAS_LATENCY;
os->os_sync = ZFS_SYNC_STANDARD;
os->os_primary_cache = ZFS_CACHE_ALL;
os->os_secondary_cache = ZFS_CACHE_ALL;
}
if (ds == NULL || !ds->ds_is_snapshot)
os->os_zil_header = os->os_phys->os_zil_header;
os->os_zil = zil_alloc(os, &os->os_zil_header);
for (i = 0; i < TXG_SIZE; i++) {
os->os_dirty_dnodes[i] = multilist_create(sizeof (dnode_t),
offsetof(dnode_t, dn_dirty_link[i]),
dnode_multilist_index_func);
}
list_create(&os->os_dnodes, sizeof (dnode_t),
offsetof(dnode_t, dn_link));
list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
offsetof(dmu_buf_impl_t, db_link));
mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&os->os_userused_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
dnode_special_open(os, &os->os_phys->os_meta_dnode,
DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
dnode_special_open(os, &os->os_phys->os_userused_dnode,
DMU_USERUSED_OBJECT, &os->os_userused_dnode);
dnode_special_open(os, &os->os_phys->os_groupused_dnode,
DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);
}
*osp = os;
return (0);
}
int
dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
{
int err = 0;
/*
* We shouldn't be doing anything with dsl_dataset_t's unless the
* pool_config lock is held, or the dataset is long-held.
*/
ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) ||
dsl_dataset_long_held(ds));
mutex_enter(&ds->ds_opening_lock);
if (ds->ds_objset == NULL) {
objset_t *os;
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
ds, dsl_dataset_get_blkptr(ds), &os);
rrw_exit(&ds->ds_bp_rwlock, FTAG);
if (err == 0) {
mutex_enter(&ds->ds_lock);
ASSERT(ds->ds_objset == NULL);
ds->ds_objset = os;
mutex_exit(&ds->ds_lock);
}
}
*osp = ds->ds_objset;
mutex_exit(&ds->ds_opening_lock);
return (err);
}
/*
* Holds the pool while the objset is held. Therefore only one objset
* can be held at a time.
*/
int
dmu_objset_hold(const char *name, void *tag, objset_t **osp)
{
dsl_pool_t *dp;
dsl_dataset_t *ds;
int err;
err = dsl_pool_hold(name, tag, &dp);
if (err != 0)
return (err);
err = dsl_dataset_hold(dp, name, tag, &ds);
if (err != 0) {
dsl_pool_rele(dp, tag);
return (err);
}
err = dmu_objset_from_ds(ds, osp);
if (err != 0) {
dsl_dataset_rele(ds, tag);
dsl_pool_rele(dp, tag);
}
return (err);
}
static int
dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
boolean_t readonly, void *tag, objset_t **osp)
{
int err;
err = dmu_objset_from_ds(ds, osp);
if (err != 0) {
dsl_dataset_disown(ds, tag);
} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
dsl_dataset_disown(ds, tag);
return (SET_ERROR(EINVAL));
} else if (!readonly && dsl_dataset_is_snapshot(ds)) {
dsl_dataset_disown(ds, tag);
return (SET_ERROR(EROFS));
}
return (err);
}
/*
* dsl_pool must not be held when this is called.
* Upon successful return, there will be a longhold on the dataset,
* and the dsl_pool will not be held.
*/
int
dmu_objset_own(const char *name, dmu_objset_type_t type,
boolean_t readonly, void *tag, objset_t **osp)
{
dsl_pool_t *dp;
dsl_dataset_t *ds;
int err;
err = dsl_pool_hold(name, FTAG, &dp);
if (err != 0)
return (err);
err = dsl_dataset_own(dp, name, tag, &ds);
if (err != 0) {
dsl_pool_rele(dp, FTAG);
return (err);
}
err = dmu_objset_own_impl(ds, type, readonly, tag, osp);
dsl_pool_rele(dp, FTAG);
return (err);
}
int
dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
boolean_t readonly, void *tag, objset_t **osp)
{
dsl_dataset_t *ds;
int err;
err = dsl_dataset_own_obj(dp, obj, tag, &ds);
if (err != 0)
return (err);
return (dmu_objset_own_impl(ds, type, readonly, tag, osp));
}
void
dmu_objset_rele(objset_t *os, void *tag)
{
dsl_pool_t *dp = dmu_objset_pool(os);
dsl_dataset_rele(os->os_dsl_dataset, tag);
dsl_pool_rele(dp, tag);
}
/*
* When we are called, os MUST refer to an objset associated with a dataset
* that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner
* == tag. We will then release and reacquire ownership of the dataset while
* holding the pool config_rwlock to avoid intervening namespace or ownership
* changes may occur.
*
* This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to
* release the hold on its dataset and acquire a new one on the dataset of the
* same name so that it can be partially torn down and reconstructed.
*/
void
dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds,
void *tag)
{
dsl_pool_t *dp;
char name[ZFS_MAX_DATASET_NAME_LEN];
VERIFY3P(ds, !=, NULL);
VERIFY3P(ds->ds_owner, ==, tag);
VERIFY(dsl_dataset_long_held(ds));
dsl_dataset_name(ds, name);
dp = ds->ds_dir->dd_pool;
dsl_pool_config_enter(dp, FTAG);
dsl_dataset_disown(ds, tag);
VERIFY0(dsl_dataset_own(dp, name, tag, newds));
dsl_pool_config_exit(dp, FTAG);
}
void
dmu_objset_disown(objset_t *os, void *tag)
{
dsl_dataset_disown(os->os_dsl_dataset, tag);
}
void
dmu_objset_evict_dbufs(objset_t *os)
{
dnode_t dn_marker;
dnode_t *dn;
mutex_enter(&os->os_lock);
dn = list_head(&os->os_dnodes);
while (dn != NULL) {
/*
* Skip dnodes without holds. We have to do this dance
* because dnode_add_ref() only works if there is already a
* hold. If the dnode has no holds, then it has no dbufs.
*/
if (dnode_add_ref(dn, FTAG)) {
list_insert_after(&os->os_dnodes, dn, &dn_marker);
mutex_exit(&os->os_lock);
dnode_evict_dbufs(dn);
dnode_rele(dn, FTAG);
mutex_enter(&os->os_lock);
dn = list_next(&os->os_dnodes, &dn_marker);
list_remove(&os->os_dnodes, &dn_marker);
} else {
dn = list_next(&os->os_dnodes, dn);
}
}
mutex_exit(&os->os_lock);
if (DMU_USERUSED_DNODE(os) != NULL) {
dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));
dnode_evict_dbufs(DMU_USERUSED_DNODE(os));
}
dnode_evict_dbufs(DMU_META_DNODE(os));
}
/*
* Objset eviction processing is split into into two pieces.
* The first marks the objset as evicting, evicts any dbufs that
* have a refcount of zero, and then queues up the objset for the
* second phase of eviction. Once os->os_dnodes has been cleared by
* dnode_buf_pageout()->dnode_destroy(), the second phase is executed.
* The second phase closes the special dnodes, dequeues the objset from
* the list of those undergoing eviction, and finally frees the objset.
*
* NOTE: Due to asynchronous eviction processing (invocation of
* dnode_buf_pageout()), it is possible for the meta dnode for the
* objset to have no holds even though os->os_dnodes is not empty.
*/
void
dmu_objset_evict(objset_t *os)
{
dsl_dataset_t *ds = os->os_dsl_dataset;
for (int t = 0; t < TXG_SIZE; t++)
ASSERT(!dmu_objset_is_dirty(os, t));
if (ds)
dsl_prop_unregister_all(ds, os);
if (os->os_sa)
sa_tear_down(os);
dmu_objset_evict_dbufs(os);
mutex_enter(&os->os_lock);
spa_evicting_os_register(os->os_spa, os);
if (list_is_empty(&os->os_dnodes)) {
mutex_exit(&os->os_lock);
dmu_objset_evict_done(os);
} else {
mutex_exit(&os->os_lock);
}
}
void
dmu_objset_evict_done(objset_t *os)
{
ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
dnode_special_close(&os->os_meta_dnode);
if (DMU_USERUSED_DNODE(os)) {
dnode_special_close(&os->os_userused_dnode);
dnode_special_close(&os->os_groupused_dnode);
}
zil_free(os->os_zil);
arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
/*
* This is a barrier to prevent the objset from going away in
* dnode_move() until we can safely ensure that the objset is still in
* use. We consider the objset valid before the barrier and invalid
* after the barrier.
*/
rw_enter(&os_lock, RW_READER);
rw_exit(&os_lock);
mutex_destroy(&os->os_lock);
mutex_destroy(&os->os_userused_lock);
mutex_destroy(&os->os_obj_lock);
mutex_destroy(&os->os_user_ptr_lock);
for (int i = 0; i < TXG_SIZE; i++) {
multilist_destroy(os->os_dirty_dnodes[i]);
}
spa_evicting_os_deregister(os->os_spa, os);
kmem_free(os, sizeof (objset_t));
}
timestruc_t
dmu_objset_snap_cmtime(objset_t *os)
{
return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
}
/* called from dsl for meta-objset */
objset_t *
dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
dmu_objset_type_t type, dmu_tx_t *tx)
{
objset_t *os;
dnode_t *mdn;
ASSERT(dmu_tx_is_syncing(tx));
if (ds != NULL)
VERIFY0(dmu_objset_from_ds(ds, &os));
else
VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os));
mdn = DMU_META_DNODE(os);
dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
/*
* We don't want to have to increase the meta-dnode's nlevels
* later, because then we could do it in quescing context while
* we are also accessing it in open context.
*
* This precaution is not necessary for the MOS (ds == NULL),
* because the MOS is only updated in syncing context.
* This is most fortunate: the MOS is the only objset that
* needs to be synced multiple times as spa_sync() iterates
* to convergence, so minimizing its dn_nlevels matters.
*/
if (ds != NULL) {
int levels = 1;
/*
* Determine the number of levels necessary for the meta-dnode
* to contain DN_MAX_OBJECT dnodes. Note that in order to
* ensure that we do not overflow 64 bits, there has to be
* a nlevels that gives us a number of blocks > DN_MAX_OBJECT
* but < 2^64. Therefore,
* (mdn->dn_indblkshift - SPA_BLKPTRSHIFT) (10) must be
* less than (64 - log2(DN_MAX_OBJECT)) (16).
*/
while ((uint64_t)mdn->dn_nblkptr <<
(mdn->dn_datablkshift - DNODE_SHIFT +
(levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
DN_MAX_OBJECT)
levels++;
mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
mdn->dn_nlevels = levels;
}
ASSERT(type != DMU_OST_NONE);
ASSERT(type != DMU_OST_ANY);
ASSERT(type < DMU_OST_NUMTYPES);
os->os_phys->os_type = type;
if (dmu_objset_userused_enabled(os)) {
os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
os->os_flags = os->os_phys->os_flags;
}
dsl_dataset_dirty(ds, tx);
return (os);
}
typedef struct dmu_objset_create_arg {
const char *doca_name;
cred_t *doca_cred;
void (*doca_userfunc)(objset_t *os, void *arg,
cred_t *cr, dmu_tx_t *tx);
void *doca_userarg;
dmu_objset_type_t doca_type;
uint64_t doca_flags;
} dmu_objset_create_arg_t;
/*ARGSUSED*/
static int
dmu_objset_create_check(void *arg, dmu_tx_t *tx)
{
dmu_objset_create_arg_t *doca = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dir_t *pdd;
const char *tail;
int error;
if (strchr(doca->doca_name, '@') != NULL)
return (SET_ERROR(EINVAL));
if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN)
return (SET_ERROR(ENAMETOOLONG));
error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);
if (error != 0)
return (error);
if (tail == NULL) {
dsl_dir_rele(pdd, FTAG);
return (SET_ERROR(EEXIST));
}
error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
doca->doca_cred);
dsl_dir_rele(pdd, FTAG);
return (error);
}
static void
dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
{
dmu_objset_create_arg_t *doca = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dir_t *pdd;
const char *tail;
dsl_dataset_t *ds;
uint64_t obj;
blkptr_t *bp;
objset_t *os;
VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail));
obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags,
doca->doca_cred, tx);
VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
bp = dsl_dataset_get_blkptr(ds);
os = dmu_objset_create_impl(pdd->dd_pool->dp_spa,
ds, bp, doca->doca_type, tx);
rrw_exit(&ds->ds_bp_rwlock, FTAG);
if (doca->doca_userfunc != NULL) {
doca->doca_userfunc(os, doca->doca_userarg,
doca->doca_cred, tx);
}
spa_history_log_internal_ds(ds, "create", tx, "");
dsl_dataset_rele(ds, FTAG);
dsl_dir_rele(pdd, FTAG);
}
int
dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg)
{
dmu_objset_create_arg_t doca;
doca.doca_name = name;
doca.doca_cred = CRED();
doca.doca_flags = flags;
doca.doca_userfunc = func;
doca.doca_userarg = arg;
doca.doca_type = type;
return (dsl_sync_task(name,
dmu_objset_create_check, dmu_objset_create_sync, &doca,
5, ZFS_SPACE_CHECK_NORMAL));
}
typedef struct dmu_objset_clone_arg {
const char *doca_clone;
const char *doca_origin;
cred_t *doca_cred;
} dmu_objset_clone_arg_t;
/*ARGSUSED*/
static int
dmu_objset_clone_check(void *arg, dmu_tx_t *tx)
{
dmu_objset_clone_arg_t *doca = arg;
dsl_dir_t *pdd;
const char *tail;
int error;
dsl_dataset_t *origin;
dsl_pool_t *dp = dmu_tx_pool(tx);
if (strchr(doca->doca_clone, '@') != NULL)
return (SET_ERROR(EINVAL));
if (strlen(doca->doca_clone) >= ZFS_MAX_DATASET_NAME_LEN)
return (SET_ERROR(ENAMETOOLONG));
error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail);
if (error != 0)
return (error);
if (tail == NULL) {
dsl_dir_rele(pdd, FTAG);
return (SET_ERROR(EEXIST));
}
error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
doca->doca_cred);
if (error != 0) {
dsl_dir_rele(pdd, FTAG);
return (SET_ERROR(EDQUOT));
}
dsl_dir_rele(pdd, FTAG);
error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin);
if (error != 0)
return (error);
/* You can only clone snapshots, not the head datasets. */
if (!origin->ds_is_snapshot) {
dsl_dataset_rele(origin, FTAG);
return (SET_ERROR(EINVAL));
}
dsl_dataset_rele(origin, FTAG);
return (0);
}
static void
dmu_objset_clone_sync(void *arg, dmu_tx_t *tx)
{
dmu_objset_clone_arg_t *doca = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dir_t *pdd;
const char *tail;
dsl_dataset_t *origin, *ds;
uint64_t obj;
char namebuf[ZFS_MAX_DATASET_NAME_LEN];
VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail));
VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin));
obj = dsl_dataset_create_sync(pdd, tail, origin, 0,
doca->doca_cred, tx);
VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
dsl_dataset_name(origin, namebuf);
spa_history_log_internal_ds(ds, "clone", tx,
"origin=%s (%llu)", namebuf, origin->ds_object);
dsl_dataset_rele(ds, FTAG);
dsl_dataset_rele(origin, FTAG);
dsl_dir_rele(pdd, FTAG);
}
int
dmu_objset_clone(const char *clone, const char *origin)
{
dmu_objset_clone_arg_t doca;
doca.doca_clone = clone;
doca.doca_origin = origin;
doca.doca_cred = CRED();
return (dsl_sync_task(clone,
dmu_objset_clone_check, dmu_objset_clone_sync, &doca,
5, ZFS_SPACE_CHECK_NORMAL));
+}
+
+static int
+dmu_objset_remap_indirects_impl(objset_t *os, uint64_t last_removed_txg)
+{
+ int error = 0;
+ uint64_t object = 0;
+ while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
+ error = dmu_object_remap_indirects(os, object,
+ last_removed_txg);
+ /*
+ * If the ZPL removed the object before we managed to dnode_hold
+ * it, we would get an ENOENT. If the ZPL declares its intent
+ * to remove the object (dnode_free) before we manage to
+ * dnode_hold it, we would get an EEXIST. In either case, we
+ * want to continue remapping the other objects in the objset;
+ * in all other cases, we want to break early.
+ */
+ if (error != 0 && error != ENOENT && error != EEXIST) {
+ break;
+ }
+ }
+ if (error == ESRCH) {
+ error = 0;
+ }
+ return (error);
+}
+
+int
+dmu_objset_remap_indirects(const char *fsname)
+{
+ int error = 0;
+ objset_t *os = NULL;
+ uint64_t last_removed_txg;
+ uint64_t remap_start_txg;
+ dsl_dir_t *dd;
+
+ error = dmu_objset_hold(fsname, FTAG, &os);
+ if (error != 0) {
+ return (error);
+ }
+ dd = dmu_objset_ds(os)->ds_dir;
+
+ if (!spa_feature_is_enabled(dmu_objset_spa(os),
+ SPA_FEATURE_OBSOLETE_COUNTS)) {
+ dmu_objset_rele(os, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ if (dsl_dataset_is_snapshot(dmu_objset_ds(os))) {
+ dmu_objset_rele(os, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * If there has not been a removal, we're done.
+ */
+ last_removed_txg = spa_get_last_removal_txg(dmu_objset_spa(os));
+ if (last_removed_txg == -1ULL) {
+ dmu_objset_rele(os, FTAG);
+ return (0);
+ }
+
+ /*
+ * If we have remapped since the last removal, we're done.
+ */
+ if (dsl_dir_is_zapified(dd)) {
+ uint64_t last_remap_txg;
+ if (zap_lookup(spa_meta_objset(dmu_objset_spa(os)),
+ dd->dd_object, DD_FIELD_LAST_REMAP_TXG,
+ sizeof (last_remap_txg), 1, &last_remap_txg) == 0 &&
+ last_remap_txg > last_removed_txg) {
+ dmu_objset_rele(os, FTAG);
+ return (0);
+ }
+ }
+
+ dsl_dataset_long_hold(dmu_objset_ds(os), FTAG);
+ dsl_pool_rele(dmu_objset_pool(os), FTAG);
+
+ remap_start_txg = spa_last_synced_txg(dmu_objset_spa(os));
+ error = dmu_objset_remap_indirects_impl(os, last_removed_txg);
+ if (error == 0) {
+ /*
+ * We update the last_remap_txg to be the start txg so that
+ * we can guarantee that every block older than last_remap_txg
+ * that can be remapped has been remapped.
+ */
+ error = dsl_dir_update_last_remap_txg(dd, remap_start_txg);
+ }
+
+ dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
+ dsl_dataset_rele(dmu_objset_ds(os), FTAG);
+
+ return (error);
}
int
dmu_objset_snapshot_one(const char *fsname, const char *snapname)
{
int err;
char *longsnap = kmem_asprintf("%s@%s", fsname, snapname);
nvlist_t *snaps = fnvlist_alloc();
fnvlist_add_boolean(snaps, longsnap);
strfree(longsnap);
err = dsl_dataset_snapshot(snaps, NULL, NULL);
fnvlist_free(snaps);
return (err);
}
static void
dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx)
{
dnode_t *dn;
while ((dn = multilist_sublist_head(list)) != NULL) {
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
ASSERT(dn->dn_dbuf->db_data_pending);
/*
* Initialize dn_zio outside dnode_sync() because the
* meta-dnode needs to set it ouside dnode_sync().
*/
dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
ASSERT(dn->dn_zio);
ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
multilist_sublist_remove(list, dn);
multilist_t *newlist = dn->dn_objset->os_synced_dnodes;
if (newlist != NULL) {
(void) dnode_add_ref(dn, newlist);
multilist_insert(newlist, dn);
}
dnode_sync(dn, tx);
}
}
/* ARGSUSED */
static void
dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
{
blkptr_t *bp = zio->io_bp;
objset_t *os = arg;
dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
ASSERT0(BP_GET_LEVEL(bp));
/*
* Update rootbp fill count: it should be the number of objects
* allocated in the object set (not counting the "special"
* objects that are stored in the objset_phys_t -- the meta
* dnode and user/group accounting objects).
*/
bp->blk_fill = 0;
for (int i = 0; i < dnp->dn_nblkptr; i++)
bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
if (os->os_dsl_dataset != NULL)
rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG);
*os->os_rootbp = *bp;
if (os->os_dsl_dataset != NULL)
rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
}
/* ARGSUSED */
static void
dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
{
blkptr_t *bp = zio->io_bp;
blkptr_t *bp_orig = &zio->io_bp_orig;
objset_t *os = arg;
if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
ASSERT(BP_EQUAL(bp, bp_orig));
} else {
dsl_dataset_t *ds = os->os_dsl_dataset;
dmu_tx_t *tx = os->os_synctx;
(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
dsl_dataset_block_born(ds, bp, tx);
}
kmem_free(bp, sizeof (*bp));
}
typedef struct sync_dnodes_arg {
multilist_t *sda_list;
int sda_sublist_idx;
multilist_t *sda_newlist;
dmu_tx_t *sda_tx;
} sync_dnodes_arg_t;
static void
sync_dnodes_task(void *arg)
{
sync_dnodes_arg_t *sda = arg;
multilist_sublist_t *ms =
multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx);
dmu_objset_sync_dnodes(ms, sda->sda_tx);
multilist_sublist_unlock(ms);
kmem_free(sda, sizeof (*sda));
}
/* called from dsl */
void
dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
{
int txgoff;
zbookmark_phys_t zb;
zio_prop_t zp;
zio_t *zio;
list_t *list;
dbuf_dirty_record_t *dr;
blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP);
*blkptr_copy = *os->os_rootbp;
dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
ASSERT(dmu_tx_is_syncing(tx));
/* XXX the write_done callback should really give us the tx... */
os->os_synctx = tx;
if (os->os_dsl_dataset == NULL) {
/*
* This is the MOS. If we have upgraded,
* spa_max_replication() could change, so reset
* os_copies here.
*/
os->os_copies = spa_max_replication(os->os_spa);
}
/*
* Create the root block IO
*/
SET_BOOKMARK(&zb, os->os_dsl_dataset ?
os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
arc_release(os->os_phys_buf, &os->os_phys_buf);
dmu_write_policy(os, NULL, 0, 0, &zp);
zio = arc_write(pio, os->os_spa, tx->tx_txg,
blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
&zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
/*
* Sync special dnodes - the parent IO for the sync is the root block
*/
DMU_META_DNODE(os)->dn_zio = zio;
dnode_sync(DMU_META_DNODE(os), tx);
os->os_phys->os_flags = os->os_flags;
if (DMU_USERUSED_DNODE(os) &&
DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
DMU_USERUSED_DNODE(os)->dn_zio = zio;
dnode_sync(DMU_USERUSED_DNODE(os), tx);
DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
}
txgoff = tx->tx_txg & TXG_MASK;
if (dmu_objset_userused_enabled(os)) {
/*
* We must create the list here because it uses the
* dn_dirty_link[] of this txg. But it may already
* exist because we call dsl_dataset_sync() twice per txg.
*/
if (os->os_synced_dnodes == NULL) {
os->os_synced_dnodes =
multilist_create(sizeof (dnode_t),
offsetof(dnode_t, dn_dirty_link[txgoff]),
dnode_multilist_index_func);
} else {
ASSERT3U(os->os_synced_dnodes->ml_offset, ==,
offsetof(dnode_t, dn_dirty_link[txgoff]));
}
}
for (int i = 0;
i < multilist_get_num_sublists(os->os_dirty_dnodes[txgoff]); i++) {
sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP);
sda->sda_list = os->os_dirty_dnodes[txgoff];
sda->sda_sublist_idx = i;
sda->sda_tx = tx;
(void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
sync_dnodes_task, sda, 0);
/* callback frees sda */
}
taskq_wait(dmu_objset_pool(os)->dp_sync_taskq);
list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
while ((dr = list_head(list)) != NULL) {
ASSERT0(dr->dr_dbuf->db_level);
list_remove(list, dr);
if (dr->dr_zio)
zio_nowait(dr->dr_zio);
}
/* Enable dnode backfill if enough objects have been freed. */
if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
os->os_rescan_dnodes = B_TRUE;
os->os_freed_dnodes = 0;
}
/*
* Free intent log blocks up to this tx.
*/
zil_sync(os->os_zil, tx);
os->os_phys->os_zil_header = os->os_zil_header;
zio_nowait(zio);
}
boolean_t
dmu_objset_is_dirty(objset_t *os, uint64_t txg)
{
return (!multilist_is_empty(os->os_dirty_dnodes[txg & TXG_MASK]));
}
static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
void
dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
{
used_cbs[ost] = cb;
}
boolean_t
dmu_objset_userused_enabled(objset_t *os)
{
return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
used_cbs[os->os_phys->os_type] != NULL &&
DMU_USERUSED_DNODE(os) != NULL);
}
typedef struct userquota_node {
uint64_t uqn_id;
int64_t uqn_delta;
avl_node_t uqn_node;
} userquota_node_t;
typedef struct userquota_cache {
avl_tree_t uqc_user_deltas;
avl_tree_t uqc_group_deltas;
} userquota_cache_t;
static int
userquota_compare(const void *l, const void *r)
{
const userquota_node_t *luqn = l;
const userquota_node_t *ruqn = r;
if (luqn->uqn_id < ruqn->uqn_id)
return (-1);
if (luqn->uqn_id > ruqn->uqn_id)
return (1);
return (0);
}
static void
do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx)
{
void *cookie;
userquota_node_t *uqn;
ASSERT(dmu_tx_is_syncing(tx));
cookie = NULL;
while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas,
&cookie)) != NULL) {
/*
* os_userused_lock protects against concurrent calls to
* zap_increment_int(). It's needed because zap_increment_int()
* is not thread-safe (i.e. not atomic).
*/
mutex_enter(&os->os_userused_lock);
VERIFY0(zap_increment_int(os, DMU_USERUSED_OBJECT,
uqn->uqn_id, uqn->uqn_delta, tx));
mutex_exit(&os->os_userused_lock);
kmem_free(uqn, sizeof (*uqn));
}
avl_destroy(&cache->uqc_user_deltas);
cookie = NULL;
while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas,
&cookie)) != NULL) {
mutex_enter(&os->os_userused_lock);
VERIFY0(zap_increment_int(os, DMU_GROUPUSED_OBJECT,
uqn->uqn_id, uqn->uqn_delta, tx));
mutex_exit(&os->os_userused_lock);
kmem_free(uqn, sizeof (*uqn));
}
avl_destroy(&cache->uqc_group_deltas);
}
static void
userquota_update_cache(avl_tree_t *avl, uint64_t id, int64_t delta)
{
userquota_node_t search = { .uqn_id = id };
avl_index_t idx;
userquota_node_t *uqn = avl_find(avl, &search, &idx);
if (uqn == NULL) {
uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP);
uqn->uqn_id = id;
avl_insert(avl, uqn, idx);
}
uqn->uqn_delta += delta;
}
static void
do_userquota_update(userquota_cache_t *cache, uint64_t used, uint64_t flags,
uint64_t user, uint64_t group, boolean_t subtract)
{
if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
int64_t delta = DNODE_SIZE + used;
if (subtract)
delta = -delta;
userquota_update_cache(&cache->uqc_user_deltas, user, delta);
userquota_update_cache(&cache->uqc_group_deltas, group, delta);
}
}
typedef struct userquota_updates_arg {
objset_t *uua_os;
int uua_sublist_idx;
dmu_tx_t *uua_tx;
} userquota_updates_arg_t;
static void
userquota_updates_task(void *arg)
{
userquota_updates_arg_t *uua = arg;
objset_t *os = uua->uua_os;
dmu_tx_t *tx = uua->uua_tx;
dnode_t *dn;
userquota_cache_t cache = { 0 };
multilist_sublist_t *list =
multilist_sublist_lock(os->os_synced_dnodes, uua->uua_sublist_idx);
ASSERT(multilist_sublist_head(list) == NULL ||
dmu_objset_userused_enabled(os));
avl_create(&cache.uqc_user_deltas, userquota_compare,
sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
avl_create(&cache.uqc_group_deltas, userquota_compare,
sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
while ((dn = multilist_sublist_head(list)) != NULL) {
int flags;
ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
dn->dn_phys->dn_flags &
DNODE_FLAG_USERUSED_ACCOUNTED);
flags = dn->dn_id_flags;
ASSERT(flags);
if (flags & DN_ID_OLD_EXIST) {
do_userquota_update(&cache,
dn->dn_oldused, dn->dn_oldflags,
dn->dn_olduid, dn->dn_oldgid, B_TRUE);
}
if (flags & DN_ID_NEW_EXIST) {
do_userquota_update(&cache,
DN_USED_BYTES(dn->dn_phys),
dn->dn_phys->dn_flags, dn->dn_newuid,
dn->dn_newgid, B_FALSE);
}
mutex_enter(&dn->dn_mtx);
dn->dn_oldused = 0;
dn->dn_oldflags = 0;
if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
dn->dn_olduid = dn->dn_newuid;
dn->dn_oldgid = dn->dn_newgid;
dn->dn_id_flags |= DN_ID_OLD_EXIST;
if (dn->dn_bonuslen == 0)
dn->dn_id_flags |= DN_ID_CHKED_SPILL;
else
dn->dn_id_flags |= DN_ID_CHKED_BONUS;
}
dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
mutex_exit(&dn->dn_mtx);
multilist_sublist_remove(list, dn);
dnode_rele(dn, os->os_synced_dnodes);
}
do_userquota_cacheflush(os, &cache, tx);
multilist_sublist_unlock(list);
kmem_free(uua, sizeof (*uua));
}
void
dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
{
if (!dmu_objset_userused_enabled(os))
return;
/* Allocate the user/groupused objects if necessary. */
if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
VERIFY0(zap_create_claim(os,
DMU_USERUSED_OBJECT,
DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
VERIFY0(zap_create_claim(os,
DMU_GROUPUSED_OBJECT,
DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
}
for (int i = 0;
i < multilist_get_num_sublists(os->os_synced_dnodes); i++) {
userquota_updates_arg_t *uua =
kmem_alloc(sizeof (*uua), KM_SLEEP);
uua->uua_os = os;
uua->uua_sublist_idx = i;
uua->uua_tx = tx;
/* note: caller does taskq_wait() */
(void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
userquota_updates_task, uua, 0);
/* callback frees uua */
}
}
/*
* Returns a pointer to data to find uid/gid from
*
* If a dirty record for transaction group that is syncing can't
* be found then NULL is returned. In the NULL case it is assumed
* the uid/gid aren't changing.
*/
static void *
dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
dbuf_dirty_record_t *dr, **drp;
void *data;
if (db->db_dirtycnt == 0)
return (db->db.db_data); /* Nothing is changing */
for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
if (dr->dr_txg == tx->tx_txg)
break;
if (dr == NULL) {
data = NULL;
} else {
dnode_t *dn;
DB_DNODE_ENTER(dr->dr_dbuf);
dn = DB_DNODE(dr->dr_dbuf);
if (dn->dn_bonuslen == 0 &&
dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
data = dr->dt.dl.dr_data->b_data;
else
data = dr->dt.dl.dr_data;
DB_DNODE_EXIT(dr->dr_dbuf);
}
return (data);
}
void
dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
{
objset_t *os = dn->dn_objset;
void *data = NULL;
dmu_buf_impl_t *db = NULL;
uint64_t *user = NULL;
uint64_t *group = NULL;
int flags = dn->dn_id_flags;
int error;
boolean_t have_spill = B_FALSE;
if (!dmu_objset_userused_enabled(dn->dn_objset))
return;
if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST|
DN_ID_CHKED_SPILL)))
return;
if (before && dn->dn_bonuslen != 0)
data = DN_BONUS(dn->dn_phys);
else if (!before && dn->dn_bonuslen != 0) {
if (dn->dn_bonus) {
db = dn->dn_bonus;
mutex_enter(&db->db_mtx);
data = dmu_objset_userquota_find_data(db, tx);
} else {
data = DN_BONUS(dn->dn_phys);
}
} else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
int rf = 0;
if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
rf |= DB_RF_HAVESTRUCT;
error = dmu_spill_hold_by_dnode(dn,
rf | DB_RF_MUST_SUCCEED,
FTAG, (dmu_buf_t **)&db);
ASSERT(error == 0);
mutex_enter(&db->db_mtx);
data = (before) ? db->db.db_data :
dmu_objset_userquota_find_data(db, tx);
have_spill = B_TRUE;
} else {
mutex_enter(&dn->dn_mtx);
dn->dn_id_flags |= DN_ID_CHKED_BONUS;
mutex_exit(&dn->dn_mtx);
return;
}
if (before) {
ASSERT(data);
user = &dn->dn_olduid;
group = &dn->dn_oldgid;
} else if (data) {
user = &dn->dn_newuid;
group = &dn->dn_newgid;
}
/*
* Must always call the callback in case the object
* type has changed and that type isn't an object type to track
*/
error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data,
user, group);
/*
* Preserve existing uid/gid when the callback can't determine
* what the new uid/gid are and the callback returned EEXIST.
* The EEXIST error tells us to just use the existing uid/gid.
* If we don't know what the old values are then just assign
* them to 0, since that is a new file being created.
*/
if (!before && data == NULL && error == EEXIST) {
if (flags & DN_ID_OLD_EXIST) {
dn->dn_newuid = dn->dn_olduid;
dn->dn_newgid = dn->dn_oldgid;
} else {
dn->dn_newuid = 0;
dn->dn_newgid = 0;
}
error = 0;
}
if (db)
mutex_exit(&db->db_mtx);
mutex_enter(&dn->dn_mtx);
if (error == 0 && before)
dn->dn_id_flags |= DN_ID_OLD_EXIST;
if (error == 0 && !before)
dn->dn_id_flags |= DN_ID_NEW_EXIST;
if (have_spill) {
dn->dn_id_flags |= DN_ID_CHKED_SPILL;
} else {
dn->dn_id_flags |= DN_ID_CHKED_BONUS;
}
mutex_exit(&dn->dn_mtx);
if (have_spill)
dmu_buf_rele((dmu_buf_t *)db, FTAG);
}
boolean_t
dmu_objset_userspace_present(objset_t *os)
{
return (os->os_phys->os_flags &
OBJSET_FLAG_USERACCOUNTING_COMPLETE);
}
int
dmu_objset_userspace_upgrade(objset_t *os)
{
uint64_t obj;
int err = 0;
if (dmu_objset_userspace_present(os))
return (0);
if (!dmu_objset_userused_enabled(os))
return (SET_ERROR(ENOTSUP));
if (dmu_objset_is_snapshot(os))
return (SET_ERROR(EINVAL));
/*
* We simply need to mark every object dirty, so that it will be
* synced out and now accounted. If this is called
* concurrently, or if we already did some work before crashing,
* that's fine, since we track each object's accounted state
* independently.
*/
for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
dmu_tx_t *tx;
dmu_buf_t *db;
int objerr;
if (issig(JUSTLOOKING) && issig(FORREAL))
return (SET_ERROR(EINTR));
objerr = dmu_bonus_hold(os, obj, FTAG, &db);
if (objerr != 0)
continue;
tx = dmu_tx_create(os);
dmu_tx_hold_bonus(tx, obj);
objerr = dmu_tx_assign(tx, TXG_WAIT);
if (objerr != 0) {
dmu_tx_abort(tx);
continue;
}
dmu_buf_will_dirty(db, tx);
dmu_buf_rele(db, FTAG);
dmu_tx_commit(tx);
}
os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
txg_wait_synced(dmu_objset_pool(os), 0);
return (0);
}
void
dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
uint64_t *usedobjsp, uint64_t *availobjsp)
{
dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp,
usedobjsp, availobjsp);
}
uint64_t
dmu_objset_fsid_guid(objset_t *os)
{
return (dsl_dataset_fsid_guid(os->os_dsl_dataset));
}
void
dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)
{
stat->dds_type = os->os_phys->os_type;
if (os->os_dsl_dataset)
dsl_dataset_fast_stat(os->os_dsl_dataset, stat);
}
void
dmu_objset_stats(objset_t *os, nvlist_t *nv)
{
ASSERT(os->os_dsl_dataset ||
os->os_phys->os_type == DMU_OST_META);
if (os->os_dsl_dataset != NULL)
dsl_dataset_stats(os->os_dsl_dataset, nv);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
os->os_phys->os_type);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
dmu_objset_userspace_present(os));
}
int
dmu_objset_is_snapshot(objset_t *os)
{
if (os->os_dsl_dataset != NULL)
return (os->os_dsl_dataset->ds_is_snapshot);
else
return (B_FALSE);
}
int
dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen,
boolean_t *conflict)
{
dsl_dataset_t *ds = os->os_dsl_dataset;
uint64_t ignored;
if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
return (SET_ERROR(ENOENT));
return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored,
MT_NORMALIZE, real, maxlen, conflict));
}
int
dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)
{
dsl_dataset_t *ds = os->os_dsl_dataset;
zap_cursor_t cursor;
zap_attribute_t attr;
ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
return (SET_ERROR(ENOENT));
zap_cursor_init_serialized(&cursor,
ds->ds_dir->dd_pool->dp_meta_objset,
dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp);
if (zap_cursor_retrieve(&cursor, &attr) != 0) {
zap_cursor_fini(&cursor);
return (SET_ERROR(ENOENT));
}
if (strlen(attr.za_name) + 1 > namelen) {
zap_cursor_fini(&cursor);
return (SET_ERROR(ENAMETOOLONG));
}
(void) strcpy(name, attr.za_name);
if (idp)
*idp = attr.za_first_integer;
if (case_conflict)
*case_conflict = attr.za_normalization_conflict;
zap_cursor_advance(&cursor);
*offp = zap_cursor_serialize(&cursor);
zap_cursor_fini(&cursor);
return (0);
}
int
dmu_dir_list_next(objset_t *os, int namelen, char *name,
uint64_t *idp, uint64_t *offp)
{
dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
zap_cursor_t cursor;
zap_attribute_t attr;
/* there is no next dir on a snapshot! */
if (os->os_dsl_dataset->ds_object !=
dsl_dir_phys(dd)->dd_head_dataset_obj)
return (SET_ERROR(ENOENT));
zap_cursor_init_serialized(&cursor,
dd->dd_pool->dp_meta_objset,
dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp);
if (zap_cursor_retrieve(&cursor, &attr) != 0) {
zap_cursor_fini(&cursor);
return (SET_ERROR(ENOENT));
}
if (strlen(attr.za_name) + 1 > namelen) {
zap_cursor_fini(&cursor);
return (SET_ERROR(ENAMETOOLONG));
}
(void) strcpy(name, attr.za_name);
if (idp)
*idp = attr.za_first_integer;
zap_cursor_advance(&cursor);
*offp = zap_cursor_serialize(&cursor);
zap_cursor_fini(&cursor);
return (0);
}
typedef struct dmu_objset_find_ctx {
taskq_t *dc_tq;
dsl_pool_t *dc_dp;
uint64_t dc_ddobj;
char *dc_ddname; /* last component of ddobj's name */
int (*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *);
void *dc_arg;
int dc_flags;
kmutex_t *dc_error_lock;
int *dc_error;
} dmu_objset_find_ctx_t;
static void
dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
{
dsl_pool_t *dp = dcp->dc_dp;
dsl_dir_t *dd;
dsl_dataset_t *ds;
zap_cursor_t zc;
zap_attribute_t *attr;
uint64_t thisobj;
int err = 0;
/* don't process if there already was an error */
if (*dcp->dc_error != 0)
goto out;
/*
* Note: passing the name (dc_ddname) here is optional, but it
* improves performance because we don't need to call
* zap_value_search() to determine the name.
*/
err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, dcp->dc_ddname, FTAG, &dd);
if (err != 0)
goto out;
/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
if (dd->dd_myname[0] == '$') {
dsl_dir_rele(dd, FTAG);
goto out;
}
thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
/*
* Iterate over all children.
*/
if (dcp->dc_flags & DS_FIND_CHILDREN) {
for (zap_cursor_init(&zc, dp->dp_meta_objset,
dsl_dir_phys(dd)->dd_child_dir_zapobj);
zap_cursor_retrieve(&zc, attr) == 0;
(void) zap_cursor_advance(&zc)) {
ASSERT3U(attr->za_integer_length, ==,
sizeof (uint64_t));
ASSERT3U(attr->za_num_integers, ==, 1);
dmu_objset_find_ctx_t *child_dcp =
kmem_alloc(sizeof (*child_dcp), KM_SLEEP);
*child_dcp = *dcp;
child_dcp->dc_ddobj = attr->za_first_integer;
child_dcp->dc_ddname = spa_strdup(attr->za_name);
if (dcp->dc_tq != NULL)
(void) taskq_dispatch(dcp->dc_tq,
dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);
else
dmu_objset_find_dp_impl(child_dcp);
}
zap_cursor_fini(&zc);
}
/*
* Iterate over all snapshots.
*/
if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
dsl_dataset_t *ds;
err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
if (err == 0) {
uint64_t snapobj;
snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
dsl_dataset_rele(ds, FTAG);
for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
zap_cursor_retrieve(&zc, attr) == 0;
(void) zap_cursor_advance(&zc)) {
ASSERT3U(attr->za_integer_length, ==,
sizeof (uint64_t));
ASSERT3U(attr->za_num_integers, ==, 1);
err = dsl_dataset_hold_obj(dp,
attr->za_first_integer, FTAG, &ds);
if (err != 0)
break;
err = dcp->dc_func(dp, ds, dcp->dc_arg);
dsl_dataset_rele(ds, FTAG);
if (err != 0)
break;
}
zap_cursor_fini(&zc);
}
}
kmem_free(attr, sizeof (zap_attribute_t));
if (err != 0) {
dsl_dir_rele(dd, FTAG);
goto out;
}
/*
* Apply to self.
*/
err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
/*
* Note: we hold the dir while calling dsl_dataset_hold_obj() so
* that the dir will remain cached, and we won't have to re-instantiate
* it (which could be expensive due to finding its name via
* zap_value_search()).
*/
dsl_dir_rele(dd, FTAG);
if (err != 0)
goto out;
err = dcp->dc_func(dp, ds, dcp->dc_arg);
dsl_dataset_rele(ds, FTAG);
out:
if (err != 0) {
mutex_enter(dcp->dc_error_lock);
/* only keep first error */
if (*dcp->dc_error == 0)
*dcp->dc_error = err;
mutex_exit(dcp->dc_error_lock);
}
if (dcp->dc_ddname != NULL)
spa_strfree(dcp->dc_ddname);
kmem_free(dcp, sizeof (*dcp));
}
static void
dmu_objset_find_dp_cb(void *arg)
{
dmu_objset_find_ctx_t *dcp = arg;
dsl_pool_t *dp = dcp->dc_dp;
/*
* We need to get a pool_config_lock here, as there are several
* asssert(pool_config_held) down the stack. Getting a lock via
* dsl_pool_config_enter is risky, as it might be stalled by a
* pending writer. This would deadlock, as the write lock can
* only be granted when our parent thread gives up the lock.
* The _prio interface gives us priority over a pending writer.
*/
dsl_pool_config_enter_prio(dp, FTAG);
dmu_objset_find_dp_impl(dcp);
dsl_pool_config_exit(dp, FTAG);
}
/*
* Find objsets under and including ddobj, call func(ds) on each.
* The order for the enumeration is completely undefined.
* func is called with dsl_pool_config held.
*/
int
dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
{
int error = 0;
taskq_t *tq = NULL;
int ntasks;
dmu_objset_find_ctx_t *dcp;
kmutex_t err_lock;
mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP);
dcp->dc_tq = NULL;
dcp->dc_dp = dp;
dcp->dc_ddobj = ddobj;
dcp->dc_ddname = NULL;
dcp->dc_func = func;
dcp->dc_arg = arg;
dcp->dc_flags = flags;
dcp->dc_error_lock = &err_lock;
dcp->dc_error = &error;
if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) {
/*
* In case a write lock is held we can't make use of
* parallelism, as down the stack of the worker threads
* the lock is asserted via dsl_pool_config_held.
* In case of a read lock this is solved by getting a read
* lock in each worker thread, which isn't possible in case
* of a writer lock. So we fall back to the synchronous path
* here.
* In the future it might be possible to get some magic into
* dsl_pool_config_held in a way that it returns true for
* the worker threads so that a single lock held from this
* thread suffices. For now, stay single threaded.
*/
dmu_objset_find_dp_impl(dcp);
mutex_destroy(&err_lock);
return (error);
}
ntasks = dmu_find_threads;
if (ntasks == 0)
ntasks = vdev_count_leaves(dp->dp_spa) * 4;
tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks,
INT_MAX, 0);
if (tq == NULL) {
kmem_free(dcp, sizeof (*dcp));
mutex_destroy(&err_lock);
return (SET_ERROR(ENOMEM));
}
dcp->dc_tq = tq;
/* dcp will be freed by task */
(void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);
/*
* PORTING: this code relies on the property of taskq_wait to wait
* until no more tasks are queued and no more tasks are active. As
* we always queue new tasks from within other tasks, task_wait
* reliably waits for the full recursion to finish, even though we
* enqueue new tasks after taskq_wait has been called.
* On platforms other than illumos, taskq_wait may not have this
* property.
*/
taskq_wait(tq);
taskq_destroy(tq);
mutex_destroy(&err_lock);
return (error);
}
/*
* Find all objsets under name, and for each, call 'func(child_name, arg)'.
* The dp_config_rwlock must not be held when this is called, and it
* will not be held when the callback is called.
* Therefore this function should only be used when the pool is not changing
* (e.g. in syncing context), or the callback can deal with the possible races.
*/
static int
dmu_objset_find_impl(spa_t *spa, const char *name,
int func(const char *, void *), void *arg, int flags)
{
dsl_dir_t *dd;
dsl_pool_t *dp = spa_get_dsl(spa);
dsl_dataset_t *ds;
zap_cursor_t zc;
zap_attribute_t *attr;
char *child;
uint64_t thisobj;
int err;
dsl_pool_config_enter(dp, FTAG);
err = dsl_dir_hold(dp, name, FTAG, &dd, NULL);
if (err != 0) {
dsl_pool_config_exit(dp, FTAG);
return (err);
}
/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
if (dd->dd_myname[0] == '$') {
dsl_dir_rele(dd, FTAG);
dsl_pool_config_exit(dp, FTAG);
return (0);
}
thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
/*
* Iterate over all children.
*/
if (flags & DS_FIND_CHILDREN) {
for (zap_cursor_init(&zc, dp->dp_meta_objset,
dsl_dir_phys(dd)->dd_child_dir_zapobj);
zap_cursor_retrieve(&zc, attr) == 0;
(void) zap_cursor_advance(&zc)) {
ASSERT3U(attr->za_integer_length, ==,
sizeof (uint64_t));
ASSERT3U(attr->za_num_integers, ==, 1);
child = kmem_asprintf("%s/%s", name, attr->za_name);
dsl_pool_config_exit(dp, FTAG);
err = dmu_objset_find_impl(spa, child,
func, arg, flags);
dsl_pool_config_enter(dp, FTAG);
strfree(child);
if (err != 0)
break;
}
zap_cursor_fini(&zc);
if (err != 0) {
dsl_dir_rele(dd, FTAG);
dsl_pool_config_exit(dp, FTAG);
kmem_free(attr, sizeof (zap_attribute_t));
return (err);
}
}
/*
* Iterate over all snapshots.
*/
if (flags & DS_FIND_SNAPSHOTS) {
err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
if (err == 0) {
uint64_t snapobj;
snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
dsl_dataset_rele(ds, FTAG);
for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
zap_cursor_retrieve(&zc, attr) == 0;
(void) zap_cursor_advance(&zc)) {
ASSERT3U(attr->za_integer_length, ==,
sizeof (uint64_t));
ASSERT3U(attr->za_num_integers, ==, 1);
child = kmem_asprintf("%s@%s",
name, attr->za_name);
dsl_pool_config_exit(dp, FTAG);
err = func(child, arg);
dsl_pool_config_enter(dp, FTAG);
strfree(child);
if (err != 0)
break;
}
zap_cursor_fini(&zc);
}
}
dsl_dir_rele(dd, FTAG);
kmem_free(attr, sizeof (zap_attribute_t));
dsl_pool_config_exit(dp, FTAG);
if (err != 0)
return (err);
/* Apply to self. */
return (func(name, arg));
}
/*
* See comment above dmu_objset_find_impl().
*/
int
dmu_objset_find(char *name, int func(const char *, void *), void *arg,
int flags)
{
spa_t *spa;
int error;
error = spa_open(name, &spa, FTAG);
if (error != 0)
return (error);
error = dmu_objset_find_impl(spa, name, func, arg, flags);
spa_close(spa, FTAG);
return (error);
}
void
dmu_objset_set_user(objset_t *os, void *user_ptr)
{
ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
os->os_user_ptr = user_ptr;
}
void *
dmu_objset_get_user(objset_t *os)
{
ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
return (os->os_user_ptr);
}
/*
* Determine name of filesystem, given name of snapshot.
* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes
*/
int
dmu_fsname(const char *snapname, char *buf)
{
char *atp = strchr(snapname, '@');
if (atp == NULL)
return (SET_ERROR(EINVAL));
if (atp - snapname >= ZFS_MAX_DATASET_NAME_LEN)
return (SET_ERROR(ENAMETOOLONG));
(void) strlcpy(buf, snapname, atp - snapname + 1);
return (0);
}
/*
* Call when we think we're going to write/free space in open context to track
* the amount of dirty data in the open txg, which is also the amount
* of memory that can not be evicted until this txg syncs.
*/
void
dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx)
{
dsl_dataset_t *ds = os->os_dsl_dataset;
int64_t aspace = spa_get_worst_case_asize(os->os_spa, space);
if (ds != NULL) {
dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
}
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c (revision 332525)
@@ -1,1320 +1,1337 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/dmu.h>
#include <sys/dmu_impl.h>
#include <sys/dbuf.h>
#include <sys/dmu_tx.h>
#include <sys/dmu_objset.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_dir.h>
#include <sys/dsl_pool.h>
#include <sys/zap_impl.h>
#include <sys/spa.h>
#include <sys/sa.h>
#include <sys/sa_impl.h>
#include <sys/zfs_context.h>
#include <sys/varargs.h>
typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
uint64_t arg1, uint64_t arg2);
dmu_tx_t *
dmu_tx_create_dd(dsl_dir_t *dd)
{
dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
tx->tx_dir = dd;
if (dd != NULL)
tx->tx_pool = dd->dd_pool;
list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
offsetof(dmu_tx_hold_t, txh_node));
list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
offsetof(dmu_tx_callback_t, dcb_node));
tx->tx_start = gethrtime();
return (tx);
}
dmu_tx_t *
dmu_tx_create(objset_t *os)
{
dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
tx->tx_objset = os;
return (tx);
}
dmu_tx_t *
dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
{
dmu_tx_t *tx = dmu_tx_create_dd(NULL);
txg_verify(dp->dp_spa, txg);
tx->tx_pool = dp;
tx->tx_txg = txg;
tx->tx_anyobj = TRUE;
return (tx);
}
int
dmu_tx_is_syncing(dmu_tx_t *tx)
{
return (tx->tx_anyobj);
}
int
dmu_tx_private_ok(dmu_tx_t *tx)
{
return (tx->tx_anyobj);
}
static dmu_tx_hold_t *
dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type,
uint64_t arg1, uint64_t arg2)
{
dmu_tx_hold_t *txh;
if (dn != NULL) {
(void) refcount_add(&dn->dn_holds, tx);
if (tx->tx_txg != 0) {
mutex_enter(&dn->dn_mtx);
/*
* dn->dn_assigned_txg == tx->tx_txg doesn't pose a
* problem, but there's no way for it to happen (for
* now, at least).
*/
ASSERT(dn->dn_assigned_txg == 0);
dn->dn_assigned_txg = tx->tx_txg;
(void) refcount_add(&dn->dn_tx_holds, tx);
mutex_exit(&dn->dn_mtx);
}
}
txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
txh->txh_tx = tx;
txh->txh_dnode = dn;
refcount_create(&txh->txh_space_towrite);
refcount_create(&txh->txh_memory_tohold);
txh->txh_type = type;
txh->txh_arg1 = arg1;
txh->txh_arg2 = arg2;
list_insert_tail(&tx->tx_holds, txh);
return (txh);
}
static dmu_tx_hold_t *
dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
{
dnode_t *dn = NULL;
dmu_tx_hold_t *txh;
int err;
if (object != DMU_NEW_OBJECT) {
err = dnode_hold(os, object, FTAG, &dn);
if (err != 0) {
tx->tx_err = err;
return (NULL);
}
}
txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2);
if (dn != NULL)
dnode_rele(dn, FTAG);
return (txh);
}
void
dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn)
{
/*
* If we're syncing, they can manipulate any object anyhow, and
* the hold on the dnode_t can cause problems.
*/
if (!dmu_tx_is_syncing(tx))
(void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0);
}
/*
* This function reads specified data from disk. The specified data will
* be needed to perform the transaction -- i.e, it will be read after
* we do dmu_tx_assign(). There are two reasons that we read the data now
* (before dmu_tx_assign()):
*
* 1. Reading it now has potentially better performance. The transaction
* has not yet been assigned, so the TXG is not held open, and also the
* caller typically has less locks held when calling dmu_tx_hold_*() than
* after the transaction has been assigned. This reduces the lock (and txg)
* hold times, thus reducing lock contention.
*
* 2. It is easier for callers (primarily the ZPL) to handle i/o errors
* that are detected before they start making changes to the DMU state
* (i.e. now). Once the transaction has been assigned, and some DMU
* state has been changed, it can be difficult to recover from an i/o
* error (e.g. to undo the changes already made in memory at the DMU
* layer). Typically code to do so does not exist in the caller -- it
* assumes that the data has already been cached and thus i/o errors are
* not possible.
*
* It has been observed that the i/o initiated here can be a performance
* problem, and it appears to be optional, because we don't look at the
* data which is read. However, removing this read would only serve to
* move the work elsewhere (after the dmu_tx_assign()), where it may
* have a greater impact on performance (in addition to the impact on
* fault tolerance noted above).
*/
static int
dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
{
int err;
dmu_buf_impl_t *db;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
db = dbuf_hold_level(dn, level, blkid, FTAG);
rw_exit(&dn->dn_struct_rwlock);
if (db == NULL)
return (SET_ERROR(EIO));
err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
dbuf_rele(db, FTAG);
return (err);
}
/* ARGSUSED */
static void
dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
{
dnode_t *dn = txh->txh_dnode;
int err = 0;
if (len == 0)
return;
(void) refcount_add_many(&txh->txh_space_towrite, len, FTAG);
if (refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS)
err = SET_ERROR(EFBIG);
if (dn == NULL)
return;
/*
* For i/o error checking, read the blocks that will be needed
* to perform the write: the first and last level-0 blocks (if
* they are not aligned, i.e. if they are partial-block writes),
* and all the level-1 blocks.
*/
if (dn->dn_maxblkid == 0) {
if (off < dn->dn_datablksz &&
(off > 0 || len < dn->dn_datablksz)) {
err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
if (err != 0) {
txh->txh_tx->tx_err = err;
}
}
} else {
zio_t *zio = zio_root(dn->dn_objset->os_spa,
NULL, NULL, ZIO_FLAG_CANFAIL);
/* first level-0 block */
uint64_t start = off >> dn->dn_datablkshift;
if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) {
err = dmu_tx_check_ioerr(zio, dn, 0, start);
if (err != 0) {
txh->txh_tx->tx_err = err;
}
}
/* last level-0 block */
uint64_t end = (off + len - 1) >> dn->dn_datablkshift;
if (end != start && end <= dn->dn_maxblkid &&
P2PHASE(off + len, dn->dn_datablksz)) {
err = dmu_tx_check_ioerr(zio, dn, 0, end);
if (err != 0) {
txh->txh_tx->tx_err = err;
}
}
/* level-1 blocks */
if (dn->dn_nlevels > 1) {
int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
for (uint64_t i = (start >> shft) + 1;
i < end >> shft; i++) {
err = dmu_tx_check_ioerr(zio, dn, 1, i);
if (err != 0) {
txh->txh_tx->tx_err = err;
}
}
}
err = zio_wait(zio);
if (err != 0) {
txh->txh_tx->tx_err = err;
}
}
}
static void
dmu_tx_count_dnode(dmu_tx_hold_t *txh)
{
(void) refcount_add_many(&txh->txh_space_towrite, DNODE_SIZE, FTAG);
}
void
dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
{
dmu_tx_hold_t *txh;
ASSERT0(tx->tx_txg);
ASSERT3U(len, <=, DMU_MAX_ACCESS);
ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
object, THT_WRITE, off, len);
if (txh != NULL) {
dmu_tx_count_write(txh, off, len);
dmu_tx_count_dnode(txh);
}
}
void
+dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT(tx->tx_txg == 0);
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ object, THT_WRITE, 0, 0);
+ if (txh == NULL)
+ return;
+
+ dnode_t *dn = txh->txh_dnode;
+ (void) refcount_add_many(&txh->txh_space_towrite,
+ 1ULL << dn->dn_indblkshift, FTAG);
+ dmu_tx_count_dnode(txh);
+}
+
+void
dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
{
dmu_tx_hold_t *txh;
ASSERT0(tx->tx_txg);
ASSERT3U(len, <=, DMU_MAX_ACCESS);
ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len);
if (txh != NULL) {
dmu_tx_count_write(txh, off, len);
dmu_tx_count_dnode(txh);
}
}
/*
* This function marks the transaction as being a "net free". The end
* result is that refquotas will be disabled for this transaction, and
* this transaction will be able to use half of the pool space overhead
* (see dsl_pool_adjustedsize()). Therefore this function should only
* be called for transactions that we expect will not cause a net increase
* in the amount of space used (but it's OK if that is occasionally not true).
*/
void
dmu_tx_mark_netfree(dmu_tx_t *tx)
{
tx->tx_netfree = B_TRUE;
}
static void
dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
{
dmu_tx_t *tx;
dnode_t *dn;
int err;
tx = txh->txh_tx;
ASSERT(tx->tx_txg == 0);
dn = txh->txh_dnode;
dmu_tx_count_dnode(txh);
if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz)
return;
if (len == DMU_OBJECT_END)
len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off;
/*
* For i/o error checking, we read the first and last level-0
* blocks if they are not aligned, and all the level-1 blocks.
*
* Note: dbuf_free_range() assumes that we have not instantiated
* any level-0 dbufs that will be completely freed. Therefore we must
* exercise care to not read or count the first and last blocks
* if they are blocksize-aligned.
*/
if (dn->dn_datablkshift == 0) {
if (off != 0 || len < dn->dn_datablksz)
dmu_tx_count_write(txh, 0, dn->dn_datablksz);
} else {
/* first block will be modified if it is not aligned */
if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
dmu_tx_count_write(txh, off, 1);
/* last block will be modified if it is not aligned */
if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
dmu_tx_count_write(txh, off + len, 1);
}
/*
* Check level-1 blocks.
*/
if (dn->dn_nlevels > 1) {
int shift = dn->dn_datablkshift + dn->dn_indblkshift -
SPA_BLKPTRSHIFT;
uint64_t start = off >> shift;
uint64_t end = (off + len) >> shift;
ASSERT(dn->dn_indblkshift != 0);
/*
* dnode_reallocate() can result in an object with indirect
* blocks having an odd data block size. In this case,
* just check the single block.
*/
if (dn->dn_datablkshift == 0)
start = end = 0;
zio_t *zio = zio_root(tx->tx_pool->dp_spa,
NULL, NULL, ZIO_FLAG_CANFAIL);
for (uint64_t i = start; i <= end; i++) {
uint64_t ibyte = i << shift;
err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
i = ibyte >> shift;
if (err == ESRCH || i > end)
break;
if (err != 0) {
tx->tx_err = err;
(void) zio_wait(zio);
return;
}
(void) refcount_add_many(&txh->txh_memory_tohold,
1 << dn->dn_indblkshift, FTAG);
err = dmu_tx_check_ioerr(zio, dn, 1, i);
if (err != 0) {
tx->tx_err = err;
(void) zio_wait(zio);
return;
}
}
err = zio_wait(zio);
if (err != 0) {
tx->tx_err = err;
return;
}
}
}
void
dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
{
dmu_tx_hold_t *txh;
txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
object, THT_FREE, off, len);
if (txh != NULL)
(void) dmu_tx_hold_free_impl(txh, off, len);
}
void
dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
{
dmu_tx_hold_t *txh;
txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len);
if (txh != NULL)
(void) dmu_tx_hold_free_impl(txh, off, len);
}
static void
dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name)
{
dmu_tx_t *tx = txh->txh_tx;
dnode_t *dn;
int err;
ASSERT(tx->tx_txg == 0);
dn = txh->txh_dnode;
dmu_tx_count_dnode(txh);
/*
* Modifying a almost-full microzap is around the worst case (128KB)
*
* If it is a fat zap, the worst case would be 7*16KB=112KB:
* - 3 blocks overwritten: target leaf, ptrtbl block, header block
* - 4 new blocks written if adding:
* - 2 blocks for possibly split leaves,
* - 2 grown ptrtbl blocks
*/
(void) refcount_add_many(&txh->txh_space_towrite,
MZAP_MAX_BLKSZ, FTAG);
if (dn == NULL)
return;
ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
if (dn->dn_maxblkid == 0 || name == NULL) {
/*
* This is a microzap (only one block), or we don't know
* the name. Check the first block for i/o errors.
*/
err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
if (err != 0) {
tx->tx_err = err;
}
} else {
/*
* Access the name so that we'll check for i/o errors to
* the leaf blocks, etc. We ignore ENOENT, as this name
* may not yet exist.
*/
err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
if (err == EIO || err == ECKSUM || err == ENXIO) {
tx->tx_err = err;
}
}
}
void
dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
{
dmu_tx_hold_t *txh;
ASSERT0(tx->tx_txg);
txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
object, THT_ZAP, add, (uintptr_t)name);
if (txh != NULL)
dmu_tx_hold_zap_impl(txh, name);
}
void
dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name)
{
dmu_tx_hold_t *txh;
ASSERT0(tx->tx_txg);
ASSERT(dn != NULL);
txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name);
if (txh != NULL)
dmu_tx_hold_zap_impl(txh, name);
}
void
dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
{
dmu_tx_hold_t *txh;
ASSERT(tx->tx_txg == 0);
txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
object, THT_BONUS, 0, 0);
if (txh)
dmu_tx_count_dnode(txh);
}
void
dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn)
{
dmu_tx_hold_t *txh;
ASSERT0(tx->tx_txg);
txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0);
if (txh)
dmu_tx_count_dnode(txh);
}
void
dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
{
dmu_tx_hold_t *txh;
ASSERT(tx->tx_txg == 0);
txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
DMU_NEW_OBJECT, THT_SPACE, space, 0);
(void) refcount_add_many(&txh->txh_space_towrite, space, FTAG);
}
#ifdef ZFS_DEBUG
void
dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
{
boolean_t match_object = B_FALSE;
boolean_t match_offset = B_FALSE;
DB_DNODE_ENTER(db);
dnode_t *dn = DB_DNODE(db);
ASSERT(tx->tx_txg != 0);
ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
ASSERT3U(dn->dn_object, ==, db->db.db_object);
if (tx->tx_anyobj) {
DB_DNODE_EXIT(db);
return;
}
/* XXX No checking on the meta dnode for now */
if (db->db.db_object == DMU_META_DNODE_OBJECT) {
DB_DNODE_EXIT(db);
return;
}
for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
txh = list_next(&tx->tx_holds, txh)) {
ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
match_object = TRUE;
if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
int datablkshift = dn->dn_datablkshift ?
dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
int shift = datablkshift + epbs * db->db_level;
uint64_t beginblk = shift >= 64 ? 0 :
(txh->txh_arg1 >> shift);
uint64_t endblk = shift >= 64 ? 0 :
((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
uint64_t blkid = db->db_blkid;
/* XXX txh_arg2 better not be zero... */
dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
txh->txh_type, beginblk, endblk);
switch (txh->txh_type) {
case THT_WRITE:
if (blkid >= beginblk && blkid <= endblk)
match_offset = TRUE;
/*
* We will let this hold work for the bonus
* or spill buffer so that we don't need to
* hold it when creating a new object.
*/
if (blkid == DMU_BONUS_BLKID ||
blkid == DMU_SPILL_BLKID)
match_offset = TRUE;
/*
* They might have to increase nlevels,
* thus dirtying the new TLIBs. Or the
* might have to change the block size,
* thus dirying the new lvl=0 blk=0.
*/
if (blkid == 0)
match_offset = TRUE;
break;
case THT_FREE:
/*
* We will dirty all the level 1 blocks in
* the free range and perhaps the first and
* last level 0 block.
*/
if (blkid >= beginblk && (blkid <= endblk ||
txh->txh_arg2 == DMU_OBJECT_END))
match_offset = TRUE;
break;
case THT_SPILL:
if (blkid == DMU_SPILL_BLKID)
match_offset = TRUE;
break;
case THT_BONUS:
if (blkid == DMU_BONUS_BLKID)
match_offset = TRUE;
break;
case THT_ZAP:
match_offset = TRUE;
break;
case THT_NEWOBJECT:
match_object = TRUE;
break;
default:
ASSERT(!"bad txh_type");
}
}
if (match_object && match_offset) {
DB_DNODE_EXIT(db);
return;
}
}
DB_DNODE_EXIT(db);
panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
(u_longlong_t)db->db.db_object, db->db_level,
(u_longlong_t)db->db_blkid);
}
#endif
/*
* If we can't do 10 iops, something is wrong. Let us go ahead
* and hit zfs_dirty_data_max.
*/
hrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
/*
* We delay transactions when we've determined that the backend storage
* isn't able to accommodate the rate of incoming writes.
*
* If there is already a transaction waiting, we delay relative to when
* that transaction finishes waiting. This way the calculated min_time
* is independent of the number of threads concurrently executing
* transactions.
*
* If we are the only waiter, wait relative to when the transaction
* started, rather than the current time. This credits the transaction for
* "time already served", e.g. reading indirect blocks.
*
* The minimum time for a transaction to take is calculated as:
* min_time = scale * (dirty - min) / (max - dirty)
* min_time is then capped at zfs_delay_max_ns.
*
* The delay has two degrees of freedom that can be adjusted via tunables.
* The percentage of dirty data at which we start to delay is defined by
* zfs_delay_min_dirty_percent. This should typically be at or above
* zfs_vdev_async_write_active_max_dirty_percent so that we only start to
* delay after writing at full speed has failed to keep up with the incoming
* write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
* speaking, this variable determines the amount of delay at the midpoint of
* the curve.
*
* delay
* 10ms +-------------------------------------------------------------*+
* | *|
* 9ms + *+
* | *|
* 8ms + *+
* | * |
* 7ms + * +
* | * |
* 6ms + * +
* | * |
* 5ms + * +
* | * |
* 4ms + * +
* | * |
* 3ms + * +
* | * |
* 2ms + (midpoint) * +
* | | ** |
* 1ms + v *** +
* | zfs_delay_scale ----------> ******** |
* 0 +-------------------------------------*********----------------+
* 0% <- zfs_dirty_data_max -> 100%
*
* Note that since the delay is added to the outstanding time remaining on the
* most recent transaction, the delay is effectively the inverse of IOPS.
* Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
* was chosen such that small changes in the amount of accumulated dirty data
* in the first 3/4 of the curve yield relatively small differences in the
* amount of delay.
*
* The effects can be easier to understand when the amount of delay is
* represented on a log scale:
*
* delay
* 100ms +-------------------------------------------------------------++
* + +
* | |
* + *+
* 10ms + *+
* + ** +
* | (midpoint) ** |
* + | ** +
* 1ms + v **** +
* + zfs_delay_scale ----------> ***** +
* | **** |
* + **** +
* 100us + ** +
* + * +
* | * |
* + * +
* 10us + * +
* + +
* | |
* + +
* +--------------------------------------------------------------+
* 0% <- zfs_dirty_data_max -> 100%
*
* Note here that only as the amount of dirty data approaches its limit does
* the delay start to increase rapidly. The goal of a properly tuned system
* should be to keep the amount of dirty data out of that range by first
* ensuring that the appropriate limits are set for the I/O scheduler to reach
* optimal throughput on the backend storage, and then by changing the value
* of zfs_delay_scale to increase the steepness of the curve.
*/
static void
dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
{
dsl_pool_t *dp = tx->tx_pool;
uint64_t delay_min_bytes =
zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
hrtime_t wakeup, min_tx_time, now;
if (dirty <= delay_min_bytes)
return;
/*
* The caller has already waited until we are under the max.
* We make them pass us the amount of dirty data so we don't
* have to handle the case of it being >= the max, which could
* cause a divide-by-zero if it's == the max.
*/
ASSERT3U(dirty, <, zfs_dirty_data_max);
now = gethrtime();
min_tx_time = zfs_delay_scale *
(dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
if (now > tx->tx_start + min_tx_time)
return;
min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
uint64_t, min_tx_time);
mutex_enter(&dp->dp_lock);
wakeup = MAX(tx->tx_start + min_tx_time,
dp->dp_last_wakeup + min_tx_time);
dp->dp_last_wakeup = wakeup;
mutex_exit(&dp->dp_lock);
#ifdef _KERNEL
#ifdef illumos
mutex_enter(&curthread->t_delay_lock);
while (cv_timedwait_hires(&curthread->t_delay_cv,
&curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0)
continue;
mutex_exit(&curthread->t_delay_lock);
#else
pause_sbt("dmu_tx_delay", nstosbt(wakeup),
nstosbt(zfs_delay_resolution_ns), C_ABSOLUTE);
#endif
#else
hrtime_t delta = wakeup - gethrtime();
struct timespec ts;
ts.tv_sec = delta / NANOSEC;
ts.tv_nsec = delta % NANOSEC;
(void) nanosleep(&ts, NULL);
#endif
}
/*
* This routine attempts to assign the transaction to a transaction group.
* To do so, we must determine if there is sufficient free space on disk.
*
* If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree()
* on it), then it is assumed that there is sufficient free space,
* unless there's insufficient slop space in the pool (see the comment
* above spa_slop_shift in spa_misc.c).
*
* If it is not a "netfree" transaction, then if the data already on disk
* is over the allowed usage (e.g. quota), this will fail with EDQUOT or
* ENOSPC. Otherwise, if the current rough estimate of pending changes,
* plus the rough estimate of this transaction's changes, may exceed the
* allowed usage, then this will fail with ERESTART, which will cause the
* caller to wait for the pending changes to be written to disk (by waiting
* for the next TXG to open), and then check the space usage again.
*
* The rough estimate of pending changes is comprised of the sum of:
*
* - this transaction's holds' txh_space_towrite
*
* - dd_tempreserved[], which is the sum of in-flight transactions'
* holds' txh_space_towrite (i.e. those transactions that have called
* dmu_tx_assign() but not yet called dmu_tx_commit()).
*
* - dd_space_towrite[], which is the amount of dirtied dbufs.
*
* Note that all of these values are inflated by spa_get_worst_case_asize(),
* which means that we may get ERESTART well before we are actually in danger
* of running out of space, but this also mitigates any small inaccuracies
* in the rough estimate (e.g. txh_space_towrite doesn't take into account
* indirect blocks, and dd_space_towrite[] doesn't take into account changes
* to the MOS).
*
* Note that due to this algorithm, it is possible to exceed the allowed
* usage by one transaction. Also, as we approach the allowed usage,
* we will allow a very limited amount of changes into each TXG, thus
* decreasing performance.
*/
static int
dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
{
spa_t *spa = tx->tx_pool->dp_spa;
ASSERT0(tx->tx_txg);
if (tx->tx_err)
return (tx->tx_err);
if (spa_suspended(spa)) {
/*
* If the user has indicated a blocking failure mode
* then return ERESTART which will block in dmu_tx_wait().
* Otherwise, return EIO so that an error can get
* propagated back to the VOP calls.
*
* Note that we always honor the txg_how flag regardless
* of the failuremode setting.
*/
if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
!(txg_how & TXG_WAIT))
return (SET_ERROR(EIO));
return (SET_ERROR(ERESTART));
}
if (!tx->tx_dirty_delayed &&
dsl_pool_need_dirty_delay(tx->tx_pool)) {
tx->tx_wait_dirty = B_TRUE;
return (SET_ERROR(ERESTART));
}
tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
tx->tx_needassign_txh = NULL;
/*
* NB: No error returns are allowed after txg_hold_open, but
* before processing the dnode holds, due to the
* dmu_tx_unassign() logic.
*/
uint64_t towrite = 0;
uint64_t tohold = 0;
for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
txh = list_next(&tx->tx_holds, txh)) {
dnode_t *dn = txh->txh_dnode;
if (dn != NULL) {
mutex_enter(&dn->dn_mtx);
if (dn->dn_assigned_txg == tx->tx_txg - 1) {
mutex_exit(&dn->dn_mtx);
tx->tx_needassign_txh = txh;
return (SET_ERROR(ERESTART));
}
if (dn->dn_assigned_txg == 0)
dn->dn_assigned_txg = tx->tx_txg;
ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
(void) refcount_add(&dn->dn_tx_holds, tx);
mutex_exit(&dn->dn_mtx);
}
towrite += refcount_count(&txh->txh_space_towrite);
tohold += refcount_count(&txh->txh_memory_tohold);
}
/* needed allocation: worst-case estimate of write space */
uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite);
/* calculate memory footprint estimate */
uint64_t memory = towrite + tohold;
if (tx->tx_dir != NULL && asize != 0) {
int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx);
if (err != 0)
return (err);
}
return (0);
}
static void
dmu_tx_unassign(dmu_tx_t *tx)
{
if (tx->tx_txg == 0)
return;
txg_rele_to_quiesce(&tx->tx_txgh);
/*
* Walk the transaction's hold list, removing the hold on the
* associated dnode, and notifying waiters if the refcount drops to 0.
*/
for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds);
txh != tx->tx_needassign_txh;
txh = list_next(&tx->tx_holds, txh)) {
dnode_t *dn = txh->txh_dnode;
if (dn == NULL)
continue;
mutex_enter(&dn->dn_mtx);
ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
dn->dn_assigned_txg = 0;
cv_broadcast(&dn->dn_notxholds);
}
mutex_exit(&dn->dn_mtx);
}
txg_rele_to_sync(&tx->tx_txgh);
tx->tx_lasttried_txg = tx->tx_txg;
tx->tx_txg = 0;
}
/*
* Assign tx to a transaction group; txg_how is a bitmask:
*
* If TXG_WAIT is set and the currently open txg is full, this function
* will wait until there's a new txg. This should be used when no locks
* are being held. With this bit set, this function will only fail if
* we're truly out of space (or over quota).
*
* If TXG_WAIT is *not* set and we can't assign into the currently open
* txg without blocking, this function will return immediately with
* ERESTART. This should be used whenever locks are being held. On an
* ERESTART error, the caller should drop all locks, call dmu_tx_wait(),
* and try again.
*
* If TXG_NOTHROTTLE is set, this indicates that this tx should not be
* delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for
* details on the throttle). This is used by the VFS operations, after
* they have already called dmu_tx_wait() (though most likely on a
* different tx).
*/
int
dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
{
int err;
ASSERT(tx->tx_txg == 0);
ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE));
ASSERT(!dsl_pool_sync_context(tx->tx_pool));
/* If we might wait, we must not hold the config lock. */
IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool));
if ((txg_how & TXG_NOTHROTTLE))
tx->tx_dirty_delayed = B_TRUE;
while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
dmu_tx_unassign(tx);
if (err != ERESTART || !(txg_how & TXG_WAIT))
return (err);
dmu_tx_wait(tx);
}
txg_rele_to_quiesce(&tx->tx_txgh);
return (0);
}
void
dmu_tx_wait(dmu_tx_t *tx)
{
spa_t *spa = tx->tx_pool->dp_spa;
dsl_pool_t *dp = tx->tx_pool;
ASSERT(tx->tx_txg == 0);
ASSERT(!dsl_pool_config_held(tx->tx_pool));
if (tx->tx_wait_dirty) {
/*
* dmu_tx_try_assign() has determined that we need to wait
* because we've consumed much or all of the dirty buffer
* space.
*/
mutex_enter(&dp->dp_lock);
while (dp->dp_dirty_total >= zfs_dirty_data_max)
cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
uint64_t dirty = dp->dp_dirty_total;
mutex_exit(&dp->dp_lock);
dmu_tx_delay(tx, dirty);
tx->tx_wait_dirty = B_FALSE;
/*
* Note: setting tx_dirty_delayed only has effect if the
* caller used TX_WAIT. Otherwise they are going to
* destroy this tx and try again. The common case,
* zfs_write(), uses TX_WAIT.
*/
tx->tx_dirty_delayed = B_TRUE;
} else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
/*
* If the pool is suspended we need to wait until it
* is resumed. Note that it's possible that the pool
* has become active after this thread has tried to
* obtain a tx. If that's the case then tx_lasttried_txg
* would not have been set.
*/
txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
} else if (tx->tx_needassign_txh) {
/*
* A dnode is assigned to the quiescing txg. Wait for its
* transaction to complete.
*/
dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
mutex_enter(&dn->dn_mtx);
while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
mutex_exit(&dn->dn_mtx);
tx->tx_needassign_txh = NULL;
} else {
txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
}
}
static void
dmu_tx_destroy(dmu_tx_t *tx)
{
dmu_tx_hold_t *txh;
while ((txh = list_head(&tx->tx_holds)) != NULL) {
dnode_t *dn = txh->txh_dnode;
list_remove(&tx->tx_holds, txh);
refcount_destroy_many(&txh->txh_space_towrite,
refcount_count(&txh->txh_space_towrite));
refcount_destroy_many(&txh->txh_memory_tohold,
refcount_count(&txh->txh_memory_tohold));
kmem_free(txh, sizeof (dmu_tx_hold_t));
if (dn != NULL)
dnode_rele(dn, tx);
}
list_destroy(&tx->tx_callbacks);
list_destroy(&tx->tx_holds);
kmem_free(tx, sizeof (dmu_tx_t));
}
void
dmu_tx_commit(dmu_tx_t *tx)
{
ASSERT(tx->tx_txg != 0);
/*
* Go through the transaction's hold list and remove holds on
* associated dnodes, notifying waiters if no holds remain.
*/
for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
txh = list_next(&tx->tx_holds, txh)) {
dnode_t *dn = txh->txh_dnode;
if (dn == NULL)
continue;
mutex_enter(&dn->dn_mtx);
ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
dn->dn_assigned_txg = 0;
cv_broadcast(&dn->dn_notxholds);
}
mutex_exit(&dn->dn_mtx);
}
if (tx->tx_tempreserve_cookie)
dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
if (!list_is_empty(&tx->tx_callbacks))
txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
if (tx->tx_anyobj == FALSE)
txg_rele_to_sync(&tx->tx_txgh);
dmu_tx_destroy(tx);
}
void
dmu_tx_abort(dmu_tx_t *tx)
{
ASSERT(tx->tx_txg == 0);
/*
* Call any registered callbacks with an error code.
*/
if (!list_is_empty(&tx->tx_callbacks))
dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
dmu_tx_destroy(tx);
}
uint64_t
dmu_tx_get_txg(dmu_tx_t *tx)
{
ASSERT(tx->tx_txg != 0);
return (tx->tx_txg);
}
dsl_pool_t *
dmu_tx_pool(dmu_tx_t *tx)
{
ASSERT(tx->tx_pool != NULL);
return (tx->tx_pool);
}
void
dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
{
dmu_tx_callback_t *dcb;
dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
dcb->dcb_func = func;
dcb->dcb_data = data;
list_insert_tail(&tx->tx_callbacks, dcb);
}
/*
* Call all the commit callbacks on a list, with a given error code.
*/
void
dmu_tx_do_callbacks(list_t *cb_list, int error)
{
dmu_tx_callback_t *dcb;
while ((dcb = list_head(cb_list)) != NULL) {
list_remove(cb_list, dcb);
dcb->dcb_func(dcb->dcb_data, error);
kmem_free(dcb, sizeof (dmu_tx_callback_t));
}
}
/*
* Interface to hold a bunch of attributes.
* used for creating new files.
* attrsize is the total size of all attributes
* to be added during object creation
*
* For updating/adding a single attribute dmu_tx_hold_sa() should be used.
*/
/*
* hold necessary attribute name for attribute registration.
* should be a very rare case where this is needed. If it does
* happen it would only happen on the first write to the file system.
*/
static void
dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
{
if (!sa->sa_need_attr_registration)
return;
for (int i = 0; i != sa->sa_num_attrs; i++) {
if (!sa->sa_attr_table[i].sa_registered) {
if (sa->sa_reg_attr_obj)
dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
B_TRUE, sa->sa_attr_table[i].sa_name);
else
dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
B_TRUE, sa->sa_attr_table[i].sa_name);
}
}
}
void
dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
{
dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx,
tx->tx_objset, object, THT_SPILL, 0, 0);
(void) refcount_add_many(&txh->txh_space_towrite,
SPA_OLD_MAXBLOCKSIZE, FTAG);
}
void
dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
{
sa_os_t *sa = tx->tx_objset->os_sa;
dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
if (tx->tx_objset->os_sa->sa_master_obj == 0)
return;
if (tx->tx_objset->os_sa->sa_layout_attr_obj) {
dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
} else {
dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
}
dmu_tx_sa_registration_hold(sa, tx);
if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
return;
(void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
THT_SPILL, 0, 0);
}
/*
* Hold SA attribute
*
* dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
*
* variable_size is the total size of all variable sized attributes
* passed to this function. It is not the total size of all
* variable size attributes that *may* exist on this object.
*/
void
dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
{
uint64_t object;
sa_os_t *sa = tx->tx_objset->os_sa;
ASSERT(hdl != NULL);
object = sa_handle_object(hdl);
dmu_tx_hold_bonus(tx, object);
if (tx->tx_objset->os_sa->sa_master_obj == 0)
return;
if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
}
dmu_tx_sa_registration_hold(sa, tx);
if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
ASSERT(tx->tx_txg == 0);
dmu_tx_hold_spill(tx, object);
} else {
dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
dnode_t *dn;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
if (dn->dn_have_spill) {
ASSERT(tx->tx_txg == 0);
dmu_tx_hold_spill(tx, object);
}
DB_DNODE_EXIT(db);
}
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c (revision 332525)
@@ -1,362 +1,373 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/dnode.h>
#include <sys/dmu_objset.h>
#include <sys/dmu_zfetch.h>
#include <sys/dmu.h>
#include <sys/dbuf.h>
#include <sys/kstat.h>
/*
* This tunable disables predictive prefetch. Note that it leaves "prescient"
* prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch,
* prescient prefetch never issues i/os that end up not being needed,
* so it can't hurt performance.
*/
boolean_t zfs_prefetch_disable = B_FALSE;
/* max # of streams per zfetch */
uint32_t zfetch_max_streams = 8;
/* min time before stream reclaim */
uint32_t zfetch_min_sec_reap = 2;
/* max bytes to prefetch per stream (default 8MB) */
uint32_t zfetch_max_distance = 8 * 1024 * 1024;
/* max bytes to prefetch indirects for per stream (default 64MB) */
uint32_t zfetch_max_idistance = 64 * 1024 * 1024;
/* max number of bytes in an array_read in which we allow prefetching (1MB) */
uint64_t zfetch_array_rd_sz = 1024 * 1024;
SYSCTL_DECL(_vfs_zfs);
SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RW,
&zfs_prefetch_disable, 0, "Disable prefetch");
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH");
SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RWTUN,
&zfetch_max_streams, 0, "Max # of streams per zfetch");
SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RWTUN,
&zfetch_min_sec_reap, 0, "Min time before stream reclaim");
SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN,
&zfetch_max_distance, 0, "Max bytes to prefetch per stream");
SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance, CTLFLAG_RWTUN,
&zfetch_max_idistance, 0, "Max bytes to prefetch indirects for per stream");
SYSCTL_UQUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RWTUN,
&zfetch_array_rd_sz, 0,
"Number of bytes in a array_read at which we stop prefetching");
typedef struct zfetch_stats {
kstat_named_t zfetchstat_hits;
kstat_named_t zfetchstat_misses;
kstat_named_t zfetchstat_max_streams;
} zfetch_stats_t;
static zfetch_stats_t zfetch_stats = {
{ "hits", KSTAT_DATA_UINT64 },
{ "misses", KSTAT_DATA_UINT64 },
{ "max_streams", KSTAT_DATA_UINT64 },
};
#define ZFETCHSTAT_BUMP(stat) \
atomic_inc_64(&zfetch_stats.stat.value.ui64);
kstat_t *zfetch_ksp;
void
zfetch_init(void)
{
zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL);
if (zfetch_ksp != NULL) {
zfetch_ksp->ks_data = &zfetch_stats;
kstat_install(zfetch_ksp);
}
}
void
zfetch_fini(void)
{
if (zfetch_ksp != NULL) {
kstat_delete(zfetch_ksp);
zfetch_ksp = NULL;
}
}
/*
* This takes a pointer to a zfetch structure and a dnode. It performs the
* necessary setup for the zfetch structure, grokking data from the
* associated dnode.
*/
void
dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
{
if (zf == NULL)
return;
zf->zf_dnode = dno;
list_create(&zf->zf_stream, sizeof (zstream_t),
offsetof(zstream_t, zs_node));
rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
}
static void
dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
{
ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
list_remove(&zf->zf_stream, zs);
mutex_destroy(&zs->zs_lock);
kmem_free(zs, sizeof (*zs));
}
/*
* Clean-up state associated with a zfetch structure (e.g. destroy the
* streams). This doesn't free the zfetch_t itself, that's left to the caller.
*/
void
dmu_zfetch_fini(zfetch_t *zf)
{
zstream_t *zs;
ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
rw_enter(&zf->zf_rwlock, RW_WRITER);
while ((zs = list_head(&zf->zf_stream)) != NULL)
dmu_zfetch_stream_remove(zf, zs);
rw_exit(&zf->zf_rwlock);
list_destroy(&zf->zf_stream);
rw_destroy(&zf->zf_rwlock);
zf->zf_dnode = NULL;
}
/*
* If there aren't too many streams already, create a new stream.
* The "blkid" argument is the next block that we expect this stream to access.
* While we're here, clean up old streams (which haven't been
* accessed for at least zfetch_min_sec_reap seconds).
*/
static void
dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
{
zstream_t *zs_next;
int numstreams = 0;
ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
/*
* Clean up old streams.
*/
for (zstream_t *zs = list_head(&zf->zf_stream);
zs != NULL; zs = zs_next) {
zs_next = list_next(&zf->zf_stream, zs);
if (((gethrtime() - zs->zs_atime) / NANOSEC) >
zfetch_min_sec_reap)
dmu_zfetch_stream_remove(zf, zs);
else
numstreams++;
}
/*
* The maximum number of streams is normally zfetch_max_streams,
* but for small files we lower it such that it's at least possible
* for all the streams to be non-overlapping.
*
* If we are already at the maximum number of streams for this file,
* even after removing old streams, then don't create this stream.
*/
uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
zfetch_max_distance));
if (numstreams >= max_streams) {
ZFETCHSTAT_BUMP(zfetchstat_max_streams);
return;
}
zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
zs->zs_blkid = blkid;
zs->zs_pf_blkid = blkid;
zs->zs_ipf_blkid = blkid;
zs->zs_atime = gethrtime();
mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
list_insert_head(&zf->zf_stream, zs);
}
/*
* This is the predictive prefetch entry point. It associates dnode access
* specified with blkid and nblks arguments with prefetch stream, predicts
* further accesses based on that stats and initiates speculative prefetch.
* fetch_data argument specifies whether actual data blocks should be fetched:
* FALSE -- prefetch only indirect blocks for predicted data blocks;
* TRUE -- prefetch predicted data blocks plus following indirect blocks.
*/
void
dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
{
zstream_t *zs;
int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
int64_t pf_ahead_blks, max_blks;
int epbs, max_dist_blks, pf_nblks, ipf_nblks;
uint64_t end_of_access_blkid = blkid + nblks;
+ spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
if (zfs_prefetch_disable)
+ return;
+
+ /*
+ * If we haven't yet loaded the indirect vdevs' mappings, we
+ * can only read from blocks that we carefully ensure are on
+ * concrete vdevs (or previously-loaded indirect vdevs). So we
+ * can't allow the predictive prefetcher to attempt reads of other
+ * blocks (e.g. of the MOS's dnode obejct).
+ */
+ if (!spa_indirect_vdevs_loaded(spa))
return;
/*
* As a fast path for small (single-block) files, ignore access
* to the first block.
*/
if (blkid == 0)
return;
rw_enter(&zf->zf_rwlock, RW_READER);
/*
* Find matching prefetch stream. Depending on whether the accesses
* are block-aligned, first block of the new access may either follow
* the last block of the previous access, or be equal to it.
*/
for (zs = list_head(&zf->zf_stream); zs != NULL;
zs = list_next(&zf->zf_stream, zs)) {
if (blkid == zs->zs_blkid || blkid + 1 == zs->zs_blkid) {
mutex_enter(&zs->zs_lock);
/*
* zs_blkid could have changed before we
* acquired zs_lock; re-check them here.
*/
if (blkid == zs->zs_blkid) {
break;
} else if (blkid + 1 == zs->zs_blkid) {
blkid++;
nblks--;
if (nblks == 0) {
/* Already prefetched this before. */
mutex_exit(&zs->zs_lock);
rw_exit(&zf->zf_rwlock);
return;
}
break;
}
mutex_exit(&zs->zs_lock);
}
}
if (zs == NULL) {
/*
* This access is not part of any existing stream. Create
* a new stream for it.
*/
ZFETCHSTAT_BUMP(zfetchstat_misses);
if (rw_tryupgrade(&zf->zf_rwlock))
dmu_zfetch_stream_create(zf, end_of_access_blkid);
rw_exit(&zf->zf_rwlock);
return;
}
/*
* This access was to a block that we issued a prefetch for on
* behalf of this stream. Issue further prefetches for this stream.
*
* Normally, we start prefetching where we stopped
* prefetching last (zs_pf_blkid). But when we get our first
* hit on this stream, zs_pf_blkid == zs_blkid, we don't
* want to prefetch the block we just accessed. In this case,
* start just after the block we just accessed.
*/
pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
/*
* Double our amount of prefetched data, but don't let the
* prefetch get further ahead than zfetch_max_distance.
*/
if (fetch_data) {
max_dist_blks =
zfetch_max_distance >> zf->zf_dnode->dn_datablkshift;
/*
* Previously, we were (zs_pf_blkid - blkid) ahead. We
* want to now be double that, so read that amount again,
* plus the amount we are catching up by (i.e. the amount
* read just now).
*/
pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks;
max_blks = max_dist_blks - (pf_start - end_of_access_blkid);
pf_nblks = MIN(pf_ahead_blks, max_blks);
} else {
pf_nblks = 0;
}
zs->zs_pf_blkid = pf_start + pf_nblks;
/*
* Do the same for indirects, starting from where we stopped last,
* or where we will stop reading data blocks (and the indirects
* that point to them).
*/
ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid);
max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift;
/*
* We want to double our distance ahead of the data prefetch
* (or reader, if we are not prefetching data). Previously, we
* were (zs_ipf_blkid - blkid) ahead. To double that, we read
* that amount again, plus the amount we are catching up by
* (i.e. the amount read now + the amount of data prefetched now).
*/
pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks;
max_blks = max_dist_blks - (ipf_start - end_of_access_blkid);
ipf_nblks = MIN(pf_ahead_blks, max_blks);
zs->zs_ipf_blkid = ipf_start + ipf_nblks;
epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;
zs->zs_atime = gethrtime();
zs->zs_blkid = end_of_access_blkid;
mutex_exit(&zs->zs_lock);
rw_exit(&zf->zf_rwlock);
/*
* dbuf_prefetch() is asynchronous (even when it needs to read
* indirect blocks), but we still prefer to drop our locks before
* calling it to reduce the time we hold them.
*/
for (int i = 0; i < pf_nblks; i++) {
dbuf_prefetch(zf->zf_dnode, 0, pf_start + i,
ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
}
for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) {
dbuf_prefetch(zf->zf_dnode, 1, iblk,
ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
}
ZFETCHSTAT_BUMP(zfetchstat_hits);
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c (revision 332525)
@@ -1,2004 +1,2003 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/zfs_context.h>
#include <sys/dbuf.h>
#include <sys/dnode.h>
#include <sys/dmu.h>
#include <sys/dmu_impl.h>
#include <sys/dmu_tx.h>
#include <sys/dmu_objset.h>
#include <sys/dsl_dir.h>
#include <sys/dsl_dataset.h>
#include <sys/spa.h>
#include <sys/zio.h>
#include <sys/dmu_zfetch.h>
#include <sys/range_tree.h>
static kmem_cache_t *dnode_cache;
/*
* Define DNODE_STATS to turn on statistic gathering. By default, it is only
* turned on when DEBUG is also defined.
*/
#ifdef DEBUG
#define DNODE_STATS
#endif /* DEBUG */
#ifdef DNODE_STATS
#define DNODE_STAT_ADD(stat) ((stat)++)
#else
#define DNODE_STAT_ADD(stat) /* nothing */
#endif /* DNODE_STATS */
static dnode_phys_t dnode_phys_zero;
int zfs_default_bs = SPA_MINBLOCKSHIFT;
int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
SYSCTL_DECL(_vfs_zfs);
SYSCTL_INT(_vfs_zfs, OID_AUTO, default_bs, CTLFLAG_RWTUN,
&zfs_default_bs, 0, "Default dnode block shift");
SYSCTL_INT(_vfs_zfs, OID_AUTO, default_ibs, CTLFLAG_RWTUN,
&zfs_default_ibs, 0, "Default dnode indirect block shift");
#ifdef illumos
static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
#endif
static int
dbuf_compare(const void *x1, const void *x2)
{
const dmu_buf_impl_t *d1 = x1;
const dmu_buf_impl_t *d2 = x2;
if (d1->db_level < d2->db_level) {
return (-1);
}
if (d1->db_level > d2->db_level) {
return (1);
}
if (d1->db_blkid < d2->db_blkid) {
return (-1);
}
if (d1->db_blkid > d2->db_blkid) {
return (1);
}
if (d1->db_state == DB_SEARCH) {
ASSERT3S(d2->db_state, !=, DB_SEARCH);
return (-1);
} else if (d2->db_state == DB_SEARCH) {
ASSERT3S(d1->db_state, !=, DB_SEARCH);
return (1);
}
if ((uintptr_t)d1 < (uintptr_t)d2) {
return (-1);
}
if ((uintptr_t)d1 > (uintptr_t)d2) {
return (1);
}
return (0);
}
/* ARGSUSED */
static int
dnode_cons(void *arg, void *unused, int kmflag)
{
dnode_t *dn = arg;
int i;
rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
/*
* Every dbuf has a reference, and dropping a tracked reference is
* O(number of references), so don't track dn_holds.
*/
refcount_create_untracked(&dn->dn_holds);
refcount_create(&dn->dn_tx_holds);
list_link_init(&dn->dn_link);
bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
for (i = 0; i < TXG_SIZE; i++) {
list_link_init(&dn->dn_dirty_link[i]);
dn->dn_free_ranges[i] = NULL;
list_create(&dn->dn_dirty_records[i],
sizeof (dbuf_dirty_record_t),
offsetof(dbuf_dirty_record_t, dr_dirty_node));
}
dn->dn_allocated_txg = 0;
dn->dn_free_txg = 0;
dn->dn_assigned_txg = 0;
dn->dn_dirtyctx = 0;
dn->dn_dirtyctx_firstset = NULL;
dn->dn_bonus = NULL;
dn->dn_have_spill = B_FALSE;
dn->dn_zio = NULL;
dn->dn_oldused = 0;
dn->dn_oldflags = 0;
dn->dn_olduid = 0;
dn->dn_oldgid = 0;
dn->dn_newuid = 0;
dn->dn_newgid = 0;
dn->dn_id_flags = 0;
dn->dn_dbufs_count = 0;
avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
offsetof(dmu_buf_impl_t, db_link));
dn->dn_moved = 0;
POINTER_INVALIDATE(&dn->dn_objset);
return (0);
}
/* ARGSUSED */
static void
dnode_dest(void *arg, void *unused)
{
int i;
dnode_t *dn = arg;
rw_destroy(&dn->dn_struct_rwlock);
mutex_destroy(&dn->dn_mtx);
mutex_destroy(&dn->dn_dbufs_mtx);
cv_destroy(&dn->dn_notxholds);
refcount_destroy(&dn->dn_holds);
refcount_destroy(&dn->dn_tx_holds);
ASSERT(!list_link_active(&dn->dn_link));
for (i = 0; i < TXG_SIZE; i++) {
ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
list_destroy(&dn->dn_dirty_records[i]);
ASSERT0(dn->dn_next_nblkptr[i]);
ASSERT0(dn->dn_next_nlevels[i]);
ASSERT0(dn->dn_next_indblkshift[i]);
ASSERT0(dn->dn_next_bonustype[i]);
ASSERT0(dn->dn_rm_spillblk[i]);
ASSERT0(dn->dn_next_bonuslen[i]);
ASSERT0(dn->dn_next_blksz[i]);
}
ASSERT0(dn->dn_allocated_txg);
ASSERT0(dn->dn_free_txg);
ASSERT0(dn->dn_assigned_txg);
ASSERT0(dn->dn_dirtyctx);
ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
ASSERT3P(dn->dn_bonus, ==, NULL);
ASSERT(!dn->dn_have_spill);
ASSERT3P(dn->dn_zio, ==, NULL);
ASSERT0(dn->dn_oldused);
ASSERT0(dn->dn_oldflags);
ASSERT0(dn->dn_olduid);
ASSERT0(dn->dn_oldgid);
ASSERT0(dn->dn_newuid);
ASSERT0(dn->dn_newgid);
ASSERT0(dn->dn_id_flags);
ASSERT0(dn->dn_dbufs_count);
avl_destroy(&dn->dn_dbufs);
}
void
dnode_init(void)
{
ASSERT(dnode_cache == NULL);
dnode_cache = kmem_cache_create("dnode_t",
sizeof (dnode_t),
0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
kmem_cache_set_move(dnode_cache, dnode_move);
}
void
dnode_fini(void)
{
kmem_cache_destroy(dnode_cache);
dnode_cache = NULL;
}
#ifdef ZFS_DEBUG
void
dnode_verify(dnode_t *dn)
{
int drop_struct_lock = FALSE;
ASSERT(dn->dn_phys);
ASSERT(dn->dn_objset);
ASSERT(dn->dn_handle->dnh_dnode == dn);
ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
return;
if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
rw_enter(&dn->dn_struct_rwlock, RW_READER);
drop_struct_lock = TRUE;
}
if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
int i;
ASSERT3U(dn->dn_indblkshift, >=, 0);
ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
if (dn->dn_datablkshift) {
ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
}
ASSERT3U(dn->dn_nlevels, <=, 30);
ASSERT(DMU_OT_IS_VALID(dn->dn_type));
ASSERT3U(dn->dn_nblkptr, >=, 1);
ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
ASSERT3U(dn->dn_datablksz, ==,
dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
for (i = 0; i < TXG_SIZE; i++) {
ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
}
}
if (dn->dn_phys->dn_type != DMU_OT_NONE)
ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
if (dn->dn_dbuf != NULL) {
ASSERT3P(dn->dn_phys, ==,
(dnode_phys_t *)dn->dn_dbuf->db.db_data +
(dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
}
if (drop_struct_lock)
rw_exit(&dn->dn_struct_rwlock);
}
#endif
void
dnode_byteswap(dnode_phys_t *dnp)
{
uint64_t *buf64 = (void*)&dnp->dn_blkptr;
int i;
if (dnp->dn_type == DMU_OT_NONE) {
bzero(dnp, sizeof (dnode_phys_t));
return;
}
dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
dnp->dn_used = BSWAP_64(dnp->dn_used);
/*
* dn_nblkptr is only one byte, so it's OK to read it in either
* byte order. We can't read dn_bouslen.
*/
ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
buf64[i] = BSWAP_64(buf64[i]);
/*
* OK to check dn_bonuslen for zero, because it won't matter if
* we have the wrong byte order. This is necessary because the
* dnode dnode is smaller than a regular dnode.
*/
if (dnp->dn_bonuslen != 0) {
/*
* Note that the bonus length calculated here may be
* longer than the actual bonus buffer. This is because
* we always put the bonus buffer after the last block
* pointer (instead of packing it against the end of the
* dnode buffer).
*/
int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
size_t len = DN_MAX_BONUSLEN - off;
ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
dmu_object_byteswap_t byteswap =
DMU_OT_BYTESWAP(dnp->dn_bonustype);
dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len);
}
/* Swap SPILL block if we have one */
if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t));
}
void
dnode_buf_byteswap(void *vbuf, size_t size)
{
dnode_phys_t *buf = vbuf;
int i;
ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
size >>= DNODE_SHIFT;
for (i = 0; i < size; i++) {
dnode_byteswap(buf);
buf++;
}
}
void
dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
{
ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
dnode_setdirty(dn, tx);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
(dn->dn_nblkptr-1) * sizeof (blkptr_t));
dn->dn_bonuslen = newsize;
if (newsize == 0)
dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
else
dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
rw_exit(&dn->dn_struct_rwlock);
}
void
dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
{
ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
dnode_setdirty(dn, tx);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
dn->dn_bonustype = newtype;
dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
rw_exit(&dn->dn_struct_rwlock);
}
void
dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
{
ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
dnode_setdirty(dn, tx);
dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK;
dn->dn_have_spill = B_FALSE;
}
static void
dnode_setdblksz(dnode_t *dn, int size)
{
ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
dn->dn_datablksz = size;
dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
}
static dnode_t *
dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
uint64_t object, dnode_handle_t *dnh)
{
dnode_t *dn;
dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
ASSERT(!POINTER_IS_VALID(dn->dn_objset));
dn->dn_moved = 0;
/*
* Defer setting dn_objset until the dnode is ready to be a candidate
* for the dnode_move() callback.
*/
dn->dn_object = object;
dn->dn_dbuf = db;
dn->dn_handle = dnh;
dn->dn_phys = dnp;
if (dnp->dn_datablkszsec) {
dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
} else {
dn->dn_datablksz = 0;
dn->dn_datablkszsec = 0;
dn->dn_datablkshift = 0;
}
dn->dn_indblkshift = dnp->dn_indblkshift;
dn->dn_nlevels = dnp->dn_nlevels;
dn->dn_type = dnp->dn_type;
dn->dn_nblkptr = dnp->dn_nblkptr;
dn->dn_checksum = dnp->dn_checksum;
dn->dn_compress = dnp->dn_compress;
dn->dn_bonustype = dnp->dn_bonustype;
dn->dn_bonuslen = dnp->dn_bonuslen;
dn->dn_maxblkid = dnp->dn_maxblkid;
dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
dn->dn_id_flags = 0;
dmu_zfetch_init(&dn->dn_zfetch, dn);
ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
mutex_enter(&os->os_lock);
if (dnh->dnh_dnode != NULL) {
/* Lost the allocation race. */
mutex_exit(&os->os_lock);
kmem_cache_free(dnode_cache, dn);
return (dnh->dnh_dnode);
}
/*
* Exclude special dnodes from os_dnodes so an empty os_dnodes
* signifies that the special dnodes have no references from
* their children (the entries in os_dnodes). This allows
* dnode_destroy() to easily determine if the last child has
* been removed and then complete eviction of the objset.
*/
if (!DMU_OBJECT_IS_SPECIAL(object))
list_insert_head(&os->os_dnodes, dn);
membar_producer();
/*
* Everything else must be valid before assigning dn_objset
* makes the dnode eligible for dnode_move().
*/
dn->dn_objset = os;
dnh->dnh_dnode = dn;
mutex_exit(&os->os_lock);
arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
return (dn);
}
/*
* Caller must be holding the dnode handle, which is released upon return.
*/
static void
dnode_destroy(dnode_t *dn)
{
objset_t *os = dn->dn_objset;
boolean_t complete_os_eviction = B_FALSE;
ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
mutex_enter(&os->os_lock);
POINTER_INVALIDATE(&dn->dn_objset);
if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
list_remove(&os->os_dnodes, dn);
complete_os_eviction =
list_is_empty(&os->os_dnodes) &&
list_link_active(&os->os_evicting_node);
}
mutex_exit(&os->os_lock);
/* the dnode can no longer move, so we can release the handle */
zrl_remove(&dn->dn_handle->dnh_zrlock);
dn->dn_allocated_txg = 0;
dn->dn_free_txg = 0;
dn->dn_assigned_txg = 0;
dn->dn_dirtyctx = 0;
if (dn->dn_dirtyctx_firstset != NULL) {
kmem_free(dn->dn_dirtyctx_firstset, 1);
dn->dn_dirtyctx_firstset = NULL;
}
if (dn->dn_bonus != NULL) {
mutex_enter(&dn->dn_bonus->db_mtx);
dbuf_destroy(dn->dn_bonus);
dn->dn_bonus = NULL;
}
dn->dn_zio = NULL;
dn->dn_have_spill = B_FALSE;
dn->dn_oldused = 0;
dn->dn_oldflags = 0;
dn->dn_olduid = 0;
dn->dn_oldgid = 0;
dn->dn_newuid = 0;
dn->dn_newgid = 0;
dn->dn_id_flags = 0;
dmu_zfetch_fini(&dn->dn_zfetch);
kmem_cache_free(dnode_cache, dn);
arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
if (complete_os_eviction)
dmu_objset_evict_done(os);
}
void
dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
int i;
ASSERT3U(blocksize, <=,
spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
if (blocksize == 0)
blocksize = 1 << zfs_default_bs;
else
blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
if (ibs == 0)
ibs = zfs_default_ibs;
ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset,
dn->dn_object, tx->tx_txg, blocksize, ibs);
ASSERT(dn->dn_type == DMU_OT_NONE);
ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
ASSERT(ot != DMU_OT_NONE);
ASSERT(DMU_OT_IS_VALID(ot));
ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
(bonustype == DMU_OT_SA && bonuslen == 0) ||
(bonustype != DMU_OT_NONE && bonuslen != 0));
ASSERT(DMU_OT_IS_VALID(bonustype));
ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
ASSERT(dn->dn_type == DMU_OT_NONE);
ASSERT0(dn->dn_maxblkid);
ASSERT0(dn->dn_allocated_txg);
ASSERT0(dn->dn_assigned_txg);
ASSERT(refcount_is_zero(&dn->dn_tx_holds));
ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
ASSERT(avl_is_empty(&dn->dn_dbufs));
for (i = 0; i < TXG_SIZE; i++) {
ASSERT0(dn->dn_next_nblkptr[i]);
ASSERT0(dn->dn_next_nlevels[i]);
ASSERT0(dn->dn_next_indblkshift[i]);
ASSERT0(dn->dn_next_bonuslen[i]);
ASSERT0(dn->dn_next_bonustype[i]);
ASSERT0(dn->dn_rm_spillblk[i]);
ASSERT0(dn->dn_next_blksz[i]);
ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
}
dn->dn_type = ot;
dnode_setdblksz(dn, blocksize);
dn->dn_indblkshift = ibs;
dn->dn_nlevels = 1;
if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
dn->dn_nblkptr = 1;
else
dn->dn_nblkptr = 1 +
((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
dn->dn_bonustype = bonustype;
dn->dn_bonuslen = bonuslen;
dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
dn->dn_compress = ZIO_COMPRESS_INHERIT;
dn->dn_dirtyctx = 0;
dn->dn_free_txg = 0;
if (dn->dn_dirtyctx_firstset) {
kmem_free(dn->dn_dirtyctx_firstset, 1);
dn->dn_dirtyctx_firstset = NULL;
}
dn->dn_allocated_txg = tx->tx_txg;
dn->dn_id_flags = 0;
dnode_setdirty(dn, tx);
dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
}
void
dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
int nblkptr;
ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
ASSERT3U(blocksize, <=,
spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
ASSERT0(blocksize % SPA_MINBLOCKSIZE);
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
ASSERT(tx->tx_txg != 0);
ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
(bonustype != DMU_OT_NONE && bonuslen != 0) ||
(bonustype == DMU_OT_SA && bonuslen == 0));
ASSERT(DMU_OT_IS_VALID(bonustype));
ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
/* clean up any unreferenced dbufs */
dnode_evict_dbufs(dn);
dn->dn_id_flags = 0;
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
dnode_setdirty(dn, tx);
if (dn->dn_datablksz != blocksize) {
/* change blocksize */
ASSERT(dn->dn_maxblkid == 0 &&
(BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
dnode_block_freed(dn, 0)));
dnode_setdblksz(dn, blocksize);
dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
}
if (dn->dn_bonuslen != bonuslen)
dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
nblkptr = 1;
else
nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
if (dn->dn_bonustype != bonustype)
dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
if (dn->dn_nblkptr != nblkptr)
dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
dbuf_rm_spill(dn, tx);
dnode_rm_spill(dn, tx);
}
rw_exit(&dn->dn_struct_rwlock);
/* change type */
dn->dn_type = ot;
/* change bonus size and type */
mutex_enter(&dn->dn_mtx);
dn->dn_bonustype = bonustype;
dn->dn_bonuslen = bonuslen;
dn->dn_nblkptr = nblkptr;
dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
dn->dn_compress = ZIO_COMPRESS_INHERIT;
ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
/* fix up the bonus db_size */
if (dn->dn_bonus) {
dn->dn_bonus->db.db_size =
DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t);
ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
}
dn->dn_allocated_txg = tx->tx_txg;
mutex_exit(&dn->dn_mtx);
}
#ifdef DNODE_STATS
static struct {
uint64_t dms_dnode_invalid;
uint64_t dms_dnode_recheck1;
uint64_t dms_dnode_recheck2;
uint64_t dms_dnode_special;
uint64_t dms_dnode_handle;
uint64_t dms_dnode_rwlock;
uint64_t dms_dnode_active;
} dnode_move_stats;
#endif /* DNODE_STATS */
static void
dnode_move_impl(dnode_t *odn, dnode_t *ndn)
{
int i;
ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock));
/* Copy fields. */
ndn->dn_objset = odn->dn_objset;
ndn->dn_object = odn->dn_object;
ndn->dn_dbuf = odn->dn_dbuf;
ndn->dn_handle = odn->dn_handle;
ndn->dn_phys = odn->dn_phys;
ndn->dn_type = odn->dn_type;
ndn->dn_bonuslen = odn->dn_bonuslen;
ndn->dn_bonustype = odn->dn_bonustype;
ndn->dn_nblkptr = odn->dn_nblkptr;
ndn->dn_checksum = odn->dn_checksum;
ndn->dn_compress = odn->dn_compress;
ndn->dn_nlevels = odn->dn_nlevels;
ndn->dn_indblkshift = odn->dn_indblkshift;
ndn->dn_datablkshift = odn->dn_datablkshift;
ndn->dn_datablkszsec = odn->dn_datablkszsec;
ndn->dn_datablksz = odn->dn_datablksz;
ndn->dn_maxblkid = odn->dn_maxblkid;
bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
sizeof (odn->dn_next_nblkptr));
bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
sizeof (odn->dn_next_nlevels));
bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
sizeof (odn->dn_next_indblkshift));
bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
sizeof (odn->dn_next_bonustype));
bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
sizeof (odn->dn_rm_spillblk));
bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
sizeof (odn->dn_next_bonuslen));
bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
sizeof (odn->dn_next_blksz));
for (i = 0; i < TXG_SIZE; i++) {
list_move_tail(&ndn->dn_dirty_records[i],
&odn->dn_dirty_records[i]);
}
bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
sizeof (odn->dn_free_ranges));
ndn->dn_allocated_txg = odn->dn_allocated_txg;
ndn->dn_free_txg = odn->dn_free_txg;
ndn->dn_assigned_txg = odn->dn_assigned_txg;
ndn->dn_dirtyctx = odn->dn_dirtyctx;
ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
ASSERT(avl_is_empty(&ndn->dn_dbufs));
avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
ndn->dn_dbufs_count = odn->dn_dbufs_count;
ndn->dn_bonus = odn->dn_bonus;
ndn->dn_have_spill = odn->dn_have_spill;
ndn->dn_zio = odn->dn_zio;
ndn->dn_oldused = odn->dn_oldused;
ndn->dn_oldflags = odn->dn_oldflags;
ndn->dn_olduid = odn->dn_olduid;
ndn->dn_oldgid = odn->dn_oldgid;
ndn->dn_newuid = odn->dn_newuid;
ndn->dn_newgid = odn->dn_newgid;
ndn->dn_id_flags = odn->dn_id_flags;
dmu_zfetch_init(&ndn->dn_zfetch, NULL);
list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
/*
* Update back pointers. Updating the handle fixes the back pointer of
* every descendant dbuf as well as the bonus dbuf.
*/
ASSERT(ndn->dn_handle->dnh_dnode == odn);
ndn->dn_handle->dnh_dnode = ndn;
if (ndn->dn_zfetch.zf_dnode == odn) {
ndn->dn_zfetch.zf_dnode = ndn;
}
/*
* Invalidate the original dnode by clearing all of its back pointers.
*/
odn->dn_dbuf = NULL;
odn->dn_handle = NULL;
avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
offsetof(dmu_buf_impl_t, db_link));
odn->dn_dbufs_count = 0;
odn->dn_bonus = NULL;
odn->dn_zfetch.zf_dnode = NULL;
/*
* Set the low bit of the objset pointer to ensure that dnode_move()
* recognizes the dnode as invalid in any subsequent callback.
*/
POINTER_INVALIDATE(&odn->dn_objset);
/*
* Satisfy the destructor.
*/
for (i = 0; i < TXG_SIZE; i++) {
list_create(&odn->dn_dirty_records[i],
sizeof (dbuf_dirty_record_t),
offsetof(dbuf_dirty_record_t, dr_dirty_node));
odn->dn_free_ranges[i] = NULL;
odn->dn_next_nlevels[i] = 0;
odn->dn_next_indblkshift[i] = 0;
odn->dn_next_bonustype[i] = 0;
odn->dn_rm_spillblk[i] = 0;
odn->dn_next_bonuslen[i] = 0;
odn->dn_next_blksz[i] = 0;
}
odn->dn_allocated_txg = 0;
odn->dn_free_txg = 0;
odn->dn_assigned_txg = 0;
odn->dn_dirtyctx = 0;
odn->dn_dirtyctx_firstset = NULL;
odn->dn_have_spill = B_FALSE;
odn->dn_zio = NULL;
odn->dn_oldused = 0;
odn->dn_oldflags = 0;
odn->dn_olduid = 0;
odn->dn_oldgid = 0;
odn->dn_newuid = 0;
odn->dn_newgid = 0;
odn->dn_id_flags = 0;
/*
* Mark the dnode.
*/
ndn->dn_moved = 1;
odn->dn_moved = (uint8_t)-1;
}
#ifdef illumos
#ifdef _KERNEL
/*ARGSUSED*/
static kmem_cbrc_t
dnode_move(void *buf, void *newbuf, size_t size, void *arg)
{
dnode_t *odn = buf, *ndn = newbuf;
objset_t *os;
int64_t refcount;
uint32_t dbufs;
/*
* The dnode is on the objset's list of known dnodes if the objset
* pointer is valid. We set the low bit of the objset pointer when
* freeing the dnode to invalidate it, and the memory patterns written
* by kmem (baddcafe and deadbeef) set at least one of the two low bits.
* A newly created dnode sets the objset pointer last of all to indicate
* that the dnode is known and in a valid state to be moved by this
* function.
*/
os = odn->dn_objset;
if (!POINTER_IS_VALID(os)) {
DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid);
return (KMEM_CBRC_DONT_KNOW);
}
/*
* Ensure that the objset does not go away during the move.
*/
rw_enter(&os_lock, RW_WRITER);
if (os != odn->dn_objset) {
rw_exit(&os_lock);
DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1);
return (KMEM_CBRC_DONT_KNOW);
}
/*
* If the dnode is still valid, then so is the objset. We know that no
* valid objset can be freed while we hold os_lock, so we can safely
* ensure that the objset remains in use.
*/
mutex_enter(&os->os_lock);
/*
* Recheck the objset pointer in case the dnode was removed just before
* acquiring the lock.
*/
if (os != odn->dn_objset) {
mutex_exit(&os->os_lock);
rw_exit(&os_lock);
DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2);
return (KMEM_CBRC_DONT_KNOW);
}
/*
* At this point we know that as long as we hold os->os_lock, the dnode
* cannot be freed and fields within the dnode can be safely accessed.
* The objset listing this dnode cannot go away as long as this dnode is
* on its list.
*/
rw_exit(&os_lock);
if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
mutex_exit(&os->os_lock);
DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special);
return (KMEM_CBRC_NO);
}
ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
/*
* Lock the dnode handle to prevent the dnode from obtaining any new
* holds. This also prevents the descendant dbufs and the bonus dbuf
* from accessing the dnode, so that we can discount their holds. The
* handle is safe to access because we know that while the dnode cannot
* go away, neither can its handle. Once we hold dnh_zrlock, we can
* safely move any dnode referenced only by dbufs.
*/
if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
mutex_exit(&os->os_lock);
DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle);
return (KMEM_CBRC_LATER);
}
/*
* Ensure a consistent view of the dnode's holds and the dnode's dbufs.
* We need to guarantee that there is a hold for every dbuf in order to
* determine whether the dnode is actively referenced. Falsely matching
* a dbuf to an active hold would lead to an unsafe move. It's possible
* that a thread already having an active dnode hold is about to add a
* dbuf, and we can't compare hold and dbuf counts while the add is in
* progress.
*/
if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
zrl_exit(&odn->dn_handle->dnh_zrlock);
mutex_exit(&os->os_lock);
DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock);
return (KMEM_CBRC_LATER);
}
/*
* A dbuf may be removed (evicted) without an active dnode hold. In that
* case, the dbuf count is decremented under the handle lock before the
* dbuf's hold is released. This order ensures that if we count the hold
* after the dbuf is removed but before its hold is released, we will
* treat the unmatched hold as active and exit safely. If we count the
* hold before the dbuf is removed, the hold is discounted, and the
* removal is blocked until the move completes.
*/
refcount = refcount_count(&odn->dn_holds);
ASSERT(refcount >= 0);
dbufs = odn->dn_dbufs_count;
/* We can't have more dbufs than dnode holds. */
ASSERT3U(dbufs, <=, refcount);
DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
uint32_t, dbufs);
if (refcount > dbufs) {
rw_exit(&odn->dn_struct_rwlock);
zrl_exit(&odn->dn_handle->dnh_zrlock);
mutex_exit(&os->os_lock);
DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active);
return (KMEM_CBRC_LATER);
}
rw_exit(&odn->dn_struct_rwlock);
/*
* At this point we know that anyone with a hold on the dnode is not
* actively referencing it. The dnode is known and in a valid state to
* move. We're holding the locks needed to execute the critical section.
*/
dnode_move_impl(odn, ndn);
list_link_replace(&odn->dn_link, &ndn->dn_link);
/* If the dnode was safe to move, the refcount cannot have changed. */
ASSERT(refcount == refcount_count(&ndn->dn_holds));
ASSERT(dbufs == ndn->dn_dbufs_count);
zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
mutex_exit(&os->os_lock);
return (KMEM_CBRC_YES);
}
#endif /* _KERNEL */
#endif /* illumos */
void
dnode_special_close(dnode_handle_t *dnh)
{
dnode_t *dn = dnh->dnh_dnode;
/*
* Wait for final references to the dnode to clear. This can
* only happen if the arc is asyncronously evicting state that
* has a hold on this dnode while we are trying to evict this
* dnode.
*/
while (refcount_count(&dn->dn_holds) > 0)
delay(1);
ASSERT(dn->dn_dbuf == NULL ||
dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
zrl_add(&dnh->dnh_zrlock);
dnode_destroy(dn); /* implicit zrl_remove() */
zrl_destroy(&dnh->dnh_zrlock);
dnh->dnh_dnode = NULL;
}
void
dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
dnode_handle_t *dnh)
{
dnode_t *dn;
dn = dnode_create(os, dnp, NULL, object, dnh);
zrl_init(&dnh->dnh_zrlock);
DNODE_VERIFY(dn);
}
static void
dnode_buf_evict_async(void *dbu)
{
dnode_children_t *children_dnodes = dbu;
int i;
for (i = 0; i < children_dnodes->dnc_count; i++) {
dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
dnode_t *dn;
/*
* The dnode handle lock guards against the dnode moving to
* another valid address, so there is no need here to guard
* against changes to or from NULL.
*/
if (dnh->dnh_dnode == NULL) {
zrl_destroy(&dnh->dnh_zrlock);
continue;
}
zrl_add(&dnh->dnh_zrlock);
dn = dnh->dnh_dnode;
/*
* If there are holds on this dnode, then there should
* be holds on the dnode's containing dbuf as well; thus
* it wouldn't be eligible for eviction and this function
* would not have been called.
*/
ASSERT(refcount_is_zero(&dn->dn_holds));
ASSERT(refcount_is_zero(&dn->dn_tx_holds));
dnode_destroy(dn); /* implicit zrl_remove() */
zrl_destroy(&dnh->dnh_zrlock);
dnh->dnh_dnode = NULL;
}
kmem_free(children_dnodes, sizeof (dnode_children_t) +
children_dnodes->dnc_count * sizeof (dnode_handle_t));
}
/*
* errors:
* EINVAL - invalid object number.
* EIO - i/o error.
* succeeds even for free dnodes.
*/
int
dnode_hold_impl(objset_t *os, uint64_t object, int flag,
void *tag, dnode_t **dnp)
{
int epb, idx, err;
int drop_struct_lock = FALSE;
int type;
uint64_t blk;
dnode_t *mdn, *dn;
dmu_buf_impl_t *db;
dnode_children_t *children_dnodes;
dnode_handle_t *dnh;
/*
* If you are holding the spa config lock as writer, you shouldn't
* be asking the DMU to do *anything* unless it's the root pool
* which may require us to read from the root filesystem while
* holding some (not all) of the locks as writer.
*/
ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
(spa_is_root(os->os_spa) &&
spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) {
dn = (object == DMU_USERUSED_OBJECT) ?
DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os);
if (dn == NULL)
return (SET_ERROR(ENOENT));
type = dn->dn_type;
if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
return (SET_ERROR(ENOENT));
if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
return (SET_ERROR(EEXIST));
DNODE_VERIFY(dn);
(void) refcount_add(&dn->dn_holds, tag);
*dnp = dn;
return (0);
}
if (object == 0 || object >= DN_MAX_OBJECT)
return (SET_ERROR(EINVAL));
mdn = DMU_META_DNODE(os);
ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
DNODE_VERIFY(mdn);
if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
rw_enter(&mdn->dn_struct_rwlock, RW_READER);
drop_struct_lock = TRUE;
}
blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
db = dbuf_hold(mdn, blk, FTAG);
if (drop_struct_lock)
rw_exit(&mdn->dn_struct_rwlock);
if (db == NULL)
return (SET_ERROR(EIO));
err = dbuf_read(db, NULL, DB_RF_CANFAIL);
if (err) {
dbuf_rele(db, FTAG);
return (err);
}
ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
epb = db->db.db_size >> DNODE_SHIFT;
idx = object & (epb-1);
ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
children_dnodes = dmu_buf_get_user(&db->db);
if (children_dnodes == NULL) {
int i;
dnode_children_t *winner;
children_dnodes = kmem_zalloc(sizeof (dnode_children_t) +
epb * sizeof (dnode_handle_t), KM_SLEEP);
children_dnodes->dnc_count = epb;
dnh = &children_dnodes->dnc_children[0];
for (i = 0; i < epb; i++) {
zrl_init(&dnh[i].dnh_zrlock);
}
dmu_buf_init_user(&children_dnodes->dnc_dbu, NULL,
dnode_buf_evict_async, NULL);
winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu);
if (winner != NULL) {
for (i = 0; i < epb; i++) {
zrl_destroy(&dnh[i].dnh_zrlock);
}
kmem_free(children_dnodes, sizeof (dnode_children_t) +
epb * sizeof (dnode_handle_t));
children_dnodes = winner;
}
}
ASSERT(children_dnodes->dnc_count == epb);
dnh = &children_dnodes->dnc_children[idx];
zrl_add(&dnh->dnh_zrlock);
dn = dnh->dnh_dnode;
if (dn == NULL) {
dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
dn = dnode_create(os, phys, db, object, dnh);
}
mutex_enter(&dn->dn_mtx);
type = dn->dn_type;
if (dn->dn_free_txg ||
((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
((flag & DNODE_MUST_BE_FREE) &&
(type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
mutex_exit(&dn->dn_mtx);
zrl_remove(&dnh->dnh_zrlock);
dbuf_rele(db, FTAG);
return (type == DMU_OT_NONE ? ENOENT : EEXIST);
}
if (refcount_add(&dn->dn_holds, tag) == 1)
dbuf_add_ref(db, dnh);
mutex_exit(&dn->dn_mtx);
/* Now we can rely on the hold to prevent the dnode from moving. */
zrl_remove(&dnh->dnh_zrlock);
DNODE_VERIFY(dn);
ASSERT3P(dn->dn_dbuf, ==, db);
ASSERT3U(dn->dn_object, ==, object);
dbuf_rele(db, FTAG);
*dnp = dn;
return (0);
}
/*
* Return held dnode if the object is allocated, NULL if not.
*/
int
dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
{
return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
}
/*
* Can only add a reference if there is already at least one
* reference on the dnode. Returns FALSE if unable to add a
* new reference.
*/
boolean_t
dnode_add_ref(dnode_t *dn, void *tag)
{
mutex_enter(&dn->dn_mtx);
if (refcount_is_zero(&dn->dn_holds)) {
mutex_exit(&dn->dn_mtx);
return (FALSE);
}
VERIFY(1 < refcount_add(&dn->dn_holds, tag));
mutex_exit(&dn->dn_mtx);
return (TRUE);
}
void
dnode_rele(dnode_t *dn, void *tag)
{
mutex_enter(&dn->dn_mtx);
dnode_rele_and_unlock(dn, tag);
}
void
dnode_rele_and_unlock(dnode_t *dn, void *tag)
{
uint64_t refs;
/* Get while the hold prevents the dnode from moving. */
dmu_buf_impl_t *db = dn->dn_dbuf;
dnode_handle_t *dnh = dn->dn_handle;
refs = refcount_remove(&dn->dn_holds, tag);
mutex_exit(&dn->dn_mtx);
/*
* It's unsafe to release the last hold on a dnode by dnode_rele() or
* indirectly by dbuf_rele() while relying on the dnode handle to
* prevent the dnode from moving, since releasing the last hold could
* result in the dnode's parent dbuf evicting its dnode handles. For
* that reason anyone calling dnode_rele() or dbuf_rele() without some
* other direct or indirect hold on the dnode must first drop the dnode
* handle.
*/
ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
if (refs == 0 && db != NULL) {
/*
* Another thread could add a hold to the dnode handle in
* dnode_hold_impl() while holding the parent dbuf. Since the
* hold on the parent dbuf prevents the handle from being
* destroyed, the hold on the handle is OK. We can't yet assert
* that the handle has zero references, but that will be
* asserted anyway when the handle gets destroyed.
*/
dbuf_rele(db, dnh);
}
}
void
dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
{
objset_t *os = dn->dn_objset;
uint64_t txg = tx->tx_txg;
if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
dsl_dataset_dirty(os->os_dsl_dataset, tx);
return;
}
DNODE_VERIFY(dn);
#ifdef ZFS_DEBUG
mutex_enter(&dn->dn_mtx);
ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
mutex_exit(&dn->dn_mtx);
#endif
/*
* Determine old uid/gid when necessary
*/
dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
multilist_t *dirtylist = os->os_dirty_dnodes[txg & TXG_MASK];
multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn);
/*
* If we are already marked dirty, we're done.
*/
if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
multilist_sublist_unlock(mls);
return;
}
ASSERT(!refcount_is_zero(&dn->dn_holds) ||
!avl_is_empty(&dn->dn_dbufs));
ASSERT(dn->dn_datablksz != 0);
ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]);
dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
dn->dn_object, txg);
multilist_sublist_insert_head(mls, dn);
multilist_sublist_unlock(mls);
/*
* The dnode maintains a hold on its containing dbuf as
* long as there are holds on it. Each instantiated child
* dbuf maintains a hold on the dnode. When the last child
* drops its hold, the dnode will drop its hold on the
* containing dbuf. We add a "dirty hold" here so that the
* dnode will hang around after we finish processing its
* children.
*/
VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
(void) dbuf_dirty(dn->dn_dbuf, tx);
dsl_dataset_dirty(os->os_dsl_dataset, tx);
}
void
dnode_free(dnode_t *dn, dmu_tx_t *tx)
{
mutex_enter(&dn->dn_mtx);
if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
mutex_exit(&dn->dn_mtx);
return;
}
dn->dn_free_txg = tx->tx_txg;
mutex_exit(&dn->dn_mtx);
dnode_setdirty(dn, tx);
}
/*
* Try to change the block size for the indicated dnode. This can only
* succeed if there are no blocks allocated or dirty beyond first block
*/
int
dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
{
dmu_buf_impl_t *db;
int err;
ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
if (size == 0)
size = SPA_MINBLOCKSIZE;
else
size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
if (ibs == dn->dn_indblkshift)
ibs = 0;
if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
return (0);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
/* Check for any allocated blocks beyond the first */
if (dn->dn_maxblkid != 0)
goto fail;
mutex_enter(&dn->dn_dbufs_mtx);
for (db = avl_first(&dn->dn_dbufs); db != NULL;
db = AVL_NEXT(&dn->dn_dbufs, db)) {
if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
db->db_blkid != DMU_SPILL_BLKID) {
mutex_exit(&dn->dn_dbufs_mtx);
goto fail;
}
}
mutex_exit(&dn->dn_dbufs_mtx);
if (ibs && dn->dn_nlevels != 1)
goto fail;
/* resize the old block */
err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
if (err == 0)
dbuf_new_size(db, size, tx);
else if (err != ENOENT)
goto fail;
dnode_setdblksz(dn, size);
dnode_setdirty(dn, tx);
dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
if (ibs) {
dn->dn_indblkshift = ibs;
dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
}
/* rele after we have fixed the blocksize in the dnode */
if (db)
dbuf_rele(db, FTAG);
rw_exit(&dn->dn_struct_rwlock);
return (0);
fail:
rw_exit(&dn->dn_struct_rwlock);
return (SET_ERROR(ENOTSUP));
}
/* read-holding callers must not rely on the lock being continuously held */
void
dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
{
uint64_t txgoff = tx->tx_txg & TXG_MASK;
int epbs, new_nlevels;
uint64_t sz;
ASSERT(blkid != DMU_BONUS_BLKID);
ASSERT(have_read ?
RW_READ_HELD(&dn->dn_struct_rwlock) :
RW_WRITE_HELD(&dn->dn_struct_rwlock));
/*
* if we have a read-lock, check to see if we need to do any work
* before upgrading to a write-lock.
*/
if (have_read) {
if (blkid <= dn->dn_maxblkid)
return;
if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
rw_exit(&dn->dn_struct_rwlock);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
}
}
if (blkid <= dn->dn_maxblkid)
goto out;
dn->dn_maxblkid = blkid;
/*
* Compute the number of levels necessary to support the new maxblkid.
*/
new_nlevels = 1;
epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
for (sz = dn->dn_nblkptr;
sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
new_nlevels++;
if (new_nlevels > dn->dn_nlevels) {
int old_nlevels = dn->dn_nlevels;
dmu_buf_impl_t *db;
list_t *list;
dbuf_dirty_record_t *new, *dr, *dr_next;
dn->dn_nlevels = new_nlevels;
ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
dn->dn_next_nlevels[txgoff] = new_nlevels;
/* dirty the left indirects */
db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
ASSERT(db != NULL);
new = dbuf_dirty(db, tx);
dbuf_rele(db, FTAG);
/* transfer the dirty records to the new indirect */
mutex_enter(&dn->dn_mtx);
mutex_enter(&new->dt.di.dr_mtx);
list = &dn->dn_dirty_records[txgoff];
for (dr = list_head(list); dr; dr = dr_next) {
dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
if (dr->dr_dbuf->db_level != new_nlevels-1 &&
dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
list_remove(&dn->dn_dirty_records[txgoff], dr);
list_insert_tail(&new->dt.di.dr_children, dr);
dr->dr_parent = new;
}
}
mutex_exit(&new->dt.di.dr_mtx);
mutex_exit(&dn->dn_mtx);
}
out:
if (have_read)
rw_downgrade(&dn->dn_struct_rwlock);
}
static void
dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
if (db != NULL) {
dmu_buf_will_dirty(&db->db, tx);
dbuf_rele(db, FTAG);
}
}
void
dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
{
dmu_buf_impl_t *db;
uint64_t blkoff, blkid, nblks;
int blksz, blkshift, head, tail;
int trunc = FALSE;
int epbs;
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
blksz = dn->dn_datablksz;
blkshift = dn->dn_datablkshift;
epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
if (len == DMU_OBJECT_END) {
len = UINT64_MAX - off;
trunc = TRUE;
}
/*
* First, block align the region to free:
*/
if (ISP2(blksz)) {
head = P2NPHASE(off, blksz);
blkoff = P2PHASE(off, blksz);
if ((off >> blkshift) > dn->dn_maxblkid)
goto out;
} else {
ASSERT(dn->dn_maxblkid == 0);
if (off == 0 && len >= blksz) {
/*
* Freeing the whole block; fast-track this request.
* Note that we won't dirty any indirect blocks,
* which is fine because we will be freeing the entire
* file and thus all indirect blocks will be freed
* by free_children().
*/
blkid = 0;
nblks = 1;
goto done;
} else if (off >= blksz) {
/* Freeing past end-of-data */
goto out;
} else {
/* Freeing part of the block. */
head = blksz - off;
ASSERT3U(head, >, 0);
}
blkoff = off;
}
/* zero out any partial block data at the start of the range */
if (head) {
ASSERT3U(blkoff + head, ==, blksz);
if (len < head)
head = len;
if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
TRUE, FALSE, FTAG, &db) == 0) {
caddr_t data;
/* don't dirty if it isn't on disk and isn't dirty */
if (db->db_last_dirty ||
(db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
rw_exit(&dn->dn_struct_rwlock);
dmu_buf_will_dirty(&db->db, tx);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
data = db->db.db_data;
bzero(data + blkoff, head);
}
dbuf_rele(db, FTAG);
}
off += head;
len -= head;
}
/* If the range was less than one block, we're done */
if (len == 0)
goto out;
/* If the remaining range is past end of file, we're done */
if ((off >> blkshift) > dn->dn_maxblkid)
goto out;
ASSERT(ISP2(blksz));
if (trunc)
tail = 0;
else
tail = P2PHASE(len, blksz);
ASSERT0(P2PHASE(off, blksz));
/* zero out any partial block data at the end of the range */
if (tail) {
if (len < tail)
tail = len;
if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
TRUE, FALSE, FTAG, &db) == 0) {
/* don't dirty if not on disk and not dirty */
if (db->db_last_dirty ||
(db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
rw_exit(&dn->dn_struct_rwlock);
dmu_buf_will_dirty(&db->db, tx);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
bzero(db->db.db_data, tail);
}
dbuf_rele(db, FTAG);
}
len -= tail;
}
/* If the range did not include a full block, we are done */
if (len == 0)
goto out;
ASSERT(IS_P2ALIGNED(off, blksz));
ASSERT(trunc || IS_P2ALIGNED(len, blksz));
blkid = off >> blkshift;
nblks = len >> blkshift;
if (trunc)
nblks += 1;
/*
* Dirty all the indirect blocks in this range. Note that only
* the first and last indirect blocks can actually be written
* (if they were partially freed) -- they must be dirtied, even if
* they do not exist on disk yet. The interior blocks will
* be freed by free_children(), so they will not actually be written.
* Even though these interior blocks will not be written, we
* dirty them for two reasons:
*
* - It ensures that the indirect blocks remain in memory until
* syncing context. (They have already been prefetched by
* dmu_tx_hold_free(), so we don't have to worry about reading
* them serially here.)
*
* - The dirty space accounting will put pressure on the txg sync
* mechanism to begin syncing, and to delay transactions if there
* is a large amount of freeing. Even though these indirect
* blocks will not be written, we could need to write the same
* amount of space if we copy the freed BPs into deadlists.
*/
if (dn->dn_nlevels > 1) {
uint64_t first, last;
first = blkid >> epbs;
dnode_dirty_l1(dn, first, tx);
if (trunc)
last = dn->dn_maxblkid >> epbs;
else
last = (blkid + nblks - 1) >> epbs;
if (last != first)
dnode_dirty_l1(dn, last, tx);
int shift = dn->dn_datablkshift + dn->dn_indblkshift -
SPA_BLKPTRSHIFT;
for (uint64_t i = first + 1; i < last; i++) {
/*
* Set i to the blockid of the next non-hole
* level-1 indirect block at or after i. Note
* that dnode_next_offset() operates in terms of
* level-0-equivalent bytes.
*/
uint64_t ibyte = i << shift;
int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
&ibyte, 2, 1, 0);
i = ibyte >> shift;
if (i >= last)
break;
/*
* Normally we should not see an error, either
* from dnode_next_offset() or dbuf_hold_level()
* (except for ESRCH from dnode_next_offset).
* If there is an i/o error, then when we read
* this block in syncing context, it will use
* ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
* to the "failmode" property. dnode_next_offset()
* doesn't have a flag to indicate MUSTSUCCEED.
*/
if (err != 0)
break;
dnode_dirty_l1(dn, i, tx);
}
}
done:
/*
* Add this range to the dnode range list.
* We will finish up this free operation in the syncing phase.
*/
mutex_enter(&dn->dn_mtx);
int txgoff = tx->tx_txg & TXG_MASK;
if (dn->dn_free_ranges[txgoff] == NULL) {
- dn->dn_free_ranges[txgoff] =
- range_tree_create(NULL, NULL, &dn->dn_mtx);
+ dn->dn_free_ranges[txgoff] = range_tree_create(NULL, NULL);
}
range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
blkid, nblks, tx->tx_txg);
mutex_exit(&dn->dn_mtx);
dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
dnode_setdirty(dn, tx);
out:
rw_exit(&dn->dn_struct_rwlock);
}
static boolean_t
dnode_spill_freed(dnode_t *dn)
{
int i;
mutex_enter(&dn->dn_mtx);
for (i = 0; i < TXG_SIZE; i++) {
if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
break;
}
mutex_exit(&dn->dn_mtx);
return (i < TXG_SIZE);
}
/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
uint64_t
dnode_block_freed(dnode_t *dn, uint64_t blkid)
{
void *dp = spa_get_dsl(dn->dn_objset->os_spa);
int i;
if (blkid == DMU_BONUS_BLKID)
return (FALSE);
/*
* If we're in the process of opening the pool, dp will not be
* set yet, but there shouldn't be anything dirty.
*/
if (dp == NULL)
return (FALSE);
if (dn->dn_free_txg)
return (TRUE);
if (blkid == DMU_SPILL_BLKID)
return (dnode_spill_freed(dn));
mutex_enter(&dn->dn_mtx);
for (i = 0; i < TXG_SIZE; i++) {
if (dn->dn_free_ranges[i] != NULL &&
range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
break;
}
mutex_exit(&dn->dn_mtx);
return (i < TXG_SIZE);
}
/* call from syncing context when we actually write/free space for this dnode */
void
dnode_diduse_space(dnode_t *dn, int64_t delta)
{
uint64_t space;
dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
dn, dn->dn_phys,
(u_longlong_t)dn->dn_phys->dn_used,
(longlong_t)delta);
mutex_enter(&dn->dn_mtx);
space = DN_USED_BYTES(dn->dn_phys);
if (delta > 0) {
ASSERT3U(space + delta, >=, space); /* no overflow */
} else {
ASSERT3U(space, >=, -delta); /* no underflow */
}
space += delta;
if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT));
dn->dn_phys->dn_used = space >> DEV_BSHIFT;
} else {
dn->dn_phys->dn_used = space;
dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES;
}
mutex_exit(&dn->dn_mtx);
}
/*
* Scans a block at the indicated "level" looking for a hole or data,
* depending on 'flags'.
*
* If level > 0, then we are scanning an indirect block looking at its
* pointers. If level == 0, then we are looking at a block of dnodes.
*
* If we don't find what we are looking for in the block, we return ESRCH.
* Otherwise, return with *offset pointing to the beginning (if searching
* forwards) or end (if searching backwards) of the range covered by the
* block pointer we matched on (or dnode).
*
* The basic search algorithm used below by dnode_next_offset() is to
* use this function to search up the block tree (widen the search) until
* we find something (i.e., we don't return ESRCH) and then search back
* down the tree (narrow the search) until we reach our original search
* level.
*/
static int
dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
int lvl, uint64_t blkfill, uint64_t txg)
{
dmu_buf_impl_t *db = NULL;
void *data = NULL;
uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
uint64_t epb = 1ULL << epbs;
uint64_t minfill, maxfill;
boolean_t hole;
int i, inc, error, span;
dprintf("probing object %llu offset %llx level %d of %u\n",
dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
hole = ((flags & DNODE_FIND_HOLE) != 0);
inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
ASSERT(txg == 0 || !hole);
if (lvl == dn->dn_phys->dn_nlevels) {
error = 0;
epb = dn->dn_phys->dn_nblkptr;
data = dn->dn_phys->dn_blkptr;
} else {
uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
if (error) {
if (error != ENOENT)
return (error);
if (hole)
return (0);
/*
* This can only happen when we are searching up
* the block tree for data. We don't really need to
* adjust the offset, as we will just end up looking
* at the pointer to this block in its parent, and its
* going to be unallocated, so we will skip over it.
*/
return (SET_ERROR(ESRCH));
}
error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT);
if (error) {
dbuf_rele(db, FTAG);
return (error);
}
data = db->db.db_data;
}
if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
db->db_blkptr->blk_birth <= txg ||
BP_IS_HOLE(db->db_blkptr))) {
/*
* This can only happen when we are searching up the tree
* and these conditions mean that we need to keep climbing.
*/
error = SET_ERROR(ESRCH);
} else if (lvl == 0) {
dnode_phys_t *dnp = data;
span = DNODE_SHIFT;
ASSERT(dn->dn_type == DMU_OT_DNODE);
for (i = (*offset >> span) & (blkfill - 1);
i >= 0 && i < blkfill; i += inc) {
if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
break;
*offset += (1ULL << span) * inc;
}
if (i < 0 || i == blkfill)
error = SET_ERROR(ESRCH);
} else {
blkptr_t *bp = data;
uint64_t start = *offset;
span = (lvl - 1) * epbs + dn->dn_datablkshift;
minfill = 0;
maxfill = blkfill << ((lvl - 1) * epbs);
if (hole)
maxfill--;
else
minfill++;
*offset = *offset >> span;
for (i = BF64_GET(*offset, 0, epbs);
i >= 0 && i < epb; i += inc) {
if (BP_GET_FILL(&bp[i]) >= minfill &&
BP_GET_FILL(&bp[i]) <= maxfill &&
(hole || bp[i].blk_birth > txg))
break;
if (inc > 0 || *offset > 0)
*offset += inc;
}
*offset = *offset << span;
if (inc < 0) {
/* traversing backwards; position offset at the end */
ASSERT3U(*offset, <=, start);
*offset = MIN(*offset + (1ULL << span) - 1, start);
} else if (*offset < start) {
*offset = start;
}
if (i < 0 || i >= epb)
error = SET_ERROR(ESRCH);
}
if (db)
dbuf_rele(db, FTAG);
return (error);
}
/*
* Find the next hole, data, or sparse region at or after *offset.
* The value 'blkfill' tells us how many items we expect to find
* in an L0 data block; this value is 1 for normal objects,
* DNODES_PER_BLOCK for the meta dnode, and some fraction of
* DNODES_PER_BLOCK when searching for sparse regions thereof.
*
* Examples:
*
* dnode_next_offset(dn, flags, offset, 1, 1, 0);
* Finds the next/previous hole/data in a file.
* Used in dmu_offset_next().
*
* dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
* Finds the next free/allocated dnode an objset's meta-dnode.
* Only finds objects that have new contents since txg (ie.
* bonus buffer changes and content removal are ignored).
* Used in dmu_object_next().
*
* dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
* Finds the next L2 meta-dnode bp that's at most 1/4 full.
* Used in dmu_object_alloc().
*/
int
dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
int minlvl, uint64_t blkfill, uint64_t txg)
{
uint64_t initial_offset = *offset;
int lvl, maxlvl;
int error = 0;
if (!(flags & DNODE_FIND_HAVELOCK))
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_phys->dn_nlevels == 0) {
error = SET_ERROR(ESRCH);
goto out;
}
if (dn->dn_datablkshift == 0) {
if (*offset < dn->dn_datablksz) {
if (flags & DNODE_FIND_HOLE)
*offset = dn->dn_datablksz;
} else {
error = SET_ERROR(ESRCH);
}
goto out;
}
maxlvl = dn->dn_phys->dn_nlevels;
for (lvl = minlvl; lvl <= maxlvl; lvl++) {
error = dnode_next_offset_level(dn,
flags, offset, lvl, blkfill, txg);
if (error != ESRCH)
break;
}
while (error == 0 && --lvl >= minlvl) {
error = dnode_next_offset_level(dn,
flags, offset, lvl, blkfill, txg);
}
/*
* There's always a "virtual hole" at the end of the object, even
* if all BP's which physically exist are non-holes.
*/
if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
error = 0;
}
if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
initial_offset < *offset : initial_offset > *offset))
error = SET_ERROR(ESRCH);
out:
if (!(flags & DNODE_FIND_HAVELOCK))
rw_exit(&dn->dn_struct_rwlock);
return (error);
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c (revision 332525)
@@ -1,4057 +1,4252 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Portions Copyright (c) 2011 Martin Matuska <mm@FreeBSD.org>
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright (c) 2014, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 RackTop Systems.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2016, OmniTI Computer Consulting, Inc. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc.
*/
#include <sys/dmu_objset.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_dir.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_synctask.h>
#include <sys/dmu_traverse.h>
#include <sys/dmu_impl.h>
#include <sys/dmu_send.h>
#include <sys/dmu_tx.h>
#include <sys/arc.h>
#include <sys/zio.h>
#include <sys/zap.h>
#include <sys/zfeature.h>
#include <sys/unique.h>
#include <sys/zfs_context.h>
#include <sys/zfs_ioctl.h>
#include <sys/spa.h>
+#include <sys/vdev.h>
#include <sys/zfs_znode.h>
#include <sys/zfs_onexit.h>
#include <sys/zvol.h>
#include <sys/dsl_scan.h>
#include <sys/dsl_deadlist.h>
#include <sys/dsl_destroy.h>
#include <sys/dsl_userhold.h>
#include <sys/dsl_bookmark.h>
#include <sys/dmu_send.h>
#include <sys/zio_checksum.h>
#include <sys/zio_compress.h>
#include <zfs_fletcher.h>
SYSCTL_DECL(_vfs_zfs);
/*
* The SPA supports block sizes up to 16MB. However, very large blocks
* can have an impact on i/o latency (e.g. tying up a spinning disk for
* ~300ms), and also potentially on the memory allocator. Therefore,
* we do not allow the recordsize to be set larger than zfs_max_recordsize
* (default 1MB). Larger blocks can be created by changing this tunable,
* and pools with larger blocks can always be imported and used, regardless
* of this setting.
*/
int zfs_max_recordsize = 1 * 1024 * 1024;
SYSCTL_INT(_vfs_zfs, OID_AUTO, max_recordsize, CTLFLAG_RWTUN,
&zfs_max_recordsize, 0,
"Maximum block size. Expect dragons when tuning this.");
#define SWITCH64(x, y) \
{ \
uint64_t __tmp = (x); \
(x) = (y); \
(y) = __tmp; \
}
#define DS_REF_MAX (1ULL << 62)
extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds);
+static void dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds,
+ uint64_t obj, dmu_tx_t *tx);
+static void dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds,
+ dmu_tx_t *tx);
+
extern int spa_asize_inflation;
static zil_header_t zero_zil;
/*
* Figure out how much of this delta should be propogated to the dsl_dir
* layer. If there's a refreservation, that space has already been
* partially accounted for in our ancestors.
*/
static int64_t
parent_delta(dsl_dataset_t *ds, int64_t delta)
{
dsl_dataset_phys_t *ds_phys;
uint64_t old_bytes, new_bytes;
if (ds->ds_reserved == 0)
return (delta);
ds_phys = dsl_dataset_phys(ds);
old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved);
new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
return (new_bytes - old_bytes);
}
void
dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
{
int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
int compressed = BP_GET_PSIZE(bp);
int uncompressed = BP_GET_UCSIZE(bp);
int64_t delta;
dprintf_bp(bp, "ds=%p", ds);
ASSERT(dmu_tx_is_syncing(tx));
/* It could have been compressed away to nothing */
if (BP_IS_HOLE(bp))
return;
ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
if (ds == NULL) {
dsl_pool_mos_diduse_space(tx->tx_pool,
used, compressed, uncompressed);
return;
}
ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
dmu_buf_will_dirty(ds->ds_dbuf, tx);
mutex_enter(&ds->ds_lock);
delta = parent_delta(ds, used);
dsl_dataset_phys(ds)->ds_referenced_bytes += used;
dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
dsl_dataset_phys(ds)->ds_unique_bytes += used;
if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) {
ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_BLOCKS] =
B_TRUE;
}
spa_feature_t f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp));
if (f != SPA_FEATURE_NONE)
ds->ds_feature_activation_needed[f] = B_TRUE;
mutex_exit(&ds->ds_lock);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
compressed, uncompressed, tx);
dsl_dir_transfer_space(ds->ds_dir, used - delta,
DD_USED_REFRSRV, DD_USED_HEAD, NULL);
}
+/*
+ * Called when the specified segment has been remapped, and is thus no
+ * longer referenced in the head dataset. The vdev must be indirect.
+ *
+ * If the segment is referenced by a snapshot, put it on the remap deadlist.
+ * Otherwise, add this segment to the obsolete spacemap.
+ */
+void
+dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset,
+ uint64_t size, uint64_t birth, dmu_tx_t *tx)
+{
+ spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(birth <= tx->tx_txg);
+ ASSERT(!ds->ds_is_snapshot);
+
+ if (birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
+ spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
+ } else {
+ blkptr_t fakebp;
+ dva_t *dva = &fakebp.blk_dva[0];
+
+ ASSERT(ds != NULL);
+
+ mutex_enter(&ds->ds_remap_deadlist_lock);
+ if (!dsl_dataset_remap_deadlist_exists(ds)) {
+ dsl_dataset_create_remap_deadlist(ds, tx);
+ }
+ mutex_exit(&ds->ds_remap_deadlist_lock);
+
+ BP_ZERO(&fakebp);
+ fakebp.blk_birth = birth;
+ DVA_SET_VDEV(dva, vdev);
+ DVA_SET_OFFSET(dva, offset);
+ DVA_SET_ASIZE(dva, size);
+
+ dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, tx);
+ }
+}
+
int
dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
boolean_t async)
{
int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
int compressed = BP_GET_PSIZE(bp);
int uncompressed = BP_GET_UCSIZE(bp);
if (BP_IS_HOLE(bp))
return (0);
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(bp->blk_birth <= tx->tx_txg);
if (ds == NULL) {
dsl_free(tx->tx_pool, tx->tx_txg, bp);
dsl_pool_mos_diduse_space(tx->tx_pool,
-used, -compressed, -uncompressed);
return (used);
}
ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
ASSERT(!ds->ds_is_snapshot);
dmu_buf_will_dirty(ds->ds_dbuf, tx);
if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
int64_t delta;
dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
dsl_free(tx->tx_pool, tx->tx_txg, bp);
mutex_enter(&ds->ds_lock);
ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used ||
!DS_UNIQUE_IS_ACCURATE(ds));
delta = parent_delta(ds, -used);
dsl_dataset_phys(ds)->ds_unique_bytes -= used;
mutex_exit(&ds->ds_lock);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
delta, -compressed, -uncompressed, tx);
dsl_dir_transfer_space(ds->ds_dir, -used - delta,
DD_USED_REFRSRV, DD_USED_HEAD, NULL);
} else {
dprintf_bp(bp, "putting on dead list: %s", "");
if (async) {
/*
* We are here as part of zio's write done callback,
* which means we're a zio interrupt thread. We can't
* call dsl_deadlist_insert() now because it may block
* waiting for I/O. Instead, put bp on the deferred
* queue and let dsl_pool_sync() finish the job.
*/
bplist_append(&ds->ds_pending_deadlist, bp);
} else {
dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
}
ASSERT3U(ds->ds_prev->ds_object, ==,
dsl_dataset_phys(ds)->ds_prev_snap_obj);
ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
ds->ds_object && bp->blk_birth >
dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
mutex_enter(&ds->ds_prev->ds_lock);
dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
mutex_exit(&ds->ds_prev->ds_lock);
}
if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
dsl_dir_transfer_space(ds->ds_dir, used,
DD_USED_HEAD, DD_USED_SNAP, tx);
}
}
mutex_enter(&ds->ds_lock);
ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used);
dsl_dataset_phys(ds)->ds_referenced_bytes -= used;
ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed);
dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed;
ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed);
dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed;
mutex_exit(&ds->ds_lock);
return (used);
}
/*
* We have to release the fsid syncronously or we risk that a subsequent
* mount of the same dataset will fail to unique_insert the fsid. This
* failure would manifest itself as the fsid of this dataset changing
* between mounts which makes NFS clients quite unhappy.
*/
static void
dsl_dataset_evict_sync(void *dbu)
{
dsl_dataset_t *ds = dbu;
ASSERT(ds->ds_owner == NULL);
unique_remove(ds->ds_fsid_guid);
}
static void
dsl_dataset_evict_async(void *dbu)
{
dsl_dataset_t *ds = dbu;
ASSERT(ds->ds_owner == NULL);
ds->ds_dbuf = NULL;
if (ds->ds_objset != NULL)
dmu_objset_evict(ds->ds_objset);
if (ds->ds_prev) {
dsl_dataset_rele(ds->ds_prev, ds);
ds->ds_prev = NULL;
}
bplist_destroy(&ds->ds_pending_deadlist);
- if (ds->ds_deadlist.dl_os != NULL)
+ if (dsl_deadlist_is_open(&ds->ds_deadlist))
dsl_deadlist_close(&ds->ds_deadlist);
+ if (dsl_deadlist_is_open(&ds->ds_remap_deadlist))
+ dsl_deadlist_close(&ds->ds_remap_deadlist);
if (ds->ds_dir)
dsl_dir_async_rele(ds->ds_dir, ds);
ASSERT(!list_link_active(&ds->ds_synced_link));
list_destroy(&ds->ds_prop_cbs);
if (mutex_owned(&ds->ds_lock))
mutex_exit(&ds->ds_lock);
mutex_destroy(&ds->ds_lock);
if (mutex_owned(&ds->ds_opening_lock))
mutex_exit(&ds->ds_opening_lock);
mutex_destroy(&ds->ds_opening_lock);
mutex_destroy(&ds->ds_sendstream_lock);
+ mutex_destroy(&ds->ds_remap_deadlist_lock);
refcount_destroy(&ds->ds_longholds);
rrw_destroy(&ds->ds_bp_rwlock);
kmem_free(ds, sizeof (dsl_dataset_t));
}
int
dsl_dataset_get_snapname(dsl_dataset_t *ds)
{
dsl_dataset_phys_t *headphys;
int err;
dmu_buf_t *headdbuf;
dsl_pool_t *dp = ds->ds_dir->dd_pool;
objset_t *mos = dp->dp_meta_objset;
if (ds->ds_snapname[0])
return (0);
if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0)
return (0);
err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj,
FTAG, &headdbuf);
if (err != 0)
return (err);
headphys = headdbuf->db_data;
err = zap_value_search(dp->dp_meta_objset,
headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
dmu_buf_rele(headdbuf, FTAG);
return (err);
}
int
dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
{
objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
matchtype_t mt = 0;
int err;
if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
mt = MT_NORMALIZE;
err = zap_lookup_norm(mos, snapobj, name, 8, 1,
value, mt, NULL, 0, NULL);
if (err == ENOTSUP && (mt & MT_NORMALIZE))
err = zap_lookup(mos, snapobj, name, 8, 1, value);
return (err);
}
int
dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
boolean_t adj_cnt)
{
objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
matchtype_t mt = 0;
int err;
dsl_dir_snap_cmtime_update(ds->ds_dir);
if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
mt = MT_NORMALIZE;
err = zap_remove_norm(mos, snapobj, name, mt, tx);
if (err == ENOTSUP && (mt & MT_NORMALIZE))
err = zap_remove(mos, snapobj, name, tx);
if (err == 0 && adj_cnt)
dsl_fs_ss_count_adjust(ds->ds_dir, -1,
DD_FIELD_SNAPSHOT_COUNT, tx);
return (err);
}
boolean_t
dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag)
{
dmu_buf_t *dbuf = ds->ds_dbuf;
boolean_t result = B_FALSE;
if (dbuf != NULL && dmu_buf_try_add_ref(dbuf, dp->dp_meta_objset,
ds->ds_object, DMU_BONUS_BLKID, tag)) {
if (ds == dmu_buf_get_user(dbuf))
result = B_TRUE;
else
dmu_buf_rele(dbuf, tag);
}
return (result);
}
int
dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
dsl_dataset_t **dsp)
{
objset_t *mos = dp->dp_meta_objset;
dmu_buf_t *dbuf;
dsl_dataset_t *ds;
int err;
dmu_object_info_t doi;
ASSERT(dsl_pool_config_held(dp));
err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
if (err != 0)
return (err);
/* Make sure dsobj has the correct object type. */
dmu_object_info_from_db(dbuf, &doi);
if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) {
dmu_buf_rele(dbuf, tag);
return (SET_ERROR(EINVAL));
}
ds = dmu_buf_get_user(dbuf);
if (ds == NULL) {
dsl_dataset_t *winner = NULL;
ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
ds->ds_dbuf = dbuf;
ds->ds_object = dsobj;
ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0;
+ err = dsl_dir_hold_obj(dp, dsl_dataset_phys(ds)->ds_dir_obj,
+ NULL, ds, &ds->ds_dir);
+ if (err != 0) {
+ kmem_free(ds, sizeof (dsl_dataset_t));
+ dmu_buf_rele(dbuf, tag);
+ return (err);
+ }
+
mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&ds->ds_remap_deadlist_lock,
+ NULL, MUTEX_DEFAULT, NULL);
rrw_init(&ds->ds_bp_rwlock, B_FALSE);
refcount_create(&ds->ds_longholds);
bplist_create(&ds->ds_pending_deadlist);
- dsl_deadlist_open(&ds->ds_deadlist,
- mos, dsl_dataset_phys(ds)->ds_deadlist_obj);
list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
offsetof(dmu_sendarg_t, dsa_link));
list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t),
offsetof(dsl_prop_cb_record_t, cbr_ds_node));
if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
if (!(spa_feature_table[f].fi_flags &
ZFEATURE_FLAG_PER_DATASET))
continue;
err = zap_contains(mos, dsobj,
spa_feature_table[f].fi_guid);
if (err == 0) {
ds->ds_feature_inuse[f] = B_TRUE;
} else {
ASSERT3U(err, ==, ENOENT);
err = 0;
}
}
}
- err = dsl_dir_hold_obj(dp,
- dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds, &ds->ds_dir);
- if (err != 0) {
- mutex_destroy(&ds->ds_lock);
- mutex_destroy(&ds->ds_opening_lock);
- mutex_destroy(&ds->ds_sendstream_lock);
- refcount_destroy(&ds->ds_longholds);
- bplist_destroy(&ds->ds_pending_deadlist);
- dsl_deadlist_close(&ds->ds_deadlist);
- kmem_free(ds, sizeof (dsl_dataset_t));
- dmu_buf_rele(dbuf, tag);
- return (err);
- }
-
if (!ds->ds_is_snapshot) {
ds->ds_snapname[0] = '\0';
if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
err = dsl_dataset_hold_obj(dp,
dsl_dataset_phys(ds)->ds_prev_snap_obj,
ds, &ds->ds_prev);
}
if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
int zaperr = zap_lookup(mos, ds->ds_object,
DS_FIELD_BOOKMARK_NAMES,
sizeof (ds->ds_bookmarks), 1,
&ds->ds_bookmarks);
if (zaperr != ENOENT)
VERIFY0(zaperr);
}
} else {
if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
err = dsl_dataset_get_snapname(ds);
if (err == 0 &&
dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
err = zap_count(
ds->ds_dir->dd_pool->dp_meta_objset,
dsl_dataset_phys(ds)->ds_userrefs_obj,
&ds->ds_userrefs);
}
}
if (err == 0 && !ds->ds_is_snapshot) {
err = dsl_prop_get_int_ds(ds,
zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
&ds->ds_reserved);
if (err == 0) {
err = dsl_prop_get_int_ds(ds,
zfs_prop_to_name(ZFS_PROP_REFQUOTA),
&ds->ds_quota);
}
} else {
ds->ds_reserved = ds->ds_quota = 0;
}
+ dsl_deadlist_open(&ds->ds_deadlist,
+ mos, dsl_dataset_phys(ds)->ds_deadlist_obj);
+ uint64_t remap_deadlist_obj =
+ dsl_dataset_get_remap_deadlist_object(ds);
+ if (remap_deadlist_obj != 0) {
+ dsl_deadlist_open(&ds->ds_remap_deadlist, mos,
+ remap_deadlist_obj);
+ }
+
dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict_sync,
dsl_dataset_evict_async, &ds->ds_dbuf);
if (err == 0)
winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu);
if (err != 0 || winner != NULL) {
bplist_destroy(&ds->ds_pending_deadlist);
dsl_deadlist_close(&ds->ds_deadlist);
+ if (dsl_deadlist_is_open(&ds->ds_remap_deadlist))
+ dsl_deadlist_close(&ds->ds_remap_deadlist);
if (ds->ds_prev)
dsl_dataset_rele(ds->ds_prev, ds);
dsl_dir_rele(ds->ds_dir, ds);
mutex_destroy(&ds->ds_lock);
mutex_destroy(&ds->ds_opening_lock);
mutex_destroy(&ds->ds_sendstream_lock);
refcount_destroy(&ds->ds_longholds);
kmem_free(ds, sizeof (dsl_dataset_t));
if (err != 0) {
dmu_buf_rele(dbuf, tag);
return (err);
}
ds = winner;
} else {
ds->ds_fsid_guid =
unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid);
if (ds->ds_fsid_guid !=
dsl_dataset_phys(ds)->ds_fsid_guid) {
zfs_dbgmsg("ds_fsid_guid changed from "
"%llx to %llx for pool %s dataset id %llu",
(long long)
dsl_dataset_phys(ds)->ds_fsid_guid,
(long long)ds->ds_fsid_guid,
spa_name(dp->dp_spa),
dsobj);
}
}
}
ASSERT3P(ds->ds_dbuf, ==, dbuf);
ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data);
ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 ||
spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
*dsp = ds;
return (0);
}
int
dsl_dataset_hold(dsl_pool_t *dp, const char *name,
void *tag, dsl_dataset_t **dsp)
{
dsl_dir_t *dd;
const char *snapname;
uint64_t obj;
int err = 0;
dsl_dataset_t *ds;
err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
if (err != 0)
return (err);
ASSERT(dsl_pool_config_held(dp));
obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
if (obj != 0)
err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
else
err = SET_ERROR(ENOENT);
/* we may be looking for a snapshot */
if (err == 0 && snapname != NULL) {
dsl_dataset_t *snap_ds;
if (*snapname++ != '@') {
dsl_dataset_rele(ds, tag);
dsl_dir_rele(dd, FTAG);
return (SET_ERROR(ENOENT));
}
dprintf("looking for snapshot '%s'\n", snapname);
err = dsl_dataset_snap_lookup(ds, snapname, &obj);
if (err == 0)
err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds);
dsl_dataset_rele(ds, tag);
if (err == 0) {
mutex_enter(&snap_ds->ds_lock);
if (snap_ds->ds_snapname[0] == 0)
(void) strlcpy(snap_ds->ds_snapname, snapname,
sizeof (snap_ds->ds_snapname));
mutex_exit(&snap_ds->ds_lock);
ds = snap_ds;
}
}
if (err == 0)
*dsp = ds;
dsl_dir_rele(dd, FTAG);
return (err);
}
int
dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj,
void *tag, dsl_dataset_t **dsp)
{
int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
if (err != 0)
return (err);
if (!dsl_dataset_tryown(*dsp, tag)) {
dsl_dataset_rele(*dsp, tag);
*dsp = NULL;
return (SET_ERROR(EBUSY));
}
return (0);
}
int
dsl_dataset_own(dsl_pool_t *dp, const char *name,
void *tag, dsl_dataset_t **dsp)
{
int err = dsl_dataset_hold(dp, name, tag, dsp);
if (err != 0)
return (err);
if (!dsl_dataset_tryown(*dsp, tag)) {
dsl_dataset_rele(*dsp, tag);
return (SET_ERROR(EBUSY));
}
return (0);
}
/*
* See the comment above dsl_pool_hold() for details. In summary, a long
* hold is used to prevent destruction of a dataset while the pool hold
* is dropped, allowing other concurrent operations (e.g. spa_sync()).
*
* The dataset and pool must be held when this function is called. After it
* is called, the pool hold may be released while the dataset is still held
* and accessed.
*/
void
dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag)
{
ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
(void) refcount_add(&ds->ds_longholds, tag);
}
void
dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag)
{
(void) refcount_remove(&ds->ds_longholds, tag);
}
/* Return B_TRUE if there are any long holds on this dataset. */
boolean_t
dsl_dataset_long_held(dsl_dataset_t *ds)
{
return (!refcount_is_zero(&ds->ds_longholds));
}
void
dsl_dataset_name(dsl_dataset_t *ds, char *name)
{
if (ds == NULL) {
(void) strcpy(name, "mos");
} else {
dsl_dir_name(ds->ds_dir, name);
VERIFY0(dsl_dataset_get_snapname(ds));
if (ds->ds_snapname[0]) {
VERIFY3U(strlcat(name, "@", ZFS_MAX_DATASET_NAME_LEN),
<, ZFS_MAX_DATASET_NAME_LEN);
/*
* We use a "recursive" mutex so that we
* can call dprintf_ds() with ds_lock held.
*/
if (!MUTEX_HELD(&ds->ds_lock)) {
mutex_enter(&ds->ds_lock);
VERIFY3U(strlcat(name, ds->ds_snapname,
ZFS_MAX_DATASET_NAME_LEN), <,
ZFS_MAX_DATASET_NAME_LEN);
mutex_exit(&ds->ds_lock);
} else {
VERIFY3U(strlcat(name, ds->ds_snapname,
ZFS_MAX_DATASET_NAME_LEN), <,
ZFS_MAX_DATASET_NAME_LEN);
}
}
}
}
int
dsl_dataset_namelen(dsl_dataset_t *ds)
{
VERIFY0(dsl_dataset_get_snapname(ds));
mutex_enter(&ds->ds_lock);
int len = dsl_dir_namelen(ds->ds_dir) + 1 + strlen(ds->ds_snapname);
mutex_exit(&ds->ds_lock);
return (len);
}
void
dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
{
dmu_buf_rele(ds->ds_dbuf, tag);
}
void
dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
{
ASSERT3P(ds->ds_owner, ==, tag);
ASSERT(ds->ds_dbuf != NULL);
mutex_enter(&ds->ds_lock);
ds->ds_owner = NULL;
mutex_exit(&ds->ds_lock);
dsl_dataset_long_rele(ds, tag);
dsl_dataset_rele(ds, tag);
}
boolean_t
dsl_dataset_tryown(dsl_dataset_t *ds, void *tag)
{
boolean_t gotit = FALSE;
ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
mutex_enter(&ds->ds_lock);
if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) {
ds->ds_owner = tag;
dsl_dataset_long_hold(ds, tag);
gotit = TRUE;
}
mutex_exit(&ds->ds_lock);
return (gotit);
}
boolean_t
dsl_dataset_has_owner(dsl_dataset_t *ds)
{
boolean_t rv;
mutex_enter(&ds->ds_lock);
rv = (ds->ds_owner != NULL);
mutex_exit(&ds->ds_lock);
return (rv);
}
static void
dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
{
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
uint64_t zero = 0;
VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
spa_feature_incr(spa, f, tx);
dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid,
sizeof (zero), 1, &zero, tx));
}
void
dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
{
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx));
spa_feature_decr(spa, f, tx);
}
uint64_t
dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
uint64_t flags, dmu_tx_t *tx)
{
dsl_pool_t *dp = dd->dd_pool;
dmu_buf_t *dbuf;
dsl_dataset_phys_t *dsphys;
uint64_t dsobj;
objset_t *mos = dp->dp_meta_objset;
if (origin == NULL)
origin = dp->dp_origin_snap;
ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0);
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
dmu_buf_will_dirty(dbuf, tx);
dsphys = dbuf->db_data;
bzero(dsphys, sizeof (dsl_dataset_phys_t));
dsphys->ds_dir_obj = dd->dd_object;
dsphys->ds_flags = flags;
dsphys->ds_fsid_guid = unique_create();
do {
(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
sizeof (dsphys->ds_guid));
} while (dsphys->ds_guid == 0);
dsphys->ds_snapnames_zapobj =
zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
DMU_OT_NONE, 0, tx);
dsphys->ds_creation_time = gethrestime_sec();
dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
if (origin == NULL) {
dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
} else {
dsl_dataset_t *ohds; /* head of the origin snapshot */
dsphys->ds_prev_snap_obj = origin->ds_object;
dsphys->ds_prev_snap_txg =
dsl_dataset_phys(origin)->ds_creation_txg;
dsphys->ds_referenced_bytes =
dsl_dataset_phys(origin)->ds_referenced_bytes;
dsphys->ds_compressed_bytes =
dsl_dataset_phys(origin)->ds_compressed_bytes;
dsphys->ds_uncompressed_bytes =
dsl_dataset_phys(origin)->ds_uncompressed_bytes;
rrw_enter(&origin->ds_bp_rwlock, RW_READER, FTAG);
dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp;
rrw_exit(&origin->ds_bp_rwlock, FTAG);
/*
* Inherit flags that describe the dataset's contents
* (INCONSISTENT) or properties (Case Insensitive).
*/
dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags &
(DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
if (origin->ds_feature_inuse[f])
dsl_dataset_activate_feature(dsobj, f, tx);
}
dmu_buf_will_dirty(origin->ds_dbuf, tx);
dsl_dataset_phys(origin)->ds_num_children++;
VERIFY0(dsl_dataset_hold_obj(dp,
dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj,
FTAG, &ohds));
dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
dsl_dataset_rele(ohds, FTAG);
if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) {
dsl_dataset_phys(origin)->ds_next_clones_obj =
zap_create(mos,
DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
}
VERIFY0(zap_add_int(mos,
dsl_dataset_phys(origin)->ds_next_clones_obj,
dsobj, tx));
}
dmu_buf_will_dirty(dd->dd_dbuf, tx);
dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object;
if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
dsl_dir_phys(origin->ds_dir)->dd_clones =
zap_create(mos,
DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
}
VERIFY0(zap_add_int(mos,
dsl_dir_phys(origin->ds_dir)->dd_clones,
dsobj, tx));
}
}
if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
dmu_buf_rele(dbuf, FTAG);
dmu_buf_will_dirty(dd->dd_dbuf, tx);
dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj;
return (dsobj);
}
static void
dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
{
objset_t *os;
VERIFY0(dmu_objset_from_ds(ds, &os));
if (bcmp(&os->os_zil_header, &zero_zil, sizeof (zero_zil)) != 0) {
dsl_pool_t *dp = ds->ds_dir->dd_pool;
zio_t *zio;
bzero(&os->os_zil_header, sizeof (os->os_zil_header));
zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
dsl_dataset_sync(ds, zio, tx);
VERIFY0(zio_wait(zio));
/* dsl_dataset_sync_done will drop this reference. */
dmu_buf_add_ref(ds->ds_dbuf, ds);
dsl_dataset_sync_done(ds, tx);
}
}
uint64_t
dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
{
dsl_pool_t *dp = pdd->dd_pool;
uint64_t dsobj, ddobj;
dsl_dir_t *dd;
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(lastname[0] != '@');
ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
dsobj = dsl_dataset_create_sync_dd(dd, origin,
flags & ~DS_CREATE_FLAG_NODIRTY, tx);
dsl_deleg_set_create_perms(dd, tx, cr);
/*
* Since we're creating a new node we know it's a leaf, so we can
* initialize the counts if the limit feature is active.
*/
if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
uint64_t cnt = 0;
objset_t *os = dd->dd_pool->dp_meta_objset;
dsl_dir_zapify(dd, tx);
VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
sizeof (cnt), 1, &cnt, tx));
VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
sizeof (cnt), 1, &cnt, tx));
}
dsl_dir_rele(dd, FTAG);
/*
* If we are creating a clone, make sure we zero out any stale
* data from the origin snapshots zil header.
*/
if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) {
dsl_dataset_t *ds;
VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
dsl_dataset_zero_zil(ds, tx);
dsl_dataset_rele(ds, FTAG);
}
return (dsobj);
}
#ifdef __FreeBSD__
/* FreeBSD ioctl compat begin */
struct destroyarg {
nvlist_t *nvl;
const char *snapname;
};
static int
dsl_check_snap_cb(const char *name, void *arg)
{
struct destroyarg *da = arg;
dsl_dataset_t *ds;
char *dsname;
dsname = kmem_asprintf("%s@%s", name, da->snapname);
fnvlist_add_boolean(da->nvl, dsname);
kmem_free(dsname, strlen(dsname) + 1);
return (0);
}
int
dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname,
nvlist_t *snaps)
{
struct destroyarg *da;
int err;
da = kmem_zalloc(sizeof (struct destroyarg), KM_SLEEP);
da->nvl = snaps;
da->snapname = snapname;
err = dmu_objset_find(fsname, dsl_check_snap_cb, da,
DS_FIND_CHILDREN);
kmem_free(da, sizeof (struct destroyarg));
return (err);
}
/* FreeBSD ioctl compat end */
#endif /* __FreeBSD__ */
/*
* The unique space in the head dataset can be calculated by subtracting
* the space used in the most recent snapshot, that is still being used
* in this file system, from the space currently in use. To figure out
* the space in the most recent snapshot still in use, we need to take
* the total space used in the snapshot and subtract out the space that
* has been freed up since the snapshot was taken.
*/
void
dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
{
uint64_t mrs_used;
uint64_t dlused, dlcomp, dluncomp;
ASSERT(!ds->ds_is_snapshot);
if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0)
mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes;
else
mrs_used = 0;
dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
ASSERT3U(dlused, <=, mrs_used);
dsl_dataset_phys(ds)->ds_unique_bytes =
dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused);
if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
SPA_VERSION_UNIQUE_ACCURATE)
dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
}
void
dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
dmu_tx_t *tx)
{
objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
uint64_t count;
int err;
ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2);
err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
obj, tx);
/*
* The err should not be ENOENT, but a bug in a previous version
* of the code could cause upgrade_clones_cb() to not set
* ds_next_snap_obj when it should, leading to a missing entry.
* If we knew that the pool was created after
* SPA_VERSION_NEXT_CLONES, we could assert that it isn't
* ENOENT. However, at least we can check that we don't have
* too many entries in the next_clones_obj even after failing to
* remove this one.
*/
if (err != ENOENT)
VERIFY0(err);
ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
&count));
ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2);
}
blkptr_t *
dsl_dataset_get_blkptr(dsl_dataset_t *ds)
{
return (&dsl_dataset_phys(ds)->ds_bp);
}
spa_t *
dsl_dataset_get_spa(dsl_dataset_t *ds)
{
return (ds->ds_dir->dd_pool->dp_spa);
}
void
dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
{
dsl_pool_t *dp;
if (ds == NULL) /* this is the meta-objset */
return;
ASSERT(ds->ds_objset != NULL);
if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0)
panic("dirtying snapshot!");
/* Must not dirty a dataset in the same txg where it got snapshotted. */
ASSERT3U(tx->tx_txg, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
dp = ds->ds_dir->dd_pool;
if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
/* up the hold count until we can be written out */
dmu_buf_add_ref(ds->ds_dbuf, ds);
}
}
boolean_t
dsl_dataset_is_dirty(dsl_dataset_t *ds)
{
for (int t = 0; t < TXG_SIZE; t++) {
if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
ds, t))
return (B_TRUE);
}
return (B_FALSE);
}
static int
dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
{
uint64_t asize;
if (!dmu_tx_is_syncing(tx))
return (0);
/*
* If there's an fs-only reservation, any blocks that might become
* owned by the snapshot dataset must be accommodated by space
* outside of the reservation.
*/
ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved);
if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
return (SET_ERROR(ENOSPC));
/*
* Propagate any reserved space for this snapshot to other
* snapshot checks in this sync group.
*/
if (asize > 0)
dsl_dir_willuse_space(ds->ds_dir, asize, tx);
return (0);
}
int
dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr)
{
int error;
uint64_t value;
ds->ds_trysnap_txg = tx->tx_txg;
if (!dmu_tx_is_syncing(tx))
return (0);
/*
* We don't allow multiple snapshots of the same txg. If there
* is already one, try again.
*/
if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg)
return (SET_ERROR(EAGAIN));
/*
* Check for conflicting snapshot name.
*/
error = dsl_dataset_snap_lookup(ds, snapname, &value);
if (error == 0)
return (SET_ERROR(EEXIST));
if (error != ENOENT)
return (error);
/*
* We don't allow taking snapshots of inconsistent datasets, such as
* those into which we are currently receiving. However, if we are
* creating this snapshot as part of a receive, this check will be
* executed atomically with respect to the completion of the receive
* itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this
* case we ignore this, knowing it will be fixed up for us shortly in
* dmu_recv_end_sync().
*/
if (!recv && DS_IS_INCONSISTENT(ds))
return (SET_ERROR(EBUSY));
/*
* Skip the check for temporary snapshots or if we have already checked
* the counts in dsl_dataset_snapshot_check. This means we really only
* check the count here when we're receiving a stream.
*/
if (cnt != 0 && cr != NULL) {
error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr);
if (error != 0)
return (error);
}
error = dsl_dataset_snapshot_reserve_space(ds, tx);
if (error != 0)
return (error);
return (0);
}
int
dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
{
dsl_dataset_snapshot_arg_t *ddsa = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
nvpair_t *pair;
int rv = 0;
/*
* Pre-compute how many total new snapshots will be created for each
* level in the tree and below. This is needed for validating the
* snapshot limit when either taking a recursive snapshot or when
* taking multiple snapshots.
*
* The problem is that the counts are not actually adjusted when
* we are checking, only when we finally sync. For a single snapshot,
* this is easy, the count will increase by 1 at each node up the tree,
* but its more complicated for the recursive/multiple snapshot case.
*
* The dsl_fs_ss_limit_check function does recursively check the count
* at each level up the tree but since it is validating each snapshot
* independently we need to be sure that we are validating the complete
* count for the entire set of snapshots. We do this by rolling up the
* counts for each component of the name into an nvlist and then
* checking each of those cases with the aggregated count.
*
* This approach properly handles not only the recursive snapshot
* case (where we get all of those on the ddsa_snaps list) but also
* the sibling case (e.g. snapshot a/b and a/c so that we will also
* validate the limit on 'a' using a count of 2).
*
* We validate the snapshot names in the third loop and only report
* name errors once.
*/
if (dmu_tx_is_syncing(tx)) {
nvlist_t *cnt_track = NULL;
cnt_track = fnvlist_alloc();
/* Rollup aggregated counts into the cnt_track list */
for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
pair != NULL;
pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
char *pdelim;
uint64_t val;
char nm[MAXPATHLEN];
(void) strlcpy(nm, nvpair_name(pair), sizeof (nm));
pdelim = strchr(nm, '@');
if (pdelim == NULL)
continue;
*pdelim = '\0';
do {
if (nvlist_lookup_uint64(cnt_track, nm,
&val) == 0) {
/* update existing entry */
fnvlist_add_uint64(cnt_track, nm,
val + 1);
} else {
/* add to list */
fnvlist_add_uint64(cnt_track, nm, 1);
}
pdelim = strrchr(nm, '/');
if (pdelim != NULL)
*pdelim = '\0';
} while (pdelim != NULL);
}
/* Check aggregated counts at each level */
for (pair = nvlist_next_nvpair(cnt_track, NULL);
pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) {
int error = 0;
char *name;
uint64_t cnt = 0;
dsl_dataset_t *ds;
name = nvpair_name(pair);
cnt = fnvpair_value_uint64(pair);
ASSERT(cnt > 0);
error = dsl_dataset_hold(dp, name, FTAG, &ds);
if (error == 0) {
error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
ZFS_PROP_SNAPSHOT_LIMIT, NULL,
ddsa->ddsa_cr);
dsl_dataset_rele(ds, FTAG);
}
if (error != 0) {
if (ddsa->ddsa_errors != NULL)
fnvlist_add_int32(ddsa->ddsa_errors,
name, error);
rv = error;
/* only report one error for this check */
break;
}
}
nvlist_free(cnt_track);
}
for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
int error = 0;
dsl_dataset_t *ds;
char *name, *atp;
char dsname[ZFS_MAX_DATASET_NAME_LEN];
name = nvpair_name(pair);
if (strlen(name) >= ZFS_MAX_DATASET_NAME_LEN)
error = SET_ERROR(ENAMETOOLONG);
if (error == 0) {
atp = strchr(name, '@');
if (atp == NULL)
error = SET_ERROR(EINVAL);
if (error == 0)
(void) strlcpy(dsname, name, atp - name + 1);
}
if (error == 0)
error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
if (error == 0) {
/* passing 0/NULL skips dsl_fs_ss_limit_check */
error = dsl_dataset_snapshot_check_impl(ds,
atp + 1, tx, B_FALSE, 0, NULL);
dsl_dataset_rele(ds, FTAG);
}
if (error != 0) {
if (ddsa->ddsa_errors != NULL) {
fnvlist_add_int32(ddsa->ddsa_errors,
name, error);
}
rv = error;
}
}
return (rv);
}
void
dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
dmu_tx_t *tx)
{
dsl_pool_t *dp = ds->ds_dir->dd_pool;
dmu_buf_t *dbuf;
dsl_dataset_phys_t *dsphys;
uint64_t dsobj, crtxg;
objset_t *mos = dp->dp_meta_objset;
objset_t *os;
ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
/*
* If we are on an old pool, the zil must not be active, in which
* case it will be zeroed. Usually zil_suspend() accomplishes this.
*/
ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP ||
dmu_objset_from_ds(ds, &os) != 0 ||
bcmp(&os->os_phys->os_zil_header, &zero_zil,
sizeof (zero_zil)) == 0);
/* Should not snapshot a dirty dataset. */
ASSERT(!txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
ds, tx->tx_txg));
dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx);
/*
* The origin's ds_creation_txg has to be < TXG_INITIAL
*/
if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
crtxg = 1;
else
crtxg = tx->tx_txg;
dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
dmu_buf_will_dirty(dbuf, tx);
dsphys = dbuf->db_data;
bzero(dsphys, sizeof (dsl_dataset_phys_t));
dsphys->ds_dir_obj = ds->ds_dir->dd_object;
dsphys->ds_fsid_guid = unique_create();
do {
(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
sizeof (dsphys->ds_guid));
} while (dsphys->ds_guid == 0);
dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
dsphys->ds_next_snap_obj = ds->ds_object;
dsphys->ds_num_children = 1;
dsphys->ds_creation_time = gethrestime_sec();
dsphys->ds_creation_txg = crtxg;
dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes;
dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes;
dsphys->ds_uncompressed_bytes =
dsl_dataset_phys(ds)->ds_uncompressed_bytes;
dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags;
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;
rrw_exit(&ds->ds_bp_rwlock, FTAG);
dmu_buf_rele(dbuf, FTAG);
for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
if (ds->ds_feature_inuse[f])
dsl_dataset_activate_feature(dsobj, f, tx);
}
ASSERT3U(ds->ds_prev != 0, ==,
dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
if (ds->ds_prev) {
uint64_t next_clones_obj =
dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj;
ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
ds->ds_object ||
dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1);
if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
ds->ds_object) {
dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
dsl_dataset_phys(ds->ds_prev)->ds_creation_txg);
dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj;
} else if (next_clones_obj != 0) {
dsl_dataset_remove_from_next_clones(ds->ds_prev,
dsphys->ds_next_snap_obj, tx);
VERIFY0(zap_add_int(mos,
next_clones_obj, dsobj, tx));
}
}
/*
* If we have a reference-reservation on this dataset, we will
* need to increase the amount of refreservation being charged
* since our unique space is going to zero.
*/
if (ds->ds_reserved) {
int64_t delta;
ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes,
ds->ds_reserved);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
delta, 0, 0, tx);
}
dmu_buf_will_dirty(ds->ds_dbuf, tx);
dsl_dataset_phys(ds)->ds_deadlist_obj =
dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX,
dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
dsl_deadlist_close(&ds->ds_deadlist);
dsl_deadlist_open(&ds->ds_deadlist, mos,
dsl_dataset_phys(ds)->ds_deadlist_obj);
dsl_deadlist_add_key(&ds->ds_deadlist,
dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
+ if (dsl_dataset_remap_deadlist_exists(ds)) {
+ uint64_t remap_deadlist_obj =
+ dsl_dataset_get_remap_deadlist_object(ds);
+ /*
+ * Move the remap_deadlist to the snapshot. The head
+ * will create a new remap deadlist on demand, from
+ * dsl_dataset_block_remapped().
+ */
+ dsl_dataset_unset_remap_deadlist_object(ds, tx);
+ dsl_deadlist_close(&ds->ds_remap_deadlist);
+
+ dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_REMAP_DEADLIST,
+ sizeof (remap_deadlist_obj), 1, &remap_deadlist_obj, tx));
+ }
+
ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg);
dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj;
dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg;
dsl_dataset_phys(ds)->ds_unique_bytes = 0;
+
if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj,
snapname, 8, 1, &dsobj, tx));
if (ds->ds_prev)
dsl_dataset_rele(ds->ds_prev, ds);
VERIFY0(dsl_dataset_hold_obj(dp,
dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev));
dsl_scan_ds_snapshotted(ds, tx);
dsl_dir_snap_cmtime_update(ds->ds_dir);
spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, "");
}
void
dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx)
{
dsl_dataset_snapshot_arg_t *ddsa = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
nvpair_t *pair;
for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
dsl_dataset_t *ds;
char *name, *atp;
char dsname[ZFS_MAX_DATASET_NAME_LEN];
name = nvpair_name(pair);
atp = strchr(name, '@');
(void) strlcpy(dsname, name, atp - name + 1);
VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds));
dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx);
if (ddsa->ddsa_props != NULL) {
dsl_props_set_sync_impl(ds->ds_prev,
ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx);
}
dsl_dataset_rele(ds, FTAG);
}
}
/*
* The snapshots must all be in the same pool.
* All-or-nothing: if there are any failures, nothing will be modified.
*/
int
dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
{
dsl_dataset_snapshot_arg_t ddsa;
nvpair_t *pair;
boolean_t needsuspend;
int error;
spa_t *spa;
char *firstname;
nvlist_t *suspended = NULL;
pair = nvlist_next_nvpair(snaps, NULL);
if (pair == NULL)
return (0);
firstname = nvpair_name(pair);
error = spa_open(firstname, &spa, FTAG);
if (error != 0)
return (error);
needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
spa_close(spa, FTAG);
if (needsuspend) {
suspended = fnvlist_alloc();
for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
pair = nvlist_next_nvpair(snaps, pair)) {
char fsname[ZFS_MAX_DATASET_NAME_LEN];
char *snapname = nvpair_name(pair);
char *atp;
void *cookie;
atp = strchr(snapname, '@');
if (atp == NULL) {
error = SET_ERROR(EINVAL);
break;
}
(void) strlcpy(fsname, snapname, atp - snapname + 1);
error = zil_suspend(fsname, &cookie);
if (error != 0)
break;
fnvlist_add_uint64(suspended, fsname,
(uintptr_t)cookie);
}
}
ddsa.ddsa_snaps = snaps;
ddsa.ddsa_props = props;
ddsa.ddsa_errors = errors;
ddsa.ddsa_cr = CRED();
if (error == 0) {
error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
dsl_dataset_snapshot_sync, &ddsa,
fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL);
}
if (suspended != NULL) {
for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL;
pair = nvlist_next_nvpair(suspended, pair)) {
zil_resume((void *)(uintptr_t)
fnvpair_value_uint64(pair));
}
fnvlist_free(suspended);
}
#ifdef __FreeBSD__
#ifdef _KERNEL
if (error == 0) {
for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
pair = nvlist_next_nvpair(snaps, pair)) {
char *snapname = nvpair_name(pair);
zvol_create_minors(snapname);
}
}
#endif
#endif
return (error);
}
typedef struct dsl_dataset_snapshot_tmp_arg {
const char *ddsta_fsname;
const char *ddsta_snapname;
minor_t ddsta_cleanup_minor;
const char *ddsta_htag;
} dsl_dataset_snapshot_tmp_arg_t;
static int
dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx)
{
dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *ds;
int error;
error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds);
if (error != 0)
return (error);
/* NULL cred means no limit check for tmp snapshot */
error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname,
tx, B_FALSE, 0, NULL);
if (error != 0) {
dsl_dataset_rele(ds, FTAG);
return (error);
}
if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) {
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(ENOTSUP));
}
error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag,
B_TRUE, tx);
if (error != 0) {
dsl_dataset_rele(ds, FTAG);
return (error);
}
dsl_dataset_rele(ds, FTAG);
return (0);
}
static void
dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx)
{
dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *ds;
VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds));
dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx);
dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag,
ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx);
dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx);
dsl_dataset_rele(ds, FTAG);
}
int
dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
minor_t cleanup_minor, const char *htag)
{
dsl_dataset_snapshot_tmp_arg_t ddsta;
int error;
spa_t *spa;
boolean_t needsuspend;
void *cookie;
ddsta.ddsta_fsname = fsname;
ddsta.ddsta_snapname = snapname;
ddsta.ddsta_cleanup_minor = cleanup_minor;
ddsta.ddsta_htag = htag;
error = spa_open(fsname, &spa, FTAG);
if (error != 0)
return (error);
needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
spa_close(spa, FTAG);
if (needsuspend) {
error = zil_suspend(fsname, &cookie);
if (error != 0)
return (error);
}
error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check,
dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED);
if (needsuspend)
zil_resume(cookie);
return (error);
}
void
dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
{
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(ds->ds_objset != NULL);
ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0);
/*
* in case we had to change ds_fsid_guid when we opened it,
* sync it out now.
*/
dmu_buf_will_dirty(ds->ds_dbuf, tx);
dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;
if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) {
VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1,
&ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx));
VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1,
&ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx));
VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1,
&ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx));
ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0;
ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0;
ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;
}
dmu_objset_sync(ds->ds_objset, zio, tx);
for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
if (ds->ds_feature_activation_needed[f]) {
if (ds->ds_feature_inuse[f])
continue;
dsl_dataset_activate_feature(ds->ds_object, f, tx);
ds->ds_feature_inuse[f] = B_TRUE;
}
}
}
static int
deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
{
dsl_deadlist_t *dl = arg;
dsl_deadlist_insert(dl, bp, tx);
return (0);
}
void
dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx)
{
objset_t *os = ds->ds_objset;
bplist_iterate(&ds->ds_pending_deadlist,
deadlist_enqueue_cb, &ds->ds_deadlist, tx);
if (os->os_synced_dnodes != NULL) {
multilist_destroy(os->os_synced_dnodes);
os->os_synced_dnodes = NULL;
}
ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx)));
dmu_buf_rele(ds->ds_dbuf, ds);
}
int
get_clones_stat_impl(dsl_dataset_t *ds, nvlist_t *val)
{
uint64_t count = 0;
objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
zap_cursor_t zc;
zap_attribute_t za;
ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
/*
* There may be missing entries in ds_next_clones_obj
* due to a bug in a previous version of the code.
* Only trust it if it has the right number of entries.
*/
if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
&count));
}
if (count != dsl_dataset_phys(ds)->ds_num_children - 1) {
return (ENOENT);
}
for (zap_cursor_init(&zc, mos,
dsl_dataset_phys(ds)->ds_next_clones_obj);
zap_cursor_retrieve(&zc, &za) == 0;
zap_cursor_advance(&zc)) {
dsl_dataset_t *clone;
char buf[ZFS_MAX_DATASET_NAME_LEN];
VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
za.za_first_integer, FTAG, &clone));
dsl_dir_name(clone->ds_dir, buf);
fnvlist_add_boolean(val, buf);
dsl_dataset_rele(clone, FTAG);
}
zap_cursor_fini(&zc);
return (0);
}
void
get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
{
nvlist_t *propval = fnvlist_alloc();
nvlist_t *val;
/*
* We use nvlist_alloc() instead of fnvlist_alloc() because the
* latter would allocate the list with NV_UNIQUE_NAME flag.
* As a result, every time a clone name is appended to the list
* it would be (linearly) searched for for a duplicate name.
* We already know that all clone names must be unique and we
* want avoid the quadratic complexity of double-checking that
* because we can have a large number of clones.
*/
VERIFY0(nvlist_alloc(&val, 0, KM_SLEEP));
if (get_clones_stat_impl(ds, val) == 0) {
fnvlist_add_nvlist(propval, ZPROP_VALUE, val);
fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES),
propval);
}
nvlist_free(val);
nvlist_free(propval);
}
/*
* Returns a string that represents the receive resume stats token. It should
* be freed with strfree().
*/
char *
get_receive_resume_stats_impl(dsl_dataset_t *ds)
{
dsl_pool_t *dp = ds->ds_dir->dd_pool;
if (dsl_dataset_has_resume_receive_state(ds)) {
char *str;
void *packed;
uint8_t *compressed;
uint64_t val;
nvlist_t *token_nv = fnvlist_alloc();
size_t packed_size, compressed_size;
if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) {
fnvlist_add_uint64(token_nv, "fromguid", val);
}
if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) {
fnvlist_add_uint64(token_nv, "object", val);
}
if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) {
fnvlist_add_uint64(token_nv, "offset", val);
}
if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) {
fnvlist_add_uint64(token_nv, "bytes", val);
}
if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) {
fnvlist_add_uint64(token_nv, "toguid", val);
}
char buf[256];
if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) {
fnvlist_add_string(token_nv, "toname", buf);
}
if (zap_contains(dp->dp_meta_objset, ds->ds_object,
DS_FIELD_RESUME_LARGEBLOCK) == 0) {
fnvlist_add_boolean(token_nv, "largeblockok");
}
if (zap_contains(dp->dp_meta_objset, ds->ds_object,
DS_FIELD_RESUME_EMBEDOK) == 0) {
fnvlist_add_boolean(token_nv, "embedok");
}
if (zap_contains(dp->dp_meta_objset, ds->ds_object,
DS_FIELD_RESUME_COMPRESSOK) == 0) {
fnvlist_add_boolean(token_nv, "compressok");
}
packed = fnvlist_pack(token_nv, &packed_size);
fnvlist_free(token_nv);
compressed = kmem_alloc(packed_size, KM_SLEEP);
compressed_size = gzip_compress(packed, compressed,
packed_size, packed_size, 6);
zio_cksum_t cksum;
fletcher_4_native(compressed, compressed_size, NULL, &cksum);
str = kmem_alloc(compressed_size * 2 + 1, KM_SLEEP);
for (int i = 0; i < compressed_size; i++) {
(void) sprintf(str + i * 2, "%02x", compressed[i]);
}
str[compressed_size * 2] = '\0';
char *propval = kmem_asprintf("%u-%llx-%llx-%s",
ZFS_SEND_RESUME_TOKEN_VERSION,
(longlong_t)cksum.zc_word[0],
(longlong_t)packed_size, str);
kmem_free(packed, packed_size);
kmem_free(str, compressed_size * 2 + 1);
kmem_free(compressed, packed_size);
return (propval);
}
return (spa_strdup(""));
}
/*
* Returns a string that represents the receive resume stats token of the
* dataset's child. It should be freed with strfree().
*/
char *
get_child_receive_stats(dsl_dataset_t *ds)
{
char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
dsl_dataset_t *recv_ds;
dsl_dataset_name(ds, recvname);
if (strlcat(recvname, "/", sizeof (recvname)) <
sizeof (recvname) &&
strlcat(recvname, recv_clone_name, sizeof (recvname)) <
sizeof (recvname) &&
dsl_dataset_hold(ds->ds_dir->dd_pool, recvname, FTAG,
&recv_ds) == 0) {
char *propval = get_receive_resume_stats_impl(recv_ds);
dsl_dataset_rele(recv_ds, FTAG);
return (propval);
}
return (spa_strdup(""));
}
static void
get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv)
{
char *propval = get_receive_resume_stats_impl(ds);
if (strcmp(propval, "") != 0) {
dsl_prop_nvlist_add_string(nv,
ZFS_PROP_RECEIVE_RESUME_TOKEN, propval);
} else {
char *childval = get_child_receive_stats(ds);
if (strcmp(childval, "") != 0) {
dsl_prop_nvlist_add_string(nv,
ZFS_PROP_RECEIVE_RESUME_TOKEN, childval);
}
strfree(childval);
}
strfree(propval);
}
uint64_t
dsl_get_refratio(dsl_dataset_t *ds)
{
uint64_t ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 :
(dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 /
dsl_dataset_phys(ds)->ds_compressed_bytes);
return (ratio);
}
uint64_t
dsl_get_logicalreferenced(dsl_dataset_t *ds)
{
return (dsl_dataset_phys(ds)->ds_uncompressed_bytes);
}
uint64_t
dsl_get_compressratio(dsl_dataset_t *ds)
{
if (ds->ds_is_snapshot) {
return (dsl_get_refratio(ds));
} else {
dsl_dir_t *dd = ds->ds_dir;
mutex_enter(&dd->dd_lock);
uint64_t val = dsl_dir_get_compressratio(dd);
mutex_exit(&dd->dd_lock);
return (val);
}
}
uint64_t
dsl_get_used(dsl_dataset_t *ds)
{
if (ds->ds_is_snapshot) {
return (dsl_dataset_phys(ds)->ds_unique_bytes);
} else {
dsl_dir_t *dd = ds->ds_dir;
mutex_enter(&dd->dd_lock);
uint64_t val = dsl_dir_get_used(dd);
mutex_exit(&dd->dd_lock);
return (val);
}
}
uint64_t
dsl_get_creation(dsl_dataset_t *ds)
{
return (dsl_dataset_phys(ds)->ds_creation_time);
}
uint64_t
dsl_get_creationtxg(dsl_dataset_t *ds)
{
return (dsl_dataset_phys(ds)->ds_creation_txg);
}
uint64_t
dsl_get_refquota(dsl_dataset_t *ds)
{
return (ds->ds_quota);
}
uint64_t
dsl_get_refreservation(dsl_dataset_t *ds)
{
return (ds->ds_reserved);
}
uint64_t
dsl_get_guid(dsl_dataset_t *ds)
{
return (dsl_dataset_phys(ds)->ds_guid);
}
uint64_t
dsl_get_unique(dsl_dataset_t *ds)
{
return (dsl_dataset_phys(ds)->ds_unique_bytes);
}
uint64_t
dsl_get_objsetid(dsl_dataset_t *ds)
{
return (ds->ds_object);
}
uint64_t
dsl_get_userrefs(dsl_dataset_t *ds)
{
return (ds->ds_userrefs);
}
uint64_t
dsl_get_defer_destroy(dsl_dataset_t *ds)
{
return (DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
}
uint64_t
dsl_get_referenced(dsl_dataset_t *ds)
{
return (dsl_dataset_phys(ds)->ds_referenced_bytes);
}
uint64_t
dsl_get_numclones(dsl_dataset_t *ds)
{
ASSERT(ds->ds_is_snapshot);
return (dsl_dataset_phys(ds)->ds_num_children - 1);
}
uint64_t
dsl_get_inconsistent(dsl_dataset_t *ds)
{
return ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT) ?
1 : 0);
}
uint64_t
dsl_get_available(dsl_dataset_t *ds)
{
uint64_t refdbytes = dsl_get_referenced(ds);
uint64_t availbytes = dsl_dir_space_available(ds->ds_dir,
NULL, 0, TRUE);
if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
availbytes +=
ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
}
if (ds->ds_quota != 0) {
/*
* Adjust available bytes according to refquota
*/
if (refdbytes < ds->ds_quota) {
availbytes = MIN(availbytes,
ds->ds_quota - refdbytes);
} else {
availbytes = 0;
}
}
return (availbytes);
}
int
dsl_get_written(dsl_dataset_t *ds, uint64_t *written)
{
dsl_pool_t *dp = ds->ds_dir->dd_pool;
dsl_dataset_t *prev;
int err = dsl_dataset_hold_obj(dp,
dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
if (err == 0) {
uint64_t comp, uncomp;
err = dsl_dataset_space_written(prev, ds, written,
&comp, &uncomp);
dsl_dataset_rele(prev, FTAG);
}
return (err);
}
/*
* 'snap' should be a buffer of size ZFS_MAX_DATASET_NAME_LEN.
*/
int
dsl_get_prev_snap(dsl_dataset_t *ds, char *snap)
{
dsl_pool_t *dp = ds->ds_dir->dd_pool;
if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) {
dsl_dataset_name(ds->ds_prev, snap);
return (0);
} else {
return (ENOENT);
}
}
/*
* Returns the mountpoint property and source for the given dataset in the value
* and source buffers. The value buffer must be at least as large as MAXPATHLEN
* and the source buffer as least as large a ZFS_MAX_DATASET_NAME_LEN.
* Returns 0 on success and an error on failure.
*/
int
dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value,
char *source)
{
int error;
dsl_pool_t *dp = ds->ds_dir->dd_pool;
/* Retrieve the mountpoint value stored in the zap opbject */
error = dsl_prop_get_ds(ds, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), 1,
ZAP_MAXVALUELEN, value, source);
if (error != 0) {
return (error);
}
/* Process the dsname and source to find the full mountpoint string */
if (value[0] == '/') {
char *buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
char *root = buf;
const char *relpath;
/*
* If we inherit the mountpoint, even from a dataset
* with a received value, the source will be the path of
* the dataset we inherit from. If source is
* ZPROP_SOURCE_VAL_RECVD, the received value is not
* inherited.
*/
if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) {
relpath = "";
} else {
ASSERT0(strncmp(dsname, source, strlen(source)));
relpath = dsname + strlen(source);
if (relpath[0] == '/')
relpath++;
}
spa_altroot(dp->dp_spa, root, ZAP_MAXVALUELEN);
/*
* Special case an alternate root of '/'. This will
* avoid having multiple leading slashes in the
* mountpoint path.
*/
if (strcmp(root, "/") == 0)
root++;
/*
* If the mountpoint is '/' then skip over this
* if we are obtaining either an alternate root or
* an inherited mountpoint.
*/
char *mnt = value;
if (value[1] == '\0' && (root[0] != '\0' ||
relpath[0] != '\0'))
mnt = value + 1;
if (relpath[0] == '\0') {
(void) snprintf(value, ZAP_MAXVALUELEN, "%s%s",
root, mnt);
} else {
(void) snprintf(value, ZAP_MAXVALUELEN, "%s%s%s%s",
root, mnt, relpath[0] == '@' ? "" : "/",
relpath);
}
kmem_free(buf, ZAP_MAXVALUELEN);
} else {
/* 'legacy' or 'none' */
(void) snprintf(value, ZAP_MAXVALUELEN, "%s", value);
}
return (0);
}
void
dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
{
dsl_pool_t *dp = ds->ds_dir->dd_pool;
ASSERT(dsl_pool_config_held(dp));
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO,
dsl_get_refratio(ds));
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED,
dsl_get_logicalreferenced(ds));
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
dsl_get_compressratio(ds));
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
dsl_get_used(ds));
if (ds->ds_is_snapshot) {
get_clones_stat(ds, nv);
} else {
char buf[ZFS_MAX_DATASET_NAME_LEN];
if (dsl_get_prev_snap(ds, buf) == 0)
dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP,
buf);
dsl_dir_stats(ds->ds_dir, nv);
}
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE,
dsl_get_available(ds));
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED,
dsl_get_referenced(ds));
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
dsl_get_creation(ds));
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
dsl_get_creationtxg(ds));
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
dsl_get_refquota(ds));
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
dsl_get_refreservation(ds));
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
dsl_get_guid(ds));
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
dsl_get_unique(ds));
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
dsl_get_objsetid(ds));
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
dsl_get_userrefs(ds));
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
dsl_get_defer_destroy(ds));
if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
uint64_t written;
if (dsl_get_written(ds, &written) == 0) {
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
written);
}
}
if (!dsl_dataset_is_snapshot(ds)) {
/*
* A failed "newfs" (e.g. full) resumable receive leaves
* the stats set on this dataset. Check here for the prop.
*/
get_receive_resume_stats(ds, nv);
/*
* A failed incremental resumable receive leaves the
* stats set on our child named "%recv". Check the child
* for the prop.
*/
/* 6 extra bytes for /%recv */
char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
dsl_dataset_t *recv_ds;
dsl_dataset_name(ds, recvname);
if (strlcat(recvname, "/", sizeof (recvname)) <
sizeof (recvname) &&
strlcat(recvname, recv_clone_name, sizeof (recvname)) <
sizeof (recvname) &&
dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) {
get_receive_resume_stats(recv_ds, nv);
dsl_dataset_rele(recv_ds, FTAG);
}
}
}
void
dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
{
dsl_pool_t *dp = ds->ds_dir->dd_pool;
ASSERT(dsl_pool_config_held(dp));
stat->dds_creation_txg = dsl_get_creationtxg(ds);
stat->dds_inconsistent = dsl_get_inconsistent(ds);
stat->dds_guid = dsl_get_guid(ds);
stat->dds_origin[0] = '\0';
if (ds->ds_is_snapshot) {
stat->dds_is_snapshot = B_TRUE;
stat->dds_num_clones = dsl_get_numclones(ds);
} else {
stat->dds_is_snapshot = B_FALSE;
stat->dds_num_clones = 0;
if (dsl_dir_is_clone(ds->ds_dir)) {
dsl_dir_get_origin(ds->ds_dir, stat->dds_origin);
}
}
}
uint64_t
dsl_dataset_fsid_guid(dsl_dataset_t *ds)
{
return (ds->ds_fsid_guid);
}
void
dsl_dataset_space(dsl_dataset_t *ds,
uint64_t *refdbytesp, uint64_t *availbytesp,
uint64_t *usedobjsp, uint64_t *availobjsp)
{
*refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes;
*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes)
*availbytesp +=
ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
if (ds->ds_quota != 0) {
/*
* Adjust available bytes according to refquota
*/
if (*refdbytesp < ds->ds_quota)
*availbytesp = MIN(*availbytesp,
ds->ds_quota - *refdbytesp);
else
*availbytesp = 0;
}
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
*usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp);
rrw_exit(&ds->ds_bp_rwlock, FTAG);
*availobjsp = DN_MAX_OBJECT - *usedobjsp;
}
boolean_t
dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
{
dsl_pool_t *dp = ds->ds_dir->dd_pool;
uint64_t birth;
ASSERT(dsl_pool_config_held(dp));
if (snap == NULL)
return (B_FALSE);
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
birth = dsl_dataset_get_blkptr(ds)->blk_birth;
rrw_exit(&ds->ds_bp_rwlock, FTAG);
if (birth > dsl_dataset_phys(snap)->ds_creation_txg) {
objset_t *os, *os_snap;
/*
* It may be that only the ZIL differs, because it was
* reset in the head. Don't count that as being
* modified.
*/
if (dmu_objset_from_ds(ds, &os) != 0)
return (B_TRUE);
if (dmu_objset_from_ds(snap, &os_snap) != 0)
return (B_TRUE);
return (bcmp(&os->os_phys->os_meta_dnode,
&os_snap->os_phys->os_meta_dnode,
sizeof (os->os_phys->os_meta_dnode)) != 0);
}
return (B_FALSE);
}
typedef struct dsl_dataset_rename_snapshot_arg {
const char *ddrsa_fsname;
const char *ddrsa_oldsnapname;
const char *ddrsa_newsnapname;
boolean_t ddrsa_recursive;
dmu_tx_t *ddrsa_tx;
} dsl_dataset_rename_snapshot_arg_t;
/* ARGSUSED */
static int
dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,
dsl_dataset_t *hds, void *arg)
{
dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
int error;
uint64_t val;
error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
if (error != 0) {
/* ignore nonexistent snapshots */
return (error == ENOENT ? 0 : error);
}
/* new name should not exist */
error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val);
if (error == 0)
error = SET_ERROR(EEXIST);
else if (error == ENOENT)
error = 0;
/* dataset name + 1 for the "@" + the new snapshot name must fit */
if (dsl_dir_namelen(hds->ds_dir) + 1 +
strlen(ddrsa->ddrsa_newsnapname) >= ZFS_MAX_DATASET_NAME_LEN)
error = SET_ERROR(ENAMETOOLONG);
return (error);
}
static int
dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx)
{
dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *hds;
int error;
error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds);
if (error != 0)
return (error);
if (ddrsa->ddrsa_recursive) {
error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
dsl_dataset_rename_snapshot_check_impl, ddrsa,
DS_FIND_CHILDREN);
} else {
error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa);
}
dsl_dataset_rele(hds, FTAG);
return (error);
}
static int
dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
dsl_dataset_t *hds, void *arg)
{
#ifdef __FreeBSD__
#ifdef _KERNEL
char *oldname, *newname;
#endif
#endif
dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
dsl_dataset_t *ds;
uint64_t val;
dmu_tx_t *tx = ddrsa->ddrsa_tx;
int error;
error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
ASSERT(error == 0 || error == ENOENT);
if (error == ENOENT) {
/* ignore nonexistent snapshots */
return (0);
}
VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds));
/* log before we change the name */
spa_history_log_internal_ds(ds, "rename", tx,
"-> @%s", ddrsa->ddrsa_newsnapname);
VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx,
B_FALSE));
mutex_enter(&ds->ds_lock);
(void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname);
mutex_exit(&ds->ds_lock);
VERIFY0(zap_add(dp->dp_meta_objset,
dsl_dataset_phys(hds)->ds_snapnames_zapobj,
ds->ds_snapname, 8, 1, &ds->ds_object, tx));
#ifdef __FreeBSD__
#ifdef _KERNEL
oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
newname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
snprintf(oldname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname,
ddrsa->ddrsa_oldsnapname);
snprintf(newname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname,
ddrsa->ddrsa_newsnapname);
zfsvfs_update_fromname(oldname, newname);
zvol_rename_minors(oldname, newname);
kmem_free(newname, MAXPATHLEN);
kmem_free(oldname, MAXPATHLEN);
#endif
#endif
dsl_dataset_rele(ds, FTAG);
return (0);
}
static void
dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx)
{
dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *hds;
VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds));
ddrsa->ddrsa_tx = tx;
if (ddrsa->ddrsa_recursive) {
VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
dsl_dataset_rename_snapshot_sync_impl, ddrsa,
DS_FIND_CHILDREN));
} else {
VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa));
}
dsl_dataset_rele(hds, FTAG);
}
int
dsl_dataset_rename_snapshot(const char *fsname,
const char *oldsnapname, const char *newsnapname, boolean_t recursive)
{
dsl_dataset_rename_snapshot_arg_t ddrsa;
ddrsa.ddrsa_fsname = fsname;
ddrsa.ddrsa_oldsnapname = oldsnapname;
ddrsa.ddrsa_newsnapname = newsnapname;
ddrsa.ddrsa_recursive = recursive;
return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
dsl_dataset_rename_snapshot_sync, &ddrsa,
1, ZFS_SPACE_CHECK_RESERVED));
}
/*
* If we're doing an ownership handoff, we need to make sure that there is
* only one long hold on the dataset. We're not allowed to change anything here
* so we don't permanently release the long hold or regular hold here. We want
* to do this only when syncing to avoid the dataset unexpectedly going away
* when we release the long hold.
*/
static int
dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
{
boolean_t held;
if (!dmu_tx_is_syncing(tx))
return (0);
if (owner != NULL) {
VERIFY3P(ds->ds_owner, ==, owner);
dsl_dataset_long_rele(ds, owner);
}
held = dsl_dataset_long_held(ds);
if (owner != NULL)
dsl_dataset_long_hold(ds, owner);
if (held)
return (SET_ERROR(EBUSY));
return (0);
}
int
dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
{
dsl_dataset_rollback_arg_t *ddra = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *ds;
int64_t unused_refres_delta;
int error;
error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds);
if (error != 0)
return (error);
/* must not be a snapshot */
if (ds->ds_is_snapshot) {
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(EINVAL));
}
/* must have a most recent snapshot */
if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) {
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(ESRCH));
}
/*
* No rollback to a snapshot created in the current txg, because
* the rollback may dirty the dataset and create blocks that are
* not reachable from the rootbp while having a birth txg that
* falls into the snapshot's range.
*/
if (dmu_tx_is_syncing(tx) &&
dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) {
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(EAGAIN));
}
/*
* If the expected target snapshot is specified, then check that
* the latest snapshot is it.
*/
if (ddra->ddra_tosnap != NULL) {
dsl_dataset_t *snapds;
/* Check if the target snapshot exists at all. */
error = dsl_dataset_hold(dp, ddra->ddra_tosnap, FTAG, &snapds);
if (error != 0) {
/*
* ESRCH is used to signal that the target snapshot does
* not exist, while ENOENT is used to report that
* the rolled back dataset does not exist.
* ESRCH is also used to cover other cases where the
* target snapshot is not related to the dataset being
* rolled back such as being in a different pool.
*/
if (error == ENOENT || error == EXDEV)
error = SET_ERROR(ESRCH);
dsl_dataset_rele(ds, FTAG);
return (error);
}
ASSERT(snapds->ds_is_snapshot);
/* Check if the snapshot is the latest snapshot indeed. */
if (snapds != ds->ds_prev) {
/*
* Distinguish between the case where the only problem
* is intervening snapshots (EEXIST) vs the snapshot
* not being a valid target for rollback (ESRCH).
*/
if (snapds->ds_dir == ds->ds_dir ||
(dsl_dir_is_clone(ds->ds_dir) &&
dsl_dir_phys(ds->ds_dir)->dd_origin_obj ==
snapds->ds_object)) {
error = SET_ERROR(EEXIST);
} else {
error = SET_ERROR(ESRCH);
}
dsl_dataset_rele(snapds, FTAG);
dsl_dataset_rele(ds, FTAG);
return (error);
}
dsl_dataset_rele(snapds, FTAG);
}
/* must not have any bookmarks after the most recent snapshot */
nvlist_t *proprequest = fnvlist_alloc();
fnvlist_add_boolean(proprequest, zfs_prop_to_name(ZFS_PROP_CREATETXG));
nvlist_t *bookmarks = fnvlist_alloc();
error = dsl_get_bookmarks_impl(ds, proprequest, bookmarks);
fnvlist_free(proprequest);
if (error != 0) {
dsl_dataset_rele(ds, FTAG);
return (error);
}
for (nvpair_t *pair = nvlist_next_nvpair(bookmarks, NULL);
pair != NULL; pair = nvlist_next_nvpair(bookmarks, pair)) {
nvlist_t *valuenv =
fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair),
zfs_prop_to_name(ZFS_PROP_CREATETXG));
uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value");
if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
fnvlist_free(bookmarks);
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(EEXIST));
}
}
fnvlist_free(bookmarks);
error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx);
if (error != 0) {
dsl_dataset_rele(ds, FTAG);
return (error);
}
/*
* Check if the snap we are rolling back to uses more than
* the refquota.
*/
if (ds->ds_quota != 0 &&
dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) {
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(EDQUOT));
}
/*
* When we do the clone swap, we will temporarily use more space
* due to the refreservation (the head will no longer have any
* unique space, so the entire amount of the refreservation will need
* to be free). We will immediately destroy the clone, freeing
* this space, but the freeing happens over many txg's.
*/
unused_refres_delta = (int64_t)MIN(ds->ds_reserved,
dsl_dataset_phys(ds)->ds_unique_bytes);
if (unused_refres_delta > 0 &&
unused_refres_delta >
dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) {
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(ENOSPC));
}
dsl_dataset_rele(ds, FTAG);
return (0);
}
void
dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
{
dsl_dataset_rollback_arg_t *ddra = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *ds, *clone;
uint64_t cloneobj;
char namebuf[ZFS_MAX_DATASET_NAME_LEN];
VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds));
dsl_dataset_name(ds->ds_prev, namebuf);
fnvlist_add_string(ddra->ddra_result, "target", namebuf);
cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx);
VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone));
dsl_dataset_clone_swap_sync_impl(clone, ds, tx);
dsl_dataset_zero_zil(ds, tx);
dsl_destroy_head_sync_impl(clone, tx);
dsl_dataset_rele(clone, FTAG);
dsl_dataset_rele(ds, FTAG);
}
/*
* Rolls back the given filesystem or volume to the most recent snapshot.
* The name of the most recent snapshot will be returned under key "target"
* in the result nvlist.
*
* If owner != NULL:
* - The existing dataset MUST be owned by the specified owner at entry
* - Upon return, dataset will still be held by the same owner, whether we
* succeed or not.
*
* This mode is required any time the existing filesystem is mounted. See
* notes above zfs_suspend_fs() for further details.
*/
int
dsl_dataset_rollback(const char *fsname, const char *tosnap, void *owner,
nvlist_t *result)
{
dsl_dataset_rollback_arg_t ddra;
ddra.ddra_fsname = fsname;
ddra.ddra_tosnap = tosnap;
ddra.ddra_owner = owner;
ddra.ddra_result = result;
return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
dsl_dataset_rollback_sync, &ddra,
1, ZFS_SPACE_CHECK_RESERVED));
}
struct promotenode {
list_node_t link;
dsl_dataset_t *ds;
};
static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp,
void *tag);
static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag);
int
dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
{
dsl_dataset_promote_arg_t *ddpa = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *hds;
struct promotenode *snap;
dsl_dataset_t *origin_ds;
int err;
uint64_t unused;
uint64_t ss_mv_cnt;
size_t max_snap_len;
boolean_t conflicting_snaps;
err = promote_hold(ddpa, dp, FTAG);
if (err != 0)
return (err);
hds = ddpa->ddpa_clone;
snap = list_head(&ddpa->shared_snaps);
origin_ds = snap->ds;
max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1;
snap = list_head(&ddpa->origin_snaps);
if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) {
promote_rele(ddpa, FTAG);
return (SET_ERROR(EXDEV));
}
/*
* Compute and check the amount of space to transfer. Since this is
* so expensive, don't do the preliminary check.
*/
if (!dmu_tx_is_syncing(tx)) {
promote_rele(ddpa, FTAG);
return (0);
}
/* compute origin's new unique space */
snap = list_tail(&ddpa->clone_snaps);
ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
origin_ds->ds_object);
dsl_deadlist_space_range(&snap->ds->ds_deadlist,
dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX,
&ddpa->unique, &unused, &unused);
/*
* Walk the snapshots that we are moving
*
* Compute space to transfer. Consider the incremental changes
* to used by each snapshot:
* (my used) = (prev's used) + (blocks born) - (blocks killed)
* So each snapshot gave birth to:
* (blocks born) = (my used) - (prev's used) + (blocks killed)
* So a sequence would look like:
* (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
* Which simplifies to:
* uN + kN + kN-1 + ... + k1 + k0
* Note however, if we stop before we reach the ORIGIN we get:
* uN + kN + kN-1 + ... + kM - uM-1
*/
conflicting_snaps = B_FALSE;
ss_mv_cnt = 0;
ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes;
ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes;
ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes;
for (snap = list_head(&ddpa->shared_snaps); snap;
snap = list_next(&ddpa->shared_snaps, snap)) {
uint64_t val, dlused, dlcomp, dluncomp;
dsl_dataset_t *ds = snap->ds;
ss_mv_cnt++;
/*
* If there are long holds, we won't be able to evict
* the objset.
*/
if (dsl_dataset_long_held(ds)) {
err = SET_ERROR(EBUSY);
goto out;
}
/* Check that the snapshot name does not conflict */
VERIFY0(dsl_dataset_get_snapname(ds));
if (strlen(ds->ds_snapname) >= max_snap_len) {
err = SET_ERROR(ENAMETOOLONG);
goto out;
}
err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
if (err == 0) {
fnvlist_add_boolean(ddpa->err_ds,
snap->ds->ds_snapname);
conflicting_snaps = B_TRUE;
} else if (err != ENOENT) {
goto out;
}
/* The very first snapshot does not have a deadlist */
if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0)
continue;
dsl_deadlist_space(&ds->ds_deadlist,
&dlused, &dlcomp, &dluncomp);
ddpa->used += dlused;
ddpa->comp += dlcomp;
ddpa->uncomp += dluncomp;
}
/*
* In order to return the full list of conflicting snapshots, we check
* whether there was a conflict after traversing all of them.
*/
if (conflicting_snaps) {
err = SET_ERROR(EEXIST);
goto out;
}
/*
* If we are a clone of a clone then we never reached ORIGIN,
* so we need to subtract out the clone origin's used space.
*/
if (ddpa->origin_origin) {
ddpa->used -=
dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes;
ddpa->comp -=
dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes;
ddpa->uncomp -=
dsl_dataset_phys(ddpa->origin_origin)->
ds_uncompressed_bytes;
}
/* Check that there is enough space and limit headroom here */
err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
0, ss_mv_cnt, ddpa->used, ddpa->cr);
if (err != 0)
goto out;
/*
* Compute the amounts of space that will be used by snapshots
* after the promotion (for both origin and clone). For each,
* it is the amount of space that will be on all of their
* deadlists (that was not born before their new origin).
*/
if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
uint64_t space;
/*
* Note, typically this will not be a clone of a clone,
* so dd_origin_txg will be < TXG_INITIAL, so
* these snaplist_space() -> dsl_deadlist_space_range()
* calls will be fast because they do not have to
* iterate over all bps.
*/
snap = list_head(&ddpa->origin_snaps);
err = snaplist_space(&ddpa->shared_snaps,
snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap);
if (err != 0)
goto out;
err = snaplist_space(&ddpa->clone_snaps,
snap->ds->ds_dir->dd_origin_txg, &space);
if (err != 0)
goto out;
ddpa->cloneusedsnap += space;
}
if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags &
DD_FLAG_USED_BREAKDOWN) {
err = snaplist_space(&ddpa->origin_snaps,
dsl_dataset_phys(origin_ds)->ds_creation_txg,
&ddpa->originusedsnap);
if (err != 0)
goto out;
}
out:
promote_rele(ddpa, FTAG);
return (err);
}
void
dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
{
dsl_dataset_promote_arg_t *ddpa = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *hds;
struct promotenode *snap;
dsl_dataset_t *origin_ds;
dsl_dataset_t *origin_head;
dsl_dir_t *dd;
dsl_dir_t *odd = NULL;
uint64_t oldnext_obj;
int64_t delta;
#if defined(__FreeBSD__) && defined(_KERNEL)
char *oldname, *newname;
#endif
VERIFY0(promote_hold(ddpa, dp, FTAG));
hds = ddpa->ddpa_clone;
ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE);
snap = list_head(&ddpa->shared_snaps);
origin_ds = snap->ds;
dd = hds->ds_dir;
snap = list_head(&ddpa->origin_snaps);
origin_head = snap->ds;
/*
* We need to explicitly open odd, since origin_ds's dd will be
* changing.
*/
VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object,
NULL, FTAG, &odd));
/* change origin's next snap */
dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj;
snap = list_tail(&ddpa->clone_snaps);
ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
origin_ds->ds_object);
dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object;
/* change the origin's next clone */
if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) {
dsl_dataset_remove_from_next_clones(origin_ds,
snap->ds->ds_object, tx);
VERIFY0(zap_add_int(dp->dp_meta_objset,
dsl_dataset_phys(origin_ds)->ds_next_clones_obj,
oldnext_obj, tx));
}
/* change origin */
dmu_buf_will_dirty(dd->dd_dbuf, tx);
ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object);
dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj;
dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
dmu_buf_will_dirty(odd->dd_dbuf, tx);
dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object;
origin_head->ds_dir->dd_origin_txg =
dsl_dataset_phys(origin_ds)->ds_creation_txg;
/* change dd_clone entries */
if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
VERIFY0(zap_remove_int(dp->dp_meta_objset,
dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx));
VERIFY0(zap_add_int(dp->dp_meta_objset,
dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
hds->ds_object, tx));
VERIFY0(zap_remove_int(dp->dp_meta_objset,
dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
origin_head->ds_object, tx));
if (dsl_dir_phys(dd)->dd_clones == 0) {
dsl_dir_phys(dd)->dd_clones =
zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES,
DMU_OT_NONE, 0, tx);
}
VERIFY0(zap_add_int(dp->dp_meta_objset,
dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx));
}
#if defined(__FreeBSD__) && defined(_KERNEL)
/* Take the spa_namespace_lock early so zvol renames don't deadlock. */
mutex_enter(&spa_namespace_lock);
oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
newname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
#endif
/* move snapshots to this dir */
for (snap = list_head(&ddpa->shared_snaps); snap;
snap = list_next(&ddpa->shared_snaps, snap)) {
dsl_dataset_t *ds = snap->ds;
/*
* Property callbacks are registered to a particular
* dsl_dir. Since ours is changing, evict the objset
* so that they will be unregistered from the old dsl_dir.
*/
if (ds->ds_objset) {
dmu_objset_evict(ds->ds_objset);
ds->ds_objset = NULL;
}
/* move snap name entry */
VERIFY0(dsl_dataset_get_snapname(ds));
VERIFY0(dsl_dataset_snap_remove(origin_head,
ds->ds_snapname, tx, B_TRUE));
VERIFY0(zap_add(dp->dp_meta_objset,
dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname,
8, 1, &ds->ds_object, tx));
dsl_fs_ss_count_adjust(hds->ds_dir, 1,
DD_FIELD_SNAPSHOT_COUNT, tx);
/* change containing dsl_dir */
dmu_buf_will_dirty(ds->ds_dbuf, tx);
ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object);
dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object;
ASSERT3P(ds->ds_dir, ==, odd);
dsl_dir_rele(ds->ds_dir, ds);
VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object,
NULL, ds, &ds->ds_dir));
#if defined(__FreeBSD__) && defined(_KERNEL)
dsl_dataset_name(ds, newname);
zfsvfs_update_fromname(oldname, newname);
zvol_rename_minors(oldname, newname);
#endif
/* move any clone references */
if (dsl_dataset_phys(ds)->ds_next_clones_obj &&
spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
zap_cursor_t zc;
zap_attribute_t za;
for (zap_cursor_init(&zc, dp->dp_meta_objset,
dsl_dataset_phys(ds)->ds_next_clones_obj);
zap_cursor_retrieve(&zc, &za) == 0;
zap_cursor_advance(&zc)) {
dsl_dataset_t *cnds;
uint64_t o;
if (za.za_first_integer == oldnext_obj) {
/*
* We've already moved the
* origin's reference.
*/
continue;
}
VERIFY0(dsl_dataset_hold_obj(dp,
za.za_first_integer, FTAG, &cnds));
o = dsl_dir_phys(cnds->ds_dir)->
dd_head_dataset_obj;
VERIFY0(zap_remove_int(dp->dp_meta_objset,
dsl_dir_phys(odd)->dd_clones, o, tx));
VERIFY0(zap_add_int(dp->dp_meta_objset,
dsl_dir_phys(dd)->dd_clones, o, tx));
dsl_dataset_rele(cnds, FTAG);
}
zap_cursor_fini(&zc);
}
ASSERT(!dsl_prop_hascb(ds));
}
#if defined(__FreeBSD__) && defined(_KERNEL)
mutex_exit(&spa_namespace_lock);
kmem_free(newname, MAXPATHLEN);
kmem_free(oldname, MAXPATHLEN);
#endif
/*
* Change space accounting.
* Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
* both be valid, or both be 0 (resulting in delta == 0). This
* is true for each of {clone,origin} independently.
*/
delta = ddpa->cloneusedsnap -
dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP];
ASSERT3S(delta, >=, 0);
ASSERT3U(ddpa->used, >=, delta);
dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
dsl_dir_diduse_space(dd, DD_USED_HEAD,
ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx);
delta = ddpa->originusedsnap -
dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP];
ASSERT3S(delta, <=, 0);
ASSERT3U(ddpa->used, >=, -delta);
dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
dsl_dir_diduse_space(odd, DD_USED_HEAD,
-ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx);
dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique;
/* log history record */
spa_history_log_internal_ds(hds, "promote", tx, "");
dsl_dir_rele(odd, FTAG);
promote_rele(ddpa, FTAG);
}
/*
* Make a list of dsl_dataset_t's for the snapshots between first_obj
* (exclusive) and last_obj (inclusive). The list will be in reverse
* order (last_obj will be the list_head()). If first_obj == 0, do all
* snapshots back to this dataset's origin.
*/
static int
snaplist_make(dsl_pool_t *dp,
uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag)
{
uint64_t obj = last_obj;
list_create(l, sizeof (struct promotenode),
offsetof(struct promotenode, link));
while (obj != first_obj) {
dsl_dataset_t *ds;
struct promotenode *snap;
int err;
err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
ASSERT(err != ENOENT);
if (err != 0)
return (err);
if (first_obj == 0)
first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj;
snap = kmem_alloc(sizeof (*snap), KM_SLEEP);
snap->ds = ds;
list_insert_tail(l, snap);
obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
}
return (0);
}
static int
snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
{
struct promotenode *snap;
*spacep = 0;
for (snap = list_head(l); snap; snap = list_next(l, snap)) {
uint64_t used, comp, uncomp;
dsl_deadlist_space_range(&snap->ds->ds_deadlist,
mintxg, UINT64_MAX, &used, &comp, &uncomp);
*spacep += used;
}
return (0);
}
static void
snaplist_destroy(list_t *l, void *tag)
{
struct promotenode *snap;
if (l == NULL || !list_link_active(&l->list_head))
return;
while ((snap = list_tail(l)) != NULL) {
list_remove(l, snap);
dsl_dataset_rele(snap->ds, tag);
kmem_free(snap, sizeof (*snap));
}
list_destroy(l);
}
static int
promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag)
{
int error;
dsl_dir_t *dd;
struct promotenode *snap;
error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag,
&ddpa->ddpa_clone);
if (error != 0)
return (error);
dd = ddpa->ddpa_clone->ds_dir;
if (ddpa->ddpa_clone->ds_is_snapshot ||
!dsl_dir_is_clone(dd)) {
dsl_dataset_rele(ddpa->ddpa_clone, tag);
return (SET_ERROR(EINVAL));
}
error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj,
&ddpa->shared_snaps, tag);
if (error != 0)
goto out;
error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object,
&ddpa->clone_snaps, tag);
if (error != 0)
goto out;
snap = list_head(&ddpa->shared_snaps);
ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj);
error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj,
dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj,
&ddpa->origin_snaps, tag);
if (error != 0)
goto out;
if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) {
error = dsl_dataset_hold_obj(dp,
dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj,
tag, &ddpa->origin_origin);
if (error != 0)
goto out;
}
out:
if (error != 0)
promote_rele(ddpa, tag);
return (error);
}
static void
promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag)
{
snaplist_destroy(&ddpa->shared_snaps, tag);
snaplist_destroy(&ddpa->clone_snaps, tag);
snaplist_destroy(&ddpa->origin_snaps, tag);
if (ddpa->origin_origin != NULL)
dsl_dataset_rele(ddpa->origin_origin, tag);
dsl_dataset_rele(ddpa->ddpa_clone, tag);
}
/*
* Promote a clone.
*
* If it fails due to a conflicting snapshot name, "conflsnap" will be filled
* in with the name. (It must be at least ZFS_MAX_DATASET_NAME_LEN bytes long.)
*/
int
dsl_dataset_promote(const char *name, char *conflsnap)
{
dsl_dataset_promote_arg_t ddpa = { 0 };
uint64_t numsnaps;
int error;
nvpair_t *snap_pair;
objset_t *os;
/*
* We will modify space proportional to the number of
* snapshots. Compute numsnaps.
*/
error = dmu_objset_hold(name, FTAG, &os);
if (error != 0)
return (error);
error = zap_count(dmu_objset_pool(os)->dp_meta_objset,
dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj,
&numsnaps);
dmu_objset_rele(os, FTAG);
if (error != 0)
return (error);
ddpa.ddpa_clonename = name;
ddpa.err_ds = fnvlist_alloc();
ddpa.cr = CRED();
error = dsl_sync_task(name, dsl_dataset_promote_check,
dsl_dataset_promote_sync, &ddpa,
2 + numsnaps, ZFS_SPACE_CHECK_RESERVED);
/*
* Return the first conflicting snapshot found.
*/
snap_pair = nvlist_next_nvpair(ddpa.err_ds, NULL);
if (snap_pair != NULL && conflsnap != NULL)
(void) strcpy(conflsnap, nvpair_name(snap_pair));
fnvlist_free(ddpa.err_ds);
return (error);
}
int
dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx)
{
/*
* "slack" factor for received datasets with refquota set on them.
* See the bottom of this function for details on its use.
*/
uint64_t refquota_slack = DMU_MAX_ACCESS * spa_asize_inflation;
int64_t unused_refres_delta;
/* they should both be heads */
if (clone->ds_is_snapshot ||
origin_head->ds_is_snapshot)
return (SET_ERROR(EINVAL));
/* if we are not forcing, the branch point should be just before them */
if (!force && clone->ds_prev != origin_head->ds_prev)
return (SET_ERROR(EINVAL));
/* clone should be the clone (unless they are unrelated) */
if (clone->ds_prev != NULL &&
clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap &&
origin_head->ds_dir != clone->ds_prev->ds_dir)
return (SET_ERROR(EINVAL));
/* the clone should be a child of the origin */
if (clone->ds_dir->dd_parent != origin_head->ds_dir)
return (SET_ERROR(EINVAL));
/* origin_head shouldn't be modified unless 'force' */
if (!force &&
dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev))
return (SET_ERROR(ETXTBSY));
/* origin_head should have no long holds (e.g. is not mounted) */
if (dsl_dataset_handoff_check(origin_head, owner, tx))
return (SET_ERROR(EBUSY));
/* check amount of any unconsumed refreservation */
unused_refres_delta =
(int64_t)MIN(origin_head->ds_reserved,
dsl_dataset_phys(origin_head)->ds_unique_bytes) -
(int64_t)MIN(origin_head->ds_reserved,
dsl_dataset_phys(clone)->ds_unique_bytes);
if (unused_refres_delta > 0 &&
unused_refres_delta >
dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE))
return (SET_ERROR(ENOSPC));
/*
* The clone can't be too much over the head's refquota.
*
* To ensure that the entire refquota can be used, we allow one
* transaction to exceed the the refquota. Therefore, this check
* needs to also allow for the space referenced to be more than the
* refquota. The maximum amount of space that one transaction can use
* on disk is DMU_MAX_ACCESS * spa_asize_inflation. Allowing this
* overage ensures that we are able to receive a filesystem that
* exceeds the refquota on the source system.
*
* So that overage is the refquota_slack we use below.
*/
if (origin_head->ds_quota != 0 &&
dsl_dataset_phys(clone)->ds_referenced_bytes >
origin_head->ds_quota + refquota_slack)
return (SET_ERROR(EDQUOT));
return (0);
}
+static void
+dsl_dataset_swap_remap_deadlists(dsl_dataset_t *clone,
+ dsl_dataset_t *origin, dmu_tx_t *tx)
+{
+ uint64_t clone_remap_dl_obj, origin_remap_dl_obj;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+
+ ASSERT(dsl_pool_sync_context(dp));
+
+ clone_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(clone);
+ origin_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(origin);
+
+ if (clone_remap_dl_obj != 0) {
+ dsl_deadlist_close(&clone->ds_remap_deadlist);
+ dsl_dataset_unset_remap_deadlist_object(clone, tx);
+ }
+ if (origin_remap_dl_obj != 0) {
+ dsl_deadlist_close(&origin->ds_remap_deadlist);
+ dsl_dataset_unset_remap_deadlist_object(origin, tx);
+ }
+
+ if (clone_remap_dl_obj != 0) {
+ dsl_dataset_set_remap_deadlist_object(origin,
+ clone_remap_dl_obj, tx);
+ dsl_deadlist_open(&origin->ds_remap_deadlist,
+ dp->dp_meta_objset, clone_remap_dl_obj);
+ }
+ if (origin_remap_dl_obj != 0) {
+ dsl_dataset_set_remap_deadlist_object(clone,
+ origin_remap_dl_obj, tx);
+ dsl_deadlist_open(&clone->ds_remap_deadlist,
+ dp->dp_meta_objset, origin_remap_dl_obj);
+ }
+}
+
void
dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
dsl_dataset_t *origin_head, dmu_tx_t *tx)
{
dsl_pool_t *dp = dmu_tx_pool(tx);
int64_t unused_refres_delta;
ASSERT(clone->ds_reserved == 0);
/*
* NOTE: On DEBUG kernels there could be a race between this and
* the check function if spa_asize_inflation is adjusted...
*/
ASSERT(origin_head->ds_quota == 0 ||
dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota +
DMU_MAX_ACCESS * spa_asize_inflation);
ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev);
/*
* Swap per-dataset feature flags.
*/
for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
if (!(spa_feature_table[f].fi_flags &
ZFEATURE_FLAG_PER_DATASET)) {
ASSERT(!clone->ds_feature_inuse[f]);
ASSERT(!origin_head->ds_feature_inuse[f]);
continue;
}
boolean_t clone_inuse = clone->ds_feature_inuse[f];
boolean_t origin_head_inuse = origin_head->ds_feature_inuse[f];
if (clone_inuse) {
dsl_dataset_deactivate_feature(clone->ds_object, f, tx);
clone->ds_feature_inuse[f] = B_FALSE;
}
if (origin_head_inuse) {
dsl_dataset_deactivate_feature(origin_head->ds_object,
f, tx);
origin_head->ds_feature_inuse[f] = B_FALSE;
}
if (clone_inuse) {
dsl_dataset_activate_feature(origin_head->ds_object,
f, tx);
origin_head->ds_feature_inuse[f] = B_TRUE;
}
if (origin_head_inuse) {
dsl_dataset_activate_feature(clone->ds_object, f, tx);
clone->ds_feature_inuse[f] = B_TRUE;
}
}
dmu_buf_will_dirty(clone->ds_dbuf, tx);
dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
if (clone->ds_objset != NULL) {
dmu_objset_evict(clone->ds_objset);
clone->ds_objset = NULL;
}
if (origin_head->ds_objset != NULL) {
dmu_objset_evict(origin_head->ds_objset);
origin_head->ds_objset = NULL;
}
unused_refres_delta =
(int64_t)MIN(origin_head->ds_reserved,
dsl_dataset_phys(origin_head)->ds_unique_bytes) -
(int64_t)MIN(origin_head->ds_reserved,
dsl_dataset_phys(clone)->ds_unique_bytes);
/*
* Reset origin's unique bytes, if it exists.
*/
if (clone->ds_prev) {
dsl_dataset_t *origin = clone->ds_prev;
uint64_t comp, uncomp;
dmu_buf_will_dirty(origin->ds_dbuf, tx);
dsl_deadlist_space_range(&clone->ds_deadlist,
dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX,
&dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp);
}
/* swap blkptrs */
{
rrw_enter(&clone->ds_bp_rwlock, RW_WRITER, FTAG);
rrw_enter(&origin_head->ds_bp_rwlock, RW_WRITER, FTAG);
blkptr_t tmp;
tmp = dsl_dataset_phys(origin_head)->ds_bp;
dsl_dataset_phys(origin_head)->ds_bp =
dsl_dataset_phys(clone)->ds_bp;
dsl_dataset_phys(clone)->ds_bp = tmp;
rrw_exit(&origin_head->ds_bp_rwlock, FTAG);
rrw_exit(&clone->ds_bp_rwlock, FTAG);
}
/* set dd_*_bytes */
{
int64_t dused, dcomp, duncomp;
uint64_t cdl_used, cdl_comp, cdl_uncomp;
uint64_t odl_used, odl_comp, odl_uncomp;
ASSERT3U(dsl_dir_phys(clone->ds_dir)->
dd_used_breakdown[DD_USED_SNAP], ==, 0);
dsl_deadlist_space(&clone->ds_deadlist,
&cdl_used, &cdl_comp, &cdl_uncomp);
dsl_deadlist_space(&origin_head->ds_deadlist,
&odl_used, &odl_comp, &odl_uncomp);
dused = dsl_dataset_phys(clone)->ds_referenced_bytes +
cdl_used -
(dsl_dataset_phys(origin_head)->ds_referenced_bytes +
odl_used);
dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes +
cdl_comp -
(dsl_dataset_phys(origin_head)->ds_compressed_bytes +
odl_comp);
duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes +
cdl_uncomp -
(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes +
odl_uncomp);
dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD,
dused, dcomp, duncomp, tx);
dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD,
-dused, -dcomp, -duncomp, tx);
/*
* The difference in the space used by snapshots is the
* difference in snapshot space due to the head's
* deadlist (since that's the only thing that's
* changing that affects the snapused).
*/
dsl_deadlist_space_range(&clone->ds_deadlist,
origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
&cdl_used, &cdl_comp, &cdl_uncomp);
dsl_deadlist_space_range(&origin_head->ds_deadlist,
origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
&odl_used, &odl_comp, &odl_uncomp);
dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used,
DD_USED_HEAD, DD_USED_SNAP, NULL);
}
/* swap ds_*_bytes */
SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes,
dsl_dataset_phys(clone)->ds_referenced_bytes);
SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes,
dsl_dataset_phys(clone)->ds_compressed_bytes);
SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes,
dsl_dataset_phys(clone)->ds_uncompressed_bytes);
SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes,
dsl_dataset_phys(clone)->ds_unique_bytes);
/* apply any parent delta for change in unconsumed refreservation */
dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV,
unused_refres_delta, 0, 0, tx);
/*
* Swap deadlists.
*/
dsl_deadlist_close(&clone->ds_deadlist);
dsl_deadlist_close(&origin_head->ds_deadlist);
SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj,
dsl_dataset_phys(clone)->ds_deadlist_obj);
dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset,
dsl_dataset_phys(clone)->ds_deadlist_obj);
dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,
dsl_dataset_phys(origin_head)->ds_deadlist_obj);
+ dsl_dataset_swap_remap_deadlists(clone, origin_head, tx);
dsl_scan_ds_clone_swapped(origin_head, clone, tx);
spa_history_log_internal_ds(clone, "clone swap", tx,
"parent=%s", origin_head->ds_dir->dd_myname);
}
/*
* Given a pool name and a dataset object number in that pool,
* return the name of that dataset.
*/
int
dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
{
dsl_pool_t *dp;
dsl_dataset_t *ds;
int error;
error = dsl_pool_hold(pname, FTAG, &dp);
if (error != 0)
return (error);
error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
if (error == 0) {
dsl_dataset_name(ds, buf);
dsl_dataset_rele(ds, FTAG);
}
dsl_pool_rele(dp, FTAG);
return (error);
}
int
dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
{
int error = 0;
ASSERT3S(asize, >, 0);
/*
* *ref_rsrv is the portion of asize that will come from any
* unconsumed refreservation space.
*/
*ref_rsrv = 0;
mutex_enter(&ds->ds_lock);
/*
* Make a space adjustment for reserved bytes.
*/
if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
ASSERT3U(*used, >=,
ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
*used -=
(ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
*ref_rsrv =
asize - MIN(asize, parent_delta(ds, asize + inflight));
}
if (!check_quota || ds->ds_quota == 0) {
mutex_exit(&ds->ds_lock);
return (0);
}
/*
* If they are requesting more space, and our current estimate
* is over quota, they get to try again unless the actual
* on-disk is over quota and there are no pending changes (which
* may free up space for us).
*/
if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >=
ds->ds_quota) {
if (inflight > 0 ||
dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota)
error = SET_ERROR(ERESTART);
else
error = SET_ERROR(EDQUOT);
}
mutex_exit(&ds->ds_lock);
return (error);
}
typedef struct dsl_dataset_set_qr_arg {
const char *ddsqra_name;
zprop_source_t ddsqra_source;
uint64_t ddsqra_value;
} dsl_dataset_set_qr_arg_t;
/* ARGSUSED */
static int
dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx)
{
dsl_dataset_set_qr_arg_t *ddsqra = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *ds;
int error;
uint64_t newval;
if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA)
return (SET_ERROR(ENOTSUP));
error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
if (error != 0)
return (error);
if (ds->ds_is_snapshot) {
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(EINVAL));
}
error = dsl_prop_predict(ds->ds_dir,
zfs_prop_to_name(ZFS_PROP_REFQUOTA),
ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
if (error != 0) {
dsl_dataset_rele(ds, FTAG);
return (error);
}
if (newval == 0) {
dsl_dataset_rele(ds, FTAG);
return (0);
}
if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes ||
newval < ds->ds_reserved) {
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(ENOSPC));
}
dsl_dataset_rele(ds, FTAG);
return (0);
}
static void
dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx)
{
dsl_dataset_set_qr_arg_t *ddsqra = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *ds;
uint64_t newval;
VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
dsl_prop_set_sync_impl(ds,
zfs_prop_to_name(ZFS_PROP_REFQUOTA),
ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
&ddsqra->ddsqra_value, tx);
VERIFY0(dsl_prop_get_int_ds(ds,
zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval));
if (ds->ds_quota != newval) {
dmu_buf_will_dirty(ds->ds_dbuf, tx);
ds->ds_quota = newval;
}
dsl_dataset_rele(ds, FTAG);
}
int
dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
uint64_t refquota)
{
dsl_dataset_set_qr_arg_t ddsqra;
ddsqra.ddsqra_name = dsname;
ddsqra.ddsqra_source = source;
ddsqra.ddsqra_value = refquota;
return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
dsl_dataset_set_refquota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
}
static int
dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx)
{
dsl_dataset_set_qr_arg_t *ddsqra = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *ds;
int error;
uint64_t newval, unique;
if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION)
return (SET_ERROR(ENOTSUP));
error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
if (error != 0)
return (error);
if (ds->ds_is_snapshot) {
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(EINVAL));
}
error = dsl_prop_predict(ds->ds_dir,
zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
if (error != 0) {
dsl_dataset_rele(ds, FTAG);
return (error);
}
/*
* If we are doing the preliminary check in open context, the
* space estimates may be inaccurate.
*/
if (!dmu_tx_is_syncing(tx)) {
dsl_dataset_rele(ds, FTAG);
return (0);
}
mutex_enter(&ds->ds_lock);
if (!DS_UNIQUE_IS_ACCURATE(ds))
dsl_dataset_recalc_head_uniq(ds);
unique = dsl_dataset_phys(ds)->ds_unique_bytes;
mutex_exit(&ds->ds_lock);
if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) {
uint64_t delta = MAX(unique, newval) -
MAX(unique, ds->ds_reserved);
if (delta >
dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) ||
(ds->ds_quota > 0 && newval > ds->ds_quota)) {
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(ENOSPC));
}
}
dsl_dataset_rele(ds, FTAG);
return (0);
}
void
dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
zprop_source_t source, uint64_t value, dmu_tx_t *tx)
{
uint64_t newval;
uint64_t unique;
int64_t delta;
dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
source, sizeof (value), 1, &value, tx);
VERIFY0(dsl_prop_get_int_ds(ds,
zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval));
dmu_buf_will_dirty(ds->ds_dbuf, tx);
mutex_enter(&ds->ds_dir->dd_lock);
mutex_enter(&ds->ds_lock);
ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
unique = dsl_dataset_phys(ds)->ds_unique_bytes;
delta = MAX(0, (int64_t)(newval - unique)) -
MAX(0, (int64_t)(ds->ds_reserved - unique));
ds->ds_reserved = newval;
mutex_exit(&ds->ds_lock);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
mutex_exit(&ds->ds_dir->dd_lock);
}
static void
dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx)
{
dsl_dataset_set_qr_arg_t *ddsqra = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *ds;
VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
dsl_dataset_set_refreservation_sync_impl(ds,
ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx);
dsl_dataset_rele(ds, FTAG);
}
int
dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
uint64_t refreservation)
{
dsl_dataset_set_qr_arg_t ddsqra;
ddsqra.ddsqra_name = dsname;
ddsqra.ddsqra_source = source;
ddsqra.ddsqra_value = refreservation;
return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
dsl_dataset_set_refreservation_sync, &ddsqra,
0, ZFS_SPACE_CHECK_NONE));
}
/*
* Return (in *usedp) the amount of space written in new that is not
* present in oldsnap. New may be a snapshot or the head. Old must be
* a snapshot before new, in new's filesystem (or its origin). If not then
* fail and return EINVAL.
*
* The written space is calculated by considering two components: First, we
* ignore any freed space, and calculate the written as new's used space
* minus old's used space. Next, we add in the amount of space that was freed
* between the two snapshots, thus reducing new's used space relative to old's.
* Specifically, this is the space that was born before old->ds_creation_txg,
* and freed before new (ie. on new's deadlist or a previous deadlist).
*
* space freed [---------------------]
* snapshots ---O-------O--------O-------O------
* oldsnap new
*/
int
dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
{
int err = 0;
uint64_t snapobj;
dsl_pool_t *dp = new->ds_dir->dd_pool;
ASSERT(dsl_pool_config_held(dp));
*usedp = 0;
*usedp += dsl_dataset_phys(new)->ds_referenced_bytes;
*usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes;
*compp = 0;
*compp += dsl_dataset_phys(new)->ds_compressed_bytes;
*compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes;
*uncompp = 0;
*uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes;
*uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes;
snapobj = new->ds_object;
while (snapobj != oldsnap->ds_object) {
dsl_dataset_t *snap;
uint64_t used, comp, uncomp;
if (snapobj == new->ds_object) {
snap = new;
} else {
err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
if (err != 0)
break;
}
if (dsl_dataset_phys(snap)->ds_prev_snap_txg ==
dsl_dataset_phys(oldsnap)->ds_creation_txg) {
/*
* The blocks in the deadlist can not be born after
* ds_prev_snap_txg, so get the whole deadlist space,
* which is more efficient (especially for old-format
* deadlists). Unfortunately the deadlist code
* doesn't have enough information to make this
* optimization itself.
*/
dsl_deadlist_space(&snap->ds_deadlist,
&used, &comp, &uncomp);
} else {
dsl_deadlist_space_range(&snap->ds_deadlist,
0, dsl_dataset_phys(oldsnap)->ds_creation_txg,
&used, &comp, &uncomp);
}
*usedp += used;
*compp += comp;
*uncompp += uncomp;
/*
* If we get to the beginning of the chain of snapshots
* (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
* was not a snapshot of/before new.
*/
snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
if (snap != new)
dsl_dataset_rele(snap, FTAG);
if (snapobj == 0) {
err = SET_ERROR(EINVAL);
break;
}
}
return (err);
}
/*
* Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
* lastsnap, and all snapshots in between are deleted.
*
* blocks that would be freed [---------------------------]
* snapshots ---O-------O--------O-------O--------O
* firstsnap lastsnap
*
* This is the set of blocks that were born after the snap before firstsnap,
* (birth > firstsnap->prev_snap_txg) and died before the snap after the
* last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
* We calculate this by iterating over the relevant deadlists (from the snap
* after lastsnap, backward to the snap after firstsnap), summing up the
* space on the deadlist that was born after the snap before firstsnap.
*/
int
dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
dsl_dataset_t *lastsnap,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
{
int err = 0;
uint64_t snapobj;
dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
ASSERT(firstsnap->ds_is_snapshot);
ASSERT(lastsnap->ds_is_snapshot);
/*
* Check that the snapshots are in the same dsl_dir, and firstsnap
* is before lastsnap.
*/
if (firstsnap->ds_dir != lastsnap->ds_dir ||
dsl_dataset_phys(firstsnap)->ds_creation_txg >
dsl_dataset_phys(lastsnap)->ds_creation_txg)
return (SET_ERROR(EINVAL));
*usedp = *compp = *uncompp = 0;
snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj;
while (snapobj != firstsnap->ds_object) {
dsl_dataset_t *ds;
uint64_t used, comp, uncomp;
err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
if (err != 0)
break;
dsl_deadlist_space_range(&ds->ds_deadlist,
dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX,
&used, &comp, &uncomp);
*usedp += used;
*compp += comp;
*uncompp += uncomp;
snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
ASSERT3U(snapobj, !=, 0);
dsl_dataset_rele(ds, FTAG);
}
return (err);
}
/*
* Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
* For example, they could both be snapshots of the same filesystem, and
* 'earlier' is before 'later'. Or 'earlier' could be the origin of
* 'later's filesystem. Or 'earlier' could be an older snapshot in the origin's
* filesystem. Or 'earlier' could be the origin's origin.
*
* If non-zero, earlier_txg is used instead of earlier's ds_creation_txg.
*/
boolean_t
dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
uint64_t earlier_txg)
{
dsl_pool_t *dp = later->ds_dir->dd_pool;
int error;
boolean_t ret;
ASSERT(dsl_pool_config_held(dp));
ASSERT(earlier->ds_is_snapshot || earlier_txg != 0);
if (earlier_txg == 0)
earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg;
if (later->ds_is_snapshot &&
earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg)
return (B_FALSE);
if (later->ds_dir == earlier->ds_dir)
return (B_TRUE);
if (!dsl_dir_is_clone(later->ds_dir))
return (B_FALSE);
if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object)
return (B_TRUE);
dsl_dataset_t *origin;
error = dsl_dataset_hold_obj(dp,
dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin);
if (error != 0)
return (B_FALSE);
ret = dsl_dataset_is_before(origin, earlier, earlier_txg);
dsl_dataset_rele(origin, FTAG);
return (ret);
}
void
dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
{
objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx);
}
boolean_t
dsl_dataset_is_zapified(dsl_dataset_t *ds)
{
dmu_object_info_t doi;
dmu_object_info_from_db(ds->ds_dbuf, &doi);
return (doi.doi_type == DMU_OTN_ZAP_METADATA);
}
boolean_t
dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds)
{
return (dsl_dataset_is_zapified(ds) &&
zap_contains(ds->ds_dir->dd_pool->dp_meta_objset,
ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0);
+}
+
+uint64_t
+dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds)
+{
+ uint64_t remap_deadlist_obj;
+ int err;
+
+ if (!dsl_dataset_is_zapified(ds))
+ return (0);
+
+ err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object,
+ DS_FIELD_REMAP_DEADLIST, sizeof (remap_deadlist_obj), 1,
+ &remap_deadlist_obj);
+
+ if (err != 0) {
+ VERIFY3S(err, ==, ENOENT);
+ return (0);
+ }
+
+ ASSERT(remap_deadlist_obj != 0);
+ return (remap_deadlist_obj);
+}
+
+boolean_t
+dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds)
+{
+ EQUIV(dsl_deadlist_is_open(&ds->ds_remap_deadlist),
+ dsl_dataset_get_remap_deadlist_object(ds) != 0);
+ return (dsl_deadlist_is_open(&ds->ds_remap_deadlist));
+}
+
+static void
+dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds, uint64_t obj,
+ dmu_tx_t *tx)
+{
+ ASSERT(obj != 0);
+ dsl_dataset_zapify(ds, tx);
+ VERIFY0(zap_add(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object,
+ DS_FIELD_REMAP_DEADLIST, sizeof (obj), 1, &obj, tx));
+}
+
+static void
+dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ VERIFY0(zap_remove(ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_object, DS_FIELD_REMAP_DEADLIST, tx));
+}
+
+void
+dsl_dataset_destroy_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ uint64_t remap_deadlist_object;
+ spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(dsl_dataset_remap_deadlist_exists(ds));
+
+ remap_deadlist_object = ds->ds_remap_deadlist.dl_object;
+ dsl_deadlist_close(&ds->ds_remap_deadlist);
+ dsl_deadlist_free(spa_meta_objset(spa), remap_deadlist_object, tx);
+ dsl_dataset_unset_remap_deadlist_object(ds, tx);
+ spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+}
+
+void
+dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ uint64_t remap_deadlist_obj;
+ spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(MUTEX_HELD(&ds->ds_remap_deadlist_lock));
+ /*
+ * Currently we only create remap deadlists when there are indirect
+ * vdevs with referenced mappings.
+ */
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
+
+ remap_deadlist_obj = dsl_deadlist_clone(
+ &ds->ds_deadlist, UINT64_MAX,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
+ dsl_dataset_set_remap_deadlist_object(ds,
+ remap_deadlist_obj, tx);
+ dsl_deadlist_open(&ds->ds_remap_deadlist, spa_meta_objset(spa),
+ remap_deadlist_obj);
+ spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c (revision 332525)
@@ -1,553 +1,566 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/dsl_dataset.h>
#include <sys/dmu.h>
#include <sys/refcount.h>
#include <sys/zap.h>
#include <sys/zfs_context.h>
#include <sys/dsl_pool.h>
/*
* Deadlist concurrency:
*
* Deadlists can only be modified from the syncing thread.
*
* Except for dsl_deadlist_insert(), it can only be modified with the
* dp_config_rwlock held with RW_WRITER.
*
* The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can
* be called concurrently, from open context, with the dl_config_rwlock held
* with RW_READER.
*
* Therefore, we only need to provide locking between dsl_deadlist_insert() and
* the accessors, protecting:
* dl_phys->dl_used,comp,uncomp
* and protecting the dl_tree from being loaded.
* The locking is provided by dl_lock. Note that locking on the bpobj_t
* provides its own locking, and dl_oldfmt is immutable.
*/
static int
dsl_deadlist_compare(const void *arg1, const void *arg2)
{
const dsl_deadlist_entry_t *dle1 = arg1;
const dsl_deadlist_entry_t *dle2 = arg2;
if (dle1->dle_mintxg < dle2->dle_mintxg)
return (-1);
else if (dle1->dle_mintxg > dle2->dle_mintxg)
return (+1);
else
return (0);
}
static void
dsl_deadlist_load_tree(dsl_deadlist_t *dl)
{
zap_cursor_t zc;
zap_attribute_t za;
ASSERT(MUTEX_HELD(&dl->dl_lock));
ASSERT(!dl->dl_oldfmt);
if (dl->dl_havetree)
return;
avl_create(&dl->dl_tree, dsl_deadlist_compare,
sizeof (dsl_deadlist_entry_t),
offsetof(dsl_deadlist_entry_t, dle_node));
for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
zap_cursor_retrieve(&zc, &za) == 0;
zap_cursor_advance(&zc)) {
dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
dle->dle_mintxg = zfs_strtonum(za.za_name, NULL);
VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os,
za.za_first_integer));
avl_add(&dl->dl_tree, dle);
}
zap_cursor_fini(&zc);
dl->dl_havetree = B_TRUE;
}
void
dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
{
dmu_object_info_t doi;
+ ASSERT(!dsl_deadlist_is_open(dl));
+
mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL);
dl->dl_os = os;
dl->dl_object = object;
VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf));
dmu_object_info_from_db(dl->dl_dbuf, &doi);
if (doi.doi_type == DMU_OT_BPOBJ) {
dmu_buf_rele(dl->dl_dbuf, dl);
dl->dl_dbuf = NULL;
dl->dl_oldfmt = B_TRUE;
VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object));
return;
}
dl->dl_oldfmt = B_FALSE;
dl->dl_phys = dl->dl_dbuf->db_data;
dl->dl_havetree = B_FALSE;
}
+boolean_t
+dsl_deadlist_is_open(dsl_deadlist_t *dl)
+{
+ return (dl->dl_os != NULL);
+}
+
void
dsl_deadlist_close(dsl_deadlist_t *dl)
{
void *cookie = NULL;
dsl_deadlist_entry_t *dle;
- dl->dl_os = NULL;
+ ASSERT(dsl_deadlist_is_open(dl));
if (dl->dl_oldfmt) {
dl->dl_oldfmt = B_FALSE;
bpobj_close(&dl->dl_bpobj);
+ dl->dl_os = NULL;
+ dl->dl_object = 0;
return;
}
if (dl->dl_havetree) {
while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie))
!= NULL) {
bpobj_close(&dle->dle_bpobj);
kmem_free(dle, sizeof (*dle));
}
avl_destroy(&dl->dl_tree);
}
dmu_buf_rele(dl->dl_dbuf, dl);
mutex_destroy(&dl->dl_lock);
dl->dl_dbuf = NULL;
dl->dl_phys = NULL;
+ dl->dl_os = NULL;
+ dl->dl_object = 0;
}
uint64_t
dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
{
if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx));
return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
sizeof (dsl_deadlist_phys_t), tx));
}
void
dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
{
dmu_object_info_t doi;
zap_cursor_t zc;
zap_attribute_t za;
VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi));
if (doi.doi_type == DMU_OT_BPOBJ) {
bpobj_free(os, dlobj, tx);
return;
}
for (zap_cursor_init(&zc, os, dlobj);
zap_cursor_retrieve(&zc, &za) == 0;
zap_cursor_advance(&zc)) {
uint64_t obj = za.za_first_integer;
if (obj == dmu_objset_pool(os)->dp_empty_bpobj)
bpobj_decr_empty(os, tx);
else
bpobj_free(os, obj, tx);
}
zap_cursor_fini(&zc);
VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx));
}
static void
dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
const blkptr_t *bp, dmu_tx_t *tx)
{
ASSERT(MUTEX_HELD(&dl->dl_lock));
if (dle->dle_bpobj.bpo_object ==
dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
bpobj_close(&dle->dle_bpobj);
bpobj_decr_empty(dl->dl_os, tx);
VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
dle->dle_mintxg, obj, tx));
}
bpobj_enqueue(&dle->dle_bpobj, bp, tx);
}
static void
dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
uint64_t obj, dmu_tx_t *tx)
{
ASSERT(MUTEX_HELD(&dl->dl_lock));
if (dle->dle_bpobj.bpo_object !=
dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
} else {
bpobj_close(&dle->dle_bpobj);
bpobj_decr_empty(dl->dl_os, tx);
VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
dle->dle_mintxg, obj, tx));
}
}
void
dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
{
dsl_deadlist_entry_t dle_tofind;
dsl_deadlist_entry_t *dle;
avl_index_t where;
if (dl->dl_oldfmt) {
bpobj_enqueue(&dl->dl_bpobj, bp, tx);
return;
}
mutex_enter(&dl->dl_lock);
dsl_deadlist_load_tree(dl);
dmu_buf_will_dirty(dl->dl_dbuf, tx);
dl->dl_phys->dl_used +=
bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
dl->dl_phys->dl_comp += BP_GET_PSIZE(bp);
dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp);
dle_tofind.dle_mintxg = bp->blk_birth;
dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
if (dle == NULL)
dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
else
dle = AVL_PREV(&dl->dl_tree, dle);
dle_enqueue(dl, dle, bp, tx);
mutex_exit(&dl->dl_lock);
}
/*
* Insert new key in deadlist, which must be > all current entries.
* mintxg is not inclusive.
*/
void
dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
{
uint64_t obj;
dsl_deadlist_entry_t *dle;
if (dl->dl_oldfmt)
return;
dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
dle->dle_mintxg = mintxg;
mutex_enter(&dl->dl_lock);
dsl_deadlist_load_tree(dl);
obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
avl_add(&dl->dl_tree, dle);
VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object,
mintxg, obj, tx));
mutex_exit(&dl->dl_lock);
}
/*
* Remove this key, merging its entries into the previous key.
*/
void
dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
{
dsl_deadlist_entry_t dle_tofind;
dsl_deadlist_entry_t *dle, *dle_prev;
if (dl->dl_oldfmt)
return;
mutex_enter(&dl->dl_lock);
dsl_deadlist_load_tree(dl);
dle_tofind.dle_mintxg = mintxg;
dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
dle_prev = AVL_PREV(&dl->dl_tree, dle);
dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx);
avl_remove(&dl->dl_tree, dle);
bpobj_close(&dle->dle_bpobj);
kmem_free(dle, sizeof (*dle));
VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx));
mutex_exit(&dl->dl_lock);
}
/*
* Walk ds's snapshots to regenerate generate ZAP & AVL.
*/
static void
dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj,
uint64_t mrs_obj, dmu_tx_t *tx)
{
- dsl_deadlist_t dl;
+ dsl_deadlist_t dl = { 0 };
dsl_pool_t *dp = dmu_objset_pool(os);
dsl_deadlist_open(&dl, os, dlobj);
if (dl.dl_oldfmt) {
dsl_deadlist_close(&dl);
return;
}
while (mrs_obj != 0) {
dsl_dataset_t *ds;
VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));
dsl_deadlist_add_key(&dl,
dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
dsl_dataset_rele(ds, FTAG);
}
dsl_deadlist_close(&dl);
}
uint64_t
dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
uint64_t mrs_obj, dmu_tx_t *tx)
{
dsl_deadlist_entry_t *dle;
uint64_t newobj;
newobj = dsl_deadlist_alloc(dl->dl_os, tx);
if (dl->dl_oldfmt) {
dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx);
return (newobj);
}
mutex_enter(&dl->dl_lock);
dsl_deadlist_load_tree(dl);
for (dle = avl_first(&dl->dl_tree); dle;
dle = AVL_NEXT(&dl->dl_tree, dle)) {
uint64_t obj;
if (dle->dle_mintxg >= maxtxg)
break;
obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
dle->dle_mintxg, obj, tx));
}
mutex_exit(&dl->dl_lock);
return (newobj);
}
void
dsl_deadlist_space(dsl_deadlist_t *dl,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
{
+ ASSERT(dsl_deadlist_is_open(dl));
if (dl->dl_oldfmt) {
VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj,
usedp, compp, uncompp));
return;
}
mutex_enter(&dl->dl_lock);
*usedp = dl->dl_phys->dl_used;
*compp = dl->dl_phys->dl_comp;
*uncompp = dl->dl_phys->dl_uncomp;
mutex_exit(&dl->dl_lock);
}
/*
* return space used in the range (mintxg, maxtxg].
* Includes maxtxg, does not include mintxg.
* mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is
* larger than any bp in the deadlist (eg. UINT64_MAX)).
*/
void
dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
{
dsl_deadlist_entry_t *dle;
dsl_deadlist_entry_t dle_tofind;
avl_index_t where;
if (dl->dl_oldfmt) {
VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj,
mintxg, maxtxg, usedp, compp, uncompp));
return;
}
*usedp = *compp = *uncompp = 0;
mutex_enter(&dl->dl_lock);
dsl_deadlist_load_tree(dl);
dle_tofind.dle_mintxg = mintxg;
dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
/*
* If we don't find this mintxg, there shouldn't be anything
* after it either.
*/
ASSERT(dle != NULL ||
avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL);
for (; dle && dle->dle_mintxg < maxtxg;
dle = AVL_NEXT(&dl->dl_tree, dle)) {
uint64_t used, comp, uncomp;
VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
&used, &comp, &uncomp));
*usedp += used;
*compp += comp;
*uncompp += uncomp;
}
mutex_exit(&dl->dl_lock);
}
static void
dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
dmu_tx_t *tx)
{
dsl_deadlist_entry_t dle_tofind;
dsl_deadlist_entry_t *dle;
avl_index_t where;
uint64_t used, comp, uncomp;
bpobj_t bpo;
ASSERT(MUTEX_HELD(&dl->dl_lock));
VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp));
bpobj_close(&bpo);
dsl_deadlist_load_tree(dl);
dmu_buf_will_dirty(dl->dl_dbuf, tx);
dl->dl_phys->dl_used += used;
dl->dl_phys->dl_comp += comp;
dl->dl_phys->dl_uncomp += uncomp;
dle_tofind.dle_mintxg = birth;
dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
if (dle == NULL)
dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
dle_enqueue_subobj(dl, dle, obj, tx);
}
static int
dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
{
dsl_deadlist_t *dl = arg;
dsl_deadlist_insert(dl, bp, tx);
return (0);
}
/*
* Merge the deadlist pointed to by 'obj' into dl. obj will be left as
* an empty deadlist.
*/
void
dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
{
zap_cursor_t zc;
zap_attribute_t za;
dmu_buf_t *bonus;
dsl_deadlist_phys_t *dlp;
dmu_object_info_t doi;
VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi));
if (doi.doi_type == DMU_OT_BPOBJ) {
bpobj_t bpo;
VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
VERIFY3U(0, ==, bpobj_iterate(&bpo,
dsl_deadlist_insert_cb, dl, tx));
bpobj_close(&bpo);
return;
}
mutex_enter(&dl->dl_lock);
for (zap_cursor_init(&zc, dl->dl_os, obj);
zap_cursor_retrieve(&zc, &za) == 0;
zap_cursor_advance(&zc)) {
uint64_t mintxg = zfs_strtonum(za.za_name, NULL);
dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx));
}
zap_cursor_fini(&zc);
VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
dlp = bonus->db_data;
dmu_buf_will_dirty(bonus, tx);
bzero(dlp, sizeof (*dlp));
dmu_buf_rele(bonus, FTAG);
mutex_exit(&dl->dl_lock);
}
/*
* Remove entries on dl that are >= mintxg, and put them on the bpobj.
*/
void
dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
dmu_tx_t *tx)
{
dsl_deadlist_entry_t dle_tofind;
dsl_deadlist_entry_t *dle;
avl_index_t where;
ASSERT(!dl->dl_oldfmt);
mutex_enter(&dl->dl_lock);
dmu_buf_will_dirty(dl->dl_dbuf, tx);
dsl_deadlist_load_tree(dl);
dle_tofind.dle_mintxg = mintxg;
dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
if (dle == NULL)
dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
while (dle) {
uint64_t used, comp, uncomp;
dsl_deadlist_entry_t *dle_next;
bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
&used, &comp, &uncomp));
ASSERT3U(dl->dl_phys->dl_used, >=, used);
ASSERT3U(dl->dl_phys->dl_comp, >=, comp);
ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp);
dl->dl_phys->dl_used -= used;
dl->dl_phys->dl_comp -= comp;
dl->dl_phys->dl_uncomp -= uncomp;
VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object,
dle->dle_mintxg, tx));
dle_next = AVL_NEXT(&dl->dl_tree, dle);
avl_remove(&dl->dl_tree, dle);
bpobj_close(&dle->dle_bpobj);
kmem_free(dle, sizeof (*dle));
dle = dle_next;
}
mutex_exit(&dl->dl_lock);
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c (revision 332525)
@@ -1,1032 +1,1080 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
* Copyright (c) 2013 by Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/zfs_context.h>
#include <sys/dsl_userhold.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_synctask.h>
#include <sys/dsl_destroy.h>
#include <sys/dmu_tx.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_dir.h>
#include <sys/dmu_traverse.h>
#include <sys/dsl_scan.h>
#include <sys/dmu_objset.h>
#include <sys/zap.h>
#include <sys/zfeature.h>
#include <sys/zfs_ioctl.h>
#include <sys/dsl_deleg.h>
#include <sys/dmu_impl.h>
#include <sys/zcp.h>
int
dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
{
if (!ds->ds_is_snapshot)
return (SET_ERROR(EINVAL));
if (dsl_dataset_long_held(ds))
return (SET_ERROR(EBUSY));
/*
* Only allow deferred destroy on pools that support it.
* NOTE: deferred destroy is only supported on snapshots.
*/
if (defer) {
if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
SPA_VERSION_USERREFS)
return (SET_ERROR(ENOTSUP));
return (0);
}
/*
* If this snapshot has an elevated user reference count,
* we can't destroy it yet.
*/
if (ds->ds_userrefs > 0)
return (SET_ERROR(EBUSY));
/*
* Can't delete a branch point.
*/
if (dsl_dataset_phys(ds)->ds_num_children > 1)
return (SET_ERROR(EEXIST));
return (0);
}
int
dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx)
{
dsl_destroy_snapshot_arg_t *ddsa = arg;
const char *dsname = ddsa->ddsa_name;
boolean_t defer = ddsa->ddsa_defer;
dsl_pool_t *dp = dmu_tx_pool(tx);
int error = 0;
dsl_dataset_t *ds;
error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
/*
* If the snapshot does not exist, silently ignore it, and
* dsl_destroy_snapshot_sync() will be a no-op
* (it's "already destroyed").
*/
if (error == ENOENT)
return (0);
if (error == 0) {
error = dsl_destroy_snapshot_check_impl(ds, defer);
dsl_dataset_rele(ds, FTAG);
}
return (error);
}
struct process_old_arg {
dsl_dataset_t *ds;
dsl_dataset_t *ds_prev;
boolean_t after_branch_point;
zio_t *pio;
uint64_t used, comp, uncomp;
};
static int
process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
{
struct process_old_arg *poa = arg;
dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
ASSERT(!BP_IS_HOLE(bp));
if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
if (poa->ds_prev && !poa->after_branch_point &&
bp->blk_birth >
dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes +=
bp_get_dsize_sync(dp->dp_spa, bp);
}
} else {
poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
poa->comp += BP_GET_PSIZE(bp);
poa->uncomp += BP_GET_UCSIZE(bp);
dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
}
return (0);
}
static void
process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
{
struct process_old_arg poa = { 0 };
dsl_pool_t *dp = ds->ds_dir->dd_pool;
objset_t *mos = dp->dp_meta_objset;
uint64_t deadlist_obj;
ASSERT(ds->ds_deadlist.dl_oldfmt);
ASSERT(ds_next->ds_deadlist.dl_oldfmt);
poa.ds = ds;
poa.ds_prev = ds_prev;
poa.after_branch_point = after_branch_point;
poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
process_old_cb, &poa, tx));
VERIFY0(zio_wait(poa.pio));
ASSERT3U(poa.used, ==, dsl_dataset_phys(ds)->ds_unique_bytes);
/* change snapused */
dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
-poa.used, -poa.comp, -poa.uncomp, tx);
/* swap next's deadlist to our deadlist */
dsl_deadlist_close(&ds->ds_deadlist);
dsl_deadlist_close(&ds_next->ds_deadlist);
deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
dsl_dataset_phys(ds)->ds_deadlist_obj =
dsl_dataset_phys(ds_next)->ds_deadlist_obj;
dsl_dataset_phys(ds_next)->ds_deadlist_obj = deadlist_obj;
dsl_deadlist_open(&ds->ds_deadlist, mos,
dsl_dataset_phys(ds)->ds_deadlist_obj);
dsl_deadlist_open(&ds_next->ds_deadlist, mos,
dsl_dataset_phys(ds_next)->ds_deadlist_obj);
}
static void
dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
{
objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
zap_cursor_t zc;
zap_attribute_t za;
/*
* If it is the old version, dd_clones doesn't exist so we can't
* find the clones, but dsl_deadlist_remove_key() is a no-op so it
* doesn't matter.
*/
if (dsl_dir_phys(ds->ds_dir)->dd_clones == 0)
return;
for (zap_cursor_init(&zc, mos, dsl_dir_phys(ds->ds_dir)->dd_clones);
zap_cursor_retrieve(&zc, &za) == 0;
zap_cursor_advance(&zc)) {
dsl_dataset_t *clone;
VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
za.za_first_integer, FTAG, &clone));
if (clone->ds_dir->dd_origin_txg > mintxg) {
dsl_deadlist_remove_key(&clone->ds_deadlist,
mintxg, tx);
+ if (dsl_dataset_remap_deadlist_exists(clone)) {
+ dsl_deadlist_remove_key(
+ &clone->ds_remap_deadlist, mintxg, tx);
+ }
dsl_dataset_remove_clones_key(clone, mintxg, tx);
}
dsl_dataset_rele(clone, FTAG);
}
zap_cursor_fini(&zc);
}
+static void
+dsl_destroy_snapshot_handle_remaps(dsl_dataset_t *ds, dsl_dataset_t *ds_next,
+ dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ /* Move blocks to be obsoleted to pool's obsolete list. */
+ if (dsl_dataset_remap_deadlist_exists(ds_next)) {
+ if (!bpobj_is_open(&dp->dp_obsolete_bpobj))
+ dsl_pool_create_obsolete_bpobj(dp, tx);
+
+ dsl_deadlist_move_bpobj(&ds_next->ds_remap_deadlist,
+ &dp->dp_obsolete_bpobj,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
+ }
+
+ /* Merge our deadlist into next's and free it. */
+ if (dsl_dataset_remap_deadlist_exists(ds)) {
+ uint64_t remap_deadlist_object =
+ dsl_dataset_get_remap_deadlist_object(ds);
+ ASSERT(remap_deadlist_object != 0);
+
+ mutex_enter(&ds_next->ds_remap_deadlist_lock);
+ if (!dsl_dataset_remap_deadlist_exists(ds_next))
+ dsl_dataset_create_remap_deadlist(ds_next, tx);
+ mutex_exit(&ds_next->ds_remap_deadlist_lock);
+
+ dsl_deadlist_merge(&ds_next->ds_remap_deadlist,
+ remap_deadlist_object, tx);
+ dsl_dataset_destroy_remap_deadlist(ds, tx);
+ }
+}
+
void
dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
{
int err;
int after_branch_point = FALSE;
dsl_pool_t *dp = ds->ds_dir->dd_pool;
objset_t *mos = dp->dp_meta_objset;
dsl_dataset_t *ds_prev = NULL;
uint64_t obj;
ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
rrw_exit(&ds->ds_bp_rwlock, FTAG);
ASSERT(refcount_is_zero(&ds->ds_longholds));
if (defer &&
(ds->ds_userrefs > 0 ||
dsl_dataset_phys(ds)->ds_num_children > 1)) {
ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
dmu_buf_will_dirty(ds->ds_dbuf, tx);
dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY;
spa_history_log_internal_ds(ds, "defer_destroy", tx, "");
return;
}
ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
/* We need to log before removing it from the namespace. */
spa_history_log_internal_ds(ds, "destroy", tx, "");
dsl_scan_ds_destroyed(ds, tx);
obj = ds->ds_object;
for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
if (ds->ds_feature_inuse[f]) {
dsl_dataset_deactivate_feature(obj, f, tx);
ds->ds_feature_inuse[f] = B_FALSE;
}
}
if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
ASSERT3P(ds->ds_prev, ==, NULL);
VERIFY0(dsl_dataset_hold_obj(dp,
dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &ds_prev));
after_branch_point =
(dsl_dataset_phys(ds_prev)->ds_next_snap_obj != obj);
dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
if (after_branch_point &&
dsl_dataset_phys(ds_prev)->ds_next_clones_obj != 0) {
dsl_dataset_remove_from_next_clones(ds_prev, obj, tx);
if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
VERIFY0(zap_add_int(mos,
dsl_dataset_phys(ds_prev)->
ds_next_clones_obj,
dsl_dataset_phys(ds)->ds_next_snap_obj,
tx));
}
}
if (!after_branch_point) {
dsl_dataset_phys(ds_prev)->ds_next_snap_obj =
dsl_dataset_phys(ds)->ds_next_snap_obj;
}
}
dsl_dataset_t *ds_next;
uint64_t old_unique;
uint64_t used = 0, comp = 0, uncomp = 0;
VERIFY0(dsl_dataset_hold_obj(dp,
dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &ds_next));
ASSERT3U(dsl_dataset_phys(ds_next)->ds_prev_snap_obj, ==, obj);
old_unique = dsl_dataset_phys(ds_next)->ds_unique_bytes;
dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
dsl_dataset_phys(ds_next)->ds_prev_snap_obj =
dsl_dataset_phys(ds)->ds_prev_snap_obj;
dsl_dataset_phys(ds_next)->ds_prev_snap_txg =
dsl_dataset_phys(ds)->ds_prev_snap_txg;
ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
ds_prev ? dsl_dataset_phys(ds_prev)->ds_creation_txg : 0);
if (ds_next->ds_deadlist.dl_oldfmt) {
process_old_deadlist(ds, ds_prev, ds_next,
after_branch_point, tx);
} else {
/* Adjust prev's unique space. */
if (ds_prev && !after_branch_point) {
dsl_deadlist_space_range(&ds_next->ds_deadlist,
dsl_dataset_phys(ds_prev)->ds_prev_snap_txg,
dsl_dataset_phys(ds)->ds_prev_snap_txg,
&used, &comp, &uncomp);
dsl_dataset_phys(ds_prev)->ds_unique_bytes += used;
}
/* Adjust snapused. */
dsl_deadlist_space_range(&ds_next->ds_deadlist,
dsl_dataset_phys(ds)->ds_prev_snap_txg, UINT64_MAX,
&used, &comp, &uncomp);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
-used, -comp, -uncomp, tx);
/* Move blocks to be freed to pool's free list. */
dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
&dp->dp_free_bpobj, dsl_dataset_phys(ds)->ds_prev_snap_txg,
tx);
dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
DD_USED_HEAD, used, comp, uncomp, tx);
/* Merge our deadlist into next's and free it. */
dsl_deadlist_merge(&ds_next->ds_deadlist,
dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
}
+
dsl_deadlist_close(&ds->ds_deadlist);
dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
dmu_buf_will_dirty(ds->ds_dbuf, tx);
dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
+ dsl_destroy_snapshot_handle_remaps(ds, ds_next, tx);
+
/* Collapse range in clone heads */
dsl_dataset_remove_clones_key(ds,
dsl_dataset_phys(ds)->ds_creation_txg, tx);
if (ds_next->ds_is_snapshot) {
dsl_dataset_t *ds_nextnext;
/*
* Update next's unique to include blocks which
* were previously shared by only this snapshot
* and it. Those blocks will be born after the
* prev snap and before this snap, and will have
* died after the next snap and before the one
* after that (ie. be on the snap after next's
* deadlist).
*/
VERIFY0(dsl_dataset_hold_obj(dp,
dsl_dataset_phys(ds_next)->ds_next_snap_obj,
FTAG, &ds_nextnext));
dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
dsl_dataset_phys(ds)->ds_prev_snap_txg,
dsl_dataset_phys(ds)->ds_creation_txg,
&used, &comp, &uncomp);
dsl_dataset_phys(ds_next)->ds_unique_bytes += used;
dsl_dataset_rele(ds_nextnext, FTAG);
ASSERT3P(ds_next->ds_prev, ==, NULL);
/* Collapse range in this head. */
dsl_dataset_t *hds;
VERIFY0(dsl_dataset_hold_obj(dp,
dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &hds));
dsl_deadlist_remove_key(&hds->ds_deadlist,
dsl_dataset_phys(ds)->ds_creation_txg, tx);
+ if (dsl_dataset_remap_deadlist_exists(hds)) {
+ dsl_deadlist_remove_key(&hds->ds_remap_deadlist,
+ dsl_dataset_phys(ds)->ds_creation_txg, tx);
+ }
dsl_dataset_rele(hds, FTAG);
} else {
ASSERT3P(ds_next->ds_prev, ==, ds);
dsl_dataset_rele(ds_next->ds_prev, ds_next);
ds_next->ds_prev = NULL;
if (ds_prev) {
VERIFY0(dsl_dataset_hold_obj(dp,
dsl_dataset_phys(ds)->ds_prev_snap_obj,
ds_next, &ds_next->ds_prev));
}
dsl_dataset_recalc_head_uniq(ds_next);
/*
* Reduce the amount of our unconsumed refreservation
* being charged to our parent by the amount of
* new unique data we have gained.
*/
if (old_unique < ds_next->ds_reserved) {
int64_t mrsdelta;
uint64_t new_unique =
dsl_dataset_phys(ds_next)->ds_unique_bytes;
ASSERT(old_unique <= new_unique);
mrsdelta = MIN(new_unique - old_unique,
ds_next->ds_reserved - old_unique);
dsl_dir_diduse_space(ds->ds_dir,
DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
}
}
dsl_dataset_rele(ds_next, FTAG);
/*
* This must be done after the dsl_traverse(), because it will
* re-open the objset.
*/
if (ds->ds_objset) {
dmu_objset_evict(ds->ds_objset);
ds->ds_objset = NULL;
}
/* remove from snapshot namespace */
dsl_dataset_t *ds_head;
ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0);
VERIFY0(dsl_dataset_hold_obj(dp,
dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &ds_head));
VERIFY0(dsl_dataset_get_snapname(ds));
#ifdef ZFS_DEBUG
{
uint64_t val;
err = dsl_dataset_snap_lookup(ds_head,
ds->ds_snapname, &val);
ASSERT0(err);
ASSERT3U(val, ==, obj);
}
#endif
VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx, B_TRUE));
dsl_dataset_rele(ds_head, FTAG);
if (ds_prev != NULL)
dsl_dataset_rele(ds_prev, FTAG);
spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
uint64_t count;
ASSERT0(zap_count(mos,
dsl_dataset_phys(ds)->ds_next_clones_obj, &count) &&
count == 0);
VERIFY0(dmu_object_free(mos,
dsl_dataset_phys(ds)->ds_next_clones_obj, tx));
}
if (dsl_dataset_phys(ds)->ds_props_obj != 0)
VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_props_obj,
tx));
if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0)
VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
tx));
dsl_dir_rele(ds->ds_dir, ds);
ds->ds_dir = NULL;
dmu_object_free_zapified(mos, obj, tx);
}
void
dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx)
{
dsl_destroy_snapshot_arg_t *ddsa = arg;
const char *dsname = ddsa->ddsa_name;
boolean_t defer = ddsa->ddsa_defer;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *ds;
int error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
if (error == ENOENT)
return;
ASSERT0(error);
dsl_destroy_snapshot_sync_impl(ds, defer, tx);
dsl_dataset_rele(ds, FTAG);
}
/*
* The semantics of this function are described in the comment above
* lzc_destroy_snaps(). To summarize:
*
* The snapshots must all be in the same pool.
*
* Snapshots that don't exist will be silently ignored (considered to be
* "already deleted").
*
* On success, all snaps will be destroyed and this will return 0.
* On failure, no snaps will be destroyed, the errlist will be filled in,
* and this will return an errno.
*/
int
dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer,
nvlist_t *errlist)
{
if (nvlist_next_nvpair(snaps, NULL) == NULL)
return (0);
/*
* lzc_destroy_snaps() is documented to take an nvlist whose
* values "don't matter". We need to convert that nvlist to
* one that we know can be converted to LUA. We also don't
* care about any duplicate entries because the nvlist will
* be converted to a LUA table which should take care of this.
*/
nvlist_t *snaps_normalized;
VERIFY0(nvlist_alloc(&snaps_normalized, 0, KM_SLEEP));
for (nvpair_t *pair = nvlist_next_nvpair(snaps, NULL);
pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) {
fnvlist_add_boolean_value(snaps_normalized,
nvpair_name(pair), B_TRUE);
}
nvlist_t *arg;
VERIFY0(nvlist_alloc(&arg, 0, KM_SLEEP));
fnvlist_add_nvlist(arg, "snaps", snaps_normalized);
fnvlist_free(snaps_normalized);
fnvlist_add_boolean_value(arg, "defer", defer);
nvlist_t *wrapper;
VERIFY0(nvlist_alloc(&wrapper, 0, KM_SLEEP));
fnvlist_add_nvlist(wrapper, ZCP_ARG_ARGLIST, arg);
fnvlist_free(arg);
const char *program =
"arg = ...\n"
"snaps = arg['snaps']\n"
"defer = arg['defer']\n"
"errors = { }\n"
"has_errors = false\n"
"for snap, v in pairs(snaps) do\n"
" errno = zfs.check.destroy{snap, defer=defer}\n"
" zfs.debug('snap: ' .. snap .. ' errno: ' .. errno)\n"
" if errno == ENOENT then\n"
" snaps[snap] = nil\n"
" elseif errno ~= 0 then\n"
" errors[snap] = errno\n"
" has_errors = true\n"
" end\n"
"end\n"
"if has_errors then\n"
" return errors\n"
"end\n"
"for snap, v in pairs(snaps) do\n"
" errno = zfs.sync.destroy{snap, defer=defer}\n"
" assert(errno == 0)\n"
"end\n"
"return { }\n";
nvlist_t *result = fnvlist_alloc();
int error = zcp_eval(nvpair_name(nvlist_next_nvpair(snaps, NULL)),
program,
B_TRUE,
0,
zfs_lua_max_memlimit,
nvlist_next_nvpair(wrapper, NULL), result);
if (error != 0) {
char *errorstr = NULL;
(void) nvlist_lookup_string(result, ZCP_RET_ERROR, &errorstr);
if (errorstr != NULL) {
zfs_dbgmsg(errorstr);
}
return (error);
}
fnvlist_free(wrapper);
/*
* lzc_destroy_snaps() is documented to fill the errlist with
* int32 values, so we need to covert the int64 values that are
* returned from LUA.
*/
int rv = 0;
nvlist_t *errlist_raw = fnvlist_lookup_nvlist(result, ZCP_RET_RETURN);
for (nvpair_t *pair = nvlist_next_nvpair(errlist_raw, NULL);
pair != NULL; pair = nvlist_next_nvpair(errlist_raw, pair)) {
int32_t val = (int32_t)fnvpair_value_int64(pair);
if (rv == 0)
rv = val;
fnvlist_add_int32(errlist, nvpair_name(pair), val);
}
fnvlist_free(result);
return (rv);
}
int
dsl_destroy_snapshot(const char *name, boolean_t defer)
{
int error;
nvlist_t *nvl = fnvlist_alloc();
nvlist_t *errlist = fnvlist_alloc();
fnvlist_add_boolean(nvl, name);
error = dsl_destroy_snapshots_nvl(nvl, defer, errlist);
fnvlist_free(errlist);
fnvlist_free(nvl);
return (error);
}
struct killarg {
dsl_dataset_t *ds;
dmu_tx_t *tx;
};
/* ARGSUSED */
static int
kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
struct killarg *ka = arg;
dmu_tx_t *tx = ka->tx;
if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
return (0);
if (zb->zb_level == ZB_ZIL_LEVEL) {
ASSERT(zilog != NULL);
/*
* It's a block in the intent log. It has no
* accounting, so just free it.
*/
dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
} else {
ASSERT(zilog == NULL);
ASSERT3U(bp->blk_birth, >,
dsl_dataset_phys(ka->ds)->ds_prev_snap_txg);
(void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
}
return (0);
}
static void
old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
{
struct killarg ka;
/*
* Free everything that we point to (that's born after
* the previous snapshot, if we are a clone)
*
* NB: this should be very quick, because we already
* freed all the objects in open context.
*/
ka.ds = ds;
ka.tx = tx;
VERIFY0(traverse_dataset(ds,
dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST,
kill_blkptr, &ka));
ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
dsl_dataset_phys(ds)->ds_unique_bytes == 0);
}
int
dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds)
{
int error;
uint64_t count;
objset_t *mos;
ASSERT(!ds->ds_is_snapshot);
if (ds->ds_is_snapshot)
return (SET_ERROR(EINVAL));
if (refcount_count(&ds->ds_longholds) != expected_holds)
return (SET_ERROR(EBUSY));
mos = ds->ds_dir->dd_pool->dp_meta_objset;
/*
* Can't delete a head dataset if there are snapshots of it.
* (Except if the only snapshots are from the branch we cloned
* from.)
*/
if (ds->ds_prev != NULL &&
dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object)
return (SET_ERROR(EBUSY));
/*
* Can't delete if there are children of this fs.
*/
error = zap_count(mos,
dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &count);
if (error != 0)
return (error);
if (count != 0)
return (SET_ERROR(EEXIST));
if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) &&
dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
ds->ds_prev->ds_userrefs == 0) {
/* We need to remove the origin snapshot as well. */
if (!refcount_is_zero(&ds->ds_prev->ds_longholds))
return (SET_ERROR(EBUSY));
}
return (0);
}
int
dsl_destroy_head_check(void *arg, dmu_tx_t *tx)
{
dsl_destroy_head_arg_t *ddha = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *ds;
int error;
error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds);
if (error != 0)
return (error);
error = dsl_destroy_head_check_impl(ds, 0);
dsl_dataset_rele(ds, FTAG);
return (error);
}
static void
dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx)
{
dsl_dir_t *dd;
dsl_pool_t *dp = dmu_tx_pool(tx);
objset_t *mos = dp->dp_meta_objset;
dd_used_t t;
ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock));
VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd));
ASSERT0(dsl_dir_phys(dd)->dd_head_dataset_obj);
/*
* Decrement the filesystem count for all parent filesystems.
*
* When we receive an incremental stream into a filesystem that already
* exists, a temporary clone is created. We never count this temporary
* clone, whose name begins with a '%'.
*/
if (dd->dd_myname[0] != '%' && dd->dd_parent != NULL)
dsl_fs_ss_count_adjust(dd->dd_parent, -1,
DD_FIELD_FILESYSTEM_COUNT, tx);
/*
* Remove our reservation. The impl() routine avoids setting the
* actual property, which would require the (already destroyed) ds.
*/
dsl_dir_set_reservation_sync_impl(dd, 0, tx);
ASSERT0(dsl_dir_phys(dd)->dd_used_bytes);
ASSERT0(dsl_dir_phys(dd)->dd_reserved);
for (t = 0; t < DD_USED_NUM; t++)
ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]);
VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx));
VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx));
VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx));
VERIFY0(zap_remove(mos,
dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
dd->dd_myname, tx));
dsl_dir_rele(dd, FTAG);
dmu_object_free_zapified(mos, ddobj, tx);
}
void
dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
{
dsl_pool_t *dp = dmu_tx_pool(tx);
objset_t *mos = dp->dp_meta_objset;
uint64_t obj, ddobj, prevobj = 0;
boolean_t rmorigin;
ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
ASSERT(ds->ds_prev == NULL ||
dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object);
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
rrw_exit(&ds->ds_bp_rwlock, FTAG);
ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
/* We need to log before removing it from the namespace. */
spa_history_log_internal_ds(ds, "destroy", tx, "");
rmorigin = (dsl_dir_is_clone(ds->ds_dir) &&
DS_IS_DEFER_DESTROY(ds->ds_prev) &&
dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
ds->ds_prev->ds_userrefs == 0);
/* Remove our reservation. */
if (ds->ds_reserved != 0) {
dsl_dataset_set_refreservation_sync_impl(ds,
(ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
0, tx);
ASSERT0(ds->ds_reserved);
}
obj = ds->ds_object;
for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
if (ds->ds_feature_inuse[f]) {
dsl_dataset_deactivate_feature(obj, f, tx);
ds->ds_feature_inuse[f] = B_FALSE;
}
}
dsl_scan_ds_destroyed(ds, tx);
if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
/* This is a clone */
ASSERT(ds->ds_prev != NULL);
ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj, !=,
obj);
ASSERT0(dsl_dataset_phys(ds)->ds_next_snap_obj);
dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
if (dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj != 0) {
dsl_dataset_remove_from_next_clones(ds->ds_prev,
obj, tx);
}
ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_num_children, >, 1);
dsl_dataset_phys(ds->ds_prev)->ds_num_children--;
}
/*
* Destroy the deadlist. Unless it's a clone, the
- * deadlist should be empty. (If it's a clone, it's
- * safe to ignore the deadlist contents.)
+ * deadlist should be empty since the dataset has no snapshots.
+ * (If it's a clone, it's safe to ignore the deadlist contents
+ * since they are still referenced by the origin snapshot.)
*/
dsl_deadlist_close(&ds->ds_deadlist);
dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
dmu_buf_will_dirty(ds->ds_dbuf, tx);
dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
+
+ if (dsl_dataset_remap_deadlist_exists(ds))
+ dsl_dataset_destroy_remap_deadlist(ds, tx);
objset_t *os;
VERIFY0(dmu_objset_from_ds(ds, &os));
if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
old_synchronous_dataset_destroy(ds, tx);
} else {
/*
* Move the bptree into the pool's list of trees to
* clean up and update space accounting information.
*/
uint64_t used, comp, uncomp;
zil_destroy_sync(dmu_objset_zil(os), tx);
if (!spa_feature_is_active(dp->dp_spa,
SPA_FEATURE_ASYNC_DESTROY)) {
dsl_scan_t *scn = dp->dp_scan;
spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY,
tx);
dp->dp_bptree_obj = bptree_alloc(mos, tx);
VERIFY0(zap_add(mos,
DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
&dp->dp_bptree_obj, tx));
ASSERT(!scn->scn_async_destroying);
scn->scn_async_destroying = B_TRUE;
}
used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes;
comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes;
uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes;
ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
dsl_dataset_phys(ds)->ds_unique_bytes == used);
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
bptree_add(mos, dp->dp_bptree_obj,
&dsl_dataset_phys(ds)->ds_bp,
dsl_dataset_phys(ds)->ds_prev_snap_txg,
used, comp, uncomp, tx);
rrw_exit(&ds->ds_bp_rwlock, FTAG);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
-used, -comp, -uncomp, tx);
dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
used, comp, uncomp, tx);
}
if (ds->ds_prev != NULL) {
if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
VERIFY0(zap_remove_int(mos,
dsl_dir_phys(ds->ds_prev->ds_dir)->dd_clones,
ds->ds_object, tx));
}
prevobj = ds->ds_prev->ds_object;
dsl_dataset_rele(ds->ds_prev, ds);
ds->ds_prev = NULL;
}
/*
* This must be done after the dsl_traverse(), because it will
* re-open the objset.
*/
if (ds->ds_objset) {
dmu_objset_evict(ds->ds_objset);
ds->ds_objset = NULL;
}
/* Erase the link in the dir */
dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj = 0;
ddobj = ds->ds_dir->dd_object;
ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0);
VERIFY0(zap_destroy(mos,
dsl_dataset_phys(ds)->ds_snapnames_zapobj, tx));
if (ds->ds_bookmarks != 0) {
VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx));
spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
}
spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
ASSERT0(dsl_dataset_phys(ds)->ds_next_clones_obj);
ASSERT0(dsl_dataset_phys(ds)->ds_props_obj);
ASSERT0(dsl_dataset_phys(ds)->ds_userrefs_obj);
dsl_dir_rele(ds->ds_dir, ds);
ds->ds_dir = NULL;
dmu_object_free_zapified(mos, obj, tx);
dsl_dir_destroy_sync(ddobj, tx);
if (rmorigin) {
dsl_dataset_t *prev;
VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev));
dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx);
dsl_dataset_rele(prev, FTAG);
}
}
void
dsl_destroy_head_sync(void *arg, dmu_tx_t *tx)
{
dsl_destroy_head_arg_t *ddha = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *ds;
VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
dsl_destroy_head_sync_impl(ds, tx);
dsl_dataset_rele(ds, FTAG);
}
static void
dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx)
{
dsl_destroy_head_arg_t *ddha = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *ds;
VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
/* Mark it as inconsistent on-disk, in case we crash */
dmu_buf_will_dirty(ds->ds_dbuf, tx);
dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT;
spa_history_log_internal_ds(ds, "destroy begin", tx, "");
dsl_dataset_rele(ds, FTAG);
}
int
dsl_destroy_head(const char *name)
{
dsl_destroy_head_arg_t ddha;
int error;
spa_t *spa;
boolean_t isenabled;
#ifdef _KERNEL
zfs_destroy_unmount_origin(name);
#endif
error = spa_open(name, &spa, FTAG);
if (error != 0)
return (error);
isenabled = spa_feature_is_enabled(spa, SPA_FEATURE_ASYNC_DESTROY);
spa_close(spa, FTAG);
ddha.ddha_name = name;
if (!isenabled) {
objset_t *os;
error = dsl_sync_task(name, dsl_destroy_head_check,
dsl_destroy_head_begin_sync, &ddha,
0, ZFS_SPACE_CHECK_NONE);
if (error != 0)
return (error);
/*
* Head deletion is processed in one txg on old pools;
* remove the objects from open context so that the txg sync
* is not too long.
*/
error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os);
if (error == 0) {
uint64_t prev_snap_txg =
dsl_dataset_phys(dmu_objset_ds(os))->
ds_prev_snap_txg;
for (uint64_t obj = 0; error == 0;
error = dmu_object_next(os, &obj, FALSE,
prev_snap_txg))
(void) dmu_free_long_object(os, obj);
/* sync out all frees */
txg_wait_synced(dmu_objset_pool(os), 0);
dmu_objset_disown(os, FTAG);
}
}
return (dsl_sync_task(name, dsl_destroy_head_check,
dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_NONE));
}
/*
* Note, this function is used as the callback for dmu_objset_find(). We
* always return 0 so that we will continue to find and process
* inconsistent datasets, even if we encounter an error trying to
* process one of them.
*/
/* ARGSUSED */
int
dsl_destroy_inconsistent(const char *dsname, void *arg)
{
objset_t *os;
if (dmu_objset_hold(dsname, FTAG, &os) == 0) {
boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os));
/*
* If the dataset is inconsistent because a resumable receive
* has failed, then do not destroy it.
*/
if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os)))
need_destroy = B_FALSE;
dmu_objset_rele(os, FTAG);
if (need_destroy)
(void) dsl_destroy_head(dsname);
}
return (0);
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c (revision 332525)
@@ -1,2076 +1,2126 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
* All rights reserved.
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
* Copyright (c) 2014 Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*/
#include <sys/dmu.h>
#include <sys/dmu_objset.h>
#include <sys/dmu_tx.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_dir.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_synctask.h>
#include <sys/dsl_deleg.h>
#include <sys/dmu_impl.h>
#include <sys/spa.h>
#include <sys/metaslab.h>
#include <sys/zap.h>
#include <sys/zio.h>
#include <sys/arc.h>
#include <sys/sunddi.h>
#include <sys/zvol.h>
#ifdef _KERNEL
#include <sys/zfs_vfsops.h>
#endif
#include <sys/zfeature.h>
#include <sys/policy.h>
#include <sys/zfs_znode.h>
#include "zfs_namecheck.h"
#include "zfs_prop.h"
/*
* Filesystem and Snapshot Limits
* ------------------------------
*
* These limits are used to restrict the number of filesystems and/or snapshots
* that can be created at a given level in the tree or below. A typical
* use-case is with a delegated dataset where the administrator wants to ensure
* that a user within the zone is not creating too many additional filesystems
* or snapshots, even though they're not exceeding their space quota.
*
* The filesystem and snapshot counts are stored as extensible properties. This
* capability is controlled by a feature flag and must be enabled to be used.
* Once enabled, the feature is not active until the first limit is set. At
* that point, future operations to create/destroy filesystems or snapshots
* will validate and update the counts.
*
* Because the count properties will not exist before the feature is active,
* the counts are updated when a limit is first set on an uninitialized
* dsl_dir node in the tree (The filesystem/snapshot count on a node includes
* all of the nested filesystems/snapshots. Thus, a new leaf node has a
* filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and
* snapshot count properties on a node indicate uninitialized counts on that
* node.) When first setting a limit on an uninitialized node, the code starts
* at the filesystem with the new limit and descends into all sub-filesystems
* to add the count properties.
*
* In practice this is lightweight since a limit is typically set when the
* filesystem is created and thus has no children. Once valid, changing the
* limit value won't require a re-traversal since the counts are already valid.
* When recursively fixing the counts, if a node with a limit is encountered
* during the descent, the counts are known to be valid and there is no need to
* descend into that filesystem's children. The counts on filesystems above the
* one with the new limit will still be uninitialized, unless a limit is
* eventually set on one of those filesystems. The counts are always recursively
* updated when a limit is set on a dataset, unless there is already a limit.
* When a new limit value is set on a filesystem with an existing limit, it is
* possible for the new limit to be less than the current count at that level
* since a user who can change the limit is also allowed to exceed the limit.
*
* Once the feature is active, then whenever a filesystem or snapshot is
* created, the code recurses up the tree, validating the new count against the
* limit at each initialized level. In practice, most levels will not have a
* limit set. If there is a limit at any initialized level up the tree, the
* check must pass or the creation will fail. Likewise, when a filesystem or
* snapshot is destroyed, the counts are recursively adjusted all the way up
* the initizized nodes in the tree. Renaming a filesystem into different point
* in the tree will first validate, then update the counts on each branch up to
* the common ancestor. A receive will also validate the counts and then update
* them.
*
* An exception to the above behavior is that the limit is not enforced if the
* user has permission to modify the limit. This is primarily so that
* recursive snapshots in the global zone always work. We want to prevent a
* denial-of-service in which a lower level delegated dataset could max out its
* limit and thus block recursive snapshots from being taken in the global zone.
* Because of this, it is possible for the snapshot count to be over the limit
* and snapshots taken in the global zone could cause a lower level dataset to
* hit or exceed its limit. The administrator taking the global zone recursive
* snapshot should be aware of this side-effect and behave accordingly.
* For consistency, the filesystem limit is also not enforced if the user can
* modify the limit.
*
* The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check()
* and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in
* dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by
* dsl_dir_init_fs_ss_count().
*
* There is a special case when we receive a filesystem that already exists. In
* this case a temporary clone name of %X is created (see dmu_recv_begin). We
* never update the filesystem counts for temporary clones.
*
* Likewise, we do not update the snapshot counts for temporary snapshots,
* such as those created by zfs diff.
*/
extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd);
static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
+typedef struct ddulrt_arg {
+ dsl_dir_t *ddulrta_dd;
+ uint64_t ddlrta_txg;
+} ddulrt_arg_t;
+
static void
dsl_dir_evict_async(void *dbu)
{
dsl_dir_t *dd = dbu;
dsl_pool_t *dp = dd->dd_pool;
int t;
dd->dd_dbuf = NULL;
for (t = 0; t < TXG_SIZE; t++) {
ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
ASSERT(dd->dd_tempreserved[t] == 0);
ASSERT(dd->dd_space_towrite[t] == 0);
}
if (dd->dd_parent)
dsl_dir_async_rele(dd->dd_parent, dd);
spa_async_close(dd->dd_pool->dp_spa, dd);
dsl_prop_fini(dd);
mutex_destroy(&dd->dd_lock);
kmem_free(dd, sizeof (dsl_dir_t));
}
int
dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
const char *tail, void *tag, dsl_dir_t **ddp)
{
dmu_buf_t *dbuf;
dsl_dir_t *dd;
int err;
ASSERT(dsl_pool_config_held(dp));
err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
if (err != 0)
return (err);
dd = dmu_buf_get_user(dbuf);
#ifdef ZFS_DEBUG
{
dmu_object_info_t doi;
dmu_object_info_from_db(dbuf, &doi);
ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
}
#endif
if (dd == NULL) {
dsl_dir_t *winner;
dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
dd->dd_object = ddobj;
dd->dd_dbuf = dbuf;
dd->dd_pool = dp;
mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
dsl_prop_init(dd);
dsl_dir_snap_cmtime_update(dd);
if (dsl_dir_phys(dd)->dd_parent_obj) {
err = dsl_dir_hold_obj(dp,
dsl_dir_phys(dd)->dd_parent_obj, NULL, dd,
&dd->dd_parent);
if (err != 0)
goto errout;
if (tail) {
#ifdef ZFS_DEBUG
uint64_t foundobj;
err = zap_lookup(dp->dp_meta_objset,
dsl_dir_phys(dd->dd_parent)->
dd_child_dir_zapobj, tail,
sizeof (foundobj), 1, &foundobj);
ASSERT(err || foundobj == ddobj);
#endif
(void) strcpy(dd->dd_myname, tail);
} else {
err = zap_value_search(dp->dp_meta_objset,
dsl_dir_phys(dd->dd_parent)->
dd_child_dir_zapobj,
ddobj, 0, dd->dd_myname);
}
if (err != 0)
goto errout;
} else {
(void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
}
if (dsl_dir_is_clone(dd)) {
dmu_buf_t *origin_bonus;
dsl_dataset_phys_t *origin_phys;
/*
* We can't open the origin dataset, because
* that would require opening this dsl_dir.
* Just look at its phys directly instead.
*/
err = dmu_bonus_hold(dp->dp_meta_objset,
dsl_dir_phys(dd)->dd_origin_obj, FTAG,
&origin_bonus);
if (err != 0)
goto errout;
origin_phys = origin_bonus->db_data;
dd->dd_origin_txg =
origin_phys->ds_creation_txg;
dmu_buf_rele(origin_bonus, FTAG);
}
dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async,
&dd->dd_dbuf);
winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu);
if (winner != NULL) {
if (dd->dd_parent)
dsl_dir_rele(dd->dd_parent, dd);
dsl_prop_fini(dd);
mutex_destroy(&dd->dd_lock);
kmem_free(dd, sizeof (dsl_dir_t));
dd = winner;
} else {
spa_open_ref(dp->dp_spa, dd);
}
}
/*
* The dsl_dir_t has both open-to-close and instantiate-to-evict
* holds on the spa. We need the open-to-close holds because
* otherwise the spa_refcnt wouldn't change when we open a
* dir which the spa also has open, so we could incorrectly
* think it was OK to unload/export/destroy the pool. We need
* the instantiate-to-evict hold because the dsl_dir_t has a
* pointer to the dd_pool, which has a pointer to the spa_t.
*/
spa_open_ref(dp->dp_spa, tag);
ASSERT3P(dd->dd_pool, ==, dp);
ASSERT3U(dd->dd_object, ==, ddobj);
ASSERT3P(dd->dd_dbuf, ==, dbuf);
*ddp = dd;
return (0);
errout:
if (dd->dd_parent)
dsl_dir_rele(dd->dd_parent, dd);
dsl_prop_fini(dd);
mutex_destroy(&dd->dd_lock);
kmem_free(dd, sizeof (dsl_dir_t));
dmu_buf_rele(dbuf, tag);
return (err);
}
void
dsl_dir_rele(dsl_dir_t *dd, void *tag)
{
dprintf_dd(dd, "%s\n", "");
spa_close(dd->dd_pool->dp_spa, tag);
dmu_buf_rele(dd->dd_dbuf, tag);
}
/*
* Remove a reference to the given dsl dir that is being asynchronously
* released. Async releases occur from a taskq performing eviction of
* dsl datasets and dirs. This process is identical to a normal release
* with the exception of using the async API for releasing the reference on
* the spa.
*/
void
dsl_dir_async_rele(dsl_dir_t *dd, void *tag)
{
dprintf_dd(dd, "%s\n", "");
spa_async_close(dd->dd_pool->dp_spa, tag);
dmu_buf_rele(dd->dd_dbuf, tag);
}
/* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */
void
dsl_dir_name(dsl_dir_t *dd, char *buf)
{
if (dd->dd_parent) {
dsl_dir_name(dd->dd_parent, buf);
VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <,
ZFS_MAX_DATASET_NAME_LEN);
} else {
buf[0] = '\0';
}
if (!MUTEX_HELD(&dd->dd_lock)) {
/*
* recursive mutex so that we can use
* dprintf_dd() with dd_lock held
*/
mutex_enter(&dd->dd_lock);
VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
<, ZFS_MAX_DATASET_NAME_LEN);
mutex_exit(&dd->dd_lock);
} else {
VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
<, ZFS_MAX_DATASET_NAME_LEN);
}
}
/* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
int
dsl_dir_namelen(dsl_dir_t *dd)
{
int result = 0;
if (dd->dd_parent) {
/* parent's name + 1 for the "/" */
result = dsl_dir_namelen(dd->dd_parent) + 1;
}
if (!MUTEX_HELD(&dd->dd_lock)) {
/* see dsl_dir_name */
mutex_enter(&dd->dd_lock);
result += strlen(dd->dd_myname);
mutex_exit(&dd->dd_lock);
} else {
result += strlen(dd->dd_myname);
}
return (result);
}
static int
getcomponent(const char *path, char *component, const char **nextp)
{
char *p;
if ((path == NULL) || (path[0] == '\0'))
return (SET_ERROR(ENOENT));
/* This would be a good place to reserve some namespace... */
p = strpbrk(path, "/@");
if (p && (p[1] == '/' || p[1] == '@')) {
/* two separators in a row */
return (SET_ERROR(EINVAL));
}
if (p == NULL || p == path) {
/*
* if the first thing is an @ or /, it had better be an
* @ and it had better not have any more ats or slashes,
* and it had better have something after the @.
*/
if (p != NULL &&
(p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
return (SET_ERROR(EINVAL));
if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN)
return (SET_ERROR(ENAMETOOLONG));
(void) strcpy(component, path);
p = NULL;
} else if (p[0] == '/') {
if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
return (SET_ERROR(ENAMETOOLONG));
(void) strncpy(component, path, p - path);
component[p - path] = '\0';
p++;
} else if (p[0] == '@') {
/*
* if the next separator is an @, there better not be
* any more slashes.
*/
if (strchr(path, '/'))
return (SET_ERROR(EINVAL));
if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
return (SET_ERROR(ENAMETOOLONG));
(void) strncpy(component, path, p - path);
component[p - path] = '\0';
} else {
panic("invalid p=%p", (void *)p);
}
*nextp = p;
return (0);
}
/*
* Return the dsl_dir_t, and possibly the last component which couldn't
* be found in *tail. The name must be in the specified dsl_pool_t. This
* thread must hold the dp_config_rwlock for the pool. Returns NULL if the
* path is bogus, or if tail==NULL and we couldn't parse the whole name.
* (*tail)[0] == '@' means that the last component is a snapshot.
*/
int
dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
dsl_dir_t **ddp, const char **tailp)
{
char buf[ZFS_MAX_DATASET_NAME_LEN];
const char *spaname, *next, *nextnext = NULL;
int err;
dsl_dir_t *dd;
uint64_t ddobj;
err = getcomponent(name, buf, &next);
if (err != 0)
return (err);
/* Make sure the name is in the specified pool. */
spaname = spa_name(dp->dp_spa);
if (strcmp(buf, spaname) != 0)
return (SET_ERROR(EXDEV));
ASSERT(dsl_pool_config_held(dp));
err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
if (err != 0) {
return (err);
}
while (next != NULL) {
dsl_dir_t *child_dd;
err = getcomponent(next, buf, &nextnext);
if (err != 0)
break;
ASSERT(next[0] != '\0');
if (next[0] == '@')
break;
dprintf("looking up %s in obj%lld\n",
buf, dsl_dir_phys(dd)->dd_child_dir_zapobj);
err = zap_lookup(dp->dp_meta_objset,
dsl_dir_phys(dd)->dd_child_dir_zapobj,
buf, sizeof (ddobj), 1, &ddobj);
if (err != 0) {
if (err == ENOENT)
err = 0;
break;
}
err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd);
if (err != 0)
break;
dsl_dir_rele(dd, tag);
dd = child_dd;
next = nextnext;
}
if (err != 0) {
dsl_dir_rele(dd, tag);
return (err);
}
/*
* It's an error if there's more than one component left, or
* tailp==NULL and there's any component left.
*/
if (next != NULL &&
(tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
/* bad path name */
dsl_dir_rele(dd, tag);
dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
err = SET_ERROR(ENOENT);
}
if (tailp != NULL)
*tailp = next;
*ddp = dd;
return (err);
}
/*
* If the counts are already initialized for this filesystem and its
* descendants then do nothing, otherwise initialize the counts.
*
* The counts on this filesystem, and those below, may be uninitialized due to
* either the use of a pre-existing pool which did not support the
* filesystem/snapshot limit feature, or one in which the feature had not yet
* been enabled.
*
* Recursively descend the filesystem tree and update the filesystem/snapshot
* counts on each filesystem below, then update the cumulative count on the
* current filesystem. If the filesystem already has a count set on it,
* then we know that its counts, and the counts on the filesystems below it,
* are already correct, so we don't have to update this filesystem.
*/
static void
dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx)
{
uint64_t my_fs_cnt = 0;
uint64_t my_ss_cnt = 0;
dsl_pool_t *dp = dd->dd_pool;
objset_t *os = dp->dp_meta_objset;
zap_cursor_t *zc;
zap_attribute_t *za;
dsl_dataset_t *ds;
ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT));
ASSERT(dsl_pool_config_held(dp));
ASSERT(dmu_tx_is_syncing(tx));
dsl_dir_zapify(dd, tx);
/*
* If the filesystem count has already been initialized then we
* don't need to recurse down any further.
*/
if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0)
return;
zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
/* Iterate my child dirs */
for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj);
zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) {
dsl_dir_t *chld_dd;
uint64_t count;
VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG,
&chld_dd));
/*
* Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and
* temporary datasets.
*/
if (chld_dd->dd_myname[0] == '$' ||
chld_dd->dd_myname[0] == '%') {
dsl_dir_rele(chld_dd, FTAG);
continue;
}
my_fs_cnt++; /* count this child */
dsl_dir_init_fs_ss_count(chld_dd, tx);
VERIFY0(zap_lookup(os, chld_dd->dd_object,
DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count));
my_fs_cnt += count;
VERIFY0(zap_lookup(os, chld_dd->dd_object,
DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count));
my_ss_cnt += count;
dsl_dir_rele(chld_dd, FTAG);
}
zap_cursor_fini(zc);
/* Count my snapshots (we counted children's snapshots above) */
VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds));
for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj);
zap_cursor_retrieve(zc, za) == 0;
zap_cursor_advance(zc)) {
/* Don't count temporary snapshots */
if (za->za_name[0] != '%')
my_ss_cnt++;
}
zap_cursor_fini(zc);
dsl_dataset_rele(ds, FTAG);
kmem_free(zc, sizeof (zap_cursor_t));
kmem_free(za, sizeof (zap_attribute_t));
/* we're in a sync task, update counts */
dmu_buf_will_dirty(dd->dd_dbuf, tx);
VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
sizeof (my_fs_cnt), 1, &my_fs_cnt, tx));
VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
sizeof (my_ss_cnt), 1, &my_ss_cnt, tx));
}
static int
dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx)
{
char *ddname = (char *)arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *ds;
dsl_dir_t *dd;
int error;
error = dsl_dataset_hold(dp, ddname, FTAG, &ds);
if (error != 0)
return (error);
if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(ENOTSUP));
}
dd = ds->ds_dir;
if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) &&
dsl_dir_is_zapified(dd) &&
zap_contains(dp->dp_meta_objset, dd->dd_object,
DD_FIELD_FILESYSTEM_COUNT) == 0) {
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(EALREADY));
}
dsl_dataset_rele(ds, FTAG);
return (0);
}
static void
dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx)
{
char *ddname = (char *)arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *ds;
spa_t *spa;
VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds));
spa = dsl_dataset_get_spa(ds);
if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) {
/*
* Since the feature was not active and we're now setting a
* limit, increment the feature-active counter so that the
* feature becomes active for the first time.
*
* We are already in a sync task so we can update the MOS.
*/
spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx);
}
/*
* Since we are now setting a non-UINT64_MAX limit on the filesystem,
* we need to ensure the counts are correct. Descend down the tree from
* this point and update all of the counts to be accurate.
*/
dsl_dir_init_fs_ss_count(ds->ds_dir, tx);
dsl_dataset_rele(ds, FTAG);
}
/*
* Make sure the feature is enabled and activate it if necessary.
* Since we're setting a limit, ensure the on-disk counts are valid.
* This is only called by the ioctl path when setting a limit value.
*
* We do not need to validate the new limit, since users who can change the
* limit are also allowed to exceed the limit.
*/
int
dsl_dir_activate_fs_ss_limit(const char *ddname)
{
int error;
error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check,
dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0,
ZFS_SPACE_CHECK_RESERVED);
if (error == EALREADY)
error = 0;
return (error);
}
/*
* Used to determine if the filesystem_limit or snapshot_limit should be
* enforced. We allow the limit to be exceeded if the user has permission to
* write the property value. We pass in the creds that we got in the open
* context since we will always be the GZ root in syncing context. We also have
* to handle the case where we are allowed to change the limit on the current
* dataset, but there may be another limit in the tree above.
*
* We can never modify these two properties within a non-global zone. In
* addition, the other checks are modeled on zfs_secpolicy_write_perms. We
* can't use that function since we are already holding the dp_config_rwlock.
* In addition, we already have the dd and dealing with snapshots is simplified
* in this code.
*/
typedef enum {
ENFORCE_ALWAYS,
ENFORCE_NEVER,
ENFORCE_ABOVE
} enforce_res_t;
static enforce_res_t
dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr)
{
enforce_res_t enforce = ENFORCE_ALWAYS;
uint64_t obj;
dsl_dataset_t *ds;
uint64_t zoned;
ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
prop == ZFS_PROP_SNAPSHOT_LIMIT);
#ifdef _KERNEL
#ifdef __FreeBSD__
if (jailed(cr))
#else
if (crgetzoneid(cr) != GLOBAL_ZONEID)
#endif
return (ENFORCE_ALWAYS);
if (secpolicy_zfs(cr) == 0)
return (ENFORCE_NEVER);
#endif
if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0)
return (ENFORCE_ALWAYS);
ASSERT(dsl_pool_config_held(dd->dd_pool));
if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0)
return (ENFORCE_ALWAYS);
if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) {
/* Only root can access zoned fs's from the GZ */
enforce = ENFORCE_ALWAYS;
} else {
if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0)
enforce = ENFORCE_ABOVE;
}
dsl_dataset_rele(ds, FTAG);
return (enforce);
}
+static void
+dsl_dir_update_last_remap_txg_sync(void *varg, dmu_tx_t *tx)
+{
+ ddulrt_arg_t *arg = varg;
+ uint64_t last_remap_txg;
+ dsl_dir_t *dd = arg->ddulrta_dd;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+
+ dsl_dir_zapify(dd, tx);
+ if (zap_lookup(mos, dd->dd_object, DD_FIELD_LAST_REMAP_TXG,
+ sizeof (last_remap_txg), 1, &last_remap_txg) != 0 ||
+ last_remap_txg < arg->ddlrta_txg) {
+ VERIFY0(zap_update(mos, dd->dd_object, DD_FIELD_LAST_REMAP_TXG,
+ sizeof (arg->ddlrta_txg), 1, &arg->ddlrta_txg, tx));
+ }
+}
+
+int
+dsl_dir_update_last_remap_txg(dsl_dir_t *dd, uint64_t txg)
+{
+ ddulrt_arg_t arg;
+ arg.ddulrta_dd = dd;
+ arg.ddlrta_txg = txg;
+
+ return (dsl_sync_task(spa_name(dd->dd_pool->dp_spa),
+ NULL, dsl_dir_update_last_remap_txg_sync, &arg,
+ 1, ZFS_SPACE_CHECK_RESERVED));
+}
+
/*
* Check if adding additional child filesystem(s) would exceed any filesystem
* limits or adding additional snapshot(s) would exceed any snapshot limits.
* The prop argument indicates which limit to check.
*
* Note that all filesystem limits up to the root (or the highest
* initialized) filesystem or the given ancestor must be satisfied.
*/
int
dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
dsl_dir_t *ancestor, cred_t *cr)
{
objset_t *os = dd->dd_pool->dp_meta_objset;
uint64_t limit, count;
char *count_prop;
enforce_res_t enforce;
int err = 0;
ASSERT(dsl_pool_config_held(dd->dd_pool));
ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
prop == ZFS_PROP_SNAPSHOT_LIMIT);
/*
* If we're allowed to change the limit, don't enforce the limit
* e.g. this can happen if a snapshot is taken by an administrative
* user in the global zone (i.e. a recursive snapshot by root).
* However, we must handle the case of delegated permissions where we
* are allowed to change the limit on the current dataset, but there
* is another limit in the tree above.
*/
enforce = dsl_enforce_ds_ss_limits(dd, prop, cr);
if (enforce == ENFORCE_NEVER)
return (0);
/*
* e.g. if renaming a dataset with no snapshots, count adjustment
* is 0.
*/
if (delta == 0)
return (0);
if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
/*
* We don't enforce the limit for temporary snapshots. This is
* indicated by a NULL cred_t argument.
*/
if (cr == NULL)
return (0);
count_prop = DD_FIELD_SNAPSHOT_COUNT;
} else {
count_prop = DD_FIELD_FILESYSTEM_COUNT;
}
/*
* If an ancestor has been provided, stop checking the limit once we
* hit that dir. We need this during rename so that we don't overcount
* the check once we recurse up to the common ancestor.
*/
if (ancestor == dd)
return (0);
/*
* If we hit an uninitialized node while recursing up the tree, we can
* stop since we know there is no limit here (or above). The counts are
* not valid on this node and we know we won't touch this node's counts.
*/
if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object,
count_prop, sizeof (count), 1, &count) == ENOENT)
return (0);
err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL,
B_FALSE);
if (err != 0)
return (err);
/* Is there a limit which we've hit? */
if (enforce == ENFORCE_ALWAYS && (count + delta) > limit)
return (SET_ERROR(EDQUOT));
if (dd->dd_parent != NULL)
err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop,
ancestor, cr);
return (err);
}
/*
* Adjust the filesystem or snapshot count for the specified dsl_dir_t and all
* parents. When a new filesystem/snapshot is created, increment the count on
* all parents, and when a filesystem/snapshot is destroyed, decrement the
* count.
*/
void
dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop,
dmu_tx_t *tx)
{
int err;
objset_t *os = dd->dd_pool->dp_meta_objset;
uint64_t count;
ASSERT(dsl_pool_config_held(dd->dd_pool));
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 ||
strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0);
/*
* When we receive an incremental stream into a filesystem that already
* exists, a temporary clone is created. We don't count this temporary
* clone, whose name begins with a '%'. We also ignore hidden ($FREE,
* $MOS & $ORIGIN) objsets.
*/
if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') &&
strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0)
return;
/*
* e.g. if renaming a dataset with no snapshots, count adjustment is 0
*/
if (delta == 0)
return;
/*
* If we hit an uninitialized node while recursing up the tree, we can
* stop since we know the counts are not valid on this node and we
* know we shouldn't touch this node's counts. An uninitialized count
* on the node indicates that either the feature has not yet been
* activated or there are no limits on this part of the tree.
*/
if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object,
prop, sizeof (count), 1, &count)) == ENOENT)
return;
VERIFY0(err);
count += delta;
/* Use a signed verify to make sure we're not neg. */
VERIFY3S(count, >=, 0);
VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count,
tx));
/* Roll up this additional count into our ancestors */
if (dd->dd_parent != NULL)
dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx);
}
uint64_t
dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
dmu_tx_t *tx)
{
objset_t *mos = dp->dp_meta_objset;
uint64_t ddobj;
dsl_dir_phys_t *ddphys;
dmu_buf_t *dbuf;
ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
if (pds) {
VERIFY(0 == zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
name, sizeof (uint64_t), 1, &ddobj, tx));
} else {
/* it's the root dir */
VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
}
VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
dmu_buf_will_dirty(dbuf, tx);
ddphys = dbuf->db_data;
ddphys->dd_creation_time = gethrestime_sec();
if (pds) {
ddphys->dd_parent_obj = pds->dd_object;
/* update the filesystem counts */
dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx);
}
ddphys->dd_props_zapobj = zap_create(mos,
DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
ddphys->dd_child_dir_zapobj = zap_create(mos,
DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
dmu_buf_rele(dbuf, FTAG);
return (ddobj);
}
boolean_t
dsl_dir_is_clone(dsl_dir_t *dd)
{
return (dsl_dir_phys(dd)->dd_origin_obj &&
(dd->dd_pool->dp_origin_snap == NULL ||
dsl_dir_phys(dd)->dd_origin_obj !=
dd->dd_pool->dp_origin_snap->ds_object));
}
uint64_t
dsl_dir_get_used(dsl_dir_t *dd)
{
return (dsl_dir_phys(dd)->dd_used_bytes);
}
uint64_t
dsl_dir_get_quota(dsl_dir_t *dd)
{
return (dsl_dir_phys(dd)->dd_quota);
}
uint64_t
dsl_dir_get_reservation(dsl_dir_t *dd)
{
return (dsl_dir_phys(dd)->dd_reserved);
}
uint64_t
dsl_dir_get_compressratio(dsl_dir_t *dd)
{
/* a fixed point number, 100x the ratio */
return (dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 :
(dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 /
dsl_dir_phys(dd)->dd_compressed_bytes));
}
uint64_t
dsl_dir_get_logicalused(dsl_dir_t *dd)
{
return (dsl_dir_phys(dd)->dd_uncompressed_bytes);
}
uint64_t
dsl_dir_get_usedsnap(dsl_dir_t *dd)
{
return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]);
}
uint64_t
dsl_dir_get_usedds(dsl_dir_t *dd)
{
return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]);
}
uint64_t
dsl_dir_get_usedrefreserv(dsl_dir_t *dd)
{
return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]);
}
uint64_t
dsl_dir_get_usedchild(dsl_dir_t *dd)
{
return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] +
dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]);
}
void
dsl_dir_get_origin(dsl_dir_t *dd, char *buf)
{
dsl_dataset_t *ds;
VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds));
dsl_dataset_name(ds, buf);
dsl_dataset_rele(ds, FTAG);
}
int
dsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count)
{
if (dsl_dir_is_zapified(dd)) {
objset_t *os = dd->dd_pool->dp_meta_objset;
return (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
sizeof (*count), 1, count));
} else {
return (ENOENT);
}
}
int
dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count)
{
if (dsl_dir_is_zapified(dd)) {
objset_t *os = dd->dd_pool->dp_meta_objset;
return (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
sizeof (*count), 1, count));
} else {
return (ENOENT);
}
}
+int
+dsl_dir_get_remaptxg(dsl_dir_t *dd, uint64_t *count)
+{
+ if (dsl_dir_is_zapified(dd)) {
+ objset_t *os = dd->dd_pool->dp_meta_objset;
+ return (zap_lookup(os, dd->dd_object, DD_FIELD_LAST_REMAP_TXG,
+ sizeof (*count), 1, count));
+ } else {
+ return (ENOENT);
+ }
+}
+
void
dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
{
mutex_enter(&dd->dd_lock);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA,
dsl_dir_get_quota(dd));
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
dsl_dir_get_reservation(dd));
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED,
dsl_dir_get_logicalused(dd));
if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
dsl_dir_get_usedsnap(dd));
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
dsl_dir_get_usedds(dd));
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
dsl_dir_get_usedrefreserv(dd));
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
dsl_dir_get_usedchild(dd));
}
mutex_exit(&dd->dd_lock);
uint64_t count;
if (dsl_dir_get_filesystem_count(dd, &count) == 0) {
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_FILESYSTEM_COUNT,
count);
}
if (dsl_dir_get_snapshot_count(dd, &count) == 0) {
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOT_COUNT,
+ count);
+ }
+ if (dsl_dir_get_remaptxg(dd, &count) == 0) {
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REMAPTXG,
count);
}
if (dsl_dir_is_clone(dd)) {
char buf[ZFS_MAX_DATASET_NAME_LEN];
dsl_dir_get_origin(dd, buf);
dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
}
}
void
dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
{
dsl_pool_t *dp = dd->dd_pool;
ASSERT(dsl_dir_phys(dd));
if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) {
/* up the hold count until we can be written out */
dmu_buf_add_ref(dd->dd_dbuf, dd);
}
}
static int64_t
parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
{
uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved);
uint64_t new_accounted =
MAX(used + delta, dsl_dir_phys(dd)->dd_reserved);
return (new_accounted - old_accounted);
}
void
dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
{
ASSERT(dmu_tx_is_syncing(tx));
mutex_enter(&dd->dd_lock);
ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]);
dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
mutex_exit(&dd->dd_lock);
/* release the hold from dsl_dir_dirty */
dmu_buf_rele(dd->dd_dbuf, dd);
}
static uint64_t
dsl_dir_space_towrite(dsl_dir_t *dd)
{
uint64_t space = 0;
ASSERT(MUTEX_HELD(&dd->dd_lock));
for (int i = 0; i < TXG_SIZE; i++) {
space += dd->dd_space_towrite[i & TXG_MASK];
ASSERT3U(dd->dd_space_towrite[i & TXG_MASK], >=, 0);
}
return (space);
}
/*
* How much space would dd have available if ancestor had delta applied
* to it? If ondiskonly is set, we're only interested in what's
* on-disk, not estimated pending changes.
*/
uint64_t
dsl_dir_space_available(dsl_dir_t *dd,
dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
{
uint64_t parentspace, myspace, quota, used;
/*
* If there are no restrictions otherwise, assume we have
* unlimited space available.
*/
quota = UINT64_MAX;
parentspace = UINT64_MAX;
if (dd->dd_parent != NULL) {
parentspace = dsl_dir_space_available(dd->dd_parent,
ancestor, delta, ondiskonly);
}
mutex_enter(&dd->dd_lock);
if (dsl_dir_phys(dd)->dd_quota != 0)
quota = dsl_dir_phys(dd)->dd_quota;
used = dsl_dir_phys(dd)->dd_used_bytes;
if (!ondiskonly)
used += dsl_dir_space_towrite(dd);
if (dd->dd_parent == NULL) {
uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE);
quota = MIN(quota, poolsize);
}
if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) {
/*
* We have some space reserved, in addition to what our
* parent gave us.
*/
parentspace += dsl_dir_phys(dd)->dd_reserved - used;
}
if (dd == ancestor) {
ASSERT(delta <= 0);
ASSERT(used >= -delta);
used += delta;
if (parentspace != UINT64_MAX)
parentspace -= delta;
}
if (used > quota) {
/* over quota */
myspace = 0;
} else {
/*
* the lesser of the space provided by our parent and
* the space left in our quota
*/
myspace = MIN(parentspace, quota - used);
}
mutex_exit(&dd->dd_lock);
return (myspace);
}
struct tempreserve {
list_node_t tr_node;
dsl_dir_t *tr_ds;
uint64_t tr_size;
};
static int
dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
boolean_t ignorequota, list_t *tr_list,
dmu_tx_t *tx, boolean_t first)
{
uint64_t txg = tx->tx_txg;
uint64_t quota;
struct tempreserve *tr;
int retval = EDQUOT;
uint64_t ref_rsrv = 0;
ASSERT3U(txg, !=, 0);
ASSERT3S(asize, >, 0);
mutex_enter(&dd->dd_lock);
/*
* Check against the dsl_dir's quota. We don't add in the delta
* when checking for over-quota because they get one free hit.
*/
uint64_t est_inflight = dsl_dir_space_towrite(dd);
for (int i = 0; i < TXG_SIZE; i++)
est_inflight += dd->dd_tempreserved[i];
uint64_t used_on_disk = dsl_dir_phys(dd)->dd_used_bytes;
/*
* On the first iteration, fetch the dataset's used-on-disk and
* refreservation values. Also, if checkrefquota is set, test if
* allocating this space would exceed the dataset's refquota.
*/
if (first && tx->tx_objset) {
int error;
dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
error = dsl_dataset_check_quota(ds, !netfree,
asize, est_inflight, &used_on_disk, &ref_rsrv);
if (error != 0) {
mutex_exit(&dd->dd_lock);
return (error);
}
}
/*
* If this transaction will result in a net free of space,
* we want to let it through.
*/
if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0)
quota = UINT64_MAX;
else
quota = dsl_dir_phys(dd)->dd_quota;
/*
* Adjust the quota against the actual pool size at the root
* minus any outstanding deferred frees.
* To ensure that it's possible to remove files from a full
* pool without inducing transient overcommits, we throttle
* netfree transactions against a quota that is slightly larger,
* but still within the pool's allocation slop. In cases where
* we're very close to full, this will allow a steady trickle of
* removes to get through.
*/
uint64_t deferred = 0;
if (dd->dd_parent == NULL) {
spa_t *spa = dd->dd_pool->dp_spa;
uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
deferred = metaslab_class_get_deferred(spa_normal_class(spa));
if (poolsize - deferred < quota) {
quota = poolsize - deferred;
retval = ENOSPC;
}
}
/*
* If they are requesting more space, and our current estimate
* is over quota, they get to try again unless the actual
* on-disk is over quota and there are no pending changes (which
* may free up space for us).
*/
if (used_on_disk + est_inflight >= quota) {
if (est_inflight > 0 || used_on_disk < quota ||
(retval == ENOSPC && used_on_disk < quota + deferred))
retval = ERESTART;
dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
"quota=%lluK tr=%lluK err=%d\n",
used_on_disk>>10, est_inflight>>10,
quota>>10, asize>>10, retval);
mutex_exit(&dd->dd_lock);
return (SET_ERROR(retval));
}
/* We need to up our estimated delta before dropping dd_lock */
dd->dd_tempreserved[txg & TXG_MASK] += asize;
uint64_t parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
asize - ref_rsrv);
mutex_exit(&dd->dd_lock);
tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
tr->tr_ds = dd;
tr->tr_size = asize;
list_insert_tail(tr_list, tr);
/* see if it's OK with our parent */
if (dd->dd_parent != NULL && parent_rsrv != 0) {
boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
return (dsl_dir_tempreserve_impl(dd->dd_parent,
parent_rsrv, netfree, ismos, tr_list, tx, B_FALSE));
} else {
return (0);
}
}
/*
* Reserve space in this dsl_dir, to be used in this tx's txg.
* After the space has been dirtied (and dsl_dir_willuse_space()
* has been called), the reservation should be canceled, using
* dsl_dir_tempreserve_clear().
*/
int
dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx)
{
int err;
list_t *tr_list;
if (asize == 0) {
*tr_cookiep = NULL;
return (0);
}
tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
list_create(tr_list, sizeof (struct tempreserve),
offsetof(struct tempreserve, tr_node));
ASSERT3S(asize, >, 0);
err = arc_tempreserve_space(lsize, tx->tx_txg);
if (err == 0) {
struct tempreserve *tr;
tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
tr->tr_size = lsize;
list_insert_tail(tr_list, tr);
} else {
if (err == EAGAIN) {
/*
* If arc_memory_throttle() detected that pageout
* is running and we are low on memory, we delay new
* non-pageout transactions to give pageout an
* advantage.
*
* It is unfortunate to be delaying while the caller's
* locks are held.
*/
txg_delay(dd->dd_pool, tx->tx_txg,
MSEC2NSEC(10), MSEC2NSEC(10));
err = SET_ERROR(ERESTART);
}
}
if (err == 0) {
err = dsl_dir_tempreserve_impl(dd, asize, netfree,
B_FALSE, tr_list, tx, B_TRUE);
}
if (err != 0)
dsl_dir_tempreserve_clear(tr_list, tx);
else
*tr_cookiep = tr_list;
return (err);
}
/*
* Clear a temporary reservation that we previously made with
* dsl_dir_tempreserve_space().
*/
void
dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
{
int txgidx = tx->tx_txg & TXG_MASK;
list_t *tr_list = tr_cookie;
struct tempreserve *tr;
ASSERT3U(tx->tx_txg, !=, 0);
if (tr_cookie == NULL)
return;
while ((tr = list_head(tr_list)) != NULL) {
if (tr->tr_ds) {
mutex_enter(&tr->tr_ds->dd_lock);
ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
tr->tr_size);
tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
mutex_exit(&tr->tr_ds->dd_lock);
} else {
arc_tempreserve_clear(tr->tr_size);
}
list_remove(tr_list, tr);
kmem_free(tr, sizeof (struct tempreserve));
}
kmem_free(tr_list, sizeof (list_t));
}
/*
* This should be called from open context when we think we're going to write
* or free space, for example when dirtying data. Be conservative; it's okay
* to write less space or free more, but we don't want to write more or free
* less than the amount specified.
*/
void
dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
{
int64_t parent_space;
uint64_t est_used;
mutex_enter(&dd->dd_lock);
if (space > 0)
dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
est_used = dsl_dir_space_towrite(dd) + dsl_dir_phys(dd)->dd_used_bytes;
parent_space = parent_delta(dd, est_used, space);
mutex_exit(&dd->dd_lock);
/* Make sure that we clean up dd_space_to* */
dsl_dir_dirty(dd, tx);
/* XXX this is potentially expensive and unnecessary... */
if (parent_space && dd->dd_parent)
dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
}
/* call from syncing context when we actually write/free space for this dd */
void
dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
{
int64_t accounted_delta;
/*
* dsl_dataset_set_refreservation_sync_impl() calls this with
* dd_lock held, so that it can atomically update
* ds->ds_reserved and the dsl_dir accounting, so that
* dsl_dataset_check_quota() can see dataset and dir accounting
* consistently.
*/
boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(type < DD_USED_NUM);
dmu_buf_will_dirty(dd->dd_dbuf, tx);
if (needlock)
mutex_enter(&dd->dd_lock);
accounted_delta =
parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used);
ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used);
ASSERT(compressed >= 0 ||
dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed);
ASSERT(uncompressed >= 0 ||
dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed);
dsl_dir_phys(dd)->dd_used_bytes += used;
dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed;
dsl_dir_phys(dd)->dd_compressed_bytes += compressed;
if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
ASSERT(used > 0 ||
dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used);
dsl_dir_phys(dd)->dd_used_breakdown[type] += used;
#ifdef DEBUG
dd_used_t t;
uint64_t u = 0;
for (t = 0; t < DD_USED_NUM; t++)
u += dsl_dir_phys(dd)->dd_used_breakdown[t];
ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes);
#endif
}
if (needlock)
mutex_exit(&dd->dd_lock);
if (dd->dd_parent != NULL) {
dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
accounted_delta, compressed, uncompressed, tx);
dsl_dir_transfer_space(dd->dd_parent,
used - accounted_delta,
DD_USED_CHILD_RSRV, DD_USED_CHILD, NULL);
}
}
void
dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
{
ASSERT(tx == NULL || dmu_tx_is_syncing(tx));
ASSERT(oldtype < DD_USED_NUM);
ASSERT(newtype < DD_USED_NUM);
if (delta == 0 ||
!(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN))
return;
if (tx != NULL)
dmu_buf_will_dirty(dd->dd_dbuf, tx);
mutex_enter(&dd->dd_lock);
ASSERT(delta > 0 ?
dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta :
dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta);
ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta));
dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta;
dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta;
mutex_exit(&dd->dd_lock);
}
typedef struct dsl_dir_set_qr_arg {
const char *ddsqra_name;
zprop_source_t ddsqra_source;
uint64_t ddsqra_value;
} dsl_dir_set_qr_arg_t;
static int
dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx)
{
dsl_dir_set_qr_arg_t *ddsqra = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *ds;
int error;
uint64_t towrite, newval;
error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
if (error != 0)
return (error);
error = dsl_prop_predict(ds->ds_dir, "quota",
ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
if (error != 0) {
dsl_dataset_rele(ds, FTAG);
return (error);
}
if (newval == 0) {
dsl_dataset_rele(ds, FTAG);
return (0);
}
mutex_enter(&ds->ds_dir->dd_lock);
/*
* If we are doing the preliminary check in open context, and
* there are pending changes, then don't fail it, since the
* pending changes could under-estimate the amount of space to be
* freed up.
*/
towrite = dsl_dir_space_towrite(ds->ds_dir);
if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
(newval < dsl_dir_phys(ds->ds_dir)->dd_reserved ||
newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) {
error = SET_ERROR(ENOSPC);
}
mutex_exit(&ds->ds_dir->dd_lock);
dsl_dataset_rele(ds, FTAG);
return (error);
}
static void
dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx)
{
dsl_dir_set_qr_arg_t *ddsqra = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *ds;
uint64_t newval;
VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA),
ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
&ddsqra->ddsqra_value, tx);
VERIFY0(dsl_prop_get_int_ds(ds,
zfs_prop_to_name(ZFS_PROP_QUOTA), &newval));
} else {
newval = ddsqra->ddsqra_value;
spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval);
}
dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
mutex_enter(&ds->ds_dir->dd_lock);
dsl_dir_phys(ds->ds_dir)->dd_quota = newval;
mutex_exit(&ds->ds_dir->dd_lock);
dsl_dataset_rele(ds, FTAG);
}
int
dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
{
dsl_dir_set_qr_arg_t ddsqra;
ddsqra.ddsqra_name = ddname;
ddsqra.ddsqra_source = source;
ddsqra.ddsqra_value = quota;
return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
dsl_dir_set_quota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
}
int
dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx)
{
dsl_dir_set_qr_arg_t *ddsqra = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *ds;
dsl_dir_t *dd;
uint64_t newval, used, avail;
int error;
error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
if (error != 0)
return (error);
dd = ds->ds_dir;
/*
* If we are doing the preliminary check in open context, the
* space estimates may be inaccurate.
*/
if (!dmu_tx_is_syncing(tx)) {
dsl_dataset_rele(ds, FTAG);
return (0);
}
error = dsl_prop_predict(ds->ds_dir,
zfs_prop_to_name(ZFS_PROP_RESERVATION),
ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
if (error != 0) {
dsl_dataset_rele(ds, FTAG);
return (error);
}
mutex_enter(&dd->dd_lock);
used = dsl_dir_phys(dd)->dd_used_bytes;
mutex_exit(&dd->dd_lock);
if (dd->dd_parent) {
avail = dsl_dir_space_available(dd->dd_parent,
NULL, 0, FALSE);
} else {
avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
}
if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) {
uint64_t delta = MAX(used, newval) -
MAX(used, dsl_dir_phys(dd)->dd_reserved);
if (delta > avail ||
(dsl_dir_phys(dd)->dd_quota > 0 &&
newval > dsl_dir_phys(dd)->dd_quota))
error = SET_ERROR(ENOSPC);
}
dsl_dataset_rele(ds, FTAG);
return (error);
}
void
dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
{
uint64_t used;
int64_t delta;
dmu_buf_will_dirty(dd->dd_dbuf, tx);
mutex_enter(&dd->dd_lock);
used = dsl_dir_phys(dd)->dd_used_bytes;
delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved);
dsl_dir_phys(dd)->dd_reserved = value;
if (dd->dd_parent != NULL) {
/* Roll up this additional usage into our ancestors */
dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
delta, 0, 0, tx);
}
mutex_exit(&dd->dd_lock);
}
static void
dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx)
{
dsl_dir_set_qr_arg_t *ddsqra = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *ds;
uint64_t newval;
VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
dsl_prop_set_sync_impl(ds,
zfs_prop_to_name(ZFS_PROP_RESERVATION),
ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
&ddsqra->ddsqra_value, tx);
VERIFY0(dsl_prop_get_int_ds(ds,
zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval));
} else {
newval = ddsqra->ddsqra_value;
spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
zfs_prop_to_name(ZFS_PROP_RESERVATION),
(longlong_t)newval);
}
dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx);
dsl_dataset_rele(ds, FTAG);
}
int
dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
uint64_t reservation)
{
dsl_dir_set_qr_arg_t ddsqra;
ddsqra.ddsqra_name = ddname;
ddsqra.ddsqra_source = source;
ddsqra.ddsqra_value = reservation;
return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
dsl_dir_set_reservation_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
}
static dsl_dir_t *
closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
{
for (; ds1; ds1 = ds1->dd_parent) {
dsl_dir_t *dd;
for (dd = ds2; dd; dd = dd->dd_parent) {
if (ds1 == dd)
return (dd);
}
}
return (NULL);
}
/*
* If delta is applied to dd, how much of that delta would be applied to
* ancestor? Syncing context only.
*/
static int64_t
would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
{
if (dd == ancestor)
return (delta);
mutex_enter(&dd->dd_lock);
delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta);
mutex_exit(&dd->dd_lock);
return (would_change(dd->dd_parent, delta, ancestor));
}
typedef struct dsl_dir_rename_arg {
const char *ddra_oldname;
const char *ddra_newname;
cred_t *ddra_cred;
} dsl_dir_rename_arg_t;
/* ARGSUSED */
static int
dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
{
int *deltap = arg;
char namebuf[ZFS_MAX_DATASET_NAME_LEN];
dsl_dataset_name(ds, namebuf);
if (strlen(namebuf) + *deltap >= ZFS_MAX_DATASET_NAME_LEN)
return (SET_ERROR(ENAMETOOLONG));
return (0);
}
static int
dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
{
dsl_dir_rename_arg_t *ddra = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dir_t *dd, *newparent;
const char *mynewname;
int error;
int delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname);
/* target dir should exist */
error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL);
if (error != 0)
return (error);
/* new parent should exist */
error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG,
&newparent, &mynewname);
if (error != 0) {
dsl_dir_rele(dd, FTAG);
return (error);
}
/* can't rename to different pool */
if (dd->dd_pool != newparent->dd_pool) {
dsl_dir_rele(newparent, FTAG);
dsl_dir_rele(dd, FTAG);
return (SET_ERROR(EXDEV));
}
/* new name should not already exist */
if (mynewname == NULL) {
dsl_dir_rele(newparent, FTAG);
dsl_dir_rele(dd, FTAG);
return (SET_ERROR(EEXIST));
}
/* if the name length is growing, validate child name lengths */
if (delta > 0) {
error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename,
&delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
if (error != 0) {
dsl_dir_rele(newparent, FTAG);
dsl_dir_rele(dd, FTAG);
return (error);
}
}
if (dmu_tx_is_syncing(tx)) {
if (spa_feature_is_active(dp->dp_spa,
SPA_FEATURE_FS_SS_LIMIT)) {
/*
* Although this is the check function and we don't
* normally make on-disk changes in check functions,
* we need to do that here.
*
* Ensure this portion of the tree's counts have been
* initialized in case the new parent has limits set.
*/
dsl_dir_init_fs_ss_count(dd, tx);
}
}
if (newparent != dd->dd_parent) {
/* is there enough space? */
uint64_t myspace =
MAX(dsl_dir_phys(dd)->dd_used_bytes,
dsl_dir_phys(dd)->dd_reserved);
objset_t *os = dd->dd_pool->dp_meta_objset;
uint64_t fs_cnt = 0;
uint64_t ss_cnt = 0;
if (dsl_dir_is_zapified(dd)) {
int err;
err = zap_lookup(os, dd->dd_object,
DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
&fs_cnt);
if (err != ENOENT && err != 0) {
dsl_dir_rele(newparent, FTAG);
dsl_dir_rele(dd, FTAG);
return (err);
}
/*
* have to add 1 for the filesystem itself that we're
* moving
*/
fs_cnt++;
err = zap_lookup(os, dd->dd_object,
DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
&ss_cnt);
if (err != ENOENT && err != 0) {
dsl_dir_rele(newparent, FTAG);
dsl_dir_rele(dd, FTAG);
return (err);
}
}
/* no rename into our descendant */
if (closest_common_ancestor(dd, newparent) == dd) {
dsl_dir_rele(newparent, FTAG);
dsl_dir_rele(dd, FTAG);
return (SET_ERROR(EINVAL));
}
error = dsl_dir_transfer_possible(dd->dd_parent,
newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred);
if (error != 0) {
dsl_dir_rele(newparent, FTAG);
dsl_dir_rele(dd, FTAG);
return (error);
}
}
dsl_dir_rele(newparent, FTAG);
dsl_dir_rele(dd, FTAG);
return (0);
}
static void
dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
{
dsl_dir_rename_arg_t *ddra = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dir_t *dd, *newparent;
const char *mynewname;
int error;
objset_t *mos = dp->dp_meta_objset;
VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL));
VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent,
&mynewname));
/* Log this before we change the name. */
spa_history_log_internal_dd(dd, "rename", tx,
"-> %s", ddra->ddra_newname);
if (newparent != dd->dd_parent) {
objset_t *os = dd->dd_pool->dp_meta_objset;
uint64_t fs_cnt = 0;
uint64_t ss_cnt = 0;
/*
* We already made sure the dd counts were initialized in the
* check function.
*/
if (spa_feature_is_active(dp->dp_spa,
SPA_FEATURE_FS_SS_LIMIT)) {
VERIFY0(zap_lookup(os, dd->dd_object,
DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
&fs_cnt));
/* add 1 for the filesystem itself that we're moving */
fs_cnt++;
VERIFY0(zap_lookup(os, dd->dd_object,
DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
&ss_cnt));
}
dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt,
DD_FIELD_FILESYSTEM_COUNT, tx);
dsl_fs_ss_count_adjust(newparent, fs_cnt,
DD_FIELD_FILESYSTEM_COUNT, tx);
dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt,
DD_FIELD_SNAPSHOT_COUNT, tx);
dsl_fs_ss_count_adjust(newparent, ss_cnt,
DD_FIELD_SNAPSHOT_COUNT, tx);
dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
-dsl_dir_phys(dd)->dd_used_bytes,
-dsl_dir_phys(dd)->dd_compressed_bytes,
-dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
dsl_dir_diduse_space(newparent, DD_USED_CHILD,
dsl_dir_phys(dd)->dd_used_bytes,
dsl_dir_phys(dd)->dd_compressed_bytes,
dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
if (dsl_dir_phys(dd)->dd_reserved >
dsl_dir_phys(dd)->dd_used_bytes) {
uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved -
dsl_dir_phys(dd)->dd_used_bytes;
dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
-unused_rsrv, 0, 0, tx);
dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV,
unused_rsrv, 0, 0, tx);
}
}
dmu_buf_will_dirty(dd->dd_dbuf, tx);
/* remove from old parent zapobj */
error = zap_remove(mos,
dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
dd->dd_myname, tx);
ASSERT0(error);
(void) strcpy(dd->dd_myname, mynewname);
dsl_dir_rele(dd->dd_parent, dd);
dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object;
VERIFY0(dsl_dir_hold_obj(dp,
newparent->dd_object, NULL, dd, &dd->dd_parent));
/* add to new parent zapobj */
VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj,
dd->dd_myname, 8, 1, &dd->dd_object, tx));
#ifdef __FreeBSD__
#ifdef _KERNEL
zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname);
zvol_rename_minors(ddra->ddra_oldname, ddra->ddra_newname);
#endif
#endif
dsl_prop_notify_all(dd);
dsl_dir_rele(newparent, FTAG);
dsl_dir_rele(dd, FTAG);
}
int
dsl_dir_rename(const char *oldname, const char *newname)
{
dsl_dir_rename_arg_t ddra;
ddra.ddra_oldname = oldname;
ddra.ddra_newname = newname;
ddra.ddra_cred = CRED();
return (dsl_sync_task(oldname,
dsl_dir_rename_check, dsl_dir_rename_sync, &ddra,
3, ZFS_SPACE_CHECK_RESERVED));
}
int
dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr)
{
dsl_dir_t *ancestor;
int64_t adelta;
uint64_t avail;
int err;
ancestor = closest_common_ancestor(sdd, tdd);
adelta = would_change(sdd, -space, ancestor);
avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
if (avail < space)
return (SET_ERROR(ENOSPC));
err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT,
ancestor, cr);
if (err != 0)
return (err);
err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT,
ancestor, cr);
if (err != 0)
return (err);
return (0);
}
timestruc_t
dsl_dir_snap_cmtime(dsl_dir_t *dd)
{
timestruc_t t;
mutex_enter(&dd->dd_lock);
t = dd->dd_snap_cmtime;
mutex_exit(&dd->dd_lock);
return (t);
}
void
dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
{
timestruc_t t;
gethrestime(&t);
mutex_enter(&dd->dd_lock);
dd->dd_snap_cmtime = t;
mutex_exit(&dd->dd_lock);
}
void
dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx)
{
objset_t *mos = dd->dd_pool->dp_meta_objset;
dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx);
}
boolean_t
dsl_dir_is_zapified(dsl_dir_t *dd)
{
dmu_object_info_t doi;
dmu_object_info_from_db(dd->dd_dbuf, &doi);
return (doi.doi_type == DMU_OTN_ZAP_METADATA);
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c (revision 332525)
@@ -1,1231 +1,1277 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
*/
#include <sys/dsl_pool.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_dir.h>
#include <sys/dsl_synctask.h>
#include <sys/dsl_scan.h>
#include <sys/dnode.h>
#include <sys/dmu_tx.h>
#include <sys/dmu_objset.h>
#include <sys/arc.h>
#include <sys/zap.h>
#include <sys/zio.h>
#include <sys/zfs_context.h>
#include <sys/fs/zfs.h>
#include <sys/zfs_znode.h>
#include <sys/spa_impl.h>
#include <sys/dsl_deadlist.h>
#include <sys/bptree.h>
#include <sys/zfeature.h>
#include <sys/zil_impl.h>
#include <sys/dsl_userhold.h>
#if defined(__FreeBSD__) && defined(_KERNEL)
#include <sys/types.h>
#include <sys/sysctl.h>
#endif
/*
* ZFS Write Throttle
* ------------------
*
* ZFS must limit the rate of incoming writes to the rate at which it is able
* to sync data modifications to the backend storage. Throttling by too much
* creates an artificial limit; throttling by too little can only be sustained
* for short periods and would lead to highly lumpy performance. On a per-pool
* basis, ZFS tracks the amount of modified (dirty) data. As operations change
* data, the amount of dirty data increases; as ZFS syncs out data, the amount
* of dirty data decreases. When the amount of dirty data exceeds a
* predetermined threshold further modifications are blocked until the amount
* of dirty data decreases (as data is synced out).
*
* The limit on dirty data is tunable, and should be adjusted according to
* both the IO capacity and available memory of the system. The larger the
* window, the more ZFS is able to aggregate and amortize metadata (and data)
* changes. However, memory is a limited resource, and allowing for more dirty
* data comes at the cost of keeping other useful data in memory (for example
* ZFS data cached by the ARC).
*
* Implementation
*
* As buffers are modified dsl_pool_willuse_space() increments both the per-
* txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of
* dirty space used; dsl_pool_dirty_space() decrements those values as data
* is synced out from dsl_pool_sync(). While only the poolwide value is
* relevant, the per-txg value is useful for debugging. The tunable
* zfs_dirty_data_max determines the dirty space limit. Once that value is
* exceeded, new writes are halted until space frees up.
*
* The zfs_dirty_data_sync tunable dictates the threshold at which we
* ensure that there is a txg syncing (see the comment in txg.c for a full
* description of transaction group stages).
*
* The IO scheduler uses both the dirty space limit and current amount of
* dirty data as inputs. Those values affect the number of concurrent IOs ZFS
* issues. See the comment in vdev_queue.c for details of the IO scheduler.
*
* The delay is also calculated based on the amount of dirty data. See the
* comment above dmu_tx_delay() for details.
*/
/*
* zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
* capped at zfs_dirty_data_max_max. It can also be overridden in /etc/system.
*/
uint64_t zfs_dirty_data_max;
uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024;
int zfs_dirty_data_max_percent = 10;
/*
* If there is at least this much dirty data, push out a txg.
*/
uint64_t zfs_dirty_data_sync = 64 * 1024 * 1024;
/*
* Once there is this amount of dirty data, the dmu_tx_delay() will kick in
* and delay each transaction.
* This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
*/
int zfs_delay_min_dirty_percent = 60;
/*
* This controls how quickly the delay approaches infinity.
* Larger values cause it to delay more for a given amount of dirty data.
* Therefore larger values will cause there to be less dirty data for a
* given throughput.
*
* For the smoothest delay, this value should be about 1 billion divided
* by the maximum number of operations per second. This will smoothly
* handle between 10x and 1/10th this number.
*
* Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the
* multiply in dmu_tx_delay().
*/
uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
/*
* This determines the number of threads used by the dp_sync_taskq.
*/
int zfs_sync_taskq_batch_pct = 75;
/*
* These tunables determine the behavior of how zil_itxg_clean() is
* called via zil_clean() in the context of spa_sync(). When an itxg
* list needs to be cleaned, TQ_NOSLEEP will be used when dispatching.
* If the dispatch fails, the call to zil_itxg_clean() will occur
* synchronously in the context of spa_sync(), which can negatively
* impact the performance of spa_sync() (e.g. in the case of the itxg
* list having a large number of itxs that needs to be cleaned).
*
* Thus, these tunables can be used to manipulate the behavior of the
* taskq used by zil_clean(); they determine the number of taskq entries
* that are pre-populated when the taskq is first created (via the
* "zfs_zil_clean_taskq_minalloc" tunable) and the maximum number of
* taskq entries that are cached after an on-demand allocation (via the
* "zfs_zil_clean_taskq_maxalloc").
*
* The idea being, we want to try reasonably hard to ensure there will
* already be a taskq entry pre-allocated by the time that it is needed
* by zil_clean(). This way, we can avoid the possibility of an
* on-demand allocation of a new taskq entry from failing, which would
* result in zil_itxg_clean() being called synchronously from zil_clean()
* (which can adversely affect performance of spa_sync()).
*
* Additionally, the number of threads used by the taskq can be
* configured via the "zfs_zil_clean_taskq_nthr_pct" tunable.
*/
int zfs_zil_clean_taskq_nthr_pct = 100;
int zfs_zil_clean_taskq_minalloc = 1024;
int zfs_zil_clean_taskq_maxalloc = 1024 * 1024;
#if defined(__FreeBSD__) && defined(_KERNEL)
extern int zfs_vdev_async_write_active_max_dirty_percent;
SYSCTL_DECL(_vfs_zfs);
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max, CTLFLAG_RWTUN,
&zfs_dirty_data_max, 0,
"The maximum amount of dirty data in bytes after which new writes are "
"halted until space becomes available");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max_max, CTLFLAG_RDTUN,
&zfs_dirty_data_max_max, 0,
"The absolute cap on dirty_data_max when auto calculating");
static int sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS);
SYSCTL_PROC(_vfs_zfs, OID_AUTO, dirty_data_max_percent,
CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
sysctl_zfs_dirty_data_max_percent, "I",
"The percent of physical memory used to auto calculate dirty_data_max");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_sync, CTLFLAG_RWTUN,
&zfs_dirty_data_sync, 0,
"Force a txg if the number of dirty buffer bytes exceed this value");
static int sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS);
/* No zfs_delay_min_dirty_percent tunable due to limit requirements */
SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_min_dirty_percent,
CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(int),
sysctl_zfs_delay_min_dirty_percent, "I",
"The limit of outstanding dirty data before transactions are delayed");
static int sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS);
/* No zfs_delay_scale tunable due to limit requirements */
SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_scale,
CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
sysctl_zfs_delay_scale, "QU",
"Controls how quickly the delay approaches infinity");
static int
sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS)
{
int val, err;
val = zfs_dirty_data_max_percent;
err = sysctl_handle_int(oidp, &val, 0, req);
if (err != 0 || req->newptr == NULL)
return (err);
if (val < 0 || val > 100)
return (EINVAL);
zfs_dirty_data_max_percent = val;
return (0);
}
static int
sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS)
{
int val, err;
val = zfs_delay_min_dirty_percent;
err = sysctl_handle_int(oidp, &val, 0, req);
if (err != 0 || req->newptr == NULL)
return (err);
if (val < zfs_vdev_async_write_active_max_dirty_percent)
return (EINVAL);
zfs_delay_min_dirty_percent = val;
return (0);
}
static int
sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS)
{
uint64_t val;
int err;
val = zfs_delay_scale;
err = sysctl_handle_64(oidp, &val, 0, req);
if (err != 0 || req->newptr == NULL)
return (err);
if (val > UINT64_MAX / zfs_dirty_data_max)
return (EINVAL);
zfs_delay_scale = val;
return (0);
}
#endif
int
dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
{
uint64_t obj;
int err;
err = zap_lookup(dp->dp_meta_objset,
dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj,
name, sizeof (obj), 1, &obj);
if (err)
return (err);
return (dsl_dir_hold_obj(dp, obj, name, dp, ddp));
}
static dsl_pool_t *
dsl_pool_open_impl(spa_t *spa, uint64_t txg)
{
dsl_pool_t *dp;
blkptr_t *bp = spa_get_rootblkptr(spa);
dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
dp->dp_spa = spa;
dp->dp_meta_rootbp = *bp;
rrw_init(&dp->dp_config_rwlock, B_TRUE);
txg_init(dp, txg);
txg_list_create(&dp->dp_dirty_datasets, spa,
offsetof(dsl_dataset_t, ds_dirty_link));
txg_list_create(&dp->dp_dirty_zilogs, spa,
offsetof(zilog_t, zl_dirty_link));
txg_list_create(&dp->dp_dirty_dirs, spa,
offsetof(dsl_dir_t, dd_dirty_link));
txg_list_create(&dp->dp_sync_tasks, spa,
offsetof(dsl_sync_task_t, dst_node));
dp->dp_sync_taskq = taskq_create("dp_sync_taskq",
zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX,
TASKQ_THREADS_CPU_PCT);
dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq",
zfs_zil_clean_taskq_nthr_pct, minclsyspri,
zfs_zil_clean_taskq_minalloc,
zfs_zil_clean_taskq_maxalloc,
TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
1, 4, 0);
return (dp);
}
int
dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
{
int err;
dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
&dp->dp_meta_objset);
if (err != 0)
dsl_pool_close(dp);
else
*dpp = dp;
return (err);
}
int
dsl_pool_open(dsl_pool_t *dp)
{
int err;
dsl_dir_t *dd;
dsl_dataset_t *ds;
uint64_t obj;
rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
&dp->dp_root_dir_obj);
if (err)
goto out;
err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
NULL, dp, &dp->dp_root_dir);
if (err)
goto out;
err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
if (err)
goto out;
if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
if (err)
goto out;
err = dsl_dataset_hold_obj(dp,
dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds);
if (err == 0) {
err = dsl_dataset_hold_obj(dp,
dsl_dataset_phys(ds)->ds_prev_snap_obj, dp,
&dp->dp_origin_snap);
dsl_dataset_rele(ds, FTAG);
}
dsl_dir_rele(dd, dp);
if (err)
goto out;
}
if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
&dp->dp_free_dir);
if (err)
goto out;
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
if (err)
goto out;
VERIFY0(bpobj_open(&dp->dp_free_bpobj,
dp->dp_meta_objset, obj));
}
+ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj);
+ if (err == 0) {
+ VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj,
+ dp->dp_meta_objset, obj));
+ } else if (err == ENOENT) {
+ /*
+ * We might not have created the remap bpobj yet.
+ */
+ err = 0;
+ } else {
+ goto out;
+ }
+ }
+
/*
- * Note: errors ignored, because the leak dir will not exist if we
- * have not encountered a leak yet.
+ * Note: errors ignored, because the these special dirs, used for
+ * space accounting, are only created on demand.
*/
(void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,
&dp->dp_leak_dir);
if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
&dp->dp_bptree_obj);
if (err != 0)
goto out;
}
if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) {
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
&dp->dp_empty_bpobj);
if (err != 0)
goto out;
}
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
&dp->dp_tmp_userrefs_obj);
if (err == ENOENT)
err = 0;
if (err)
goto out;
err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);
out:
rrw_exit(&dp->dp_config_rwlock, FTAG);
return (err);
}
void
dsl_pool_close(dsl_pool_t *dp)
{
/*
* Drop our references from dsl_pool_open().
*
* Since we held the origin_snap from "syncing" context (which
* includes pool-opening context), it actually only got a "ref"
* and not a hold, so just drop that here.
*/
- if (dp->dp_origin_snap)
+ if (dp->dp_origin_snap != NULL)
dsl_dataset_rele(dp->dp_origin_snap, dp);
- if (dp->dp_mos_dir)
+ if (dp->dp_mos_dir != NULL)
dsl_dir_rele(dp->dp_mos_dir, dp);
- if (dp->dp_free_dir)
+ if (dp->dp_free_dir != NULL)
dsl_dir_rele(dp->dp_free_dir, dp);
- if (dp->dp_leak_dir)
+ if (dp->dp_leak_dir != NULL)
dsl_dir_rele(dp->dp_leak_dir, dp);
- if (dp->dp_root_dir)
+ if (dp->dp_root_dir != NULL)
dsl_dir_rele(dp->dp_root_dir, dp);
bpobj_close(&dp->dp_free_bpobj);
+ bpobj_close(&dp->dp_obsolete_bpobj);
/* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
- if (dp->dp_meta_objset)
+ if (dp->dp_meta_objset != NULL)
dmu_objset_evict(dp->dp_meta_objset);
txg_list_destroy(&dp->dp_dirty_datasets);
txg_list_destroy(&dp->dp_dirty_zilogs);
txg_list_destroy(&dp->dp_sync_tasks);
txg_list_destroy(&dp->dp_dirty_dirs);
taskq_destroy(dp->dp_zil_clean_taskq);
taskq_destroy(dp->dp_sync_taskq);
/*
* We can't set retry to TRUE since we're explicitly specifying
* a spa to flush. This is good enough; any missed buffers for
* this spa won't cause trouble, and they'll eventually fall
* out of the ARC just like any other unused buffer.
*/
arc_flush(dp->dp_spa, FALSE);
txg_fini(dp);
dsl_scan_fini(dp);
dmu_buf_user_evict_wait();
rrw_destroy(&dp->dp_config_rwlock);
mutex_destroy(&dp->dp_lock);
taskq_destroy(dp->dp_vnrele_taskq);
- if (dp->dp_blkstats)
+ if (dp->dp_blkstats != NULL)
kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
kmem_free(dp, sizeof (dsl_pool_t));
+}
+
+void
+dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ uint64_t obj;
+ /*
+ * Currently, we only create the obsolete_bpobj where there are
+ * indirect vdevs with referenced mappings.
+ */
+ ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_DEVICE_REMOVAL));
+ /* create and open the obsolete_bpobj */
+ obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
+ VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, dp->dp_meta_objset, obj));
+ VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
+ spa_feature_incr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+}
+
+void
+dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ spa_feature_decr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+ VERIFY0(zap_remove(dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_OBSOLETE_BPOBJ, tx));
+ bpobj_free(dp->dp_meta_objset,
+ dp->dp_obsolete_bpobj.bpo_object, tx);
+ bpobj_close(&dp->dp_obsolete_bpobj);
}
dsl_pool_t *
dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
{
int err;
dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
dsl_dataset_t *ds;
uint64_t obj;
rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
/* create and open the MOS (meta-objset) */
dp->dp_meta_objset = dmu_objset_create_impl(spa,
NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
/* create the pool directory */
err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
ASSERT0(err);
/* Initialize scan structures */
VERIFY0(dsl_scan_init(dp, txg));
/* create and open the root dir */
dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
NULL, dp, &dp->dp_root_dir));
/* create and open the meta-objset dir */
(void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
VERIFY0(dsl_pool_open_special_dir(dp,
MOS_DIR_NAME, &dp->dp_mos_dir));
if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
/* create and open the free dir */
(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
FREE_DIR_NAME, tx);
VERIFY0(dsl_pool_open_special_dir(dp,
FREE_DIR_NAME, &dp->dp_free_dir));
/* create and open the free_bplist */
obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
VERIFY0(bpobj_open(&dp->dp_free_bpobj,
dp->dp_meta_objset, obj));
}
if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
dsl_pool_create_origin(dp, tx);
/* create the root dataset */
obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
/* create the root objset */
VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
#ifdef _KERNEL
{
objset_t *os;
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
os = dmu_objset_create_impl(dp->dp_spa, ds,
dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
rrw_exit(&ds->ds_bp_rwlock, FTAG);
zfs_create_fs(os, kcred, zplprops, tx);
}
#endif
dsl_dataset_rele(ds, FTAG);
dmu_tx_commit(tx);
rrw_exit(&dp->dp_config_rwlock, FTAG);
return (dp);
}
/*
* Account for the meta-objset space in its placeholder dsl_dir.
*/
void
dsl_pool_mos_diduse_space(dsl_pool_t *dp,
int64_t used, int64_t comp, int64_t uncomp)
{
ASSERT3U(comp, ==, uncomp); /* it's all metadata */
mutex_enter(&dp->dp_lock);
dp->dp_mos_used_delta += used;
dp->dp_mos_compressed_delta += comp;
dp->dp_mos_uncompressed_delta += uncomp;
mutex_exit(&dp->dp_lock);
}
static void
dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx)
{
zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
dmu_objset_sync(dp->dp_meta_objset, zio, tx);
VERIFY0(zio_wait(zio));
dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
}
static void
dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
{
ASSERT(MUTEX_HELD(&dp->dp_lock));
if (delta < 0)
ASSERT3U(-delta, <=, dp->dp_dirty_total);
dp->dp_dirty_total += delta;
/*
* Note: we signal even when increasing dp_dirty_total.
* This ensures forward progress -- each thread wakes the next waiter.
*/
if (dp->dp_dirty_total < zfs_dirty_data_max)
cv_signal(&dp->dp_spaceavail_cv);
}
void
dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
{
zio_t *zio;
dmu_tx_t *tx;
dsl_dir_t *dd;
dsl_dataset_t *ds;
objset_t *mos = dp->dp_meta_objset;
list_t synced_datasets;
list_create(&synced_datasets, sizeof (dsl_dataset_t),
offsetof(dsl_dataset_t, ds_synced_link));
tx = dmu_tx_create_assigned(dp, txg);
/*
* Write out all dirty blocks of dirty datasets.
*/
zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
/*
* We must not sync any non-MOS datasets twice, because
* we may have taken a snapshot of them. However, we
* may sync newly-created datasets on pass 2.
*/
ASSERT(!list_link_active(&ds->ds_synced_link));
list_insert_tail(&synced_datasets, ds);
dsl_dataset_sync(ds, zio, tx);
}
VERIFY0(zio_wait(zio));
/*
* We have written all of the accounted dirty data, so our
* dp_space_towrite should now be zero. However, some seldom-used
* code paths do not adhere to this (e.g. dbuf_undirty(), also
* rounding error in dbuf_write_physdone).
* Shore up the accounting of any dirtied space now.
*/
dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);
/*
* Update the long range free counter after
* we're done syncing user data
*/
mutex_enter(&dp->dp_lock);
ASSERT(spa_sync_pass(dp->dp_spa) == 1 ||
dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0);
dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0;
mutex_exit(&dp->dp_lock);
/*
* After the data blocks have been written (ensured by the zio_wait()
* above), update the user/group space accounting. This happens
* in tasks dispatched to dp_sync_taskq, so wait for them before
* continuing.
*/
for (ds = list_head(&synced_datasets); ds != NULL;
ds = list_next(&synced_datasets, ds)) {
dmu_objset_do_userquota_updates(ds->ds_objset, tx);
}
taskq_wait(dp->dp_sync_taskq);
/*
* Sync the datasets again to push out the changes due to
* userspace updates. This must be done before we process the
* sync tasks, so that any snapshots will have the correct
* user accounting information (and we won't get confused
* about which blocks are part of the snapshot).
*/
zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
ASSERT(list_link_active(&ds->ds_synced_link));
dmu_buf_rele(ds->ds_dbuf, ds);
dsl_dataset_sync(ds, zio, tx);
}
VERIFY0(zio_wait(zio));
/*
* Now that the datasets have been completely synced, we can
* clean up our in-memory structures accumulated while syncing:
*
* - move dead blocks from the pending deadlist to the on-disk deadlist
* - release hold from dsl_dataset_dirty()
*/
while ((ds = list_remove_head(&synced_datasets)) != NULL) {
dsl_dataset_sync_done(ds, tx);
}
while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) {
dsl_dir_sync(dd, tx);
}
/*
* The MOS's space is accounted for in the pool/$MOS
* (dp_mos_dir). We can't modify the mos while we're syncing
* it, so we remember the deltas and apply them here.
*/
if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 ||
dp->dp_mos_uncompressed_delta != 0) {
dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD,
dp->dp_mos_used_delta,
dp->dp_mos_compressed_delta,
dp->dp_mos_uncompressed_delta, tx);
dp->dp_mos_used_delta = 0;
dp->dp_mos_compressed_delta = 0;
dp->dp_mos_uncompressed_delta = 0;
}
if (!multilist_is_empty(mos->os_dirty_dnodes[txg & TXG_MASK])) {
dsl_pool_sync_mos(dp, tx);
}
/*
* If we modify a dataset in the same txg that we want to destroy it,
* its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.
* dsl_dir_destroy_check() will fail if there are unexpected holds.
* Therefore, we want to sync the MOS (thus syncing the dd_dbuf
* and clearing the hold on it) before we process the sync_tasks.
* The MOS data dirtied by the sync_tasks will be synced on the next
* pass.
*/
if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
dsl_sync_task_t *dst;
/*
* No more sync tasks should have been added while we
* were syncing.
*/
ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL)
dsl_sync_task_sync(dst, tx);
}
dmu_tx_commit(tx);
DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg);
}
void
dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
{
zilog_t *zilog;
while (zilog = txg_list_head(&dp->dp_dirty_zilogs, txg)) {
dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
/*
* We don't remove the zilog from the dp_dirty_zilogs
* list until after we've cleaned it. This ensures that
* callers of zilog_is_dirty() receive an accurate
* answer when they are racing with the spa sync thread.
*/
zil_clean(zilog, txg);
(void) txg_list_remove_this(&dp->dp_dirty_zilogs, zilog, txg);
ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
dmu_buf_rele(ds->ds_dbuf, zilog);
}
ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
}
/*
* TRUE if the current thread is the tx_sync_thread or if we
* are being called from SPA context during pool initialization.
*/
int
dsl_pool_sync_context(dsl_pool_t *dp)
{
return (curthread == dp->dp_tx.tx_sync_thread ||
spa_is_initializing(dp->dp_spa) ||
taskq_member(dp->dp_sync_taskq, curthread));
}
uint64_t
dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
{
uint64_t space, resv;
/*
* If we're trying to assess whether it's OK to do a free,
* cut the reservation in half to allow forward progress
* (e.g. make it possible to rm(1) files from a full pool).
*/
space = spa_get_dspace(dp->dp_spa);
resv = spa_get_slop_space(dp->dp_spa);
if (netfree)
resv >>= 1;
return (space - resv);
}
boolean_t
dsl_pool_need_dirty_delay(dsl_pool_t *dp)
{
uint64_t delay_min_bytes =
zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
boolean_t rv;
mutex_enter(&dp->dp_lock);
if (dp->dp_dirty_total > zfs_dirty_data_sync)
txg_kick(dp);
rv = (dp->dp_dirty_total > delay_min_bytes);
mutex_exit(&dp->dp_lock);
return (rv);
}
void
dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
{
if (space > 0) {
mutex_enter(&dp->dp_lock);
dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space;
dsl_pool_dirty_delta(dp, space);
mutex_exit(&dp->dp_lock);
}
}
void
dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg)
{
ASSERT3S(space, >=, 0);
if (space == 0)
return;
mutex_enter(&dp->dp_lock);
if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) {
/* XXX writing something we didn't dirty? */
space = dp->dp_dirty_pertxg[txg & TXG_MASK];
}
ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space);
dp->dp_dirty_pertxg[txg & TXG_MASK] -= space;
ASSERT3U(dp->dp_dirty_total, >=, space);
dsl_pool_dirty_delta(dp, -space);
mutex_exit(&dp->dp_lock);
}
/* ARGSUSED */
static int
upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
{
dmu_tx_t *tx = arg;
dsl_dataset_t *ds, *prev = NULL;
int err;
err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
if (err)
return (err);
while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
err = dsl_dataset_hold_obj(dp,
dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
if (err) {
dsl_dataset_rele(ds, FTAG);
return (err);
}
if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object)
break;
dsl_dataset_rele(ds, FTAG);
ds = prev;
prev = NULL;
}
if (prev == NULL) {
prev = dp->dp_origin_snap;
/*
* The $ORIGIN can't have any data, or the accounting
* will be wrong.
*/
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth);
rrw_exit(&ds->ds_bp_rwlock, FTAG);
/* The origin doesn't get attached to itself */
if (ds->ds_object == prev->ds_object) {
dsl_dataset_rele(ds, FTAG);
return (0);
}
dmu_buf_will_dirty(ds->ds_dbuf, tx);
dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object;
dsl_dataset_phys(ds)->ds_prev_snap_txg =
dsl_dataset_phys(prev)->ds_creation_txg;
dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object;
dmu_buf_will_dirty(prev->ds_dbuf, tx);
dsl_dataset_phys(prev)->ds_num_children++;
if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) {
ASSERT(ds->ds_prev == NULL);
VERIFY0(dsl_dataset_hold_obj(dp,
dsl_dataset_phys(ds)->ds_prev_snap_obj,
ds, &ds->ds_prev));
}
}
ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object);
ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object);
if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) {
dmu_buf_will_dirty(prev->ds_dbuf, tx);
dsl_dataset_phys(prev)->ds_next_clones_obj =
zap_create(dp->dp_meta_objset,
DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
}
VERIFY0(zap_add_int(dp->dp_meta_objset,
dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx));
dsl_dataset_rele(ds, FTAG);
if (prev != dp->dp_origin_snap)
dsl_dataset_rele(prev, FTAG);
return (0);
}
void
dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
{
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(dp->dp_origin_snap != NULL);
VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
}
/* ARGSUSED */
static int
upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
{
dmu_tx_t *tx = arg;
objset_t *mos = dp->dp_meta_objset;
if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) {
dsl_dataset_t *origin;
VERIFY0(dsl_dataset_hold_obj(dp,
dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin));
if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
dsl_dir_phys(origin->ds_dir)->dd_clones =
zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE,
0, tx);
}
VERIFY0(zap_add_int(dp->dp_meta_objset,
dsl_dir_phys(origin->ds_dir)->dd_clones,
ds->ds_object, tx));
dsl_dataset_rele(origin, FTAG);
}
return (0);
}
void
dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
{
ASSERT(dmu_tx_is_syncing(tx));
uint64_t obj;
(void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
VERIFY0(dsl_pool_open_special_dir(dp,
FREE_DIR_NAME, &dp->dp_free_dir));
/*
* We can't use bpobj_alloc(), because spa_version() still
* returns the old version, and we need a new-version bpobj with
* subobj support. So call dmu_object_alloc() directly.
*/
obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
}
void
dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
{
uint64_t dsobj;
dsl_dataset_t *ds;
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(dp->dp_origin_snap == NULL);
ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER));
/* create the origin dir, ds, & snap-ds */
dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
NULL, 0, kcred, tx);
VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj,
dp, &dp->dp_origin_snap));
dsl_dataset_rele(ds, FTAG);
}
taskq_t *
dsl_pool_vnrele_taskq(dsl_pool_t *dp)
{
return (dp->dp_vnrele_taskq);
}
/*
* Walk through the pool-wide zap object of temporary snapshot user holds
* and release them.
*/
void
dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
{
zap_attribute_t za;
zap_cursor_t zc;
objset_t *mos = dp->dp_meta_objset;
uint64_t zapobj = dp->dp_tmp_userrefs_obj;
nvlist_t *holds;
if (zapobj == 0)
return;
ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
holds = fnvlist_alloc();
for (zap_cursor_init(&zc, mos, zapobj);
zap_cursor_retrieve(&zc, &za) == 0;
zap_cursor_advance(&zc)) {
char *htag;
nvlist_t *tags;
htag = strchr(za.za_name, '-');
*htag = '\0';
++htag;
if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) {
tags = fnvlist_alloc();
fnvlist_add_boolean(tags, htag);
fnvlist_add_nvlist(holds, za.za_name, tags);
fnvlist_free(tags);
} else {
fnvlist_add_boolean(tags, htag);
}
}
dsl_dataset_user_release_tmp(dp, holds);
fnvlist_free(holds);
zap_cursor_fini(&zc);
}
/*
* Create the pool-wide zap object for storing temporary snapshot holds.
*/
void
dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
{
objset_t *mos = dp->dp_meta_objset;
ASSERT(dp->dp_tmp_userrefs_obj == 0);
ASSERT(dmu_tx_is_syncing(tx));
dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);
}
static int
dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding)
{
objset_t *mos = dp->dp_meta_objset;
uint64_t zapobj = dp->dp_tmp_userrefs_obj;
char *name;
int error;
ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
ASSERT(dmu_tx_is_syncing(tx));
/*
* If the pool was created prior to SPA_VERSION_USERREFS, the
* zap object for temporary holds might not exist yet.
*/
if (zapobj == 0) {
if (holding) {
dsl_pool_user_hold_create_obj(dp, tx);
zapobj = dp->dp_tmp_userrefs_obj;
} else {
return (SET_ERROR(ENOENT));
}
}
name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
if (holding)
error = zap_add(mos, zapobj, name, 8, 1, &now, tx);
else
error = zap_remove(mos, zapobj, name, tx);
strfree(name);
return (error);
}
/*
* Add a temporary hold for the given dataset object and tag.
*/
int
dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
uint64_t now, dmu_tx_t *tx)
{
return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
}
/*
* Release a temporary hold for the given dataset object and tag.
*/
int
dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
dmu_tx_t *tx)
{
return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0,
tx, B_FALSE));
}
/*
* DSL Pool Configuration Lock
*
* The dp_config_rwlock protects against changes to DSL state (e.g. dataset
* creation / destruction / rename / property setting). It must be held for
* read to hold a dataset or dsl_dir. I.e. you must call
* dsl_pool_config_enter() or dsl_pool_hold() before calling
* dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock
* must be held continuously until all datasets and dsl_dirs are released.
*
* The only exception to this rule is that if a "long hold" is placed on
* a dataset, then the dp_config_rwlock may be dropped while the dataset
* is still held. The long hold will prevent the dataset from being
* destroyed -- the destroy will fail with EBUSY. A long hold can be
* obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset
* (by calling dsl_{dataset,objset}_{try}own{_obj}).
*
* Legitimate long-holders (including owners) should be long-running, cancelable
* tasks that should cause "zfs destroy" to fail. This includes DMU
* consumers (i.e. a ZPL filesystem being mounted or ZVOL being open),
* "zfs send", and "zfs diff". There are several other long-holders whose
* uses are suboptimal (e.g. "zfs promote", and zil_suspend()).
*
* The usual formula for long-holding would be:
* dsl_pool_hold()
* dsl_dataset_hold()
* ... perform checks ...
* dsl_dataset_long_hold()
* dsl_pool_rele()
* ... perform long-running task ...
* dsl_dataset_long_rele()
* dsl_dataset_rele()
*
* Note that when the long hold is released, the dataset is still held but
* the pool is not held. The dataset may change arbitrarily during this time
* (e.g. it could be destroyed). Therefore you shouldn't do anything to the
* dataset except release it.
*
* User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only
* or modifying operations.
*
* Modifying operations should generally use dsl_sync_task(). The synctask
* infrastructure enforces proper locking strategy with respect to the
* dp_config_rwlock. See the comment above dsl_sync_task() for details.
*
* Read-only operations will manually hold the pool, then the dataset, obtain
* information from the dataset, then release the pool and dataset.
* dmu_objset_{hold,rele}() are convenience routines that also do the pool
* hold/rele.
*/
int
dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp)
{
spa_t *spa;
int error;
error = spa_open(name, &spa, tag);
if (error == 0) {
*dp = spa_get_dsl(spa);
dsl_pool_config_enter(*dp, tag);
}
return (error);
}
void
dsl_pool_rele(dsl_pool_t *dp, void *tag)
{
dsl_pool_config_exit(dp, tag);
spa_close(dp->dp_spa, tag);
}
void
dsl_pool_config_enter(dsl_pool_t *dp, void *tag)
{
/*
* We use a "reentrant" reader-writer lock, but not reentrantly.
*
* The rrwlock can (with the track_all flag) track all reading threads,
* which is very useful for debugging which code path failed to release
* the lock, and for verifying that the *current* thread does hold
* the lock.
*
* (Unlike a rwlock, which knows that N threads hold it for
* read, but not *which* threads, so rw_held(RW_READER) returns TRUE
* if any thread holds it for read, even if this thread doesn't).
*/
ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);
}
void
dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag)
{
ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
rrw_enter_read_prio(&dp->dp_config_rwlock, tag);
}
void
dsl_pool_config_exit(dsl_pool_t *dp, void *tag)
{
rrw_exit(&dp->dp_config_rwlock, tag);
}
boolean_t
dsl_pool_config_held(dsl_pool_t *dp)
{
return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
}
boolean_t
dsl_pool_config_held_writer(dsl_pool_t *dp)
{
return (RRW_WRITE_HELD(&dp->dp_config_rwlock));
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c (revision 332525)
@@ -1,2037 +1,2078 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2016 Gary Mills
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright 2017 Joyent, Inc.
* Copyright (c) 2017 Datto Inc.
*/
#include <sys/dsl_scan.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_dir.h>
#include <sys/dsl_synctask.h>
#include <sys/dnode.h>
#include <sys/dmu_tx.h>
#include <sys/dmu_objset.h>
#include <sys/arc.h>
#include <sys/zap.h>
#include <sys/zio.h>
#include <sys/zfs_context.h>
#include <sys/fs/zfs.h>
#include <sys/zfs_znode.h>
#include <sys/spa_impl.h>
#include <sys/vdev_impl.h>
#include <sys/zil_impl.h>
#include <sys/zio_checksum.h>
#include <sys/ddt.h>
#include <sys/sa.h>
#include <sys/sa_impl.h>
#include <sys/zfeature.h>
#include <sys/abd.h>
#ifdef _KERNEL
#include <sys/zfs_vfsops.h>
#endif
typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *,
const zbookmark_phys_t *);
static scan_cb_t dsl_scan_scrub_cb;
static void dsl_scan_cancel_sync(void *, dmu_tx_t *);
static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *);
static boolean_t dsl_scan_restarting(dsl_scan_t *, dmu_tx_t *);
unsigned int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */
unsigned int zfs_resilver_delay = 2; /* number of ticks to delay resilver */
unsigned int zfs_scrub_delay = 4; /* number of ticks to delay scrub */
unsigned int zfs_scan_idle = 50; /* idle window in clock ticks */
unsigned int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
+unsigned int zfs_obsolete_min_time_ms = 500; /* min millisecs to obsolete per txg */
unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver
per txg */
boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
SYSCTL_DECL(_vfs_zfs);
SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN,
&zfs_top_maxinflight, 0, "Maximum I/Os per top-level vdev");
SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_delay, CTLFLAG_RWTUN,
&zfs_resilver_delay, 0, "Number of ticks to delay resilver");
SYSCTL_UINT(_vfs_zfs, OID_AUTO, scrub_delay, CTLFLAG_RWTUN,
&zfs_scrub_delay, 0, "Number of ticks to delay scrub");
SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_idle, CTLFLAG_RWTUN,
&zfs_scan_idle, 0, "Idle scan window in clock ticks");
SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_min_time_ms, CTLFLAG_RWTUN,
&zfs_scan_min_time_ms, 0, "Min millisecs to scrub per txg");
SYSCTL_UINT(_vfs_zfs, OID_AUTO, free_min_time_ms, CTLFLAG_RWTUN,
&zfs_free_min_time_ms, 0, "Min millisecs to free per txg");
SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_min_time_ms, CTLFLAG_RWTUN,
&zfs_resilver_min_time_ms, 0, "Min millisecs to resilver per txg");
SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_io, CTLFLAG_RWTUN,
&zfs_no_scrub_io, 0, "Disable scrub I/O");
SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RWTUN,
&zfs_no_scrub_prefetch, 0, "Disable scrub prefetching");
enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
/* max number of blocks to free in a single TXG */
-uint64_t zfs_free_max_blocks = UINT64_MAX;
+uint64_t zfs_async_block_max_blocks = UINT64_MAX;
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN,
- &zfs_free_max_blocks, 0, "Maximum number of blocks to free in one TXG");
+ &zfs_async_block_max_blocks, 0, "Maximum number of blocks to free in one TXG");
#define DSL_SCAN_IS_SCRUB_RESILVER(scn) \
((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
(scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
extern int zfs_txg_timeout;
/*
* Enable/disable the processing of the free_bpobj object.
*/
boolean_t zfs_free_bpobj_enabled = B_TRUE;
SYSCTL_INT(_vfs_zfs, OID_AUTO, free_bpobj_enabled, CTLFLAG_RWTUN,
&zfs_free_bpobj_enabled, 0, "Enable free_bpobj processing");
/* the order has to match pool_scan_type */
static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
NULL,
dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */
dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */
};
int
dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
{
int err;
dsl_scan_t *scn;
spa_t *spa = dp->dp_spa;
uint64_t f;
scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
scn->scn_dp = dp;
/*
* It's possible that we're resuming a scan after a reboot so
* make sure that the scan_async_destroying flag is initialized
* appropriately.
*/
ASSERT(!scn->scn_async_destroying);
scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
SPA_FEATURE_ASYNC_DESTROY);
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
"scrub_func", sizeof (uint64_t), 1, &f);
if (err == 0) {
/*
* There was an old-style scrub in progress. Restart a
* new-style scrub from the beginning.
*/
scn->scn_restart_txg = txg;
zfs_dbgmsg("old-style scrub was in progress; "
"restarting new-style scrub in txg %llu",
scn->scn_restart_txg);
/*
* Load the queue obj from the old location so that it
* can be freed by dsl_scan_done().
*/
(void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
"scrub_queue", sizeof (uint64_t), 1,
&scn->scn_phys.scn_queue_obj);
} else {
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
&scn->scn_phys);
if (err == ENOENT)
return (0);
else if (err)
return (err);
if (scn->scn_phys.scn_state == DSS_SCANNING &&
spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
/*
* A new-type scrub was in progress on an old
* pool, and the pool was accessed by old
* software. Restart from the beginning, since
* the old software may have changed the pool in
* the meantime.
*/
scn->scn_restart_txg = txg;
zfs_dbgmsg("new-style scrub was modified "
"by old software; restarting in txg %llu",
scn->scn_restart_txg);
}
}
spa_scan_stat_init(spa);
return (0);
}
void
dsl_scan_fini(dsl_pool_t *dp)
{
if (dp->dp_scan) {
kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
dp->dp_scan = NULL;
}
}
/* ARGSUSED */
static int
dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
{
dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
if (scn->scn_phys.scn_state == DSS_SCANNING)
return (SET_ERROR(EBUSY));
return (0);
}
static void
dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
{
dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
pool_scan_func_t *funcp = arg;
dmu_object_type_t ot = 0;
dsl_pool_t *dp = scn->scn_dp;
spa_t *spa = dp->dp_spa;
ASSERT(scn->scn_phys.scn_state != DSS_SCANNING);
ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
bzero(&scn->scn_phys, sizeof (scn->scn_phys));
scn->scn_phys.scn_func = *funcp;
scn->scn_phys.scn_state = DSS_SCANNING;
scn->scn_phys.scn_min_txg = 0;
scn->scn_phys.scn_max_txg = tx->tx_txg;
scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
scn->scn_phys.scn_start_time = gethrestime_sec();
scn->scn_phys.scn_errors = 0;
scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
scn->scn_restart_txg = 0;
scn->scn_done_txg = 0;
spa_scan_stat_init(spa);
if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
/* rewrite all disk labels */
vdev_config_dirty(spa->spa_root_vdev);
if (vdev_resilver_needed(spa->spa_root_vdev,
&scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
spa_event_notify(spa, NULL, NULL,
ESC_ZFS_RESILVER_START);
} else {
spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START);
}
spa->spa_scrub_started = B_TRUE;
/*
* If this is an incremental scrub, limit the DDT scrub phase
* to just the auto-ditto class (for correctness); the rest
* of the scrub should go faster using top-down pruning.
*/
if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
}
/* back to the generic stuff */
if (dp->dp_blkstats == NULL) {
dp->dp_blkstats =
kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
}
bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
ot = DMU_OT_ZAP_OTHER;
scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
dsl_scan_sync_state(scn, tx);
spa_history_log_internal(spa, "scan setup", tx,
"func=%u mintxg=%llu maxtxg=%llu",
*funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
}
/* ARGSUSED */
static void
dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
{
static const char *old_names[] = {
"scrub_bookmark",
"scrub_ddt_bookmark",
"scrub_ddt_class_max",
"scrub_queue",
"scrub_min_txg",
"scrub_max_txg",
"scrub_func",
"scrub_errors",
NULL
};
dsl_pool_t *dp = scn->scn_dp;
spa_t *spa = dp->dp_spa;
int i;
/* Remove any remnants of an old-style scrub. */
for (i = 0; old_names[i]; i++) {
(void) zap_remove(dp->dp_meta_objset,
DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
}
if (scn->scn_phys.scn_queue_obj != 0) {
VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj, tx));
scn->scn_phys.scn_queue_obj = 0;
}
scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
/*
* If we were "restarted" from a stopped state, don't bother
* with anything else.
*/
if (scn->scn_phys.scn_state != DSS_SCANNING)
return;
if (complete)
scn->scn_phys.scn_state = DSS_FINISHED;
else
scn->scn_phys.scn_state = DSS_CANCELED;
if (dsl_scan_restarting(scn, tx))
spa_history_log_internal(spa, "scan aborted, restarting", tx,
"errors=%llu", spa_get_errlog_size(spa));
else if (!complete)
spa_history_log_internal(spa, "scan cancelled", tx,
"errors=%llu", spa_get_errlog_size(spa));
else
spa_history_log_internal(spa, "scan done", tx,
"errors=%llu", spa_get_errlog_size(spa));
if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
mutex_enter(&spa->spa_scrub_lock);
while (spa->spa_scrub_inflight > 0) {
cv_wait(&spa->spa_scrub_io_cv,
&spa->spa_scrub_lock);
}
mutex_exit(&spa->spa_scrub_lock);
spa->spa_scrub_started = B_FALSE;
spa->spa_scrub_active = B_FALSE;
/*
* If the scrub/resilver completed, update all DTLs to
* reflect this. Whether it succeeded or not, vacate
* all temporary scrub DTLs.
*/
vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
if (complete) {
spa_event_notify(spa, NULL, NULL,
scn->scn_phys.scn_min_txg ?
ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
}
spa_errlog_rotate(spa);
/*
* We may have finished replacing a device.
* Let the async thread assess this and handle the detach.
*/
spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
}
scn->scn_phys.scn_end_time = gethrestime_sec();
}
/* ARGSUSED */
static int
dsl_scan_cancel_check(void *arg, dmu_tx_t *tx)
{
dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
if (scn->scn_phys.scn_state != DSS_SCANNING)
return (SET_ERROR(ENOENT));
return (0);
}
/* ARGSUSED */
static void
dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx)
{
dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
dsl_scan_done(scn, B_FALSE, tx);
dsl_scan_sync_state(scn, tx);
spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_SCRUB_ABORT);
}
int
dsl_scan_cancel(dsl_pool_t *dp)
{
return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
}
boolean_t
dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
{
if (dsl_scan_scrubbing(scn->scn_dp) &&
scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED)
return (B_TRUE);
return (B_FALSE);
}
static int
dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx)
{
pool_scrub_cmd_t *cmd = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_scan_t *scn = dp->dp_scan;
if (*cmd == POOL_SCRUB_PAUSE) {
/* can't pause a scrub when there is no in-progress scrub */
if (!dsl_scan_scrubbing(dp))
return (SET_ERROR(ENOENT));
/* can't pause a paused scrub */
if (dsl_scan_is_paused_scrub(scn))
return (SET_ERROR(EBUSY));
} else if (*cmd != POOL_SCRUB_NORMAL) {
return (SET_ERROR(ENOTSUP));
}
return (0);
}
static void
dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
{
pool_scrub_cmd_t *cmd = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
spa_t *spa = dp->dp_spa;
dsl_scan_t *scn = dp->dp_scan;
if (*cmd == POOL_SCRUB_PAUSE) {
/* can't pause a scrub when there is no in-progress scrub */
spa->spa_scan_pass_scrub_pause = gethrestime_sec();
scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED;
dsl_scan_sync_state(scn, tx);
spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED);
} else {
ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);
if (dsl_scan_is_paused_scrub(scn)) {
/*
* We need to keep track of how much time we spend
* paused per pass so that we can adjust the scrub rate
* shown in the output of 'zpool status'
*/
spa->spa_scan_pass_scrub_spent_paused +=
gethrestime_sec() - spa->spa_scan_pass_scrub_pause;
spa->spa_scan_pass_scrub_pause = 0;
scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
dsl_scan_sync_state(scn, tx);
}
}
}
/*
* Set scrub pause/resume state if it makes sense to do so
*/
int
dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd)
{
return (dsl_sync_task(spa_name(dp->dp_spa),
dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3,
ZFS_SPACE_CHECK_RESERVED));
}
boolean_t
dsl_scan_scrubbing(const dsl_pool_t *dp)
{
dsl_scan_t *scn = dp->dp_scan;
if (scn->scn_phys.scn_state == DSS_SCANNING &&
scn->scn_phys.scn_func == POOL_SCAN_SCRUB)
return (B_TRUE);
return (B_FALSE);
}
static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
dmu_objset_type_t ostype, dmu_tx_t *tx);
static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds,
dmu_objset_type_t ostype,
dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx);
void
dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
{
zio_free(dp->dp_spa, txg, bp);
}
void
dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
{
ASSERT(dsl_pool_sync_context(dp));
zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp),
pio->io_flags));
}
static uint64_t
dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
{
uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
if (ds->ds_is_snapshot)
return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg));
return (smt);
}
static void
dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
{
VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
&scn->scn_phys, tx));
}
extern int zfs_vdev_async_write_active_min_dirty_percent;
static boolean_t
dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
{
/* we never skip user/group accounting objects */
if (zb && (int64_t)zb->zb_object < 0)
return (B_FALSE);
if (scn->scn_suspending)
return (B_TRUE); /* we're already suspending */
if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
return (B_FALSE); /* we're resuming */
/* We only know how to resume from level-0 blocks. */
if (zb && zb->zb_level != 0)
return (B_FALSE);
/*
* We suspend if:
* - we have scanned for the maximum time: an entire txg
* timeout (default 5 sec)
* or
* - we have scanned for at least the minimum time (default 1 sec
* for scrub, 3 sec for resilver), and either we have sufficient
* dirty data that we are starting to write more quickly
* (default 30%), or someone is explicitly waiting for this txg
* to complete.
* or
* - the spa is shutting down because this pool is being exported
* or the machine is rebooting.
*/
int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
uint64_t elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
if (elapsed_nanosecs / NANOSEC >= zfs_txg_timeout ||
(NSEC2MSEC(elapsed_nanosecs) > mintime &&
(txg_sync_waiting(scn->scn_dp) ||
dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent)) ||
spa_shutting_down(scn->scn_dp->dp_spa)) {
if (zb) {
dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n",
(longlong_t)zb->zb_objset,
(longlong_t)zb->zb_object,
(longlong_t)zb->zb_level,
(longlong_t)zb->zb_blkid);
scn->scn_phys.scn_bookmark = *zb;
}
dprintf("suspending at DDT bookmark %llx/%llx/%llx/%llx\n",
(longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
(longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
(longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
(longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
scn->scn_suspending = B_TRUE;
return (B_TRUE);
}
return (B_FALSE);
}
typedef struct zil_scan_arg {
dsl_pool_t *zsa_dp;
zil_header_t *zsa_zh;
} zil_scan_arg_t;
/* ARGSUSED */
static int
dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
{
zil_scan_arg_t *zsa = arg;
dsl_pool_t *dp = zsa->zsa_dp;
dsl_scan_t *scn = dp->dp_scan;
zil_header_t *zh = zsa->zsa_zh;
zbookmark_phys_t zb;
if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
return (0);
/*
* One block ("stubby") can be allocated a long time ago; we
* want to visit that one because it has been allocated
* (on-disk) even if it hasn't been claimed (even though for
* scrub there's nothing to do to it).
*/
if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
return (0);
SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
return (0);
}
/* ARGSUSED */
static int
dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
{
if (lrc->lrc_txtype == TX_WRITE) {
zil_scan_arg_t *zsa = arg;
dsl_pool_t *dp = zsa->zsa_dp;
dsl_scan_t *scn = dp->dp_scan;
zil_header_t *zh = zsa->zsa_zh;
lr_write_t *lr = (lr_write_t *)lrc;
blkptr_t *bp = &lr->lr_blkptr;
zbookmark_phys_t zb;
if (BP_IS_HOLE(bp) ||
bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
return (0);
/*
* birth can be < claim_txg if this record's txg is
* already txg sync'ed (but this log block contains
* other records that are not synced)
*/
if (claim_txg == 0 || bp->blk_birth < claim_txg)
return (0);
SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
lr->lr_foid, ZB_ZIL_LEVEL,
lr->lr_offset / BP_GET_LSIZE(bp));
VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
}
return (0);
}
static void
dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
{
uint64_t claim_txg = zh->zh_claim_txg;
zil_scan_arg_t zsa = { dp, zh };
zilog_t *zilog;
/*
* We only want to visit blocks that have been claimed but not yet
* replayed (or, in read-only mode, blocks that *would* be claimed).
*/
if (claim_txg == 0 && spa_writeable(dp->dp_spa))
return;
zilog = zil_alloc(dp->dp_meta_objset, zh);
(void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
claim_txg);
zil_free(zilog);
}
/* ARGSUSED */
static void
dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
uint64_t objset, uint64_t object, uint64_t blkid)
{
zbookmark_phys_t czb;
arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
if (zfs_no_scrub_prefetch)
return;
if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg ||
(BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
return;
SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
}
static boolean_t
dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
const zbookmark_phys_t *zb)
{
/*
* We never skip over user/group accounting objects (obj<0)
*/
if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
(int64_t)zb->zb_object >= 0) {
/*
* If we already visited this bp & everything below (in
* a prior txg sync), don't bother doing it again.
*/
if (zbookmark_subtree_completed(dnp, zb,
&scn->scn_phys.scn_bookmark))
return (B_TRUE);
/*
* If we found the block we're trying to resume from, or
* we went past it to a different object, zero it out to
* indicate that it's OK to start checking for suspending
* again.
*/
if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
dprintf("resuming at %llx/%llx/%llx/%llx\n",
(longlong_t)zb->zb_objset,
(longlong_t)zb->zb_object,
(longlong_t)zb->zb_level,
(longlong_t)zb->zb_blkid);
bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
}
}
return (B_FALSE);
}
/*
* Return nonzero on i/o error.
* Return new buf to write out in *bufp.
*/
static int
dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
dnode_phys_t *dnp, const blkptr_t *bp,
const zbookmark_phys_t *zb, dmu_tx_t *tx)
{
dsl_pool_t *dp = scn->scn_dp;
int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
int err;
if (BP_GET_LEVEL(bp) > 0) {
arc_flags_t flags = ARC_FLAG_WAIT;
int i;
blkptr_t *cbp;
int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
arc_buf_t *buf;
err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
if (err) {
scn->scn_phys.scn_errors++;
return (err);
}
for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset,
zb->zb_object, zb->zb_blkid * epb + i);
}
for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
zbookmark_phys_t czb;
SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
zb->zb_level - 1,
zb->zb_blkid * epb + i);
dsl_scan_visitbp(cbp, &czb, dnp,
ds, scn, ostype, tx);
}
arc_buf_destroy(buf, &buf);
} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
arc_flags_t flags = ARC_FLAG_WAIT;
dnode_phys_t *cdnp;
int i, j;
int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
arc_buf_t *buf;
err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
if (err) {
scn->scn_phys.scn_errors++;
return (err);
}
for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
for (j = 0; j < cdnp->dn_nblkptr; j++) {
blkptr_t *cbp = &cdnp->dn_blkptr[j];
dsl_scan_prefetch(scn, buf, cbp,
zb->zb_objset, zb->zb_blkid * epb + i, j);
}
}
for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
dsl_scan_visitdnode(scn, ds, ostype,
cdnp, zb->zb_blkid * epb + i, tx);
}
arc_buf_destroy(buf, &buf);
} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
arc_flags_t flags = ARC_FLAG_WAIT;
objset_phys_t *osp;
arc_buf_t *buf;
err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
if (err) {
scn->scn_phys.scn_errors++;
return (err);
}
osp = buf->b_data;
dsl_scan_visitdnode(scn, ds, osp->os_type,
&osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx);
if (OBJSET_BUF_HAS_USERUSED(buf)) {
/*
* We also always visit user/group accounting
* objects, and never skip them, even if we are
* suspending. This is necessary so that the space
* deltas from this txg get integrated.
*/
dsl_scan_visitdnode(scn, ds, osp->os_type,
&osp->os_groupused_dnode,
DMU_GROUPUSED_OBJECT, tx);
dsl_scan_visitdnode(scn, ds, osp->os_type,
&osp->os_userused_dnode,
DMU_USERUSED_OBJECT, tx);
}
arc_buf_destroy(buf, &buf);
}
return (0);
}
static void
dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
dmu_objset_type_t ostype, dnode_phys_t *dnp,
uint64_t object, dmu_tx_t *tx)
{
int j;
for (j = 0; j < dnp->dn_nblkptr; j++) {
zbookmark_phys_t czb;
SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
dnp->dn_nlevels - 1, j);
dsl_scan_visitbp(&dnp->dn_blkptr[j],
&czb, dnp, ds, scn, ostype, tx);
}
if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
zbookmark_phys_t czb;
SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
0, DMU_SPILL_BLKID);
dsl_scan_visitbp(&dnp->dn_spill,
&czb, dnp, ds, scn, ostype, tx);
}
}
/*
* The arguments are in this order because mdb can only print the
* first 5; we want them to be useful.
*/
static void
dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
dmu_objset_type_t ostype, dmu_tx_t *tx)
{
dsl_pool_t *dp = scn->scn_dp;
arc_buf_t *buf = NULL;
blkptr_t bp_toread = *bp;
/* ASSERT(pbuf == NULL || arc_released(pbuf)); */
if (dsl_scan_check_suspend(scn, zb))
return;
if (dsl_scan_check_resume(scn, dnp, zb))
return;
if (BP_IS_HOLE(bp))
return;
scn->scn_visited_this_txg++;
dprintf_bp(bp,
"visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p",
ds, ds ? ds->ds_object : 0,
zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
bp);
if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
return;
if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx) != 0)
return;
/*
* If dsl_scan_ddt() has already visited this block, it will have
* already done any translations or scrubbing, so don't call the
* callback again.
*/
if (ddt_class_contains(dp->dp_spa,
scn->scn_phys.scn_ddt_class_max, bp)) {
ASSERT(buf == NULL);
return;
}
/*
* If this block is from the future (after cur_max_txg), then we
* are doing this on behalf of a deleted snapshot, and we will
* revisit the future block on the next pass of this dataset.
* Don't scan it now unless we need to because something
* under it was modified.
*/
if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) {
scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
}
}
static void
dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
dmu_tx_t *tx)
{
zbookmark_phys_t zb;
SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
dsl_scan_visitbp(bp, &zb, NULL,
ds, scn, DMU_OST_NONE, tx);
dprintf_ds(ds, "finished scan%s", "");
}
void
dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
{
dsl_pool_t *dp = ds->ds_dir->dd_pool;
dsl_scan_t *scn = dp->dp_scan;
uint64_t mintxg;
if (scn->scn_phys.scn_state != DSS_SCANNING)
return;
if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
if (ds->ds_is_snapshot) {
/*
* Note:
* - scn_cur_{min,max}_txg stays the same.
* - Setting the flag is not really necessary if
* scn_cur_max_txg == scn_max_txg, because there
* is nothing after this snapshot that we care
* about. However, we set it anyway and then
* ignore it when we retraverse it in
* dsl_scan_visitds().
*/
scn->scn_phys.scn_bookmark.zb_objset =
dsl_dataset_phys(ds)->ds_next_snap_obj;
zfs_dbgmsg("destroying ds %llu; currently traversing; "
"reset zb_objset to %llu",
(u_longlong_t)ds->ds_object,
(u_longlong_t)dsl_dataset_phys(ds)->
ds_next_snap_obj);
scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
} else {
SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
ZB_DESTROYED_OBJSET, 0, 0, 0);
zfs_dbgmsg("destroying ds %llu; currently traversing; "
"reset bookmark to -1,0,0,0",
(u_longlong_t)ds->ds_object);
}
} else if (zap_lookup_int_key(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
if (ds->ds_is_snapshot) {
/*
* We keep the same mintxg; it could be >
* ds_creation_txg if the previous snapshot was
* deleted too.
*/
VERIFY(zap_add_int_key(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj,
dsl_dataset_phys(ds)->ds_next_snap_obj,
mintxg, tx) == 0);
zfs_dbgmsg("destroying ds %llu; in queue; "
"replacing with %llu",
(u_longlong_t)ds->ds_object,
(u_longlong_t)dsl_dataset_phys(ds)->
ds_next_snap_obj);
} else {
zfs_dbgmsg("destroying ds %llu; in queue; removing",
(u_longlong_t)ds->ds_object);
}
}
/*
* dsl_scan_sync() should be called after this, and should sync
* out our changed state, but just to be safe, do it here.
*/
dsl_scan_sync_state(scn, tx);
}
void
dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
{
dsl_pool_t *dp = ds->ds_dir->dd_pool;
dsl_scan_t *scn = dp->dp_scan;
uint64_t mintxg;
if (scn->scn_phys.scn_state != DSS_SCANNING)
return;
ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
scn->scn_phys.scn_bookmark.zb_objset =
dsl_dataset_phys(ds)->ds_prev_snap_obj;
zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
"reset zb_objset to %llu",
(u_longlong_t)ds->ds_object,
(u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
} else if (zap_lookup_int_key(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
VERIFY(zap_add_int_key(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj,
dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0);
zfs_dbgmsg("snapshotting ds %llu; in queue; "
"replacing with %llu",
(u_longlong_t)ds->ds_object,
(u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
}
dsl_scan_sync_state(scn, tx);
}
void
dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
{
dsl_pool_t *dp = ds1->ds_dir->dd_pool;
dsl_scan_t *scn = dp->dp_scan;
uint64_t mintxg;
if (scn->scn_phys.scn_state != DSS_SCANNING)
return;
if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) {
scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object;
zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
"reset zb_objset to %llu",
(u_longlong_t)ds1->ds_object,
(u_longlong_t)ds2->ds_object);
} else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) {
scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object;
zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
"reset zb_objset to %llu",
(u_longlong_t)ds2->ds_object,
(u_longlong_t)ds1->ds_object);
}
if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
ds1->ds_object, &mintxg) == 0) {
int err;
ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
err = zap_add_int_key(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx);
VERIFY(err == 0 || err == EEXIST);
if (err == EEXIST) {
/* Both were there to begin with */
VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj,
ds1->ds_object, mintxg, tx));
}
zfs_dbgmsg("clone_swap ds %llu; in queue; "
"replacing with %llu",
(u_longlong_t)ds1->ds_object,
(u_longlong_t)ds2->ds_object);
} else if (zap_lookup_int_key(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx));
zfs_dbgmsg("clone_swap ds %llu; in queue; "
"replacing with %llu",
(u_longlong_t)ds2->ds_object,
(u_longlong_t)ds1->ds_object);
}
dsl_scan_sync_state(scn, tx);
}
struct enqueue_clones_arg {
dmu_tx_t *tx;
uint64_t originobj;
};
/* ARGSUSED */
static int
enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
{
struct enqueue_clones_arg *eca = arg;
dsl_dataset_t *ds;
int err;
dsl_scan_t *scn = dp->dp_scan;
if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != eca->originobj)
return (0);
err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
if (err)
return (err);
while (dsl_dataset_phys(ds)->ds_prev_snap_obj != eca->originobj) {
dsl_dataset_t *prev;
err = dsl_dataset_hold_obj(dp,
dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
dsl_dataset_rele(ds, FTAG);
if (err)
return (err);
ds = prev;
}
VERIFY(zap_add_int_key(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj, ds->ds_object,
dsl_dataset_phys(ds)->ds_prev_snap_txg, eca->tx) == 0);
dsl_dataset_rele(ds, FTAG);
return (0);
}
static void
dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
{
dsl_pool_t *dp = scn->scn_dp;
dsl_dataset_t *ds;
- objset_t *os;
VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
if (scn->scn_phys.scn_cur_min_txg >=
scn->scn_phys.scn_max_txg) {
/*
* This can happen if this snapshot was created after the
* scan started, and we already completed a previous snapshot
* that was created after the scan started. This snapshot
* only references blocks with:
*
* birth < our ds_creation_txg
* cur_min_txg is no less than ds_creation_txg.
* We have already visited these blocks.
* or
* birth > scn_max_txg
* The scan requested not to visit these blocks.
*
* Subsequent snapshots (and clones) can reference our
* blocks, or blocks with even higher birth times.
* Therefore we do not need to visit them either,
* so we do not add them to the work queue.
*
* Note that checking for cur_min_txg >= cur_max_txg
* is not sufficient, because in that case we may need to
* visit subsequent snapshots. This happens when min_txg > 0,
* which raises cur_min_txg. In this case we will visit
* this dataset but skip all of its blocks, because the
* rootbp's birth time is < cur_min_txg. Then we will
* add the next snapshots/clones to the work queue.
*/
char *dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
dsl_dataset_name(ds, dsname);
zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because "
"cur_min_txg (%llu) >= max_txg (%llu)",
dsobj, dsname,
scn->scn_phys.scn_cur_min_txg,
scn->scn_phys.scn_max_txg);
kmem_free(dsname, MAXNAMELEN);
goto out;
}
- if (dmu_objset_from_ds(ds, &os))
- goto out;
-
/*
- * Only the ZIL in the head (non-snapshot) is valid. Even though
+ * Only the ZIL in the head (non-snapshot) is valid. Even though
* snapshots can have ZIL block pointers (which may be the same
- * BP as in the head), they must be ignored. So we traverse the
- * ZIL here, rather than in scan_recurse(), because the regular
- * snapshot block-sharing rules don't apply to it.
+ * BP as in the head), they must be ignored. In addition, $ORIGIN
+ * doesn't have a objset (i.e. its ds_bp is a hole) so we don't
+ * need to look for a ZIL in it either. So we traverse the ZIL here,
+ * rather than in scan_recurse(), because the regular snapshot
+ * block-sharing rules don't apply to it.
*/
- if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !ds->ds_is_snapshot)
+ if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds) &&
+ ds->ds_dir != dp->dp_origin_snap->ds_dir) {
+ objset_t *os;
+ if (dmu_objset_from_ds(ds, &os) != 0) {
+ goto out;
+ }
dsl_scan_zil(dp, &os->os_zil_header);
+ }
/*
* Iterate over the bps in this ds.
*/
dmu_buf_will_dirty(ds->ds_dbuf, tx);
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx);
rrw_exit(&ds->ds_bp_rwlock, FTAG);
char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
dsl_dataset_name(ds, dsname);
zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
"suspending=%u",
(longlong_t)dsobj, dsname,
(longlong_t)scn->scn_phys.scn_cur_min_txg,
(longlong_t)scn->scn_phys.scn_cur_max_txg,
(int)scn->scn_suspending);
kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN);
if (scn->scn_suspending)
goto out;
/*
* We've finished this pass over this dataset.
*/
/*
* If we did not completely visit this dataset, do another pass.
*/
if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
zfs_dbgmsg("incomplete pass; visiting again");
scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
VERIFY(zap_add_int_key(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj, ds->ds_object,
scn->scn_phys.scn_cur_max_txg, tx) == 0);
goto out;
}
/*
* Add descendent datasets to work queue.
*/
if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
VERIFY(zap_add_int_key(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj,
dsl_dataset_phys(ds)->ds_next_snap_obj,
dsl_dataset_phys(ds)->ds_creation_txg, tx) == 0);
}
if (dsl_dataset_phys(ds)->ds_num_children > 1) {
boolean_t usenext = B_FALSE;
if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
uint64_t count;
/*
* A bug in a previous version of the code could
* cause upgrade_clones_cb() to not set
* ds_next_snap_obj when it should, leading to a
* missing entry. Therefore we can only use the
* next_clones_obj when its count is correct.
*/
int err = zap_count(dp->dp_meta_objset,
dsl_dataset_phys(ds)->ds_next_clones_obj, &count);
if (err == 0 &&
count == dsl_dataset_phys(ds)->ds_num_children - 1)
usenext = B_TRUE;
}
if (usenext) {
VERIFY0(zap_join_key(dp->dp_meta_objset,
dsl_dataset_phys(ds)->ds_next_clones_obj,
scn->scn_phys.scn_queue_obj,
dsl_dataset_phys(ds)->ds_creation_txg, tx));
} else {
struct enqueue_clones_arg eca;
eca.tx = tx;
eca.originobj = ds->ds_object;
VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
enqueue_clones_cb, &eca, DS_FIND_CHILDREN));
}
}
out:
dsl_dataset_rele(ds, FTAG);
}
/* ARGSUSED */
static int
enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
{
dmu_tx_t *tx = arg;
dsl_dataset_t *ds;
int err;
dsl_scan_t *scn = dp->dp_scan;
err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
if (err)
return (err);
while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
dsl_dataset_t *prev;
err = dsl_dataset_hold_obj(dp,
dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
if (err) {
dsl_dataset_rele(ds, FTAG);
return (err);
}
/*
* If this is a clone, we don't need to worry about it for now.
*/
if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) {
dsl_dataset_rele(ds, FTAG);
dsl_dataset_rele(prev, FTAG);
return (0);
}
dsl_dataset_rele(ds, FTAG);
ds = prev;
}
VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx) == 0);
dsl_dataset_rele(ds, FTAG);
return (0);
}
/*
* Scrub/dedup interaction.
*
* If there are N references to a deduped block, we don't want to scrub it
* N times -- ideally, we should scrub it exactly once.
*
* We leverage the fact that the dde's replication class (enum ddt_class)
* is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
* (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
*
* To prevent excess scrubbing, the scrub begins by walking the DDT
* to find all blocks with refcnt > 1, and scrubs each of these once.
* Since there are two replication classes which contain blocks with
* refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
* Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
*
* There would be nothing more to say if a block's refcnt couldn't change
* during a scrub, but of course it can so we must account for changes
* in a block's replication class.
*
* Here's an example of what can occur:
*
* If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
* when visited during the top-down scrub phase, it will be scrubbed twice.
* This negates our scrub optimization, but is otherwise harmless.
*
* If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
* on each visit during the top-down scrub phase, it will never be scrubbed.
* To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
* reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
* DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
* while a scrub is in progress, it scrubs the block right then.
*/
static void
dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
{
ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
ddt_entry_t dde = { 0 };
int error;
uint64_t n = 0;
while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
ddt_t *ddt;
if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
break;
dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
(longlong_t)ddb->ddb_class,
(longlong_t)ddb->ddb_type,
(longlong_t)ddb->ddb_checksum,
(longlong_t)ddb->ddb_cursor);
/* There should be no pending changes to the dedup table */
ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
ASSERT(avl_first(&ddt->ddt_tree) == NULL);
dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
n++;
if (dsl_scan_check_suspend(scn, NULL))
break;
}
zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; "
"suspending=%u", (longlong_t)n,
(int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending);
ASSERT(error == 0 || error == ENOENT);
ASSERT(error != ENOENT ||
ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
}
/* ARGSUSED */
void
dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
ddt_entry_t *dde, dmu_tx_t *tx)
{
const ddt_key_t *ddk = &dde->dde_key;
ddt_phys_t *ddp = dde->dde_phys;
blkptr_t bp;
zbookmark_phys_t zb = { 0 };
if (scn->scn_phys.scn_state != DSS_SCANNING)
return;
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
if (ddp->ddp_phys_birth == 0 ||
ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
continue;
ddt_bp_create(checksum, ddk, ddp, &bp);
scn->scn_visited_this_txg++;
scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
}
}
static void
dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
{
dsl_pool_t *dp = scn->scn_dp;
zap_cursor_t zc;
zap_attribute_t za;
if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
scn->scn_phys.scn_ddt_class_max) {
scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
dsl_scan_ddt(scn, tx);
if (scn->scn_suspending)
return;
}
if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
/* First do the MOS & ORIGIN */
scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
dsl_scan_visit_rootbp(scn, NULL,
&dp->dp_meta_rootbp, tx);
spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
if (scn->scn_suspending)
return;
if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
enqueue_cb, tx, DS_FIND_CHILDREN));
} else {
dsl_scan_visitds(scn,
dp->dp_origin_snap->ds_object, tx);
}
ASSERT(!scn->scn_suspending);
} else if (scn->scn_phys.scn_bookmark.zb_objset !=
ZB_DESTROYED_OBJSET) {
/*
* If we were suspended, continue from here. Note if the
* ds we were suspended on was deleted, the zb_objset may
* be -1, so we will skip this and find a new objset
* below.
*/
dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);
if (scn->scn_suspending)
return;
}
/*
* In case we were suspended right at the end of the ds, zero the
* bookmark so we don't think that we're still trying to resume.
*/
bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t));
/* keep pulling things out of the zap-object-as-queue */
while (zap_cursor_init(&zc, dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj),
zap_cursor_retrieve(&zc, &za) == 0) {
dsl_dataset_t *ds;
uint64_t dsobj;
dsobj = zfs_strtonum(za.za_name, NULL);
VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj, dsobj, tx));
/* Set up min/max txg */
VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
if (za.za_first_integer != 0) {
scn->scn_phys.scn_cur_min_txg =
MAX(scn->scn_phys.scn_min_txg,
za.za_first_integer);
} else {
scn->scn_phys.scn_cur_min_txg =
MAX(scn->scn_phys.scn_min_txg,
dsl_dataset_phys(ds)->ds_prev_snap_txg);
}
scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
dsl_dataset_rele(ds, FTAG);
dsl_scan_visitds(scn, dsobj, tx);
zap_cursor_fini(&zc);
if (scn->scn_suspending)
return;
}
zap_cursor_fini(&zc);
}
static boolean_t
-dsl_scan_free_should_suspend(dsl_scan_t *scn)
+dsl_scan_async_block_should_pause(dsl_scan_t *scn)
{
uint64_t elapsed_nanosecs;
if (zfs_recover)
return (B_FALSE);
- if (scn->scn_visited_this_txg >= zfs_free_max_blocks)
+ if (scn->scn_visited_this_txg >= zfs_async_block_max_blocks)
return (B_TRUE);
elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
- (NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms &&
+ (NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms &&
txg_sync_waiting(scn->scn_dp)) ||
spa_shutting_down(scn->scn_dp->dp_spa));
}
static int
dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
{
dsl_scan_t *scn = arg;
if (!scn->scn_is_bptree ||
(BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
- if (dsl_scan_free_should_suspend(scn))
+ if (dsl_scan_async_block_should_pause(scn))
return (SET_ERROR(ERESTART));
}
zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0));
dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
-bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
-BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
scn->scn_visited_this_txg++;
return (0);
}
+static int
+dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = arg;
+ const dva_t *dva = &bp->blk_dva[0];
+
+ if (dsl_scan_async_block_should_pause(scn))
+ return (SET_ERROR(ERESTART));
+
+ spa_vdev_indirect_mark_obsolete(scn->scn_dp->dp_spa,
+ DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva),
+ DVA_GET_ASIZE(dva), tx);
+ scn->scn_visited_this_txg++;
+ return (0);
+}
+
boolean_t
dsl_scan_active(dsl_scan_t *scn)
{
spa_t *spa = scn->scn_dp->dp_spa;
uint64_t used = 0, comp, uncomp;
if (spa->spa_load_state != SPA_LOAD_NONE)
return (B_FALSE);
if (spa_shutting_down(spa))
return (B_FALSE);
if ((scn->scn_phys.scn_state == DSS_SCANNING &&
!dsl_scan_is_paused_scrub(scn)) ||
(scn->scn_async_destroying && !scn->scn_async_stalled))
return (B_TRUE);
if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
(void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
&used, &comp, &uncomp);
}
return (used != 0);
}
/* Called whenever a txg syncs. */
void
dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
{
dsl_scan_t *scn = dp->dp_scan;
spa_t *spa = dp->dp_spa;
int err = 0;
/*
* Check for scn_restart_txg before checking spa_load_state, so
* that we can restart an old-style scan while the pool is being
* imported (see dsl_scan_init).
*/
if (dsl_scan_restarting(scn, tx)) {
pool_scan_func_t func = POOL_SCAN_SCRUB;
dsl_scan_done(scn, B_FALSE, tx);
if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
func = POOL_SCAN_RESILVER;
zfs_dbgmsg("restarting scan func=%u txg=%llu",
func, tx->tx_txg);
dsl_scan_setup_sync(&func, tx);
}
/*
* Only process scans in sync pass 1.
*/
if (spa_sync_pass(dp->dp_spa) > 1)
return;
/*
* If the spa is shutting down, then stop scanning. This will
* ensure that the scan does not dirty any new data during the
* shutdown phase.
*/
if (spa_shutting_down(spa))
return;
/*
* If the scan is inactive due to a stalled async destroy, try again.
*/
if (!scn->scn_async_stalled && !dsl_scan_active(scn))
return;
scn->scn_visited_this_txg = 0;
scn->scn_suspending = B_FALSE;
scn->scn_sync_start_time = gethrtime();
spa->spa_scrub_active = B_TRUE;
/*
* First process the async destroys. If we suspend, don't do
* any scrubbing or resilvering. This ensures that there are no
* async destroys while we are scanning, so the scan code doesn't
* have to worry about traversing it. It is also faster to free the
* blocks than to scrub them.
*/
if (zfs_free_bpobj_enabled &&
spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
scn->scn_is_bptree = B_FALSE;
+ scn->scn_async_block_min_time_ms = zfs_free_min_time_ms;
scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
NULL, ZIO_FLAG_MUSTSUCCEED);
err = bpobj_iterate(&dp->dp_free_bpobj,
dsl_scan_free_block_cb, scn, tx);
VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
if (err != 0 && err != ERESTART)
zfs_panic_recover("error %u from bpobj_iterate()", err);
}
if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
ASSERT(scn->scn_async_destroying);
scn->scn_is_bptree = B_TRUE;
scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
NULL, ZIO_FLAG_MUSTSUCCEED);
err = bptree_iterate(dp->dp_meta_objset,
dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
VERIFY0(zio_wait(scn->scn_zio_root));
if (err == EIO || err == ECKSUM) {
err = 0;
} else if (err != 0 && err != ERESTART) {
zfs_panic_recover("error %u from "
"traverse_dataset_destroyed()", err);
}
if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) {
/* finished; deactivate async destroy feature */
spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx);
ASSERT(!spa_feature_is_active(spa,
SPA_FEATURE_ASYNC_DESTROY));
VERIFY0(zap_remove(dp->dp_meta_objset,
DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_BPTREE_OBJ, tx));
VERIFY0(bptree_free(dp->dp_meta_objset,
dp->dp_bptree_obj, tx));
dp->dp_bptree_obj = 0;
scn->scn_async_destroying = B_FALSE;
scn->scn_async_stalled = B_FALSE;
} else {
/*
* If we didn't make progress, mark the async
* destroy as stalled, so that we will not initiate
* a spa_sync() on its behalf. Note that we only
* check this if we are not finished, because if the
* bptree had no blocks for us to visit, we can
* finish without "making progress".
*/
scn->scn_async_stalled =
(scn->scn_visited_this_txg == 0);
}
}
if (scn->scn_visited_this_txg) {
zfs_dbgmsg("freed %llu blocks in %llums from "
"free_bpobj/bptree txg %llu; err=%d",
(longlong_t)scn->scn_visited_this_txg,
(longlong_t)
NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
(longlong_t)tx->tx_txg, err);
scn->scn_visited_this_txg = 0;
/*
* Write out changes to the DDT that may be required as a
* result of the blocks freed. This ensures that the DDT
* is clean when a scrub/resilver runs.
*/
ddt_sync(spa, tx->tx_txg);
}
if (err != 0)
return;
if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
zfs_free_leak_on_eio &&
(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 ||
dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 ||
dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) {
/*
* We have finished background destroying, but there is still
* some space left in the dp_free_dir. Transfer this leaked
* space to the dp_leak_dir.
*/
if (dp->dp_leak_dir == NULL) {
rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
LEAK_DIR_NAME, tx);
VERIFY0(dsl_pool_open_special_dir(dp,
LEAK_DIR_NAME, &dp->dp_leak_dir));
rrw_exit(&dp->dp_config_rwlock, FTAG);
}
dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
-dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
-dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
-dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
}
+
if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) {
/* finished; verify that space accounting went to zero */
ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes);
ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes);
ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes);
+ }
+
+ EQUIV(bpobj_is_open(&dp->dp_obsolete_bpobj),
+ 0 == zap_contains(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_OBSOLETE_BPOBJ));
+ if (err == 0 && bpobj_is_open(&dp->dp_obsolete_bpobj)) {
+ ASSERT(spa_feature_is_active(dp->dp_spa,
+ SPA_FEATURE_OBSOLETE_COUNTS));
+
+ scn->scn_is_bptree = B_FALSE;
+ scn->scn_async_block_min_time_ms = zfs_obsolete_min_time_ms;
+ err = bpobj_iterate(&dp->dp_obsolete_bpobj,
+ dsl_scan_obsolete_block_cb, scn, tx);
+ if (err != 0 && err != ERESTART)
+ zfs_panic_recover("error %u from bpobj_iterate()", err);
+
+ if (bpobj_is_empty(&dp->dp_obsolete_bpobj))
+ dsl_pool_destroy_obsolete_bpobj(dp, tx);
}
if (scn->scn_phys.scn_state != DSS_SCANNING)
return;
if (scn->scn_done_txg == tx->tx_txg) {
ASSERT(!scn->scn_suspending);
/* finished with scan. */
zfs_dbgmsg("txg %llu scan complete", tx->tx_txg);
dsl_scan_done(scn, B_TRUE, tx);
ASSERT3U(spa->spa_scrub_inflight, ==, 0);
dsl_scan_sync_state(scn, tx);
return;
}
if (dsl_scan_is_paused_scrub(scn))
return;
if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
scn->scn_phys.scn_ddt_class_max) {
zfs_dbgmsg("doing scan sync txg %llu; "
"ddt bm=%llu/%llu/%llu/%llx",
(longlong_t)tx->tx_txg,
(longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
(longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
(longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
(longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0);
ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0);
ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0);
ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0);
} else {
zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu",
(longlong_t)tx->tx_txg,
(longlong_t)scn->scn_phys.scn_bookmark.zb_objset,
(longlong_t)scn->scn_phys.scn_bookmark.zb_object,
(longlong_t)scn->scn_phys.scn_bookmark.zb_level,
(longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
}
scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
NULL, ZIO_FLAG_CANFAIL);
dsl_pool_config_enter(dp, FTAG);
dsl_scan_visit(scn, tx);
dsl_pool_config_exit(dp, FTAG);
(void) zio_wait(scn->scn_zio_root);
scn->scn_zio_root = NULL;
zfs_dbgmsg("visited %llu blocks in %llums",
(longlong_t)scn->scn_visited_this_txg,
(longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time));
if (!scn->scn_suspending) {
scn->scn_done_txg = tx->tx_txg + 1;
zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu",
tx->tx_txg, scn->scn_done_txg);
}
if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
mutex_enter(&spa->spa_scrub_lock);
while (spa->spa_scrub_inflight > 0) {
cv_wait(&spa->spa_scrub_io_cv,
&spa->spa_scrub_lock);
}
mutex_exit(&spa->spa_scrub_lock);
}
dsl_scan_sync_state(scn, tx);
}
/*
* This will start a new scan, or restart an existing one.
*/
void
dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
{
if (txg == 0) {
dmu_tx_t *tx;
tx = dmu_tx_create_dd(dp->dp_mos_dir);
VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
txg = dmu_tx_get_txg(tx);
dp->dp_scan->scn_restart_txg = txg;
dmu_tx_commit(tx);
} else {
dp->dp_scan->scn_restart_txg = txg;
}
zfs_dbgmsg("restarting resilver txg=%llu", txg);
}
boolean_t
dsl_scan_resilvering(dsl_pool_t *dp)
{
return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING &&
dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
}
/*
* scrub consumers
*/
static void
count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
{
int i;
/*
* If we resume after a reboot, zab will be NULL; don't record
* incomplete stats in that case.
*/
if (zab == NULL)
return;
for (i = 0; i < 4; i++) {
int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
if (t & DMU_OT_NEWTYPE)
t = DMU_OT_OTHER;
zfs_blkstat_t *zb = &zab->zab_type[l][t];
int equal;
zb->zb_count++;
zb->zb_asize += BP_GET_ASIZE(bp);
zb->zb_lsize += BP_GET_LSIZE(bp);
zb->zb_psize += BP_GET_PSIZE(bp);
zb->zb_gangs += BP_COUNT_GANG(bp);
switch (BP_GET_NDVAS(bp)) {
case 2:
if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
DVA_GET_VDEV(&bp->blk_dva[1]))
zb->zb_ditto_2_of_2_samevdev++;
break;
case 3:
equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
DVA_GET_VDEV(&bp->blk_dva[1])) +
(DVA_GET_VDEV(&bp->blk_dva[0]) ==
DVA_GET_VDEV(&bp->blk_dva[2])) +
(DVA_GET_VDEV(&bp->blk_dva[1]) ==
DVA_GET_VDEV(&bp->blk_dva[2]));
if (equal == 1)
zb->zb_ditto_2_of_3_samevdev++;
else if (equal == 3)
zb->zb_ditto_3_of_3_samevdev++;
break;
}
}
}
static void
dsl_scan_scrub_done(zio_t *zio)
{
spa_t *spa = zio->io_spa;
abd_free(zio->io_abd);
mutex_enter(&spa->spa_scrub_lock);
spa->spa_scrub_inflight--;
cv_broadcast(&spa->spa_scrub_io_cv);
if (zio->io_error && (zio->io_error != ECKSUM ||
!(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++;
}
mutex_exit(&spa->spa_scrub_lock);
}
static int
dsl_scan_scrub_cb(dsl_pool_t *dp,
const blkptr_t *bp, const zbookmark_phys_t *zb)
{
dsl_scan_t *scn = dp->dp_scan;
size_t size = BP_GET_PSIZE(bp);
spa_t *spa = dp->dp_spa;
uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
boolean_t needs_io;
int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
unsigned int scan_delay = 0;
if (phys_birth <= scn->scn_phys.scn_min_txg ||
phys_birth >= scn->scn_phys.scn_max_txg)
return (0);
count_block(dp->dp_blkstats, bp);
if (BP_IS_EMBEDDED(bp))
return (0);
ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
zio_flags |= ZIO_FLAG_SCRUB;
needs_io = B_TRUE;
scan_delay = zfs_scrub_delay;
} else {
ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
zio_flags |= ZIO_FLAG_RESILVER;
needs_io = B_FALSE;
scan_delay = zfs_resilver_delay;
}
/* If it's an intent log block, failure is expected. */
if (zb->zb_level == ZB_ZIL_LEVEL)
zio_flags |= ZIO_FLAG_SPECULATIVE;
for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
vdev_t *vd = vdev_lookup_top(spa,
DVA_GET_VDEV(&bp->blk_dva[d]));
/*
* Keep track of how much data we've examined so that
* zpool(1M) status can make useful progress reports.
*/
scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]);
spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);
/* if it's a resilver, this may not be in the target range */
if (!needs_io) {
if (DVA_GET_GANG(&bp->blk_dva[d])) {
/*
* Gang members may be spread across multiple
* vdevs, so the best estimate we have is the
* scrub range, which has already been checked.
* XXX -- it would be better to change our
* allocation policy to ensure that all
* gang members reside on the same vdev.
*/
needs_io = B_TRUE;
} else {
needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
phys_birth, 1);
}
}
}
if (needs_io && !zfs_no_scrub_io) {
vdev_t *rvd = spa->spa_root_vdev;
uint64_t maxinflight = rvd->vdev_children *
MAX(zfs_top_maxinflight, 1);
mutex_enter(&spa->spa_scrub_lock);
while (spa->spa_scrub_inflight >= maxinflight)
cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
spa->spa_scrub_inflight++;
mutex_exit(&spa->spa_scrub_lock);
/*
* If we're seeing recent (zfs_scan_idle) "important" I/Os
* then throttle our workload to limit the impact of a scan.
*/
if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)
delay(MAX((int)scan_delay, 0));
zio_nowait(zio_read(NULL, spa, bp,
abd_alloc_for_io(size, B_FALSE), size, dsl_scan_scrub_done,
NULL, ZIO_PRIORITY_SCRUB, zio_flags, zb));
}
/* do not relocate this block */
return (0);
}
/*
* Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver.
* Can also be called to resume a paused scrub.
*/
int
dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
{
spa_t *spa = dp->dp_spa;
dsl_scan_t *scn = dp->dp_scan;
/*
* Purge all vdev caches and probe all devices. We do this here
* rather than in sync context because this requires a writer lock
* on the spa_config lock, which we can't do from sync context. The
* spa_scrub_reopen flag indicates that vdev_open() should not
* attempt to start another scrub.
*/
spa_vdev_state_enter(spa, SCL_NONE);
spa->spa_scrub_reopen = B_TRUE;
vdev_reopen(spa->spa_root_vdev);
spa->spa_scrub_reopen = B_FALSE;
(void) spa_vdev_state_exit(spa, NULL, 0);
if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
/* got scrub start cmd, resume paused scrub */
int err = dsl_scrub_set_pause_resume(scn->scn_dp,
POOL_SCRUB_NORMAL);
if (err == 0) {
spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME);
return (ECANCELED);
}
return (SET_ERROR(err));
}
return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE));
}
static boolean_t
dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx)
{
return (scn->scn_restart_txg != 0 &&
scn->scn_restart_txg <= tx->tx_txg);
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c (revision 332525)
@@ -1,3506 +1,3899 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/zfs_context.h>
#include <sys/dmu.h>
#include <sys/dmu_tx.h>
#include <sys/space_map.h>
#include <sys/metaslab_impl.h>
#include <sys/vdev_impl.h>
#include <sys/zio.h>
#include <sys/spa_impl.h>
#include <sys/zfeature.h>
+#include <sys/vdev_indirect_mapping.h>
SYSCTL_DECL(_vfs_zfs);
SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");
#define GANG_ALLOCATION(flags) \
((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
uint64_t metaslab_aliquot = 512ULL << 10;
uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, gang_bang, CTLFLAG_RWTUN,
&metaslab_gang_bang, 0,
"Force gang block allocation for blocks larger than or equal to this value");
/*
* The in-core space map representation is more compact than its on-disk form.
* The zfs_condense_pct determines how much more compact the in-core
* space map representation must be before we compact it on-disk.
* Values should be greater than or equal to 100.
*/
int zfs_condense_pct = 200;
SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN,
&zfs_condense_pct, 0,
"Condense on-disk spacemap when it is more than this many percents"
" of in-memory counterpart");
/*
* Condensing a metaslab is not guaranteed to actually reduce the amount of
* space used on disk. In particular, a space map uses data in increments of
* MAX(1 << ashift, space_map_blksize), so a metaslab might use the
* same number of blocks after condensing. Since the goal of condensing is to
* reduce the number of IOPs required to read the space map, we only want to
* condense when we can be sure we will reduce the number of blocks used by the
* space map. Unfortunately, we cannot precisely compute whether or not this is
* the case in metaslab_should_condense since we are holding ms_lock. Instead,
* we apply the following heuristic: do not condense a spacemap unless the
* uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
* blocks.
*/
int zfs_metaslab_condense_block_threshold = 4;
/*
* The zfs_mg_noalloc_threshold defines which metaslab groups should
* be eligible for allocation. The value is defined as a percentage of
* free space. Metaslab groups that have more free space than
* zfs_mg_noalloc_threshold are always eligible for allocations. Once
* a metaslab group's free space is less than or equal to the
* zfs_mg_noalloc_threshold the allocator will avoid allocating to that
* group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
* Once all groups in the pool reach zfs_mg_noalloc_threshold then all
* groups are allowed to accept allocations. Gang blocks are always
* eligible to allocate on any metaslab group. The default value of 0 means
* no metaslab group will be excluded based on this criterion.
*/
int zfs_mg_noalloc_threshold = 0;
SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN,
&zfs_mg_noalloc_threshold, 0,
"Percentage of metaslab group size that should be free"
" to make it eligible for allocation");
/*
* Metaslab groups are considered eligible for allocations if their
* fragmenation metric (measured as a percentage) is less than or equal to
* zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
* then it will be skipped unless all metaslab groups within the metaslab
* class have also crossed this threshold.
*/
int zfs_mg_fragmentation_threshold = 85;
SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN,
&zfs_mg_fragmentation_threshold, 0,
"Percentage of metaslab group size that should be considered "
"eligible for allocations unless all metaslab groups within the metaslab class "
"have also crossed this threshold");
/*
* Allow metaslabs to keep their active state as long as their fragmentation
* percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
* active metaslab that exceeds this threshold will no longer keep its active
* status allowing better metaslabs to be selected.
*/
int zfs_metaslab_fragmentation_threshold = 70;
SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN,
&zfs_metaslab_fragmentation_threshold, 0,
"Maximum percentage of metaslab fragmentation level to keep their active state");
/*
* When set will load all metaslabs when pool is first opened.
*/
int metaslab_debug_load = 0;
SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN,
&metaslab_debug_load, 0,
"Load all metaslabs when pool is first opened");
/*
* When set will prevent metaslabs from being unloaded.
*/
int metaslab_debug_unload = 0;
SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN,
&metaslab_debug_unload, 0,
"Prevent metaslabs from being unloaded");
/*
* Minimum size which forces the dynamic allocator to change
* it's allocation strategy. Once the space map cannot satisfy
* an allocation of this size then it switches to using more
* aggressive strategy (i.e search by size rather than offset).
*/
uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN,
&metaslab_df_alloc_threshold, 0,
"Minimum size which forces the dynamic allocator to change it's allocation strategy");
/*
* The minimum free space, in percent, which must be available
* in a space map to continue allocations in a first-fit fashion.
* Once the space map's free space drops below this level we dynamically
* switch to using best-fit allocations.
*/
int metaslab_df_free_pct = 4;
SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN,
&metaslab_df_free_pct, 0,
"The minimum free space, in percent, which must be available in a "
"space map to continue allocations in a first-fit fashion");
/*
* A metaslab is considered "free" if it contains a contiguous
* segment which is greater than metaslab_min_alloc_size.
*/
uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN,
&metaslab_min_alloc_size, 0,
"A metaslab is considered \"free\" if it contains a contiguous "
"segment which is greater than vfs.zfs.metaslab.min_alloc_size");
/*
* Percentage of all cpus that can be used by the metaslab taskq.
*/
int metaslab_load_pct = 50;
SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN,
&metaslab_load_pct, 0,
"Percentage of cpus that can be used by the metaslab taskq");
/*
* Determines how many txgs a metaslab may remain loaded without having any
* allocations from it. As long as a metaslab continues to be used we will
* keep it loaded.
*/
int metaslab_unload_delay = TXG_SIZE * 2;
SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN,
&metaslab_unload_delay, 0,
"Number of TXGs that an unused metaslab can be kept in memory");
/*
* Max number of metaslabs per group to preload.
*/
int metaslab_preload_limit = SPA_DVAS_PER_BP;
SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN,
&metaslab_preload_limit, 0,
"Max number of metaslabs per group to preload");
/*
* Enable/disable preloading of metaslab.
*/
boolean_t metaslab_preload_enabled = B_TRUE;
SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN,
&metaslab_preload_enabled, 0,
"Max number of metaslabs per group to preload");
/*
* Enable/disable fragmentation weighting on metaslabs.
*/
boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN,
&metaslab_fragmentation_factor_enabled, 0,
"Enable fragmentation weighting on metaslabs");
/*
* Enable/disable lba weighting (i.e. outer tracks are given preference).
*/
boolean_t metaslab_lba_weighting_enabled = B_TRUE;
SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN,
&metaslab_lba_weighting_enabled, 0,
"Enable LBA weighting (i.e. outer tracks are given preference)");
/*
* Enable/disable metaslab group biasing.
*/
boolean_t metaslab_bias_enabled = B_TRUE;
SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN,
&metaslab_bias_enabled, 0,
"Enable metaslab group biasing");
/*
+ * Enable/disable remapping of indirect DVAs to their concrete vdevs.
+ */
+boolean_t zfs_remap_blkptr_enable = B_TRUE;
+
+/*
* Enable/disable segment-based metaslab selection.
*/
boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE;
/*
* When using segment-based metaslab selection, we will continue
* allocating from the active metaslab until we have exhausted
* zfs_metaslab_switch_threshold of its buckets.
*/
int zfs_metaslab_switch_threshold = 2;
/*
* Internal switch to enable/disable the metaslab allocation tracing
* facility.
*/
boolean_t metaslab_trace_enabled = B_TRUE;
/*
* Maximum entries that the metaslab allocation tracing facility will keep
* in a given list when running in non-debug mode. We limit the number
* of entries in non-debug mode to prevent us from using up too much memory.
* The limit should be sufficiently large that we don't expect any allocation
* to every exceed this value. In debug mode, the system will panic if this
* limit is ever reached allowing for further investigation.
*/
uint64_t metaslab_trace_max_entries = 5000;
static uint64_t metaslab_weight(metaslab_t *);
static void metaslab_set_fragmentation(metaslab_t *);
+static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
+static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
kmem_cache_t *metaslab_alloc_trace_cache;
/*
* ==========================================================================
* Metaslab classes
* ==========================================================================
*/
metaslab_class_t *
metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
{
metaslab_class_t *mc;
mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
mc->mc_spa = spa;
mc->mc_rotor = NULL;
mc->mc_ops = ops;
mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
refcount_create_tracked(&mc->mc_alloc_slots);
return (mc);
}
void
metaslab_class_destroy(metaslab_class_t *mc)
{
ASSERT(mc->mc_rotor == NULL);
ASSERT(mc->mc_alloc == 0);
ASSERT(mc->mc_deferred == 0);
ASSERT(mc->mc_space == 0);
ASSERT(mc->mc_dspace == 0);
refcount_destroy(&mc->mc_alloc_slots);
mutex_destroy(&mc->mc_lock);
kmem_free(mc, sizeof (metaslab_class_t));
}
int
metaslab_class_validate(metaslab_class_t *mc)
{
metaslab_group_t *mg;
vdev_t *vd;
/*
* Must hold one of the spa_config locks.
*/
ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
if ((mg = mc->mc_rotor) == NULL)
return (0);
do {
vd = mg->mg_vd;
ASSERT(vd->vdev_mg != NULL);
ASSERT3P(vd->vdev_top, ==, vd);
ASSERT3P(mg->mg_class, ==, mc);
ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
} while ((mg = mg->mg_next) != mc->mc_rotor);
return (0);
}
void
metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
{
atomic_add_64(&mc->mc_alloc, alloc_delta);
atomic_add_64(&mc->mc_deferred, defer_delta);
atomic_add_64(&mc->mc_space, space_delta);
atomic_add_64(&mc->mc_dspace, dspace_delta);
}
void
metaslab_class_minblocksize_update(metaslab_class_t *mc)
{
metaslab_group_t *mg;
vdev_t *vd;
uint64_t minashift = UINT64_MAX;
if ((mg = mc->mc_rotor) == NULL) {
mc->mc_minblocksize = SPA_MINBLOCKSIZE;
return;
}
do {
vd = mg->mg_vd;
if (vd->vdev_ashift < minashift)
minashift = vd->vdev_ashift;
} while ((mg = mg->mg_next) != mc->mc_rotor);
mc->mc_minblocksize = 1ULL << minashift;
}
uint64_t
metaslab_class_get_alloc(metaslab_class_t *mc)
{
return (mc->mc_alloc);
}
uint64_t
metaslab_class_get_deferred(metaslab_class_t *mc)
{
return (mc->mc_deferred);
}
uint64_t
metaslab_class_get_space(metaslab_class_t *mc)
{
return (mc->mc_space);
}
uint64_t
metaslab_class_get_dspace(metaslab_class_t *mc)
{
return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
}
uint64_t
metaslab_class_get_minblocksize(metaslab_class_t *mc)
{
return (mc->mc_minblocksize);
}
void
metaslab_class_histogram_verify(metaslab_class_t *mc)
{
vdev_t *rvd = mc->mc_spa->spa_root_vdev;
uint64_t *mc_hist;
int i;
if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
return;
mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
KM_SLEEP);
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = tvd->vdev_mg;
/*
* Skip any holes, uninitialized top-levels, or
* vdevs that are not in this metalab class.
*/
- if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
+ if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
mg->mg_class != mc) {
continue;
}
for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
mc_hist[i] += mg->mg_histogram[i];
}
for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
}
/*
* Calculate the metaslab class's fragmentation metric. The metric
* is weighted based on the space contribution of each metaslab group.
* The return value will be a number between 0 and 100 (inclusive), or
* ZFS_FRAG_INVALID if the metric has not been set. See comment above the
* zfs_frag_table for more information about the metric.
*/
uint64_t
metaslab_class_fragmentation(metaslab_class_t *mc)
{
vdev_t *rvd = mc->mc_spa->spa_root_vdev;
uint64_t fragmentation = 0;
spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = tvd->vdev_mg;
/*
- * Skip any holes, uninitialized top-levels, or
- * vdevs that are not in this metalab class.
+ * Skip any holes, uninitialized top-levels,
+ * or vdevs that are not in this metalab class.
*/
- if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
+ if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
mg->mg_class != mc) {
continue;
}
/*
* If a metaslab group does not contain a fragmentation
* metric then just bail out.
*/
if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
return (ZFS_FRAG_INVALID);
}
/*
* Determine how much this metaslab_group is contributing
* to the overall pool fragmentation metric.
*/
fragmentation += mg->mg_fragmentation *
metaslab_group_get_space(mg);
}
fragmentation /= metaslab_class_get_space(mc);
ASSERT3U(fragmentation, <=, 100);
spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
return (fragmentation);
}
/*
* Calculate the amount of expandable space that is available in
* this metaslab class. If a device is expanded then its expandable
* space will be the amount of allocatable space that is currently not
* part of this metaslab class.
*/
uint64_t
metaslab_class_expandable_space(metaslab_class_t *mc)
{
vdev_t *rvd = mc->mc_spa->spa_root_vdev;
uint64_t space = 0;
spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
for (int c = 0; c < rvd->vdev_children; c++) {
uint64_t tspace;
vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = tvd->vdev_mg;
- if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
+ if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
mg->mg_class != mc) {
continue;
}
/*
* Calculate if we have enough space to add additional
* metaslabs. We report the expandable space in terms
* of the metaslab size since that's the unit of expansion.
* Adjust by efi system partition size.
*/
tspace = tvd->vdev_max_asize - tvd->vdev_asize;
if (tspace > mc->mc_spa->spa_bootsize) {
tspace -= mc->mc_spa->spa_bootsize;
}
space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift);
}
spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
return (space);
}
static int
metaslab_compare(const void *x1, const void *x2)
{
const metaslab_t *m1 = x1;
const metaslab_t *m2 = x2;
if (m1->ms_weight < m2->ms_weight)
return (1);
if (m1->ms_weight > m2->ms_weight)
return (-1);
/*
* If the weights are identical, use the offset to force uniqueness.
*/
if (m1->ms_start < m2->ms_start)
return (-1);
if (m1->ms_start > m2->ms_start)
return (1);
ASSERT3P(m1, ==, m2);
return (0);
}
/*
* Verify that the space accounting on disk matches the in-core range_trees.
*/
void
metaslab_verify_space(metaslab_t *msp, uint64_t txg)
{
spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
uint64_t allocated = 0;
uint64_t sm_free_space, msp_free_space;
ASSERT(MUTEX_HELD(&msp->ms_lock));
if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
return;
/*
* We can only verify the metaslab space when we're called
* from syncing context with a loaded metaslab that has an allocated
* space map. Calling this in non-syncing context does not
* provide a consistent view of the metaslab since we're performing
* allocations in the future.
*/
if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
!msp->ms_loaded)
return;
sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
space_map_alloc_delta(msp->ms_sm);
/*
* Account for future allocations since we would have already
* deducted that space from the ms_freetree.
*/
for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
allocated +=
range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]);
}
msp_free_space = range_tree_space(msp->ms_tree) + allocated +
msp->ms_deferspace + range_tree_space(msp->ms_freedtree);
VERIFY3U(sm_free_space, ==, msp_free_space);
}
/*
* ==========================================================================
* Metaslab groups
* ==========================================================================
*/
/*
* Update the allocatable flag and the metaslab group's capacity.
* The allocatable flag is set to true if the capacity is below
* the zfs_mg_noalloc_threshold or has a fragmentation value that is
* greater than zfs_mg_fragmentation_threshold. If a metaslab group
* transitions from allocatable to non-allocatable or vice versa then the
* metaslab group's class is updated to reflect the transition.
*/
static void
metaslab_group_alloc_update(metaslab_group_t *mg)
{
vdev_t *vd = mg->mg_vd;
metaslab_class_t *mc = mg->mg_class;
vdev_stat_t *vs = &vd->vdev_stat;
boolean_t was_allocatable;
boolean_t was_initialized;
ASSERT(vd == vd->vdev_top);
+ ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
+ SCL_ALLOC);
mutex_enter(&mg->mg_lock);
was_allocatable = mg->mg_allocatable;
was_initialized = mg->mg_initialized;
mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
(vs->vs_space + 1);
mutex_enter(&mc->mc_lock);
/*
* If the metaslab group was just added then it won't
* have any space until we finish syncing out this txg.
* At that point we will consider it initialized and available
* for allocations. We also don't consider non-activated
* metaslab groups (e.g. vdevs that are in the middle of being removed)
* to be initialized, because they can't be used for allocation.
*/
mg->mg_initialized = metaslab_group_initialized(mg);
if (!was_initialized && mg->mg_initialized) {
mc->mc_groups++;
} else if (was_initialized && !mg->mg_initialized) {
ASSERT3U(mc->mc_groups, >, 0);
mc->mc_groups--;
}
if (mg->mg_initialized)
mg->mg_no_free_space = B_FALSE;
/*
* A metaslab group is considered allocatable if it has plenty
* of free space or is not heavily fragmented. We only take
* fragmentation into account if the metaslab group has a valid
* fragmentation metric (i.e. a value between 0 and 100).
*/
mg->mg_allocatable = (mg->mg_activation_count > 0 &&
mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
(mg->mg_fragmentation == ZFS_FRAG_INVALID ||
mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
/*
* The mc_alloc_groups maintains a count of the number of
* groups in this metaslab class that are still above the
* zfs_mg_noalloc_threshold. This is used by the allocating
* threads to determine if they should avoid allocations to
* a given group. The allocator will avoid allocations to a group
* if that group has reached or is below the zfs_mg_noalloc_threshold
* and there are still other groups that are above the threshold.
* When a group transitions from allocatable to non-allocatable or
* vice versa we update the metaslab class to reflect that change.
* When the mc_alloc_groups value drops to 0 that means that all
* groups have reached the zfs_mg_noalloc_threshold making all groups
* eligible for allocations. This effectively means that all devices
* are balanced again.
*/
if (was_allocatable && !mg->mg_allocatable)
mc->mc_alloc_groups--;
else if (!was_allocatable && mg->mg_allocatable)
mc->mc_alloc_groups++;
mutex_exit(&mc->mc_lock);
mutex_exit(&mg->mg_lock);
}
metaslab_group_t *
metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
{
metaslab_group_t *mg;
mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
avl_create(&mg->mg_metaslab_tree, metaslab_compare,
sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
mg->mg_vd = vd;
mg->mg_class = mc;
mg->mg_activation_count = 0;
mg->mg_initialized = B_FALSE;
mg->mg_no_free_space = B_TRUE;
refcount_create_tracked(&mg->mg_alloc_queue_depth);
mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
return (mg);
}
void
metaslab_group_destroy(metaslab_group_t *mg)
{
ASSERT(mg->mg_prev == NULL);
ASSERT(mg->mg_next == NULL);
/*
* We may have gone below zero with the activation count
* either because we never activated in the first place or
* because we're done, and possibly removing the vdev.
*/
ASSERT(mg->mg_activation_count <= 0);
taskq_destroy(mg->mg_taskq);
avl_destroy(&mg->mg_metaslab_tree);
mutex_destroy(&mg->mg_lock);
refcount_destroy(&mg->mg_alloc_queue_depth);
kmem_free(mg, sizeof (metaslab_group_t));
}
void
metaslab_group_activate(metaslab_group_t *mg)
{
metaslab_class_t *mc = mg->mg_class;
metaslab_group_t *mgprev, *mgnext;
- ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
+ ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0);
ASSERT(mc->mc_rotor != mg);
ASSERT(mg->mg_prev == NULL);
ASSERT(mg->mg_next == NULL);
ASSERT(mg->mg_activation_count <= 0);
if (++mg->mg_activation_count <= 0)
return;
mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
metaslab_group_alloc_update(mg);
if ((mgprev = mc->mc_rotor) == NULL) {
mg->mg_prev = mg;
mg->mg_next = mg;
} else {
mgnext = mgprev->mg_next;
mg->mg_prev = mgprev;
mg->mg_next = mgnext;
mgprev->mg_next = mg;
mgnext->mg_prev = mg;
}
mc->mc_rotor = mg;
metaslab_class_minblocksize_update(mc);
}
+/*
+ * Passivate a metaslab group and remove it from the allocation rotor.
+ * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
+ * a metaslab group. This function will momentarily drop spa_config_locks
+ * that are lower than the SCL_ALLOC lock (see comment below).
+ */
void
metaslab_group_passivate(metaslab_group_t *mg)
{
metaslab_class_t *mc = mg->mg_class;
+ spa_t *spa = mc->mc_spa;
metaslab_group_t *mgprev, *mgnext;
+ int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
- ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
+ ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
+ (SCL_ALLOC | SCL_ZIO));
if (--mg->mg_activation_count != 0) {
ASSERT(mc->mc_rotor != mg);
ASSERT(mg->mg_prev == NULL);
ASSERT(mg->mg_next == NULL);
ASSERT(mg->mg_activation_count < 0);
return;
}
+ /*
+ * The spa_config_lock is an array of rwlocks, ordered as
+ * follows (from highest to lowest):
+ * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
+ * SCL_ZIO > SCL_FREE > SCL_VDEV
+ * (For more information about the spa_config_lock see spa_misc.c)
+ * The higher the lock, the broader its coverage. When we passivate
+ * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
+ * config locks. However, the metaslab group's taskq might be trying
+ * to preload metaslabs so we must drop the SCL_ZIO lock and any
+ * lower locks to allow the I/O to complete. At a minimum,
+ * we continue to hold the SCL_ALLOC lock, which prevents any future
+ * allocations from taking place and any changes to the vdev tree.
+ */
+ spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
taskq_wait(mg->mg_taskq);
+ spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
metaslab_group_alloc_update(mg);
mgprev = mg->mg_prev;
mgnext = mg->mg_next;
if (mg == mgnext) {
mc->mc_rotor = NULL;
} else {
mc->mc_rotor = mgnext;
mgprev->mg_next = mgnext;
mgnext->mg_prev = mgprev;
}
mg->mg_prev = NULL;
mg->mg_next = NULL;
metaslab_class_minblocksize_update(mc);
}
boolean_t
metaslab_group_initialized(metaslab_group_t *mg)
{
vdev_t *vd = mg->mg_vd;
vdev_stat_t *vs = &vd->vdev_stat;
return (vs->vs_space != 0 && mg->mg_activation_count > 0);
}
uint64_t
metaslab_group_get_space(metaslab_group_t *mg)
{
return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
}
void
metaslab_group_histogram_verify(metaslab_group_t *mg)
{
uint64_t *mg_hist;
vdev_t *vd = mg->mg_vd;
uint64_t ashift = vd->vdev_ashift;
int i;
if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
return;
mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
KM_SLEEP);
ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
SPACE_MAP_HISTOGRAM_SIZE + ashift);
for (int m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
if (msp->ms_sm == NULL)
continue;
for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
mg_hist[i + ashift] +=
msp->ms_sm->sm_phys->smp_histogram[i];
}
for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
}
static void
metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
{
metaslab_class_t *mc = mg->mg_class;
uint64_t ashift = mg->mg_vd->vdev_ashift;
ASSERT(MUTEX_HELD(&msp->ms_lock));
if (msp->ms_sm == NULL)
return;
mutex_enter(&mg->mg_lock);
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
mg->mg_histogram[i + ashift] +=
msp->ms_sm->sm_phys->smp_histogram[i];
mc->mc_histogram[i + ashift] +=
msp->ms_sm->sm_phys->smp_histogram[i];
}
mutex_exit(&mg->mg_lock);
}
void
metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
{
metaslab_class_t *mc = mg->mg_class;
uint64_t ashift = mg->mg_vd->vdev_ashift;
ASSERT(MUTEX_HELD(&msp->ms_lock));
if (msp->ms_sm == NULL)
return;
mutex_enter(&mg->mg_lock);
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
ASSERT3U(mg->mg_histogram[i + ashift], >=,
msp->ms_sm->sm_phys->smp_histogram[i]);
ASSERT3U(mc->mc_histogram[i + ashift], >=,
msp->ms_sm->sm_phys->smp_histogram[i]);
mg->mg_histogram[i + ashift] -=
msp->ms_sm->sm_phys->smp_histogram[i];
mc->mc_histogram[i + ashift] -=
msp->ms_sm->sm_phys->smp_histogram[i];
}
mutex_exit(&mg->mg_lock);
}
static void
metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
{
ASSERT(msp->ms_group == NULL);
mutex_enter(&mg->mg_lock);
msp->ms_group = mg;
msp->ms_weight = 0;
avl_add(&mg->mg_metaslab_tree, msp);
mutex_exit(&mg->mg_lock);
mutex_enter(&msp->ms_lock);
metaslab_group_histogram_add(mg, msp);
mutex_exit(&msp->ms_lock);
}
static void
metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
{
mutex_enter(&msp->ms_lock);
metaslab_group_histogram_remove(mg, msp);
mutex_exit(&msp->ms_lock);
mutex_enter(&mg->mg_lock);
ASSERT(msp->ms_group == mg);
avl_remove(&mg->mg_metaslab_tree, msp);
msp->ms_group = NULL;
mutex_exit(&mg->mg_lock);
}
static void
metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
{
/*
* Although in principle the weight can be any value, in
* practice we do not use values in the range [1, 511].
*/
ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
ASSERT(MUTEX_HELD(&msp->ms_lock));
mutex_enter(&mg->mg_lock);
ASSERT(msp->ms_group == mg);
avl_remove(&mg->mg_metaslab_tree, msp);
msp->ms_weight = weight;
avl_add(&mg->mg_metaslab_tree, msp);
mutex_exit(&mg->mg_lock);
}
/*
* Calculate the fragmentation for a given metaslab group. We can use
* a simple average here since all metaslabs within the group must have
* the same size. The return value will be a value between 0 and 100
* (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
* group have a fragmentation metric.
*/
uint64_t
metaslab_group_fragmentation(metaslab_group_t *mg)
{
vdev_t *vd = mg->mg_vd;
uint64_t fragmentation = 0;
uint64_t valid_ms = 0;
for (int m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
continue;
valid_ms++;
fragmentation += msp->ms_fragmentation;
}
if (valid_ms <= vd->vdev_ms_count / 2)
return (ZFS_FRAG_INVALID);
fragmentation /= valid_ms;
ASSERT3U(fragmentation, <=, 100);
return (fragmentation);
}
/*
* Determine if a given metaslab group should skip allocations. A metaslab
* group should avoid allocations if its free capacity is less than the
* zfs_mg_noalloc_threshold or its fragmentation metric is greater than
* zfs_mg_fragmentation_threshold and there is at least one metaslab group
* that can still handle allocations. If the allocation throttle is enabled
* then we skip allocations to devices that have reached their maximum
* allocation queue depth unless the selected metaslab group is the only
* eligible group remaining.
*/
static boolean_t
metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
uint64_t psize)
{
spa_t *spa = mg->mg_vd->vdev_spa;
metaslab_class_t *mc = mg->mg_class;
/*
* We can only consider skipping this metaslab group if it's
* in the normal metaslab class and there are other metaslab
* groups to select from. Otherwise, we always consider it eligible
* for allocations.
*/
if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
return (B_TRUE);
/*
* If the metaslab group's mg_allocatable flag is set (see comments
* in metaslab_group_alloc_update() for more information) and
* the allocation throttle is disabled then allow allocations to this
* device. However, if the allocation throttle is enabled then
* check if we have reached our allocation limit (mg_alloc_queue_depth)
* to determine if we should allow allocations to this metaslab group.
* If all metaslab groups are no longer considered allocatable
* (mc_alloc_groups == 0) or we're trying to allocate the smallest
* gang block size then we allow allocations on this metaslab group
* regardless of the mg_allocatable or throttle settings.
*/
if (mg->mg_allocatable) {
metaslab_group_t *mgp;
int64_t qdepth;
uint64_t qmax = mg->mg_max_alloc_queue_depth;
if (!mc->mc_alloc_throttle_enabled)
return (B_TRUE);
/*
* If this metaslab group does not have any free space, then
* there is no point in looking further.
*/
if (mg->mg_no_free_space)
return (B_FALSE);
qdepth = refcount_count(&mg->mg_alloc_queue_depth);
/*
* If this metaslab group is below its qmax or it's
* the only allocatable metasable group, then attempt
* to allocate from it.
*/
if (qdepth < qmax || mc->mc_alloc_groups == 1)
return (B_TRUE);
ASSERT3U(mc->mc_alloc_groups, >, 1);
/*
* Since this metaslab group is at or over its qmax, we
* need to determine if there are metaslab groups after this
* one that might be able to handle this allocation. This is
* racy since we can't hold the locks for all metaslab
* groups at the same time when we make this check.
*/
for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
qmax = mgp->mg_max_alloc_queue_depth;
qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
/*
* If there is another metaslab group that
* might be able to handle the allocation, then
* we return false so that we skip this group.
*/
if (qdepth < qmax && !mgp->mg_no_free_space)
return (B_FALSE);
}
/*
* We didn't find another group to handle the allocation
* so we can't skip this metaslab group even though
* we are at or over our qmax.
*/
return (B_TRUE);
} else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
return (B_TRUE);
}
return (B_FALSE);
}
/*
* ==========================================================================
* Range tree callbacks
* ==========================================================================
*/
/*
* Comparison function for the private size-ordered tree. Tree is sorted
* by size, larger sizes at the end of the tree.
*/
static int
metaslab_rangesize_compare(const void *x1, const void *x2)
{
const range_seg_t *r1 = x1;
const range_seg_t *r2 = x2;
uint64_t rs_size1 = r1->rs_end - r1->rs_start;
uint64_t rs_size2 = r2->rs_end - r2->rs_start;
if (rs_size1 < rs_size2)
return (-1);
if (rs_size1 > rs_size2)
return (1);
if (r1->rs_start < r2->rs_start)
return (-1);
if (r1->rs_start > r2->rs_start)
return (1);
return (0);
}
/*
* Create any block allocator specific components. The current allocators
* rely on using both a size-ordered range_tree_t and an array of uint64_t's.
*/
static void
metaslab_rt_create(range_tree_t *rt, void *arg)
{
metaslab_t *msp = arg;
ASSERT3P(rt->rt_arg, ==, msp);
ASSERT(msp->ms_tree == NULL);
avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
}
/*
* Destroy the block allocator specific components.
*/
static void
metaslab_rt_destroy(range_tree_t *rt, void *arg)
{
metaslab_t *msp = arg;
ASSERT3P(rt->rt_arg, ==, msp);
ASSERT3P(msp->ms_tree, ==, rt);
ASSERT0(avl_numnodes(&msp->ms_size_tree));
avl_destroy(&msp->ms_size_tree);
}
static void
metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
{
metaslab_t *msp = arg;
ASSERT3P(rt->rt_arg, ==, msp);
ASSERT3P(msp->ms_tree, ==, rt);
VERIFY(!msp->ms_condensing);
avl_add(&msp->ms_size_tree, rs);
}
static void
metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
{
metaslab_t *msp = arg;
ASSERT3P(rt->rt_arg, ==, msp);
ASSERT3P(msp->ms_tree, ==, rt);
VERIFY(!msp->ms_condensing);
avl_remove(&msp->ms_size_tree, rs);
}
static void
metaslab_rt_vacate(range_tree_t *rt, void *arg)
{
metaslab_t *msp = arg;
ASSERT3P(rt->rt_arg, ==, msp);
ASSERT3P(msp->ms_tree, ==, rt);
/*
* Normally one would walk the tree freeing nodes along the way.
* Since the nodes are shared with the range trees we can avoid
* walking all nodes and just reinitialize the avl tree. The nodes
* will be freed by the range tree, so we don't want to free them here.
*/
avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
}
static range_tree_ops_t metaslab_rt_ops = {
metaslab_rt_create,
metaslab_rt_destroy,
metaslab_rt_add,
metaslab_rt_remove,
metaslab_rt_vacate
};
/*
* ==========================================================================
* Common allocator routines
* ==========================================================================
*/
/*
* Return the maximum contiguous segment within the metaslab.
*/
uint64_t
metaslab_block_maxsize(metaslab_t *msp)
{
avl_tree_t *t = &msp->ms_size_tree;
range_seg_t *rs;
if (t == NULL || (rs = avl_last(t)) == NULL)
return (0ULL);
return (rs->rs_end - rs->rs_start);
}
static range_seg_t *
metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
{
range_seg_t *rs, rsearch;
avl_index_t where;
rsearch.rs_start = start;
rsearch.rs_end = start + size;
rs = avl_find(t, &rsearch, &where);
if (rs == NULL) {
rs = avl_nearest(t, where, AVL_AFTER);
}
return (rs);
}
/*
* This is a helper function that can be used by the allocator to find
* a suitable block to allocate. This will search the specified AVL
* tree looking for a block that matches the specified criteria.
*/
static uint64_t
metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
uint64_t align)
{
range_seg_t *rs = metaslab_block_find(t, *cursor, size);
while (rs != NULL) {
uint64_t offset = P2ROUNDUP(rs->rs_start, align);
if (offset + size <= rs->rs_end) {
*cursor = offset + size;
return (offset);
}
rs = AVL_NEXT(t, rs);
}
/*
* If we know we've searched the whole map (*cursor == 0), give up.
* Otherwise, reset the cursor to the beginning and try again.
*/
if (*cursor == 0)
return (-1ULL);
*cursor = 0;
return (metaslab_block_picker(t, cursor, size, align));
}
/*
* ==========================================================================
* The first-fit block allocator
* ==========================================================================
*/
static uint64_t
metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
{
/*
* Find the largest power of 2 block size that evenly divides the
* requested size. This is used to try to allocate blocks with similar
* alignment from the same area of the metaslab (i.e. same cursor
* bucket) but it does not guarantee that other allocations sizes
* may exist in the same region.
*/
uint64_t align = size & -size;
uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
avl_tree_t *t = &msp->ms_tree->rt_root;
return (metaslab_block_picker(t, cursor, size, align));
}
static metaslab_ops_t metaslab_ff_ops = {
metaslab_ff_alloc
};
/*
* ==========================================================================
* Dynamic block allocator -
* Uses the first fit allocation scheme until space get low and then
* adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
* and metaslab_df_free_pct to determine when to switch the allocation scheme.
* ==========================================================================
*/
static uint64_t
metaslab_df_alloc(metaslab_t *msp, uint64_t size)
{
/*
* Find the largest power of 2 block size that evenly divides the
* requested size. This is used to try to allocate blocks with similar
* alignment from the same area of the metaslab (i.e. same cursor
* bucket) but it does not guarantee that other allocations sizes
* may exist in the same region.
*/
uint64_t align = size & -size;
uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
range_tree_t *rt = msp->ms_tree;
avl_tree_t *t = &rt->rt_root;
uint64_t max_size = metaslab_block_maxsize(msp);
int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
if (max_size < size)
return (-1ULL);
/*
* If we're running low on space switch to using the size
* sorted AVL tree (best-fit).
*/
if (max_size < metaslab_df_alloc_threshold ||
free_pct < metaslab_df_free_pct) {
t = &msp->ms_size_tree;
*cursor = 0;
}
return (metaslab_block_picker(t, cursor, size, 1ULL));
}
static metaslab_ops_t metaslab_df_ops = {
metaslab_df_alloc
};
/*
* ==========================================================================
* Cursor fit block allocator -
* Select the largest region in the metaslab, set the cursor to the beginning
* of the range and the cursor_end to the end of the range. As allocations
* are made advance the cursor. Continue allocating from the cursor until
* the range is exhausted and then find a new range.
* ==========================================================================
*/
static uint64_t
metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
{
range_tree_t *rt = msp->ms_tree;
avl_tree_t *t = &msp->ms_size_tree;
uint64_t *cursor = &msp->ms_lbas[0];
uint64_t *cursor_end = &msp->ms_lbas[1];
uint64_t offset = 0;
ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
ASSERT3U(*cursor_end, >=, *cursor);
if ((*cursor + size) > *cursor_end) {
range_seg_t *rs;
rs = avl_last(&msp->ms_size_tree);
if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
return (-1ULL);
*cursor = rs->rs_start;
*cursor_end = rs->rs_end;
}
offset = *cursor;
*cursor += size;
return (offset);
}
static metaslab_ops_t metaslab_cf_ops = {
metaslab_cf_alloc
};
/*
* ==========================================================================
* New dynamic fit allocator -
* Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
* contiguous blocks. If no region is found then just use the largest segment
* that remains.
* ==========================================================================
*/
/*
* Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
* to request from the allocator.
*/
uint64_t metaslab_ndf_clump_shift = 4;
static uint64_t
metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
{
avl_tree_t *t = &msp->ms_tree->rt_root;
avl_index_t where;
range_seg_t *rs, rsearch;
uint64_t hbit = highbit64(size);
uint64_t *cursor = &msp->ms_lbas[hbit - 1];
uint64_t max_size = metaslab_block_maxsize(msp);
ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
if (max_size < size)
return (-1ULL);
rsearch.rs_start = *cursor;
rsearch.rs_end = *cursor + size;
rs = avl_find(t, &rsearch, &where);
if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
t = &msp->ms_size_tree;
rsearch.rs_start = 0;
rsearch.rs_end = MIN(max_size,
1ULL << (hbit + metaslab_ndf_clump_shift));
rs = avl_find(t, &rsearch, &where);
if (rs == NULL)
rs = avl_nearest(t, where, AVL_AFTER);
ASSERT(rs != NULL);
}
if ((rs->rs_end - rs->rs_start) >= size) {
*cursor = rs->rs_start + size;
return (rs->rs_start);
}
return (-1ULL);
}
static metaslab_ops_t metaslab_ndf_ops = {
metaslab_ndf_alloc
};
metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
/*
* ==========================================================================
* Metaslabs
* ==========================================================================
*/
/*
* Wait for any in-progress metaslab loads to complete.
*/
void
metaslab_load_wait(metaslab_t *msp)
{
ASSERT(MUTEX_HELD(&msp->ms_lock));
while (msp->ms_loading) {
ASSERT(!msp->ms_loaded);
cv_wait(&msp->ms_load_cv, &msp->ms_lock);
}
}
int
metaslab_load(metaslab_t *msp)
{
int error = 0;
boolean_t success = B_FALSE;
ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT(!msp->ms_loaded);
ASSERT(!msp->ms_loading);
msp->ms_loading = B_TRUE;
+ /*
+ * Nobody else can manipulate a loading metaslab, so it's now safe
+ * to drop the lock. This way we don't have to hold the lock while
+ * reading the spacemap from disk.
+ */
+ mutex_exit(&msp->ms_lock);
/*
* If the space map has not been allocated yet, then treat
* all the space in the metaslab as free and add it to the
* ms_tree.
*/
if (msp->ms_sm != NULL)
error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
else
range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
success = (error == 0);
+
+ mutex_enter(&msp->ms_lock);
msp->ms_loading = B_FALSE;
if (success) {
ASSERT3P(msp->ms_group, !=, NULL);
msp->ms_loaded = B_TRUE;
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
range_tree_walk(msp->ms_defertree[t],
range_tree_remove, msp->ms_tree);
}
msp->ms_max_size = metaslab_block_maxsize(msp);
}
cv_broadcast(&msp->ms_load_cv);
return (error);
}
void
metaslab_unload(metaslab_t *msp)
{
ASSERT(MUTEX_HELD(&msp->ms_lock));
range_tree_vacate(msp->ms_tree, NULL, NULL);
msp->ms_loaded = B_FALSE;
msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
msp->ms_max_size = 0;
}
int
metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
metaslab_t **msp)
{
vdev_t *vd = mg->mg_vd;
objset_t *mos = vd->vdev_spa->spa_meta_objset;
metaslab_t *ms;
int error;
ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
ms->ms_id = id;
ms->ms_start = id << vd->vdev_ms_shift;
ms->ms_size = 1ULL << vd->vdev_ms_shift;
/*
* We only open space map objects that already exist. All others
* will be opened when we finally allocate an object for it.
*/
if (object != 0) {
error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
- ms->ms_size, vd->vdev_ashift, &ms->ms_lock);
+ ms->ms_size, vd->vdev_ashift);
if (error != 0) {
kmem_free(ms, sizeof (metaslab_t));
return (error);
}
ASSERT(ms->ms_sm != NULL);
}
/*
* We create the main range tree here, but we don't create the
* other range trees until metaslab_sync_done(). This serves
* two purposes: it allows metaslab_sync_done() to detect the
* addition of new space; and for debugging, it ensures that we'd
* data fault on any attempt to use this metaslab before it's ready.
*/
- ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock);
+ ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms);
metaslab_group_add(mg, ms);
metaslab_set_fragmentation(ms);
/*
* If we're opening an existing pool (txg == 0) or creating
* a new one (txg == TXG_INITIAL), all space is available now.
* If we're adding space to an existing pool, the new space
* does not become available until after this txg has synced.
* The metaslab's weight will also be initialized when we sync
* out this txg. This ensures that we don't attempt to allocate
* from it before we have initialized it completely.
*/
if (txg <= TXG_INITIAL)
metaslab_sync_done(ms, 0);
/*
* If metaslab_debug_load is set and we're initializing a metaslab
* that has an allocated space map object then load the its space
* map so that can verify frees.
*/
if (metaslab_debug_load && ms->ms_sm != NULL) {
mutex_enter(&ms->ms_lock);
VERIFY0(metaslab_load(ms));
mutex_exit(&ms->ms_lock);
}
if (txg != 0) {
vdev_dirty(vd, 0, NULL, txg);
vdev_dirty(vd, VDD_METASLAB, ms, txg);
}
*msp = ms;
return (0);
}
void
metaslab_fini(metaslab_t *msp)
{
metaslab_group_t *mg = msp->ms_group;
metaslab_group_remove(mg, msp);
mutex_enter(&msp->ms_lock);
VERIFY(msp->ms_group == NULL);
vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
0, -msp->ms_size);
space_map_close(msp->ms_sm);
metaslab_unload(msp);
range_tree_destroy(msp->ms_tree);
range_tree_destroy(msp->ms_freeingtree);
range_tree_destroy(msp->ms_freedtree);
for (int t = 0; t < TXG_SIZE; t++) {
range_tree_destroy(msp->ms_alloctree[t]);
}
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
range_tree_destroy(msp->ms_defertree[t]);
}
ASSERT0(msp->ms_deferspace);
mutex_exit(&msp->ms_lock);
cv_destroy(&msp->ms_load_cv);
mutex_destroy(&msp->ms_lock);
+ mutex_destroy(&msp->ms_sync_lock);
kmem_free(msp, sizeof (metaslab_t));
}
#define FRAGMENTATION_TABLE_SIZE 17
/*
* This table defines a segment size based fragmentation metric that will
* allow each metaslab to derive its own fragmentation value. This is done
* by calculating the space in each bucket of the spacemap histogram and
* multiplying that by the fragmetation metric in this table. Doing
* this for all buckets and dividing it by the total amount of free
* space in this metaslab (i.e. the total free space in all buckets) gives
* us the fragmentation metric. This means that a high fragmentation metric
* equates to most of the free space being comprised of small segments.
* Conversely, if the metric is low, then most of the free space is in
* large segments. A 10% change in fragmentation equates to approximately
* double the number of segments.
*
* This table defines 0% fragmented space using 16MB segments. Testing has
* shown that segments that are greater than or equal to 16MB do not suffer
* from drastic performance problems. Using this value, we derive the rest
* of the table. Since the fragmentation value is never stored on disk, it
* is possible to change these calculations in the future.
*/
int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
100, /* 512B */
100, /* 1K */
98, /* 2K */
95, /* 4K */
90, /* 8K */
80, /* 16K */
70, /* 32K */
60, /* 64K */
50, /* 128K */
40, /* 256K */
30, /* 512K */
20, /* 1M */
15, /* 2M */
10, /* 4M */
5, /* 8M */
0 /* 16M */
};
/*
* Calclate the metaslab's fragmentation metric. A return value
* of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
* not support this metric. Otherwise, the return value should be in the
* range [0, 100].
*/
static void
metaslab_set_fragmentation(metaslab_t *msp)
{
spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
uint64_t fragmentation = 0;
uint64_t total = 0;
boolean_t feature_enabled = spa_feature_is_enabled(spa,
SPA_FEATURE_SPACEMAP_HISTOGRAM);
if (!feature_enabled) {
msp->ms_fragmentation = ZFS_FRAG_INVALID;
return;
}
/*
* A null space map means that the entire metaslab is free
* and thus is not fragmented.
*/
if (msp->ms_sm == NULL) {
msp->ms_fragmentation = 0;
return;
}
/*
* If this metaslab's space map has not been upgraded, flag it
* so that we upgrade next time we encounter it.
*/
if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
uint64_t txg = spa_syncing_txg(spa);
vdev_t *vd = msp->ms_group->mg_vd;
/*
* If we've reached the final dirty txg, then we must
* be shutting down the pool. We don't want to dirty
* any data past this point so skip setting the condense
* flag. We can retry this action the next time the pool
* is imported.
*/
if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
msp->ms_condense_wanted = B_TRUE;
vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
spa_dbgmsg(spa, "txg %llu, requesting force condense: "
"ms_id %llu, vdev_id %llu", txg, msp->ms_id,
vd->vdev_id);
}
msp->ms_fragmentation = ZFS_FRAG_INVALID;
return;
}
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
uint64_t space = 0;
uint8_t shift = msp->ms_sm->sm_shift;
int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
FRAGMENTATION_TABLE_SIZE - 1);
if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
continue;
space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
total += space;
ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
fragmentation += space * zfs_frag_table[idx];
}
if (total > 0)
fragmentation /= total;
ASSERT3U(fragmentation, <=, 100);
msp->ms_fragmentation = fragmentation;
}
/*
* Compute a weight -- a selection preference value -- for the given metaslab.
* This is based on the amount of free space, the level of fragmentation,
* the LBA range, and whether the metaslab is loaded.
*/
static uint64_t
metaslab_space_weight(metaslab_t *msp)
{
metaslab_group_t *mg = msp->ms_group;
vdev_t *vd = mg->mg_vd;
uint64_t weight, space;
ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT(!vd->vdev_removing);
/*
* The baseline weight is the metaslab's free space.
*/
space = msp->ms_size - space_map_allocated(msp->ms_sm);
if (metaslab_fragmentation_factor_enabled &&
msp->ms_fragmentation != ZFS_FRAG_INVALID) {
/*
* Use the fragmentation information to inversely scale
* down the baseline weight. We need to ensure that we
* don't exclude this metaslab completely when it's 100%
* fragmented. To avoid this we reduce the fragmented value
* by 1.
*/
space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
/*
* If space < SPA_MINBLOCKSIZE, then we will not allocate from
* this metaslab again. The fragmentation metric may have
* decreased the space to something smaller than
* SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
* so that we can consume any remaining space.
*/
if (space > 0 && space < SPA_MINBLOCKSIZE)
space = SPA_MINBLOCKSIZE;
}
weight = space;
/*
* Modern disks have uniform bit density and constant angular velocity.
* Therefore, the outer recording zones are faster (higher bandwidth)
* than the inner zones by the ratio of outer to inner track diameter,
* which is typically around 2:1. We account for this by assigning
* higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
* In effect, this means that we'll select the metaslab with the most
* free bandwidth rather than simply the one with the most free space.
*/
if (metaslab_lba_weighting_enabled) {
weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
ASSERT(weight >= space && weight <= 2 * space);
}
/*
* If this metaslab is one we're actively using, adjust its
* weight to make it preferable to any inactive metaslab so
* we'll polish it off. If the fragmentation on this metaslab
* has exceed our threshold, then don't mark it active.
*/
if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
}
WEIGHT_SET_SPACEBASED(weight);
return (weight);
}
/*
* Return the weight of the specified metaslab, according to the segment-based
* weighting algorithm. The metaslab must be loaded. This function can
* be called within a sync pass since it relies only on the metaslab's
* range tree which is always accurate when the metaslab is loaded.
*/
static uint64_t
metaslab_weight_from_range_tree(metaslab_t *msp)
{
uint64_t weight = 0;
uint32_t segments = 0;
ASSERT(msp->ms_loaded);
for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
i--) {
uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
segments <<= 1;
segments += msp->ms_tree->rt_histogram[i];
/*
* The range tree provides more precision than the space map
* and must be downgraded so that all values fit within the
* space map's histogram. This allows us to compare loaded
* vs. unloaded metaslabs to determine which metaslab is
* considered "best".
*/
if (i > max_idx)
continue;
if (segments != 0) {
WEIGHT_SET_COUNT(weight, segments);
WEIGHT_SET_INDEX(weight, i);
WEIGHT_SET_ACTIVE(weight, 0);
break;
}
}
return (weight);
}
/*
* Calculate the weight based on the on-disk histogram. This should only
* be called after a sync pass has completely finished since the on-disk
* information is updated in metaslab_sync().
*/
static uint64_t
metaslab_weight_from_spacemap(metaslab_t *msp)
{
uint64_t weight = 0;
for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
WEIGHT_SET_COUNT(weight,
msp->ms_sm->sm_phys->smp_histogram[i]);
WEIGHT_SET_INDEX(weight, i +
msp->ms_sm->sm_shift);
WEIGHT_SET_ACTIVE(weight, 0);
break;
}
}
return (weight);
}
/*
* Compute a segment-based weight for the specified metaslab. The weight
* is determined by highest bucket in the histogram. The information
* for the highest bucket is encoded into the weight value.
*/
static uint64_t
metaslab_segment_weight(metaslab_t *msp)
{
metaslab_group_t *mg = msp->ms_group;
uint64_t weight = 0;
uint8_t shift = mg->mg_vd->vdev_ashift;
ASSERT(MUTEX_HELD(&msp->ms_lock));
/*
* The metaslab is completely free.
*/
if (space_map_allocated(msp->ms_sm) == 0) {
int idx = highbit64(msp->ms_size) - 1;
int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
if (idx < max_idx) {
WEIGHT_SET_COUNT(weight, 1ULL);
WEIGHT_SET_INDEX(weight, idx);
} else {
WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
WEIGHT_SET_INDEX(weight, max_idx);
}
WEIGHT_SET_ACTIVE(weight, 0);
ASSERT(!WEIGHT_IS_SPACEBASED(weight));
return (weight);
}
ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
/*
* If the metaslab is fully allocated then just make the weight 0.
*/
if (space_map_allocated(msp->ms_sm) == msp->ms_size)
return (0);
/*
* If the metaslab is already loaded, then use the range tree to
* determine the weight. Otherwise, we rely on the space map information
* to generate the weight.
*/
if (msp->ms_loaded) {
weight = metaslab_weight_from_range_tree(msp);
} else {
weight = metaslab_weight_from_spacemap(msp);
}
/*
* If the metaslab was active the last time we calculated its weight
* then keep it active. We want to consume the entire region that
* is associated with this weight.
*/
if (msp->ms_activation_weight != 0 && weight != 0)
WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
return (weight);
}
/*
* Determine if we should attempt to allocate from this metaslab. If the
* metaslab has a maximum size then we can quickly determine if the desired
* allocation size can be satisfied. Otherwise, if we're using segment-based
* weighting then we can determine the maximum allocation that this metaslab
* can accommodate based on the index encoded in the weight. If we're using
* space-based weights then rely on the entire weight (excluding the weight
* type bit).
*/
boolean_t
metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
{
boolean_t should_allocate;
if (msp->ms_max_size != 0)
return (msp->ms_max_size >= asize);
if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
/*
* The metaslab segment weight indicates segments in the
* range [2^i, 2^(i+1)), where i is the index in the weight.
* Since the asize might be in the middle of the range, we
* should attempt the allocation if asize < 2^(i+1).
*/
should_allocate = (asize <
1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
} else {
should_allocate = (asize <=
(msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
}
return (should_allocate);
}
static uint64_t
metaslab_weight(metaslab_t *msp)
{
vdev_t *vd = msp->ms_group->mg_vd;
spa_t *spa = vd->vdev_spa;
uint64_t weight;
ASSERT(MUTEX_HELD(&msp->ms_lock));
/*
- * This vdev is in the process of being removed so there is nothing
+ * If this vdev is in the process of being removed, there is nothing
* for us to do here.
*/
- if (vd->vdev_removing) {
- ASSERT0(space_map_allocated(msp->ms_sm));
- ASSERT0(vd->vdev_ms_shift);
+ if (vd->vdev_removing)
return (0);
- }
metaslab_set_fragmentation(msp);
/*
* Update the maximum size if the metaslab is loaded. This will
* ensure that we get an accurate maximum size if newly freed space
* has been added back into the free tree.
*/
if (msp->ms_loaded)
msp->ms_max_size = metaslab_block_maxsize(msp);
/*
* Segment-based weighting requires space map histogram support.
*/
if (zfs_metaslab_segment_weight_enabled &&
spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
(msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
sizeof (space_map_phys_t))) {
weight = metaslab_segment_weight(msp);
} else {
weight = metaslab_space_weight(msp);
}
return (weight);
}
static int
metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
{
ASSERT(MUTEX_HELD(&msp->ms_lock));
if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
metaslab_load_wait(msp);
if (!msp->ms_loaded) {
int error = metaslab_load(msp);
if (error) {
metaslab_group_sort(msp->ms_group, msp, 0);
return (error);
}
}
msp->ms_activation_weight = msp->ms_weight;
metaslab_group_sort(msp->ms_group, msp,
msp->ms_weight | activation_weight);
}
ASSERT(msp->ms_loaded);
ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
return (0);
}
static void
metaslab_passivate(metaslab_t *msp, uint64_t weight)
{
uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;
/*
* If size < SPA_MINBLOCKSIZE, then we will not allocate from
* this metaslab again. In that case, it had better be empty,
* or we would be leaving space on the table.
*/
ASSERT(size >= SPA_MINBLOCKSIZE ||
range_tree_space(msp->ms_tree) == 0);
ASSERT0(weight & METASLAB_ACTIVE_MASK);
msp->ms_activation_weight = 0;
metaslab_group_sort(msp->ms_group, msp, weight);
ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
}
/*
* Segment-based metaslabs are activated once and remain active until
* we either fail an allocation attempt (similar to space-based metaslabs)
* or have exhausted the free space in zfs_metaslab_switch_threshold
* buckets since the metaslab was activated. This function checks to see
* if we've exhaused the zfs_metaslab_switch_threshold buckets in the
* metaslab and passivates it proactively. This will allow us to select a
* metaslabs with larger contiguous region if any remaining within this
* metaslab group. If we're in sync pass > 1, then we continue using this
* metaslab so that we don't dirty more block and cause more sync passes.
*/
void
metaslab_segment_may_passivate(metaslab_t *msp)
{
spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
return;
/*
* Since we are in the middle of a sync pass, the most accurate
* information that is accessible to us is the in-core range tree
* histogram; calculate the new weight based on that information.
*/
uint64_t weight = metaslab_weight_from_range_tree(msp);
int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
int current_idx = WEIGHT_GET_INDEX(weight);
if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
metaslab_passivate(msp, weight);
}
static void
metaslab_preload(void *arg)
{
metaslab_t *msp = arg;
spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
mutex_enter(&msp->ms_lock);
metaslab_load_wait(msp);
if (!msp->ms_loaded)
(void) metaslab_load(msp);
msp->ms_selected_txg = spa_syncing_txg(spa);
mutex_exit(&msp->ms_lock);
}
static void
metaslab_group_preload(metaslab_group_t *mg)
{
spa_t *spa = mg->mg_vd->vdev_spa;
metaslab_t *msp;
avl_tree_t *t = &mg->mg_metaslab_tree;
int m = 0;
if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
taskq_wait(mg->mg_taskq);
return;
}
mutex_enter(&mg->mg_lock);
+
/*
* Load the next potential metaslabs
*/
for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
+ ASSERT3P(msp->ms_group, ==, mg);
+
/*
* We preload only the maximum number of metaslabs specified
* by metaslab_preload_limit. If a metaslab is being forced
* to condense then we preload it too. This will ensure
* that force condensing happens in the next txg.
*/
if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
continue;
}
VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
msp, TQ_SLEEP) != 0);
}
mutex_exit(&mg->mg_lock);
}
/*
* Determine if the space map's on-disk footprint is past our tolerance
* for inefficiency. We would like to use the following criteria to make
* our decision:
*
* 1. The size of the space map object should not dramatically increase as a
* result of writing out the free space range tree.
*
* 2. The minimal on-disk space map representation is zfs_condense_pct/100
* times the size than the free space range tree representation
- * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
+ * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB).
*
* 3. The on-disk size of the space map should actually decrease.
*
* Checking the first condition is tricky since we don't want to walk
* the entire AVL tree calculating the estimated on-disk size. Instead we
* use the size-ordered range tree in the metaslab and calculate the
* size required to write out the largest segment in our free tree. If the
* size required to represent that segment on disk is larger than the space
* map object then we avoid condensing this map.
*
* To determine the second criterion we use a best-case estimate and assume
* each segment can be represented on-disk as a single 64-bit entry. We refer
* to this best-case estimate as the space map's minimal form.
*
* Unfortunately, we cannot compute the on-disk size of the space map in this
* context because we cannot accurately compute the effects of compression, etc.
* Instead, we apply the heuristic described in the block comment for
* zfs_metaslab_condense_block_threshold - we only condense if the space used
* is greater than a threshold number of blocks.
*/
static boolean_t
metaslab_should_condense(metaslab_t *msp)
{
space_map_t *sm = msp->ms_sm;
range_seg_t *rs;
uint64_t size, entries, segsz, object_size, optimal_size, record_size;
dmu_object_info_t doi;
uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT(msp->ms_loaded);
/*
* Use the ms_size_tree range tree, which is ordered by size, to
* obtain the largest segment in the free tree. We always condense
* metaslabs that are empty and metaslabs for which a condense
* request has been made.
*/
rs = avl_last(&msp->ms_size_tree);
if (rs == NULL || msp->ms_condense_wanted)
return (B_TRUE);
/*
* Calculate the number of 64-bit entries this segment would
* require when written to disk. If this single segment would be
* larger on-disk than the entire current on-disk structure, then
* clearly condensing will increase the on-disk structure size.
*/
size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
entries = size / (MIN(size, SM_RUN_MAX));
segsz = entries * sizeof (uint64_t);
optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
object_size = space_map_length(msp->ms_sm);
dmu_object_info_from_db(sm->sm_dbuf, &doi);
record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
return (segsz <= object_size &&
object_size >= (optimal_size * zfs_condense_pct / 100) &&
object_size > zfs_metaslab_condense_block_threshold * record_size);
}
/*
* Condense the on-disk space map representation to its minimized form.
* The minimized form consists of a small number of allocations followed by
* the entries of the free range tree.
*/
static void
metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
{
spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
range_tree_t *condense_tree;
space_map_t *sm = msp->ms_sm;
ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT3U(spa_sync_pass(spa), ==, 1);
ASSERT(msp->ms_loaded);
spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
"spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
msp->ms_group->mg_vd->vdev_spa->spa_name,
space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
msp->ms_condense_wanted ? "TRUE" : "FALSE");
msp->ms_condense_wanted = B_FALSE;
/*
* Create an range tree that is 100% allocated. We remove segments
* that have been freed in this txg, any deferred frees that exist,
* and any allocation in the future. Removing segments should be
* a relatively inexpensive operation since we expect these trees to
* have a small number of nodes.
*/
- condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
+ condense_tree = range_tree_create(NULL, NULL);
range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
/*
* Remove what's been freed in this txg from the condense_tree.
* Since we're in sync_pass 1, we know that all the frees from
* this txg are in the freeingtree.
*/
range_tree_walk(msp->ms_freeingtree, range_tree_remove, condense_tree);
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
range_tree_walk(msp->ms_defertree[t],
range_tree_remove, condense_tree);
}
for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
range_tree_remove, condense_tree);
}
/*
* We're about to drop the metaslab's lock thus allowing
* other consumers to change it's content. Set the
* metaslab's ms_condensing flag to ensure that
* allocations on this metaslab do not occur while we're
* in the middle of committing it to disk. This is only critical
* for the ms_tree as all other range trees use per txg
* views of their content.
*/
msp->ms_condensing = B_TRUE;
mutex_exit(&msp->ms_lock);
space_map_truncate(sm, tx);
- mutex_enter(&msp->ms_lock);
/*
* While we would ideally like to create a space map representation
* that consists only of allocation records, doing so can be
* prohibitively expensive because the in-core free tree can be
* large, and therefore computationally expensive to subtract
* from the condense_tree. Instead we sync out two trees, a cheap
* allocation only tree followed by the in-core free tree. While not
* optimal, this is typically close to optimal, and much cheaper to
* compute.
*/
space_map_write(sm, condense_tree, SM_ALLOC, tx);
range_tree_vacate(condense_tree, NULL, NULL);
range_tree_destroy(condense_tree);
space_map_write(sm, msp->ms_tree, SM_FREE, tx);
+ mutex_enter(&msp->ms_lock);
msp->ms_condensing = B_FALSE;
}
/*
* Write a metaslab to disk in the context of the specified transaction group.
*/
void
metaslab_sync(metaslab_t *msp, uint64_t txg)
{
metaslab_group_t *mg = msp->ms_group;
vdev_t *vd = mg->mg_vd;
spa_t *spa = vd->vdev_spa;
objset_t *mos = spa_meta_objset(spa);
range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
dmu_tx_t *tx;
uint64_t object = space_map_object(msp->ms_sm);
ASSERT(!vd->vdev_ishole);
/*
* This metaslab has just been added so there's no work to do now.
*/
if (msp->ms_freeingtree == NULL) {
ASSERT3P(alloctree, ==, NULL);
return;
}
ASSERT3P(alloctree, !=, NULL);
ASSERT3P(msp->ms_freeingtree, !=, NULL);
ASSERT3P(msp->ms_freedtree, !=, NULL);
/*
* Normally, we don't want to process a metaslab if there
* are no allocations or frees to perform. However, if the metaslab
* is being forced to condense and it's loaded, we need to let it
* through.
*/
if (range_tree_space(alloctree) == 0 &&
range_tree_space(msp->ms_freeingtree) == 0 &&
!(msp->ms_loaded && msp->ms_condense_wanted))
return;
VERIFY(txg <= spa_final_dirty_txg(spa));
/*
* The only state that can actually be changing concurrently with
* metaslab_sync() is the metaslab's ms_tree. No other thread can
* be modifying this txg's alloctree, freeingtree, freedtree, or
- * space_map_phys_t. Therefore, we only hold ms_lock to satify
- * space map ASSERTs. We drop it whenever we call into the DMU,
- * because the DMU can call down to us (e.g. via zio_free()) at
- * any time.
+ * space_map_phys_t. We drop ms_lock whenever we could call
+ * into the DMU, because the DMU can call down to us
+ * (e.g. via zio_free()) at any time.
+ *
+ * The spa_vdev_remove_thread() can be reading metaslab state
+ * concurrently, and it is locked out by the ms_sync_lock. Note
+ * that the ms_lock is insufficient for this, because it is dropped
+ * by space_map_write().
*/
tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
if (msp->ms_sm == NULL) {
uint64_t new_object;
new_object = space_map_alloc(mos, tx);
VERIFY3U(new_object, !=, 0);
VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
- msp->ms_start, msp->ms_size, vd->vdev_ashift,
- &msp->ms_lock));
+ msp->ms_start, msp->ms_size, vd->vdev_ashift));
ASSERT(msp->ms_sm != NULL);
}
+ mutex_enter(&msp->ms_sync_lock);
mutex_enter(&msp->ms_lock);
/*
* Note: metaslab_condense() clears the space map's histogram.
* Therefore we must verify and remove this histogram before
* condensing.
*/
metaslab_group_histogram_verify(mg);
metaslab_class_histogram_verify(mg->mg_class);
metaslab_group_histogram_remove(mg, msp);
if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
metaslab_should_condense(msp)) {
metaslab_condense(msp, txg, tx);
} else {
+ mutex_exit(&msp->ms_lock);
space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
space_map_write(msp->ms_sm, msp->ms_freeingtree, SM_FREE, tx);
+ mutex_enter(&msp->ms_lock);
}
if (msp->ms_loaded) {
/*
- * When the space map is loaded, we have an accruate
+ * When the space map is loaded, we have an accurate
* histogram in the range tree. This gives us an opportunity
* to bring the space map's histogram up-to-date so we clear
* it first before updating it.
*/
space_map_histogram_clear(msp->ms_sm);
space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
/*
* Since we've cleared the histogram we need to add back
* any free space that has already been processed, plus
* any deferred space. This allows the on-disk histogram
* to accurately reflect all free space even if some space
* is not yet available for allocation (i.e. deferred).
*/
space_map_histogram_add(msp->ms_sm, msp->ms_freedtree, tx);
/*
* Add back any deferred free space that has not been
* added back into the in-core free tree yet. This will
* ensure that we don't end up with a space map histogram
* that is completely empty unless the metaslab is fully
* allocated.
*/
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
space_map_histogram_add(msp->ms_sm,
msp->ms_defertree[t], tx);
}
}
/*
* Always add the free space from this sync pass to the space
* map histogram. We want to make sure that the on-disk histogram
* accounts for all free space. If the space map is not loaded,
* then we will lose some accuracy but will correct it the next
* time we load the space map.
*/
space_map_histogram_add(msp->ms_sm, msp->ms_freeingtree, tx);
metaslab_group_histogram_add(mg, msp);
metaslab_group_histogram_verify(mg);
metaslab_class_histogram_verify(mg->mg_class);
/*
* For sync pass 1, we avoid traversing this txg's free range tree
* and instead will just swap the pointers for freeingtree and
* freedtree. We can safely do this since the freed_tree is
* guaranteed to be empty on the initial pass.
*/
if (spa_sync_pass(spa) == 1) {
range_tree_swap(&msp->ms_freeingtree, &msp->ms_freedtree);
} else {
range_tree_vacate(msp->ms_freeingtree,
range_tree_add, msp->ms_freedtree);
}
range_tree_vacate(alloctree, NULL, NULL);
ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK]));
ASSERT0(range_tree_space(msp->ms_freeingtree));
mutex_exit(&msp->ms_lock);
if (object != space_map_object(msp->ms_sm)) {
object = space_map_object(msp->ms_sm);
dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
msp->ms_id, sizeof (uint64_t), &object, tx);
}
+ mutex_exit(&msp->ms_sync_lock);
dmu_tx_commit(tx);
}
/*
* Called after a transaction group has completely synced to mark
* all of the metaslab's free space as usable.
*/
void
metaslab_sync_done(metaslab_t *msp, uint64_t txg)
{
metaslab_group_t *mg = msp->ms_group;
vdev_t *vd = mg->mg_vd;
spa_t *spa = vd->vdev_spa;
range_tree_t **defer_tree;
int64_t alloc_delta, defer_delta;
boolean_t defer_allowed = B_TRUE;
ASSERT(!vd->vdev_ishole);
mutex_enter(&msp->ms_lock);
/*
* If this metaslab is just becoming available, initialize its
* range trees and add its capacity to the vdev.
*/
if (msp->ms_freedtree == NULL) {
for (int t = 0; t < TXG_SIZE; t++) {
ASSERT(msp->ms_alloctree[t] == NULL);
- msp->ms_alloctree[t] = range_tree_create(NULL, msp,
- &msp->ms_lock);
+ msp->ms_alloctree[t] = range_tree_create(NULL, NULL);
}
ASSERT3P(msp->ms_freeingtree, ==, NULL);
- msp->ms_freeingtree = range_tree_create(NULL, msp,
- &msp->ms_lock);
+ msp->ms_freeingtree = range_tree_create(NULL, NULL);
ASSERT3P(msp->ms_freedtree, ==, NULL);
- msp->ms_freedtree = range_tree_create(NULL, msp,
- &msp->ms_lock);
+ msp->ms_freedtree = range_tree_create(NULL, NULL);
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
ASSERT(msp->ms_defertree[t] == NULL);
- msp->ms_defertree[t] = range_tree_create(NULL, msp,
- &msp->ms_lock);
+ msp->ms_defertree[t] = range_tree_create(NULL, NULL);
}
vdev_space_update(vd, 0, 0, msp->ms_size);
}
defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
metaslab_class_get_alloc(spa_normal_class(spa));
- if (free_space <= spa_get_slop_space(spa)) {
+ if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
defer_allowed = B_FALSE;
}
defer_delta = 0;
alloc_delta = space_map_alloc_delta(msp->ms_sm);
if (defer_allowed) {
defer_delta = range_tree_space(msp->ms_freedtree) -
range_tree_space(*defer_tree);
} else {
defer_delta -= range_tree_space(*defer_tree);
}
vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
/*
* If there's a metaslab_load() in progress, wait for it to complete
* so that we have a consistent view of the in-core space map.
*/
metaslab_load_wait(msp);
/*
* Move the frees from the defer_tree back to the free
* range tree (if it's loaded). Swap the freed_tree and the
* defer_tree -- this is safe to do because we've just emptied out
* the defer_tree.
*/
range_tree_vacate(*defer_tree,
msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
if (defer_allowed) {
range_tree_swap(&msp->ms_freedtree, defer_tree);
} else {
range_tree_vacate(msp->ms_freedtree,
msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
}
space_map_update(msp->ms_sm);
msp->ms_deferspace += defer_delta;
ASSERT3S(msp->ms_deferspace, >=, 0);
ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
if (msp->ms_deferspace != 0) {
/*
* Keep syncing this metaslab until all deferred frees
* are back in circulation.
*/
vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
}
/*
* Calculate the new weights before unloading any metaslabs.
* This will give us the most accurate weighting.
*/
metaslab_group_sort(mg, msp, metaslab_weight(msp));
/*
* If the metaslab is loaded and we've not tried to load or allocate
* from it in 'metaslab_unload_delay' txgs, then unload it.
*/
if (msp->ms_loaded &&
msp->ms_selected_txg + metaslab_unload_delay < txg) {
for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
VERIFY0(range_tree_space(
msp->ms_alloctree[(txg + t) & TXG_MASK]));
}
if (!metaslab_debug_unload)
metaslab_unload(msp);
}
+ ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
+ ASSERT0(range_tree_space(msp->ms_freeingtree));
+ ASSERT0(range_tree_space(msp->ms_freedtree));
+
mutex_exit(&msp->ms_lock);
}
void
metaslab_sync_reassess(metaslab_group_t *mg)
{
+ spa_t *spa = mg->mg_class->mc_spa;
+
+ spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
metaslab_group_alloc_update(mg);
mg->mg_fragmentation = metaslab_group_fragmentation(mg);
/*
- * Preload the next potential metaslabs
+ * Preload the next potential metaslabs but only on active
+ * metaslab groups. We can get into a state where the metaslab
+ * is no longer active since we dirty metaslabs as we remove a
+ * a device, thus potentially making the metaslab group eligible
+ * for preloading.
*/
- metaslab_group_preload(mg);
+ if (mg->mg_activation_count > 0) {
+ metaslab_group_preload(mg);
+ }
+ spa_config_exit(spa, SCL_ALLOC, FTAG);
}
static uint64_t
metaslab_distance(metaslab_t *msp, dva_t *dva)
{
uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
uint64_t start = msp->ms_id;
if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
return (1ULL << 63);
if (offset < start)
return ((start - offset) << ms_shift);
if (offset > start)
return ((offset - start) << ms_shift);
return (0);
}
/*
* ==========================================================================
* Metaslab allocation tracing facility
* ==========================================================================
*/
kstat_t *metaslab_trace_ksp;
kstat_named_t metaslab_trace_over_limit;
void
metaslab_alloc_trace_init(void)
{
ASSERT(metaslab_alloc_trace_cache == NULL);
metaslab_alloc_trace_cache = kmem_cache_create(
"metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
0, NULL, NULL, NULL, NULL, NULL, 0);
metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
"misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
if (metaslab_trace_ksp != NULL) {
metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
kstat_named_init(&metaslab_trace_over_limit,
"metaslab_trace_over_limit", KSTAT_DATA_UINT64);
kstat_install(metaslab_trace_ksp);
}
}
void
metaslab_alloc_trace_fini(void)
{
if (metaslab_trace_ksp != NULL) {
kstat_delete(metaslab_trace_ksp);
metaslab_trace_ksp = NULL;
}
kmem_cache_destroy(metaslab_alloc_trace_cache);
metaslab_alloc_trace_cache = NULL;
}
/*
* Add an allocation trace element to the allocation tracing list.
*/
static void
metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset)
{
if (!metaslab_trace_enabled)
return;
/*
* When the tracing list reaches its maximum we remove
* the second element in the list before adding a new one.
* By removing the second element we preserve the original
* entry as a clue to what allocations steps have already been
* performed.
*/
if (zal->zal_size == metaslab_trace_max_entries) {
metaslab_alloc_trace_t *mat_next;
#ifdef DEBUG
panic("too many entries in allocation list");
#endif
atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
zal->zal_size--;
mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
list_remove(&zal->zal_list, mat_next);
kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
}
metaslab_alloc_trace_t *mat =
kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
list_link_init(&mat->mat_list_node);
mat->mat_mg = mg;
mat->mat_msp = msp;
mat->mat_size = psize;
mat->mat_dva_id = dva_id;
mat->mat_offset = offset;
mat->mat_weight = 0;
if (msp != NULL)
mat->mat_weight = msp->ms_weight;
/*
* The list is part of the zio so locking is not required. Only
* a single thread will perform allocations for a given zio.
*/
list_insert_tail(&zal->zal_list, mat);
zal->zal_size++;
ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
}
void
metaslab_trace_init(zio_alloc_list_t *zal)
{
list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
offsetof(metaslab_alloc_trace_t, mat_list_node));
zal->zal_size = 0;
}
void
metaslab_trace_fini(zio_alloc_list_t *zal)
{
metaslab_alloc_trace_t *mat;
while ((mat = list_remove_head(&zal->zal_list)) != NULL)
kmem_cache_free(metaslab_alloc_trace_cache, mat);
list_destroy(&zal->zal_list);
zal->zal_size = 0;
}
/*
* ==========================================================================
* Metaslab block operations
* ==========================================================================
*/
static void
metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
{
if (!(flags & METASLAB_ASYNC_ALLOC) ||
flags & METASLAB_DONT_THROTTLE)
return;
metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
if (!mg->mg_class->mc_alloc_throttle_enabled)
return;
(void) refcount_add(&mg->mg_alloc_queue_depth, tag);
}
void
metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
{
if (!(flags & METASLAB_ASYNC_ALLOC) ||
flags & METASLAB_DONT_THROTTLE)
return;
metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
if (!mg->mg_class->mc_alloc_throttle_enabled)
return;
(void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
}
void
metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
{
#ifdef ZFS_DEBUG
const dva_t *dva = bp->blk_dva;
int ndvas = BP_GET_NDVAS(bp);
for (int d = 0; d < ndvas; d++) {
uint64_t vdev = DVA_GET_VDEV(&dva[d]);
metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
}
#endif
}
static uint64_t
metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
{
uint64_t start;
range_tree_t *rt = msp->ms_tree;
metaslab_class_t *mc = msp->ms_group->mg_class;
VERIFY(!msp->ms_condensing);
start = mc->mc_ops->msop_alloc(msp, size);
if (start != -1ULL) {
metaslab_group_t *mg = msp->ms_group;
vdev_t *vd = mg->mg_vd;
VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
range_tree_remove(rt, start, size);
if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size);
/* Track the last successful allocation */
msp->ms_alloc_txg = txg;
metaslab_verify_space(msp, txg);
}
/*
* Now that we've attempted the allocation we need to update the
* metaslab's maximum block size since it may have changed.
*/
msp->ms_max_size = metaslab_block_maxsize(msp);
return (start);
}
static uint64_t
metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
{
metaslab_t *msp = NULL;
uint64_t offset = -1ULL;
uint64_t activation_weight;
uint64_t target_distance;
int i;
activation_weight = METASLAB_WEIGHT_PRIMARY;
for (i = 0; i < d; i++) {
if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
activation_weight = METASLAB_WEIGHT_SECONDARY;
break;
}
}
metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
search->ms_weight = UINT64_MAX;
search->ms_start = 0;
for (;;) {
boolean_t was_active;
avl_tree_t *t = &mg->mg_metaslab_tree;
avl_index_t idx;
mutex_enter(&mg->mg_lock);
/*
* Find the metaslab with the highest weight that is less
* than what we've already tried. In the common case, this
* means that we will examine each metaslab at most once.
* Note that concurrent callers could reorder metaslabs
* by activation/passivation once we have dropped the mg_lock.
* If a metaslab is activated by another thread, and we fail
* to allocate from the metaslab we have selected, we may
* not try the newly-activated metaslab, and instead activate
* another metaslab. This is not optimal, but generally
* does not cause any problems (a possible exception being
* if every metaslab is completely full except for the
* the newly-activated metaslab which we fail to examine).
*/
msp = avl_find(t, search, &idx);
if (msp == NULL)
msp = avl_nearest(t, idx, AVL_AFTER);
for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
if (!metaslab_should_allocate(msp, asize)) {
metaslab_trace_add(zal, mg, msp, asize, d,
TRACE_TOO_SMALL);
continue;
}
/*
* If the selected metaslab is condensing, skip it.
*/
if (msp->ms_condensing)
continue;
was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
if (activation_weight == METASLAB_WEIGHT_PRIMARY)
break;
target_distance = min_distance +
(space_map_allocated(msp->ms_sm) != 0 ? 0 :
min_distance >> 1);
for (i = 0; i < d; i++) {
if (metaslab_distance(msp, &dva[i]) <
target_distance)
break;
}
if (i == d)
break;
}
mutex_exit(&mg->mg_lock);
if (msp == NULL) {
kmem_free(search, sizeof (*search));
return (-1ULL);
}
search->ms_weight = msp->ms_weight;
search->ms_start = msp->ms_start + 1;
mutex_enter(&msp->ms_lock);
/*
* Ensure that the metaslab we have selected is still
* capable of handling our request. It's possible that
* another thread may have changed the weight while we
* were blocked on the metaslab lock. We check the
* active status first to see if we need to reselect
* a new metaslab.
*/
if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
mutex_exit(&msp->ms_lock);
continue;
}
if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
activation_weight == METASLAB_WEIGHT_PRIMARY) {
metaslab_passivate(msp,
msp->ms_weight & ~METASLAB_ACTIVE_MASK);
mutex_exit(&msp->ms_lock);
continue;
}
if (metaslab_activate(msp, activation_weight) != 0) {
mutex_exit(&msp->ms_lock);
continue;
}
msp->ms_selected_txg = txg;
/*
* Now that we have the lock, recheck to see if we should
* continue to use this metaslab for this allocation. The
* the metaslab is now loaded so metaslab_should_allocate() can
* accurately determine if the allocation attempt should
* proceed.
*/
if (!metaslab_should_allocate(msp, asize)) {
/* Passivate this metaslab and select a new one. */
metaslab_trace_add(zal, mg, msp, asize, d,
TRACE_TOO_SMALL);
goto next;
}
/*
* If this metaslab is currently condensing then pick again as
* we can't manipulate this metaslab until it's committed
* to disk.
*/
if (msp->ms_condensing) {
metaslab_trace_add(zal, mg, msp, asize, d,
TRACE_CONDENSING);
mutex_exit(&msp->ms_lock);
continue;
}
offset = metaslab_block_alloc(msp, asize, txg);
metaslab_trace_add(zal, mg, msp, asize, d, offset);
if (offset != -1ULL) {
/* Proactively passivate the metaslab, if needed */
metaslab_segment_may_passivate(msp);
break;
}
next:
ASSERT(msp->ms_loaded);
/*
* We were unable to allocate from this metaslab so determine
* a new weight for this metaslab. Now that we have loaded
* the metaslab we can provide a better hint to the metaslab
* selector.
*
* For space-based metaslabs, we use the maximum block size.
* This information is only available when the metaslab
* is loaded and is more accurate than the generic free
* space weight that was calculated by metaslab_weight().
* This information allows us to quickly compare the maximum
* available allocation in the metaslab to the allocation
* size being requested.
*
* For segment-based metaslabs, determine the new weight
* based on the highest bucket in the range tree. We
* explicitly use the loaded segment weight (i.e. the range
* tree histogram) since it contains the space that is
* currently available for allocation and is accurate
* even within a sync pass.
*/
if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
uint64_t weight = metaslab_block_maxsize(msp);
WEIGHT_SET_SPACEBASED(weight);
metaslab_passivate(msp, weight);
} else {
metaslab_passivate(msp,
metaslab_weight_from_range_tree(msp));
}
/*
* We have just failed an allocation attempt, check
* that metaslab_should_allocate() agrees. Otherwise,
* we may end up in an infinite loop retrying the same
* metaslab.
*/
ASSERT(!metaslab_should_allocate(msp, asize));
mutex_exit(&msp->ms_lock);
}
mutex_exit(&msp->ms_lock);
kmem_free(search, sizeof (*search));
return (offset);
}
static uint64_t
metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
{
uint64_t offset;
ASSERT(mg->mg_initialized);
offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
min_distance, dva, d);
mutex_enter(&mg->mg_lock);
if (offset == -1ULL) {
mg->mg_failed_allocations++;
metaslab_trace_add(zal, mg, NULL, asize, d,
TRACE_GROUP_FAILURE);
if (asize == SPA_GANGBLOCKSIZE) {
/*
* This metaslab group was unable to allocate
* the minimum gang block size so it must be out of
* space. We must notify the allocation throttle
* to start skipping allocation attempts to this
* metaslab group until more space becomes available.
* Note: this failure cannot be caused by the
* allocation throttle since the allocation throttle
* is only responsible for skipping devices and
* not failing block allocations.
*/
mg->mg_no_free_space = B_TRUE;
}
}
mg->mg_allocations++;
mutex_exit(&mg->mg_lock);
return (offset);
}
/*
* If we have to write a ditto block (i.e. more than one DVA for a given BP)
* on the same vdev as an existing DVA of this BP, then try to allocate it
* at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the
* existing DVAs.
*/
int ditto_same_vdev_distance_shift = 3;
/*
* Allocate a block for the specified i/o.
*/
-static int
+int
metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
zio_alloc_list_t *zal)
{
metaslab_group_t *mg, *rotor;
vdev_t *vd;
boolean_t try_hard = B_FALSE;
ASSERT(!DVA_IS_VALID(&dva[d]));
/*
* For testing, make some blocks above a certain size be gang blocks.
*/
if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) {
metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG);
return (SET_ERROR(ENOSPC));
}
/*
* Start at the rotor and loop through all mgs until we find something.
* Note that there's no locking on mc_rotor or mc_aliquot because
* nothing actually breaks if we miss a few updates -- we just won't
* allocate quite as evenly. It all balances out over time.
*
* If we are doing ditto or log blocks, try to spread them across
* consecutive vdevs. If we're forced to reuse a vdev before we've
* allocated all of our ditto blocks, then try and spread them out on
* that vdev as much as possible. If it turns out to not be possible,
* gradually lower our standards until anything becomes acceptable.
* Also, allocating on consecutive vdevs (as opposed to random vdevs)
* gives us hope of containing our fault domains to something we're
* able to reason about. Otherwise, any two top-level vdev failures
* will guarantee the loss of data. With consecutive allocation,
* only two adjacent top-level vdev failures will result in data loss.
*
* If we are doing gang blocks (hintdva is non-NULL), try to keep
* ourselves on the same vdev as our gang block header. That
* way, we can hope for locality in vdev_cache, plus it makes our
* fault domains something tractable.
*/
if (hintdva) {
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
/*
* It's possible the vdev we're using as the hint no
- * longer exists (i.e. removed). Consult the rotor when
+ * longer exists or its mg has been closed (e.g. by
+ * device removal). Consult the rotor when
* all else fails.
*/
- if (vd != NULL) {
+ if (vd != NULL && vd->vdev_mg != NULL) {
mg = vd->vdev_mg;
if (flags & METASLAB_HINTBP_AVOID &&
mg->mg_next != NULL)
mg = mg->mg_next;
} else {
mg = mc->mc_rotor;
}
} else if (d != 0) {
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
mg = vd->vdev_mg->mg_next;
} else {
mg = mc->mc_rotor;
}
/*
* If the hint put us into the wrong metaslab class, or into a
* metaslab group that has been passivated, just follow the rotor.
*/
if (mg->mg_class != mc || mg->mg_activation_count <= 0)
mg = mc->mc_rotor;
rotor = mg;
top:
do {
boolean_t allocatable;
ASSERT(mg->mg_activation_count == 1);
vd = mg->mg_vd;
/*
* Don't allocate from faulted devices.
*/
if (try_hard) {
spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
allocatable = vdev_allocatable(vd);
spa_config_exit(spa, SCL_ZIO, FTAG);
} else {
allocatable = vdev_allocatable(vd);
}
/*
* Determine if the selected metaslab group is eligible
* for allocations. If we're ganging then don't allow
* this metaslab group to skip allocations since that would
* inadvertently return ENOSPC and suspend the pool
* even though space is still available.
*/
if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
allocatable = metaslab_group_allocatable(mg, rotor,
psize);
}
if (!allocatable) {
metaslab_trace_add(zal, mg, NULL, psize, d,
TRACE_NOT_ALLOCATABLE);
goto next;
}
ASSERT(mg->mg_initialized);
/*
* Avoid writing single-copy data to a failing,
* non-redundant vdev, unless we've already tried all
* other vdevs.
*/
if ((vd->vdev_stat.vs_write_errors > 0 ||
vd->vdev_state < VDEV_STATE_HEALTHY) &&
d == 0 && !try_hard && vd->vdev_children == 0) {
metaslab_trace_add(zal, mg, NULL, psize, d,
TRACE_VDEV_ERROR);
goto next;
}
ASSERT(mg->mg_class == mc);
/*
* If we don't need to try hard, then require that the
* block be 1/8th of the device away from any other DVAs
* in this BP. If we are trying hard, allow any offset
* to be used (distance=0).
*/
uint64_t distance = 0;
if (!try_hard) {
distance = vd->vdev_asize >>
ditto_same_vdev_distance_shift;
if (distance <= (1ULL << vd->vdev_ms_shift))
distance = 0;
}
uint64_t asize = vdev_psize_to_asize(vd, psize);
ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
distance, dva, d);
if (offset != -1ULL) {
/*
* If we've just selected this metaslab group,
* figure out whether the corresponding vdev is
* over- or under-used relative to the pool,
* and set an allocation bias to even it out.
*/
if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
vdev_stat_t *vs = &vd->vdev_stat;
int64_t vu, cu;
vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
/*
* Calculate how much more or less we should
* try to allocate from this device during
* this iteration around the rotor.
* For example, if a device is 80% full
* and the pool is 20% full then we should
* reduce allocations by 60% on this device.
*
* mg_bias = (20 - 80) * 512K / 100 = -307K
*
* This reduces allocations by 307K for this
* iteration.
*/
mg->mg_bias = ((cu - vu) *
(int64_t)mg->mg_aliquot) / 100;
} else if (!metaslab_bias_enabled) {
mg->mg_bias = 0;
}
if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
mg->mg_aliquot + mg->mg_bias) {
mc->mc_rotor = mg->mg_next;
mc->mc_aliquot = 0;
}
DVA_SET_VDEV(&dva[d], vd->vdev_id);
DVA_SET_OFFSET(&dva[d], offset);
DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
DVA_SET_ASIZE(&dva[d], asize);
return (0);
}
next:
mc->mc_rotor = mg->mg_next;
mc->mc_aliquot = 0;
} while ((mg = mg->mg_next) != rotor);
/*
* If we haven't tried hard, do so now.
*/
if (!try_hard) {
try_hard = B_TRUE;
goto top;
}
bzero(&dva[d], sizeof (dva_t));
metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC);
return (SET_ERROR(ENOSPC));
}
+void
+metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
+ uint64_t txg)
+{
+ metaslab_t *msp;
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT3U(txg, ==, spa->spa_syncing_txg);
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
+ ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
+
+ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+ VERIFY(!msp->ms_condensing);
+ VERIFY3U(offset, >=, msp->ms_start);
+ VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
+ VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
+ VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
+
+ metaslab_check_free_impl(vd, offset, asize);
+ mutex_enter(&msp->ms_lock);
+ if (range_tree_space(msp->ms_freeingtree) == 0) {
+ vdev_dirty(vd, VDD_METASLAB, msp, txg);
+ }
+ range_tree_add(msp->ms_freeingtree, offset, asize);
+ mutex_exit(&msp->ms_lock);
+}
+
+/* ARGSUSED */
+void
+metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
+ uint64_t size, void *arg)
+{
+ uint64_t *txgp = arg;
+
+ if (vd->vdev_ops->vdev_op_remap != NULL)
+ vdev_indirect_mark_obsolete(vd, offset, size, *txgp);
+ else
+ metaslab_free_impl(vd, offset, size, *txgp);
+}
+
+static void
+metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
+ uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
+
+ if (txg > spa_freeze_txg(spa))
+ return;
+
+ if (spa->spa_vdev_removal != NULL &&
+ spa->spa_vdev_removal->svr_vdev == vd &&
+ vdev_is_concrete(vd)) {
+ /*
+ * Note: we check if the vdev is concrete because when
+ * we complete the removal, we first change the vdev to be
+ * an indirect vdev (in open context), and then (in syncing
+ * context) clear spa_vdev_removal.
+ */
+ free_from_removing_vdev(vd, offset, size, txg);
+ } else if (vd->vdev_ops->vdev_op_remap != NULL) {
+ vdev_indirect_mark_obsolete(vd, offset, size, txg);
+ vd->vdev_ops->vdev_op_remap(vd, offset, size,
+ metaslab_free_impl_cb, &txg);
+ } else {
+ metaslab_free_concrete(vd, offset, size, txg);
+ }
+}
+
+typedef struct remap_blkptr_cb_arg {
+ blkptr_t *rbca_bp;
+ spa_remap_cb_t rbca_cb;
+ vdev_t *rbca_remap_vd;
+ uint64_t rbca_remap_offset;
+ void *rbca_cb_arg;
+} remap_blkptr_cb_arg_t;
+
+void
+remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
+ uint64_t size, void *arg)
+{
+ remap_blkptr_cb_arg_t *rbca = arg;
+ blkptr_t *bp = rbca->rbca_bp;
+
+ /* We can not remap split blocks. */
+ if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
+ return;
+ ASSERT0(inner_offset);
+
+ if (rbca->rbca_cb != NULL) {
+ /*
+ * At this point we know that we are not handling split
+ * blocks and we invoke the callback on the previous
+ * vdev which must be indirect.
+ */
+ ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
+
+ rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
+ rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
+
+ /* set up remap_blkptr_cb_arg for the next call */
+ rbca->rbca_remap_vd = vd;
+ rbca->rbca_remap_offset = offset;
+ }
+
+ /*
+ * The phys birth time is that of dva[0]. This ensures that we know
+ * when each dva was written, so that resilver can determine which
+ * blocks need to be scrubbed (i.e. those written during the time
+ * the vdev was offline). It also ensures that the key used in
+ * the ARC hash table is unique (i.e. dva[0] + phys_birth). If
+ * we didn't change the phys_birth, a lookup in the ARC for a
+ * remapped BP could find the data that was previously stored at
+ * this vdev + offset.
+ */
+ vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
+ DVA_GET_VDEV(&bp->blk_dva[0]));
+ vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
+ bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
+ DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
+
+ DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
+ DVA_SET_OFFSET(&bp->blk_dva[0], offset);
+}
+
/*
- * Free the block represented by DVA in the context of the specified
- * transaction group.
+ * If the block pointer contains any indirect DVAs, modify them to refer to
+ * concrete DVAs. Note that this will sometimes not be possible, leaving
+ * the indirect DVA in place. This happens if the indirect DVA spans multiple
+ * segments in the mapping (i.e. it is a "split block").
+ *
+ * If the BP was remapped, calls the callback on the original dva (note the
+ * callback can be called multiple times if the original indirect DVA refers
+ * to another indirect DVA, etc).
+ *
+ * Returns TRUE if the BP was remapped.
*/
-static void
-metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
+boolean_t
+spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
{
- uint64_t vdev = DVA_GET_VDEV(dva);
+ remap_blkptr_cb_arg_t rbca;
+
+ if (!zfs_remap_blkptr_enable)
+ return (B_FALSE);
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
+ return (B_FALSE);
+
+ /*
+ * Dedup BP's can not be remapped, because ddt_phys_select() depends
+ * on DVA[0] being the same in the BP as in the DDT (dedup table).
+ */
+ if (BP_GET_DEDUP(bp))
+ return (B_FALSE);
+
+ /*
+ * Gang blocks can not be remapped, because
+ * zio_checksum_gang_verifier() depends on the DVA[0] that's in
+ * the BP used to read the gang block header (GBH) being the same
+ * as the DVA[0] that we allocated for the GBH.
+ */
+ if (BP_IS_GANG(bp))
+ return (B_FALSE);
+
+ /*
+ * Embedded BP's have no DVA to remap.
+ */
+ if (BP_GET_NDVAS(bp) < 1)
+ return (B_FALSE);
+
+ /*
+ * Note: we only remap dva[0]. If we remapped other dvas, we
+ * would no longer know what their phys birth txg is.
+ */
+ dva_t *dva = &bp->blk_dva[0];
+
uint64_t offset = DVA_GET_OFFSET(dva);
uint64_t size = DVA_GET_ASIZE(dva);
- vdev_t *vd;
+ vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+
+ if (vd->vdev_ops->vdev_op_remap == NULL)
+ return (B_FALSE);
+
+ rbca.rbca_bp = bp;
+ rbca.rbca_cb = callback;
+ rbca.rbca_remap_vd = vd;
+ rbca.rbca_remap_offset = offset;
+ rbca.rbca_cb_arg = arg;
+
+ /*
+ * remap_blkptr_cb() will be called in order for each level of
+ * indirection, until a concrete vdev is reached or a split block is
+ * encountered. old_vd and old_offset are updated within the callback
+ * as we go from the one indirect vdev to the next one (either concrete
+ * or indirect again) in that order.
+ */
+ vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
+
+ /* Check if the DVA wasn't remapped because it is a split block */
+ if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+/*
+ * Undo the allocation of a DVA which happened in the given transaction group.
+ */
+void
+metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
+{
metaslab_t *msp;
+ vdev_t *vd;
+ uint64_t vdev = DVA_GET_VDEV(dva);
+ uint64_t offset = DVA_GET_OFFSET(dva);
+ uint64_t size = DVA_GET_ASIZE(dva);
ASSERT(DVA_IS_VALID(dva));
+ ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
if (txg > spa_freeze_txg(spa))
return;
if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
(offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
(u_longlong_t)vdev, (u_longlong_t)offset);
ASSERT(0);
return;
}
- msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+ ASSERT(!vd->vdev_removing);
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
+ ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
if (DVA_GET_GANG(dva))
size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
mutex_enter(&msp->ms_lock);
+ range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
+ offset, size);
- if (now) {
- range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
- offset, size);
-
- VERIFY(!msp->ms_condensing);
- VERIFY3U(offset, >=, msp->ms_start);
- VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
- VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
- msp->ms_size);
- VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
- VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
- range_tree_add(msp->ms_tree, offset, size);
- msp->ms_max_size = metaslab_block_maxsize(msp);
- } else {
- VERIFY3U(txg, ==, spa->spa_syncing_txg);
- if (range_tree_space(msp->ms_freeingtree) == 0)
- vdev_dirty(vd, VDD_METASLAB, msp, txg);
- range_tree_add(msp->ms_freeingtree, offset, size);
- }
-
+ VERIFY(!msp->ms_condensing);
+ VERIFY3U(offset, >=, msp->ms_start);
+ VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
+ VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
+ msp->ms_size);
+ VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
+ VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+ range_tree_add(msp->ms_tree, offset, size);
mutex_exit(&msp->ms_lock);
}
/*
- * Intent log support: upon opening the pool after a crash, notify the SPA
- * of blocks that the intent log has allocated for immediate write, but
- * which are still considered free by the SPA because the last transaction
- * group didn't commit yet.
+ * Free the block represented by DVA in the context of the specified
+ * transaction group.
*/
-static int
-metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
+void
+metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
{
uint64_t vdev = DVA_GET_VDEV(dva);
uint64_t offset = DVA_GET_OFFSET(dva);
uint64_t size = DVA_GET_ASIZE(dva);
- vdev_t *vd;
- metaslab_t *msp;
- int error = 0;
+ vdev_t *vd = vdev_lookup_top(spa, vdev);
ASSERT(DVA_IS_VALID(dva));
+ ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
- if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
- (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
- return (SET_ERROR(ENXIO));
-
- msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
-
- if (DVA_GET_GANG(dva))
+ if (DVA_GET_GANG(dva)) {
size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
-
- mutex_enter(&msp->ms_lock);
-
- if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
- error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
-
- if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
- error = SET_ERROR(ENOENT);
-
- if (error || txg == 0) { /* txg == 0 indicates dry run */
- mutex_exit(&msp->ms_lock);
- return (error);
}
- VERIFY(!msp->ms_condensing);
- VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
- VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
- VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
- range_tree_remove(msp->ms_tree, offset, size);
-
- if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */
- if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
- vdev_dirty(vd, VDD_METASLAB, msp, txg);
- range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
- }
-
- mutex_exit(&msp->ms_lock);
-
- return (0);
+ metaslab_free_impl(vd, offset, size, txg);
}
/*
* Reserve some allocation slots. The reservation system must be called
* before we call into the allocator. If there aren't any available slots
* then the I/O will be throttled until an I/O completes and its slots are
* freed up. The function returns true if it was successful in placing
* the reservation.
*/
boolean_t
metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
int flags)
{
uint64_t available_slots = 0;
boolean_t slot_reserved = B_FALSE;
ASSERT(mc->mc_alloc_throttle_enabled);
mutex_enter(&mc->mc_lock);
uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
if (reserved_slots < mc->mc_alloc_max_slots)
available_slots = mc->mc_alloc_max_slots - reserved_slots;
if (slots <= available_slots || GANG_ALLOCATION(flags)) {
/*
* We reserve the slots individually so that we can unreserve
* them individually when an I/O completes.
*/
for (int d = 0; d < slots; d++) {
reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
}
zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
slot_reserved = B_TRUE;
}
mutex_exit(&mc->mc_lock);
return (slot_reserved);
}
void
metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
{
ASSERT(mc->mc_alloc_throttle_enabled);
mutex_enter(&mc->mc_lock);
for (int d = 0; d < slots; d++) {
(void) refcount_remove(&mc->mc_alloc_slots, zio);
}
mutex_exit(&mc->mc_lock);
}
+static int
+metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
+ uint64_t txg)
+{
+ metaslab_t *msp;
+ spa_t *spa = vd->vdev_spa;
+ int error = 0;
+
+ if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
+ return (ENXIO);
+
+ ASSERT3P(vd->vdev_ms, !=, NULL);
+ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+ mutex_enter(&msp->ms_lock);
+
+ if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
+ error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
+
+ if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
+ error = SET_ERROR(ENOENT);
+
+ if (error || txg == 0) { /* txg == 0 indicates dry run */
+ mutex_exit(&msp->ms_lock);
+ return (error);
+ }
+
+ VERIFY(!msp->ms_condensing);
+ VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
+ VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+ VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
+ range_tree_remove(msp->ms_tree, offset, size);
+
+ if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */
+ if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
+ vdev_dirty(vd, VDD_METASLAB, msp, txg);
+ range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
+ }
+
+ mutex_exit(&msp->ms_lock);
+
+ return (0);
+}
+
+typedef struct metaslab_claim_cb_arg_t {
+ uint64_t mcca_txg;
+ int mcca_error;
+} metaslab_claim_cb_arg_t;
+
+/* ARGSUSED */
+static void
+metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
+ uint64_t size, void *arg)
+{
+ metaslab_claim_cb_arg_t *mcca_arg = arg;
+
+ if (mcca_arg->mcca_error == 0) {
+ mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
+ size, mcca_arg->mcca_txg);
+ }
+}
+
int
+metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
+{
+ if (vd->vdev_ops->vdev_op_remap != NULL) {
+ metaslab_claim_cb_arg_t arg;
+
+ /*
+ * Only zdb(1M) can claim on indirect vdevs. This is used
+ * to detect leaks of mapped space (that are not accounted
+ * for in the obsolete counts, spacemap, or bpobj).
+ */
+ ASSERT(!spa_writeable(vd->vdev_spa));
+ arg.mcca_error = 0;
+ arg.mcca_txg = txg;
+
+ vd->vdev_ops->vdev_op_remap(vd, offset, size,
+ metaslab_claim_impl_cb, &arg);
+
+ if (arg.mcca_error == 0) {
+ arg.mcca_error = metaslab_claim_concrete(vd,
+ offset, size, txg);
+ }
+ return (arg.mcca_error);
+ } else {
+ return (metaslab_claim_concrete(vd, offset, size, txg));
+ }
+}
+
+/*
+ * Intent log support: upon opening the pool after a crash, notify the SPA
+ * of blocks that the intent log has allocated for immediate write, but
+ * which are still considered free by the SPA because the last transaction
+ * group didn't commit yet.
+ */
+static int
+metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
+{
+ uint64_t vdev = DVA_GET_VDEV(dva);
+ uint64_t offset = DVA_GET_OFFSET(dva);
+ uint64_t size = DVA_GET_ASIZE(dva);
+ vdev_t *vd;
+
+ if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
+ return (SET_ERROR(ENXIO));
+ }
+
+ ASSERT(DVA_IS_VALID(dva));
+
+ if (DVA_GET_GANG(dva))
+ size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+
+ return (metaslab_claim_impl(vd, offset, size, txg));
+}
+
+int
metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
zio_alloc_list_t *zal, zio_t *zio)
{
dva_t *dva = bp->blk_dva;
dva_t *hintdva = hintbp->blk_dva;
int error = 0;
ASSERT(bp->blk_birth == 0);
ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
if (mc->mc_rotor == NULL) { /* no vdevs in this class */
spa_config_exit(spa, SCL_ALLOC, FTAG);
return (SET_ERROR(ENOSPC));
}
ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
ASSERT(BP_GET_NDVAS(bp) == 0);
ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
ASSERT3P(zal, !=, NULL);
for (int d = 0; d < ndvas; d++) {
error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
txg, flags, zal);
if (error != 0) {
for (d--; d >= 0; d--) {
- metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
+ metaslab_unalloc_dva(spa, &dva[d], txg);
metaslab_group_alloc_decrement(spa,
DVA_GET_VDEV(&dva[d]), zio, flags);
bzero(&dva[d], sizeof (dva_t));
}
spa_config_exit(spa, SCL_ALLOC, FTAG);
return (error);
} else {
/*
* Update the metaslab group's queue depth
* based on the newly allocated dva.
*/
metaslab_group_alloc_increment(spa,
DVA_GET_VDEV(&dva[d]), zio, flags);
}
}
ASSERT(error == 0);
ASSERT(BP_GET_NDVAS(bp) == ndvas);
spa_config_exit(spa, SCL_ALLOC, FTAG);
BP_SET_BIRTH(bp, txg, txg);
return (0);
}
void
metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
{
const dva_t *dva = bp->blk_dva;
int ndvas = BP_GET_NDVAS(bp);
ASSERT(!BP_IS_HOLE(bp));
ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
- for (int d = 0; d < ndvas; d++)
- metaslab_free_dva(spa, &dva[d], txg, now);
+ for (int d = 0; d < ndvas; d++) {
+ if (now) {
+ metaslab_unalloc_dva(spa, &dva[d], txg);
+ } else {
+ metaslab_free_dva(spa, &dva[d], txg);
+ }
+ }
spa_config_exit(spa, SCL_FREE, FTAG);
}
int
metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
{
const dva_t *dva = bp->blk_dva;
int ndvas = BP_GET_NDVAS(bp);
int error = 0;
ASSERT(!BP_IS_HOLE(bp));
if (txg != 0) {
/*
* First do a dry run to make sure all DVAs are claimable,
* so we don't have to unwind from partial failures below.
*/
if ((error = metaslab_claim(spa, bp, 0)) != 0)
return (error);
}
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
for (int d = 0; d < ndvas; d++)
if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
break;
spa_config_exit(spa, SCL_ALLOC, FTAG);
ASSERT(error == 0 || txg == 0);
return (error);
}
+/* ARGSUSED */
+static void
+metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
+ uint64_t size, void *arg)
+{
+ if (vd->vdev_ops == &vdev_indirect_ops)
+ return;
+
+ metaslab_check_free_impl(vd, offset, size);
+}
+
+static void
+metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
+{
+ metaslab_t *msp;
+ spa_t *spa = vd->vdev_spa;
+
+ if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
+ return;
+
+ if (vd->vdev_ops->vdev_op_remap != NULL) {
+ vd->vdev_ops->vdev_op_remap(vd, offset, size,
+ metaslab_check_free_impl_cb, NULL);
+ return;
+ }
+
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
+ ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
+
+ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+ mutex_enter(&msp->ms_lock);
+ if (msp->ms_loaded)
+ range_tree_verify(msp->ms_tree, offset, size);
+
+ range_tree_verify(msp->ms_freeingtree, offset, size);
+ range_tree_verify(msp->ms_freedtree, offset, size);
+ for (int j = 0; j < TXG_DEFER_SIZE; j++)
+ range_tree_verify(msp->ms_defertree[j], offset, size);
+ mutex_exit(&msp->ms_lock);
+}
+
void
metaslab_check_free(spa_t *spa, const blkptr_t *bp)
{
if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
return;
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
vdev_t *vd = vdev_lookup_top(spa, vdev);
uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
- metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
- if (msp->ms_loaded)
- range_tree_verify(msp->ms_tree, offset, size);
+ if (DVA_GET_GANG(&bp->blk_dva[i]))
+ size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
- range_tree_verify(msp->ms_freeingtree, offset, size);
- range_tree_verify(msp->ms_freedtree, offset, size);
- for (int j = 0; j < TXG_DEFER_SIZE; j++)
- range_tree_verify(msp->ms_defertree[j], offset, size);
+ ASSERT3P(vd, !=, NULL);
+
+ metaslab_check_free_impl(vd, offset, size);
}
spa_config_exit(spa, SCL_VDEV, FTAG);
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c (revision 332525)
@@ -1,411 +1,403 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/dmu.h>
#include <sys/dnode.h>
#include <sys/zio.h>
#include <sys/range_tree.h>
kmem_cache_t *range_seg_cache;
void
range_tree_init(void)
{
ASSERT(range_seg_cache == NULL);
range_seg_cache = kmem_cache_create("range_seg_cache",
sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
}
void
range_tree_fini(void)
{
kmem_cache_destroy(range_seg_cache);
range_seg_cache = NULL;
}
void
range_tree_stat_verify(range_tree_t *rt)
{
range_seg_t *rs;
uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
int i;
for (rs = avl_first(&rt->rt_root); rs != NULL;
rs = AVL_NEXT(&rt->rt_root, rs)) {
uint64_t size = rs->rs_end - rs->rs_start;
int idx = highbit64(size) - 1;
hist[idx]++;
ASSERT3U(hist[idx], !=, 0);
}
for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
if (hist[i] != rt->rt_histogram[i]) {
zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu",
i, hist, hist[i], rt->rt_histogram[i]);
}
VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
}
}
static void
range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
{
uint64_t size = rs->rs_end - rs->rs_start;
int idx = highbit64(size) - 1;
ASSERT(size != 0);
ASSERT3U(idx, <,
sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
- ASSERT(MUTEX_HELD(rt->rt_lock));
rt->rt_histogram[idx]++;
ASSERT3U(rt->rt_histogram[idx], !=, 0);
}
static void
range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
{
uint64_t size = rs->rs_end - rs->rs_start;
int idx = highbit64(size) - 1;
ASSERT(size != 0);
ASSERT3U(idx, <,
sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
- ASSERT(MUTEX_HELD(rt->rt_lock));
ASSERT3U(rt->rt_histogram[idx], !=, 0);
rt->rt_histogram[idx]--;
}
/*
* NOTE: caller is responsible for all locking.
*/
static int
range_tree_seg_compare(const void *x1, const void *x2)
{
const range_seg_t *r1 = x1;
const range_seg_t *r2 = x2;
if (r1->rs_start < r2->rs_start) {
if (r1->rs_end > r2->rs_start)
return (0);
return (-1);
}
if (r1->rs_start > r2->rs_start) {
if (r1->rs_start < r2->rs_end)
return (0);
return (1);
}
return (0);
}
range_tree_t *
-range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp)
+range_tree_create(range_tree_ops_t *ops, void *arg)
{
range_tree_t *rt;
rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
avl_create(&rt->rt_root, range_tree_seg_compare,
sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
- rt->rt_lock = lp;
rt->rt_ops = ops;
rt->rt_arg = arg;
if (rt->rt_ops != NULL)
rt->rt_ops->rtop_create(rt, rt->rt_arg);
return (rt);
}
void
range_tree_destroy(range_tree_t *rt)
{
VERIFY0(rt->rt_space);
if (rt->rt_ops != NULL)
rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
avl_destroy(&rt->rt_root);
kmem_free(rt, sizeof (*rt));
}
void
range_tree_add(void *arg, uint64_t start, uint64_t size)
{
range_tree_t *rt = arg;
avl_index_t where;
range_seg_t rsearch, *rs_before, *rs_after, *rs;
uint64_t end = start + size;
boolean_t merge_before, merge_after;
- ASSERT(MUTEX_HELD(rt->rt_lock));
VERIFY(size != 0);
rsearch.rs_start = start;
rsearch.rs_end = end;
rs = avl_find(&rt->rt_root, &rsearch, &where);
if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) {
zfs_panic_recover("zfs: allocating allocated segment"
"(offset=%llu size=%llu)\n",
(longlong_t)start, (longlong_t)size);
return;
}
/* Make sure we don't overlap with either of our neighbors */
VERIFY(rs == NULL);
rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
merge_before = (rs_before != NULL && rs_before->rs_end == start);
merge_after = (rs_after != NULL && rs_after->rs_start == end);
if (merge_before && merge_after) {
avl_remove(&rt->rt_root, rs_before);
if (rt->rt_ops != NULL) {
rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
}
range_tree_stat_decr(rt, rs_before);
range_tree_stat_decr(rt, rs_after);
rs_after->rs_start = rs_before->rs_start;
kmem_cache_free(range_seg_cache, rs_before);
rs = rs_after;
} else if (merge_before) {
if (rt->rt_ops != NULL)
rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
range_tree_stat_decr(rt, rs_before);
rs_before->rs_end = end;
rs = rs_before;
} else if (merge_after) {
if (rt->rt_ops != NULL)
rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
range_tree_stat_decr(rt, rs_after);
rs_after->rs_start = start;
rs = rs_after;
} else {
rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
rs->rs_start = start;
rs->rs_end = end;
avl_insert(&rt->rt_root, rs, where);
}
if (rt->rt_ops != NULL)
rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
range_tree_stat_incr(rt, rs);
rt->rt_space += size;
}
void
range_tree_remove(void *arg, uint64_t start, uint64_t size)
{
range_tree_t *rt = arg;
avl_index_t where;
range_seg_t rsearch, *rs, *newseg;
uint64_t end = start + size;
boolean_t left_over, right_over;
- ASSERT(MUTEX_HELD(rt->rt_lock));
VERIFY3U(size, !=, 0);
VERIFY3U(size, <=, rt->rt_space);
rsearch.rs_start = start;
rsearch.rs_end = end;
rs = avl_find(&rt->rt_root, &rsearch, &where);
/* Make sure we completely overlap with someone */
if (rs == NULL) {
zfs_panic_recover("zfs: freeing free segment "
"(offset=%llu size=%llu)",
(longlong_t)start, (longlong_t)size);
return;
}
VERIFY3U(rs->rs_start, <=, start);
VERIFY3U(rs->rs_end, >=, end);
left_over = (rs->rs_start != start);
right_over = (rs->rs_end != end);
range_tree_stat_decr(rt, rs);
if (rt->rt_ops != NULL)
rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
if (left_over && right_over) {
newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
newseg->rs_start = end;
newseg->rs_end = rs->rs_end;
range_tree_stat_incr(rt, newseg);
rs->rs_end = start;
avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
if (rt->rt_ops != NULL)
rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
} else if (left_over) {
rs->rs_end = start;
} else if (right_over) {
rs->rs_start = end;
} else {
avl_remove(&rt->rt_root, rs);
kmem_cache_free(range_seg_cache, rs);
rs = NULL;
}
if (rs != NULL) {
range_tree_stat_incr(rt, rs);
if (rt->rt_ops != NULL)
rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
}
rt->rt_space -= size;
}
static range_seg_t *
range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
{
avl_index_t where;
range_seg_t rsearch;
uint64_t end = start + size;
- ASSERT(MUTEX_HELD(rt->rt_lock));
VERIFY(size != 0);
rsearch.rs_start = start;
rsearch.rs_end = end;
return (avl_find(&rt->rt_root, &rsearch, &where));
}
static range_seg_t *
range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
{
range_seg_t *rs = range_tree_find_impl(rt, start, size);
if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size)
return (rs);
return (NULL);
}
void
range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size)
{
range_seg_t *rs;
- mutex_enter(rt->rt_lock);
rs = range_tree_find(rt, off, size);
if (rs != NULL)
panic("freeing free block; rs=%p", (void *)rs);
- mutex_exit(rt->rt_lock);
}
boolean_t
range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
{
return (range_tree_find(rt, start, size) != NULL);
}
/*
* Ensure that this range is not in the tree, regardless of whether
* it is currently in the tree.
*/
void
range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size)
{
range_seg_t *rs;
+ if (size == 0)
+ return;
+
while ((rs = range_tree_find_impl(rt, start, size)) != NULL) {
uint64_t free_start = MAX(rs->rs_start, start);
uint64_t free_end = MIN(rs->rs_end, start + size);
range_tree_remove(rt, free_start, free_end - free_start);
}
}
void
range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst)
{
range_tree_t *rt;
- ASSERT(MUTEX_HELD((*rtsrc)->rt_lock));
ASSERT0(range_tree_space(*rtdst));
ASSERT0(avl_numnodes(&(*rtdst)->rt_root));
rt = *rtsrc;
*rtsrc = *rtdst;
*rtdst = rt;
}
void
range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
{
range_seg_t *rs;
void *cookie = NULL;
- ASSERT(MUTEX_HELD(rt->rt_lock));
if (rt->rt_ops != NULL)
rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
if (func != NULL)
func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
kmem_cache_free(range_seg_cache, rs);
}
bzero(rt->rt_histogram, sizeof (rt->rt_histogram));
rt->rt_space = 0;
}
void
range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
{
range_seg_t *rs;
- ASSERT(MUTEX_HELD(rt->rt_lock));
for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
}
uint64_t
range_tree_space(range_tree_t *rt)
{
return (rt->rt_space);
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c (revision 332525)
@@ -1,7411 +1,7339 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2016 Toomas Soome <tsoome@me.com>
* Copyright 2017 Joyent, Inc.
* Copyright (c) 2017 Datto Inc.
* Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
*/
/*
* SPA: Storage Pool Allocator
*
* This file contains all the routines used when modifying on-disk SPA state.
* This includes opening, importing, destroying, exporting a pool, and syncing a
* pool.
*/
#include <sys/zfs_context.h>
#include <sys/fm/fs/zfs.h>
#include <sys/spa_impl.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
#include <sys/dmu.h>
#include <sys/dmu_tx.h>
#include <sys/zap.h>
#include <sys/zil.h>
#include <sys/ddt.h>
#include <sys/vdev_impl.h>
+#include <sys/vdev_removal.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/vdev_indirect_births.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
#include <sys/uberblock_impl.h>
#include <sys/txg.h>
#include <sys/avl.h>
+#include <sys/bpobj.h>
#include <sys/dmu_traverse.h>
#include <sys/dmu_objset.h>
#include <sys/unique.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_dir.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_synctask.h>
#include <sys/fs/zfs.h>
#include <sys/arc.h>
#include <sys/callb.h>
#include <sys/spa_boot.h>
#include <sys/zfs_ioctl.h>
#include <sys/dsl_scan.h>
#include <sys/dmu_send.h>
#include <sys/dsl_destroy.h>
#include <sys/dsl_userhold.h>
#include <sys/zfeature.h>
#include <sys/zvol.h>
#include <sys/trim_map.h>
#include <sys/abd.h>
#ifdef _KERNEL
#include <sys/callb.h>
#include <sys/cpupart.h>
#include <sys/zone.h>
#endif /* _KERNEL */
#include "zfs_prop.h"
#include "zfs_comutil.h"
/* Check hostid on import? */
static int check_hostid = 1;
/*
* The interval, in seconds, at which failed configuration cache file writes
* should be retried.
*/
-static int zfs_ccw_retry_interval = 300;
+int zfs_ccw_retry_interval = 300;
SYSCTL_DECL(_vfs_zfs);
SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0,
"Check hostid on import?");
TUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval);
SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW,
&zfs_ccw_retry_interval, 0,
"Configuration cache file write, retry after failure, interval (seconds)");
typedef enum zti_modes {
ZTI_MODE_FIXED, /* value is # of threads (min 1) */
ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */
ZTI_MODE_NULL, /* don't create a taskq */
ZTI_NMODES
} zti_modes_t;
#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) }
#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 }
#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 }
#define ZTI_N(n) ZTI_P(n, 1)
#define ZTI_ONE ZTI_N(1)
typedef struct zio_taskq_info {
zti_modes_t zti_mode;
uint_t zti_value;
uint_t zti_count;
} zio_taskq_info_t;
static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
"issue", "issue_high", "intr", "intr_high"
};
/*
* This table defines the taskq settings for each ZFS I/O type. When
* initializing a pool, we use this table to create an appropriately sized
* taskq. Some operations are low volume and therefore have a small, static
* number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
* macros. Other operations process a large amount of data; the ZTI_BATCH
* macro causes us to create a taskq oriented for throughput. Some operations
* are so high frequency and short-lived that the taskq itself can become a a
* point of lock contention. The ZTI_P(#, #) macro indicates that we need an
* additional degree of parallelism specified by the number of threads per-
* taskq and the number of taskqs; when dispatching an event in this case, the
* particular taskq is chosen at random.
*
* The different taskq priorities are to handle the different contexts (issue
* and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
* need to be handled with minimum delay.
*/
const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
/* ISSUE ISSUE_HIGH INTR INTR_HIGH */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
{ ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */
{ ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */
{ ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */
};
-static sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl,
- const char *name);
-static void spa_event_post(sysevent_t *ev);
static void spa_sync_version(void *arg, dmu_tx_t *tx);
static void spa_sync_props(void *arg, dmu_tx_t *tx);
static boolean_t spa_has_active_shared_spare(spa_t *spa);
static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
- spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
+ spa_load_state_t state, spa_import_type_t type, boolean_t trust_config,
char **ereport);
static void spa_vdev_resilver_done(spa_t *spa);
uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */
#ifdef PSRSET_BIND
id_t zio_taskq_psrset_bind = PS_NONE;
#endif
#ifdef SYSDC
boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
uint_t zio_taskq_basedc = 80; /* base duty cycle */
#endif
boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */
extern int zfs_sync_pass_deferred_free;
/*
* This (illegal) pool name is used when temporarily importing a spa_t in order
* to get the vdev stats associated with the imported devices.
*/
#define TRYIMPORT_NAME "$import"
/*
* ==========================================================================
* SPA properties routines
* ==========================================================================
*/
/*
* Add a (source=src, propname=propval) list to an nvlist.
*/
static void
spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
uint64_t intval, zprop_source_t src)
{
const char *propname = zpool_prop_to_name(prop);
nvlist_t *propval;
VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
if (strval != NULL)
VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
else
VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
nvlist_free(propval);
}
/*
* Get property values from the spa configuration.
*/
static void
spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
{
vdev_t *rvd = spa->spa_root_vdev;
dsl_pool_t *pool = spa->spa_dsl_pool;
uint64_t size, alloc, cap, version;
zprop_source_t src = ZPROP_SRC_NONE;
spa_config_dirent_t *dp;
metaslab_class_t *mc = spa_normal_class(spa);
ASSERT(MUTEX_HELD(&spa->spa_props_lock));
if (rvd != NULL) {
alloc = metaslab_class_get_alloc(spa_normal_class(spa));
size = metaslab_class_get_space(spa_normal_class(spa));
spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
size - alloc, src);
spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
metaslab_class_fragmentation(mc), src);
spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
metaslab_class_expandable_space(mc), src);
spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
(spa_mode(spa) == FREAD), src);
cap = (size == 0) ? 0 : (alloc * 100 / size);
spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
ddt_get_pool_dedup_ratio(spa), src);
spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
rvd->vdev_state, src);
version = spa_version(spa);
if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
src = ZPROP_SRC_DEFAULT;
else
src = ZPROP_SRC_LOCAL;
spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
}
if (pool != NULL) {
/*
* The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
* when opening pools before this version freedir will be NULL.
*/
if (pool->dp_free_dir != NULL) {
spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
src);
} else {
spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
NULL, 0, src);
}
if (pool->dp_leak_dir != NULL) {
spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
src);
} else {
spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
NULL, 0, src);
}
}
spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
if (spa->spa_comment != NULL) {
spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
0, ZPROP_SRC_LOCAL);
}
if (spa->spa_root != NULL)
spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
0, ZPROP_SRC_LOCAL);
if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
} else {
spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
}
if ((dp = list_head(&spa->spa_config_list)) != NULL) {
if (dp->scd_path == NULL) {
spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
"none", 0, ZPROP_SRC_LOCAL);
} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
dp->scd_path, 0, ZPROP_SRC_LOCAL);
}
}
}
/*
* Get zpool property values.
*/
int
spa_prop_get(spa_t *spa, nvlist_t **nvp)
{
objset_t *mos = spa->spa_meta_objset;
zap_cursor_t zc;
zap_attribute_t za;
int err;
VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
mutex_enter(&spa->spa_props_lock);
/*
* Get properties from the spa config.
*/
spa_prop_get_config(spa, nvp);
/* If no pool property object, no more prop to get. */
if (mos == NULL || spa->spa_pool_props_object == 0) {
mutex_exit(&spa->spa_props_lock);
return (0);
}
/*
* Get properties from the MOS pool property object.
*/
for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
(err = zap_cursor_retrieve(&zc, &za)) == 0;
zap_cursor_advance(&zc)) {
uint64_t intval = 0;
char *strval = NULL;
zprop_source_t src = ZPROP_SRC_DEFAULT;
zpool_prop_t prop;
if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL)
continue;
switch (za.za_integer_length) {
case 8:
/* integer property */
if (za.za_first_integer !=
zpool_prop_default_numeric(prop))
src = ZPROP_SRC_LOCAL;
if (prop == ZPOOL_PROP_BOOTFS) {
dsl_pool_t *dp;
dsl_dataset_t *ds = NULL;
dp = spa_get_dsl(spa);
dsl_pool_config_enter(dp, FTAG);
if (err = dsl_dataset_hold_obj(dp,
za.za_first_integer, FTAG, &ds)) {
dsl_pool_config_exit(dp, FTAG);
break;
}
strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
KM_SLEEP);
dsl_dataset_name(ds, strval);
dsl_dataset_rele(ds, FTAG);
dsl_pool_config_exit(dp, FTAG);
} else {
strval = NULL;
intval = za.za_first_integer;
}
spa_prop_add_list(*nvp, prop, strval, intval, src);
if (strval != NULL)
kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);
break;
case 1:
/* string property */
strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
err = zap_lookup(mos, spa->spa_pool_props_object,
za.za_name, 1, za.za_num_integers, strval);
if (err) {
kmem_free(strval, za.za_num_integers);
break;
}
spa_prop_add_list(*nvp, prop, strval, 0, src);
kmem_free(strval, za.za_num_integers);
break;
default:
break;
}
}
zap_cursor_fini(&zc);
mutex_exit(&spa->spa_props_lock);
out:
if (err && err != ENOENT) {
nvlist_free(*nvp);
*nvp = NULL;
return (err);
}
return (0);
}
/*
* Validate the given pool properties nvlist and modify the list
* for the property values to be set.
*/
static int
spa_prop_validate(spa_t *spa, nvlist_t *props)
{
nvpair_t *elem;
int error = 0, reset_bootfs = 0;
uint64_t objnum = 0;
boolean_t has_feature = B_FALSE;
elem = NULL;
while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
uint64_t intval;
char *strval, *slash, *check, *fname;
const char *propname = nvpair_name(elem);
zpool_prop_t prop = zpool_name_to_prop(propname);
switch (prop) {
case ZPOOL_PROP_INVAL:
if (!zpool_prop_feature(propname)) {
error = SET_ERROR(EINVAL);
break;
}
/*
* Sanitize the input.
*/
if (nvpair_type(elem) != DATA_TYPE_UINT64) {
error = SET_ERROR(EINVAL);
break;
}
if (nvpair_value_uint64(elem, &intval) != 0) {
error = SET_ERROR(EINVAL);
break;
}
if (intval != 0) {
error = SET_ERROR(EINVAL);
break;
}
fname = strchr(propname, '@') + 1;
if (zfeature_lookup_name(fname, NULL) != 0) {
error = SET_ERROR(EINVAL);
break;
}
has_feature = B_TRUE;
break;
case ZPOOL_PROP_VERSION:
error = nvpair_value_uint64(elem, &intval);
if (!error &&
(intval < spa_version(spa) ||
intval > SPA_VERSION_BEFORE_FEATURES ||
has_feature))
error = SET_ERROR(EINVAL);
break;
case ZPOOL_PROP_DELEGATION:
case ZPOOL_PROP_AUTOREPLACE:
case ZPOOL_PROP_LISTSNAPS:
case ZPOOL_PROP_AUTOEXPAND:
error = nvpair_value_uint64(elem, &intval);
if (!error && intval > 1)
error = SET_ERROR(EINVAL);
break;
case ZPOOL_PROP_BOOTFS:
/*
* If the pool version is less than SPA_VERSION_BOOTFS,
* or the pool is still being created (version == 0),
* the bootfs property cannot be set.
*/
if (spa_version(spa) < SPA_VERSION_BOOTFS) {
error = SET_ERROR(ENOTSUP);
break;
}
/*
* Make sure the vdev config is bootable
*/
if (!vdev_is_bootable(spa->spa_root_vdev)) {
error = SET_ERROR(ENOTSUP);
break;
}
reset_bootfs = 1;
error = nvpair_value_string(elem, &strval);
if (!error) {
objset_t *os;
uint64_t propval;
if (strval == NULL || strval[0] == '\0') {
objnum = zpool_prop_default_numeric(
ZPOOL_PROP_BOOTFS);
break;
}
if (error = dmu_objset_hold(strval, FTAG, &os))
break;
/*
* Must be ZPL, and its property settings
* must be supported by GRUB (compression
* is not gzip, and large blocks are not used).
*/
if (dmu_objset_type(os) != DMU_OST_ZFS) {
error = SET_ERROR(ENOTSUP);
} else if ((error =
dsl_prop_get_int_ds(dmu_objset_ds(os),
zfs_prop_to_name(ZFS_PROP_COMPRESSION),
&propval)) == 0 &&
!BOOTFS_COMPRESS_VALID(propval)) {
error = SET_ERROR(ENOTSUP);
} else {
objnum = dmu_objset_id(os);
}
dmu_objset_rele(os, FTAG);
}
break;
case ZPOOL_PROP_FAILUREMODE:
error = nvpair_value_uint64(elem, &intval);
if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
intval > ZIO_FAILURE_MODE_PANIC))
error = SET_ERROR(EINVAL);
/*
* This is a special case which only occurs when
* the pool has completely failed. This allows
* the user to change the in-core failmode property
* without syncing it out to disk (I/Os might
* currently be blocked). We do this by returning
* EIO to the caller (spa_prop_set) to trick it
* into thinking we encountered a property validation
* error.
*/
if (!error && spa_suspended(spa)) {
spa->spa_failmode = intval;
error = SET_ERROR(EIO);
}
break;
case ZPOOL_PROP_CACHEFILE:
if ((error = nvpair_value_string(elem, &strval)) != 0)
break;
if (strval[0] == '\0')
break;
if (strcmp(strval, "none") == 0)
break;
if (strval[0] != '/') {
error = SET_ERROR(EINVAL);
break;
}
slash = strrchr(strval, '/');
ASSERT(slash != NULL);
if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
strcmp(slash, "/..") == 0)
error = SET_ERROR(EINVAL);
break;
case ZPOOL_PROP_COMMENT:
if ((error = nvpair_value_string(elem, &strval)) != 0)
break;
for (check = strval; *check != '\0'; check++) {
/*
* The kernel doesn't have an easy isprint()
* check. For this kernel check, we merely
* check ASCII apart from DEL. Fix this if
* there is an easy-to-use kernel isprint().
*/
if (*check >= 0x7f) {
error = SET_ERROR(EINVAL);
break;
}
}
if (strlen(strval) > ZPROP_MAX_COMMENT)
error = E2BIG;
break;
case ZPOOL_PROP_DEDUPDITTO:
if (spa_version(spa) < SPA_VERSION_DEDUP)
error = SET_ERROR(ENOTSUP);
else
error = nvpair_value_uint64(elem, &intval);
if (error == 0 &&
intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
error = SET_ERROR(EINVAL);
break;
}
if (error)
break;
}
if (!error && reset_bootfs) {
error = nvlist_remove(props,
zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
if (!error) {
error = nvlist_add_uint64(props,
zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
}
}
return (error);
}
void
spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
{
char *cachefile;
spa_config_dirent_t *dp;
if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
&cachefile) != 0)
return;
dp = kmem_alloc(sizeof (spa_config_dirent_t),
KM_SLEEP);
if (cachefile[0] == '\0')
dp->scd_path = spa_strdup(spa_config_path);
else if (strcmp(cachefile, "none") == 0)
dp->scd_path = NULL;
else
dp->scd_path = spa_strdup(cachefile);
list_insert_head(&spa->spa_config_list, dp);
if (need_sync)
spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
}
int
spa_prop_set(spa_t *spa, nvlist_t *nvp)
{
int error;
nvpair_t *elem = NULL;
boolean_t need_sync = B_FALSE;
if ((error = spa_prop_validate(spa, nvp)) != 0)
return (error);
while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
if (prop == ZPOOL_PROP_CACHEFILE ||
prop == ZPOOL_PROP_ALTROOT ||
prop == ZPOOL_PROP_READONLY)
continue;
if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) {
uint64_t ver;
if (prop == ZPOOL_PROP_VERSION) {
VERIFY(nvpair_value_uint64(elem, &ver) == 0);
} else {
ASSERT(zpool_prop_feature(nvpair_name(elem)));
ver = SPA_VERSION_FEATURES;
need_sync = B_TRUE;
}
/* Save time if the version is already set. */
if (ver == spa_version(spa))
continue;
/*
* In addition to the pool directory object, we might
* create the pool properties object, the features for
* read object, the features for write object, or the
* feature descriptions object.
*/
error = dsl_sync_task(spa->spa_name, NULL,
spa_sync_version, &ver,
6, ZFS_SPACE_CHECK_RESERVED);
if (error)
return (error);
continue;
}
need_sync = B_TRUE;
break;
}
if (need_sync) {
return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
nvp, 6, ZFS_SPACE_CHECK_RESERVED));
}
return (0);
}
/*
* If the bootfs property value is dsobj, clear it.
*/
void
spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
{
if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
VERIFY(zap_remove(spa->spa_meta_objset,
spa->spa_pool_props_object,
zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
spa->spa_bootfs = 0;
}
}
/*ARGSUSED*/
static int
spa_change_guid_check(void *arg, dmu_tx_t *tx)
{
uint64_t *newguid = arg;
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
vdev_t *rvd = spa->spa_root_vdev;
uint64_t vdev_state;
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
vdev_state = rvd->vdev_state;
spa_config_exit(spa, SCL_STATE, FTAG);
if (vdev_state != VDEV_STATE_HEALTHY)
return (SET_ERROR(ENXIO));
ASSERT3U(spa_guid(spa), !=, *newguid);
return (0);
}
static void
spa_change_guid_sync(void *arg, dmu_tx_t *tx)
{
uint64_t *newguid = arg;
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
uint64_t oldguid;
vdev_t *rvd = spa->spa_root_vdev;
oldguid = spa_guid(spa);
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
rvd->vdev_guid = *newguid;
rvd->vdev_guid_sum += (*newguid - oldguid);
vdev_config_dirty(rvd);
spa_config_exit(spa, SCL_STATE, FTAG);
spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
oldguid, *newguid);
}
/*
* Change the GUID for the pool. This is done so that we can later
* re-import a pool built from a clone of our own vdevs. We will modify
* the root vdev's guid, our own pool guid, and then mark all of our
* vdevs dirty. Note that we must make sure that all our vdevs are
* online when we do this, or else any vdevs that weren't present
* would be orphaned from our pool. We are also going to issue a
* sysevent to update any watchers.
*/
int
spa_change_guid(spa_t *spa)
{
int error;
uint64_t guid;
mutex_enter(&spa->spa_vdev_top_lock);
mutex_enter(&spa_namespace_lock);
guid = spa_generate_guid(NULL);
error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
if (error == 0) {
- spa_config_sync(spa, B_FALSE, B_TRUE);
+ spa_write_cachefile(spa, B_FALSE, B_TRUE);
spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
}
mutex_exit(&spa_namespace_lock);
mutex_exit(&spa->spa_vdev_top_lock);
return (error);
}
/*
* ==========================================================================
* SPA state manipulation (open/create/destroy/import/export)
* ==========================================================================
*/
static int
spa_error_entry_compare(const void *a, const void *b)
{
spa_error_entry_t *sa = (spa_error_entry_t *)a;
spa_error_entry_t *sb = (spa_error_entry_t *)b;
int ret;
ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
sizeof (zbookmark_phys_t));
if (ret < 0)
return (-1);
else if (ret > 0)
return (1);
else
return (0);
}
/*
* Utility function which retrieves copies of the current logs and
* re-initializes them in the process.
*/
void
spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
{
ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
avl_create(&spa->spa_errlist_scrub,
spa_error_entry_compare, sizeof (spa_error_entry_t),
offsetof(spa_error_entry_t, se_avl));
avl_create(&spa->spa_errlist_last,
spa_error_entry_compare, sizeof (spa_error_entry_t),
offsetof(spa_error_entry_t, se_avl));
}
static void
spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
{
const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
enum zti_modes mode = ztip->zti_mode;
uint_t value = ztip->zti_value;
uint_t count = ztip->zti_count;
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
char name[32];
uint_t flags = 0;
boolean_t batch = B_FALSE;
if (mode == ZTI_MODE_NULL) {
tqs->stqs_count = 0;
tqs->stqs_taskq = NULL;
return;
}
ASSERT3U(count, >, 0);
tqs->stqs_count = count;
tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
switch (mode) {
case ZTI_MODE_FIXED:
ASSERT3U(value, >=, 1);
value = MAX(value, 1);
break;
case ZTI_MODE_BATCH:
batch = B_TRUE;
flags |= TASKQ_THREADS_CPU_PCT;
value = zio_taskq_batch_pct;
break;
default:
panic("unrecognized mode for %s_%s taskq (%u:%u) in "
"spa_activate()",
zio_type_name[t], zio_taskq_types[q], mode, value);
break;
}
for (uint_t i = 0; i < count; i++) {
taskq_t *tq;
if (count > 1) {
(void) snprintf(name, sizeof (name), "%s_%s_%u",
zio_type_name[t], zio_taskq_types[q], i);
} else {
(void) snprintf(name, sizeof (name), "%s_%s",
zio_type_name[t], zio_taskq_types[q]);
}
#ifdef SYSDC
if (zio_taskq_sysdc && spa->spa_proc != &p0) {
if (batch)
flags |= TASKQ_DC_BATCH;
tq = taskq_create_sysdc(name, value, 50, INT_MAX,
spa->spa_proc, zio_taskq_basedc, flags);
} else {
#endif
pri_t pri = maxclsyspri;
/*
* The write issue taskq can be extremely CPU
* intensive. Run it at slightly lower priority
* than the other taskqs.
* FreeBSD notes:
* - numerically higher priorities are lower priorities;
* - if priorities divided by four (RQ_PPQ) are equal
* then a difference between them is insignificant.
*/
if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
#ifdef illumos
pri--;
#else
pri += 4;
#endif
tq = taskq_create_proc(name, value, pri, 50,
INT_MAX, spa->spa_proc, flags);
#ifdef SYSDC
}
#endif
tqs->stqs_taskq[i] = tq;
}
}
static void
spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
{
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
if (tqs->stqs_taskq == NULL) {
ASSERT0(tqs->stqs_count);
return;
}
for (uint_t i = 0; i < tqs->stqs_count; i++) {
ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
taskq_destroy(tqs->stqs_taskq[i]);
}
kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
tqs->stqs_taskq = NULL;
}
/*
* Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
* Note that a type may have multiple discrete taskqs to avoid lock contention
* on the taskq itself. In that case we choose which taskq at random by using
* the low bits of gethrtime().
*/
void
spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
{
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
taskq_t *tq;
ASSERT3P(tqs->stqs_taskq, !=, NULL);
ASSERT3U(tqs->stqs_count, !=, 0);
if (tqs->stqs_count == 1) {
tq = tqs->stqs_taskq[0];
} else {
#ifdef _KERNEL
tq = tqs->stqs_taskq[cpu_ticks() % tqs->stqs_count];
#else
tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count];
#endif
}
taskq_dispatch_ent(tq, func, arg, flags, ent);
}
static void
spa_create_zio_taskqs(spa_t *spa)
{
for (int t = 0; t < ZIO_TYPES; t++) {
for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
spa_taskqs_init(spa, t, q);
}
}
}
#ifdef _KERNEL
#ifdef SPA_PROCESS
static void
spa_thread(void *arg)
{
callb_cpr_t cprinfo;
spa_t *spa = arg;
user_t *pu = PTOU(curproc);
CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
spa->spa_name);
ASSERT(curproc != &p0);
(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
"zpool-%s", spa->spa_name);
(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
#ifdef PSRSET_BIND
/* bind this thread to the requested psrset */
if (zio_taskq_psrset_bind != PS_NONE) {
pool_lock();
mutex_enter(&cpu_lock);
mutex_enter(&pidlock);
mutex_enter(&curproc->p_lock);
if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
0, NULL, NULL) == 0) {
curthread->t_bind_pset = zio_taskq_psrset_bind;
} else {
cmn_err(CE_WARN,
"Couldn't bind process for zfs pool \"%s\" to "
"pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
}
mutex_exit(&curproc->p_lock);
mutex_exit(&pidlock);
mutex_exit(&cpu_lock);
pool_unlock();
}
#endif
#ifdef SYSDC
if (zio_taskq_sysdc) {
sysdc_thread_enter(curthread, 100, 0);
}
#endif
spa->spa_proc = curproc;
spa->spa_did = curthread->t_did;
spa_create_zio_taskqs(spa);
mutex_enter(&spa->spa_proc_lock);
ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
spa->spa_proc_state = SPA_PROC_ACTIVE;
cv_broadcast(&spa->spa_proc_cv);
CALLB_CPR_SAFE_BEGIN(&cprinfo);
while (spa->spa_proc_state == SPA_PROC_ACTIVE)
cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
spa->spa_proc_state = SPA_PROC_GONE;
spa->spa_proc = &p0;
cv_broadcast(&spa->spa_proc_cv);
CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */
mutex_enter(&curproc->p_lock);
lwp_exit();
}
#endif /* SPA_PROCESS */
#endif
/*
* Activate an uninitialized pool.
*/
static void
spa_activate(spa_t *spa, int mode)
{
ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
spa->spa_state = POOL_STATE_ACTIVE;
spa->spa_mode = mode;
spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
/* Try to create a covering process */
mutex_enter(&spa->spa_proc_lock);
ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
ASSERT(spa->spa_proc == &p0);
spa->spa_did = 0;
#ifdef SPA_PROCESS
/* Only create a process if we're going to be around a while. */
if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
NULL, 0) == 0) {
spa->spa_proc_state = SPA_PROC_CREATED;
while (spa->spa_proc_state == SPA_PROC_CREATED) {
cv_wait(&spa->spa_proc_cv,
&spa->spa_proc_lock);
}
ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
ASSERT(spa->spa_proc != &p0);
ASSERT(spa->spa_did != 0);
} else {
#ifdef _KERNEL
cmn_err(CE_WARN,
"Couldn't create process for zfs pool \"%s\"\n",
spa->spa_name);
#endif
}
}
#endif /* SPA_PROCESS */
mutex_exit(&spa->spa_proc_lock);
/* If we didn't create a process, we need to create our taskqs. */
ASSERT(spa->spa_proc == &p0);
if (spa->spa_proc == &p0) {
spa_create_zio_taskqs(spa);
}
/*
* Start TRIM thread.
*/
trim_thread_create(spa);
+ for (size_t i = 0; i < TXG_SIZE; i++)
+ spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 0);
+
list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
offsetof(vdev_t, vdev_config_dirty_node));
list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
offsetof(objset_t, os_evicting_node));
list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
offsetof(vdev_t, vdev_state_dirty_node));
txg_list_create(&spa->spa_vdev_txg_list, spa,
offsetof(struct vdev, vdev_txg_node));
avl_create(&spa->spa_errlist_scrub,
spa_error_entry_compare, sizeof (spa_error_entry_t),
offsetof(spa_error_entry_t, se_avl));
avl_create(&spa->spa_errlist_last,
spa_error_entry_compare, sizeof (spa_error_entry_t),
offsetof(spa_error_entry_t, se_avl));
}
/*
* Opposite of spa_activate().
*/
static void
spa_deactivate(spa_t *spa)
{
ASSERT(spa->spa_sync_on == B_FALSE);
ASSERT(spa->spa_dsl_pool == NULL);
ASSERT(spa->spa_root_vdev == NULL);
ASSERT(spa->spa_async_zio_root == NULL);
ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
/*
* Stop TRIM thread in case spa_unload() wasn't called directly
* before spa_deactivate().
*/
trim_thread_destroy(spa);
spa_evicting_os_wait(spa);
txg_list_destroy(&spa->spa_vdev_txg_list);
list_destroy(&spa->spa_config_dirty_list);
list_destroy(&spa->spa_evicting_os_list);
list_destroy(&spa->spa_state_dirty_list);
for (int t = 0; t < ZIO_TYPES; t++) {
for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
spa_taskqs_fini(spa, t, q);
}
}
+ for (size_t i = 0; i < TXG_SIZE; i++) {
+ ASSERT3P(spa->spa_txg_zio[i], !=, NULL);
+ VERIFY0(zio_wait(spa->spa_txg_zio[i]));
+ spa->spa_txg_zio[i] = NULL;
+ }
+
metaslab_class_destroy(spa->spa_normal_class);
spa->spa_normal_class = NULL;
metaslab_class_destroy(spa->spa_log_class);
spa->spa_log_class = NULL;
/*
* If this was part of an import or the open otherwise failed, we may
* still have errors left in the queues. Empty them just in case.
*/
spa_errlog_drain(spa);
avl_destroy(&spa->spa_errlist_scrub);
avl_destroy(&spa->spa_errlist_last);
spa->spa_state = POOL_STATE_UNINITIALIZED;
mutex_enter(&spa->spa_proc_lock);
if (spa->spa_proc_state != SPA_PROC_NONE) {
ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
spa->spa_proc_state = SPA_PROC_DEACTIVATE;
cv_broadcast(&spa->spa_proc_cv);
while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
ASSERT(spa->spa_proc != &p0);
cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
}
ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
spa->spa_proc_state = SPA_PROC_NONE;
}
ASSERT(spa->spa_proc == &p0);
mutex_exit(&spa->spa_proc_lock);
#ifdef SPA_PROCESS
/*
* We want to make sure spa_thread() has actually exited the ZFS
* module, so that the module can't be unloaded out from underneath
* it.
*/
if (spa->spa_did != 0) {
thread_join(spa->spa_did);
spa->spa_did = 0;
}
#endif /* SPA_PROCESS */
}
/*
* Verify a pool configuration, and construct the vdev tree appropriately. This
* will create all the necessary vdevs in the appropriate layout, with each vdev
* in the CLOSED state. This will prep the pool before open/creation/import.
* All vdev validation is done by the vdev_alloc() routine.
*/
static int
spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
uint_t id, int atype)
{
nvlist_t **child;
uint_t children;
int error;
if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
return (error);
if ((*vdp)->vdev_ops->vdev_op_leaf)
return (0);
error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children);
if (error == ENOENT)
return (0);
if (error) {
vdev_free(*vdp);
*vdp = NULL;
return (SET_ERROR(EINVAL));
}
for (int c = 0; c < children; c++) {
vdev_t *vd;
if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
atype)) != 0) {
vdev_free(*vdp);
*vdp = NULL;
return (error);
}
}
ASSERT(*vdp != NULL);
return (0);
}
/*
* Opposite of spa_load().
*/
static void
spa_unload(spa_t *spa)
{
int i;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
/*
* Stop TRIM thread.
*/
trim_thread_destroy(spa);
/*
* Stop async tasks.
*/
spa_async_suspend(spa);
/*
* Stop syncing.
*/
if (spa->spa_sync_on) {
txg_sync_stop(spa->spa_dsl_pool);
spa->spa_sync_on = B_FALSE;
}
/*
* Even though vdev_free() also calls vdev_metaslab_fini, we need
* to call it earlier, before we wait for async i/o to complete.
* This ensures that there is no async metaslab prefetching, by
* calling taskq_wait(mg_taskq).
*/
if (spa->spa_root_vdev != NULL) {
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++)
vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]);
spa_config_exit(spa, SCL_ALL, FTAG);
}
/*
* Wait for any outstanding async I/O to complete.
*/
if (spa->spa_async_zio_root != NULL) {
for (int i = 0; i < max_ncpus; i++)
(void) zio_wait(spa->spa_async_zio_root[i]);
kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
spa->spa_async_zio_root = NULL;
}
+ if (spa->spa_vdev_removal != NULL) {
+ spa_vdev_removal_destroy(spa->spa_vdev_removal);
+ spa->spa_vdev_removal = NULL;
+ }
+
+ spa_condense_fini(spa);
+
bpobj_close(&spa->spa_deferred_bpobj);
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
/*
* Close all vdevs.
*/
if (spa->spa_root_vdev)
vdev_free(spa->spa_root_vdev);
ASSERT(spa->spa_root_vdev == NULL);
/*
* Close the dsl pool.
*/
if (spa->spa_dsl_pool) {
dsl_pool_close(spa->spa_dsl_pool);
spa->spa_dsl_pool = NULL;
spa->spa_meta_objset = NULL;
}
ddt_unload(spa);
/*
* Drop and purge level 2 cache
*/
spa_l2cache_drop(spa);
for (i = 0; i < spa->spa_spares.sav_count; i++)
vdev_free(spa->spa_spares.sav_vdevs[i]);
if (spa->spa_spares.sav_vdevs) {
kmem_free(spa->spa_spares.sav_vdevs,
spa->spa_spares.sav_count * sizeof (void *));
spa->spa_spares.sav_vdevs = NULL;
}
if (spa->spa_spares.sav_config) {
nvlist_free(spa->spa_spares.sav_config);
spa->spa_spares.sav_config = NULL;
}
spa->spa_spares.sav_count = 0;
for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
vdev_free(spa->spa_l2cache.sav_vdevs[i]);
}
if (spa->spa_l2cache.sav_vdevs) {
kmem_free(spa->spa_l2cache.sav_vdevs,
spa->spa_l2cache.sav_count * sizeof (void *));
spa->spa_l2cache.sav_vdevs = NULL;
}
if (spa->spa_l2cache.sav_config) {
nvlist_free(spa->spa_l2cache.sav_config);
spa->spa_l2cache.sav_config = NULL;
}
spa->spa_l2cache.sav_count = 0;
spa->spa_async_suspended = 0;
+ spa->spa_indirect_vdevs_loaded = B_FALSE;
+
if (spa->spa_comment != NULL) {
spa_strfree(spa->spa_comment);
spa->spa_comment = NULL;
}
spa_config_exit(spa, SCL_ALL, FTAG);
}
/*
* Load (or re-load) the current list of vdevs describing the active spares for
* this pool. When this is called, we have some form of basic information in
* 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and
* then re-generate a more complete list including status information.
*/
-static void
+void
spa_load_spares(spa_t *spa)
{
nvlist_t **spares;
uint_t nspares;
int i;
vdev_t *vd, *tvd;
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
/*
* First, close and free any existing spare vdevs.
*/
for (i = 0; i < spa->spa_spares.sav_count; i++) {
vd = spa->spa_spares.sav_vdevs[i];
/* Undo the call to spa_activate() below */
if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
B_FALSE)) != NULL && tvd->vdev_isspare)
spa_spare_remove(tvd);
vdev_close(vd);
vdev_free(vd);
}
if (spa->spa_spares.sav_vdevs)
kmem_free(spa->spa_spares.sav_vdevs,
spa->spa_spares.sav_count * sizeof (void *));
if (spa->spa_spares.sav_config == NULL)
nspares = 0;
else
VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
spa->spa_spares.sav_count = (int)nspares;
spa->spa_spares.sav_vdevs = NULL;
if (nspares == 0)
return;
/*
* Construct the array of vdevs, opening them to get status in the
* process. For each spare, there is potentially two different vdev_t
* structures associated with it: one in the list of spares (used only
* for basic validation purposes) and one in the active vdev
* configuration (if it's spared in). During this phase we open and
* validate each vdev on the spare list. If the vdev also exists in the
* active configuration, then we also mark this vdev as an active spare.
*/
spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
KM_SLEEP);
for (i = 0; i < spa->spa_spares.sav_count; i++) {
VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
VDEV_ALLOC_SPARE) == 0);
ASSERT(vd != NULL);
spa->spa_spares.sav_vdevs[i] = vd;
if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
B_FALSE)) != NULL) {
if (!tvd->vdev_isspare)
spa_spare_add(tvd);
/*
* We only mark the spare active if we were successfully
* able to load the vdev. Otherwise, importing a pool
* with a bad active spare would result in strange
* behavior, because multiple pool would think the spare
* is actively in use.
*
* There is a vulnerability here to an equally bizarre
* circumstance, where a dead active spare is later
* brought back to life (onlined or otherwise). Given
* the rarity of this scenario, and the extra complexity
* it adds, we ignore the possibility.
*/
if (!vdev_is_dead(tvd))
spa_spare_activate(tvd);
}
vd->vdev_top = vd;
vd->vdev_aux = &spa->spa_spares;
if (vdev_open(vd) != 0)
continue;
if (vdev_validate_aux(vd) == 0)
spa_spare_add(vd);
}
/*
* Recompute the stashed list of spares, with status information
* this time.
*/
VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
DATA_TYPE_NVLIST_ARRAY) == 0);
spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
KM_SLEEP);
for (i = 0; i < spa->spa_spares.sav_count; i++)
spares[i] = vdev_config_generate(spa,
spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
for (i = 0; i < spa->spa_spares.sav_count; i++)
nvlist_free(spares[i]);
kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
}
/*
* Load (or re-load) the current list of vdevs describing the active l2cache for
* this pool. When this is called, we have some form of basic information in
* 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and
* then re-generate a more complete list including status information.
* Devices which are already active have their details maintained, and are
* not re-opened.
*/
-static void
+void
spa_load_l2cache(spa_t *spa)
{
nvlist_t **l2cache;
uint_t nl2cache;
int i, j, oldnvdevs;
uint64_t guid;
vdev_t *vd, **oldvdevs, **newvdevs;
spa_aux_vdev_t *sav = &spa->spa_l2cache;
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
if (sav->sav_config != NULL) {
VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
} else {
nl2cache = 0;
newvdevs = NULL;
}
oldvdevs = sav->sav_vdevs;
oldnvdevs = sav->sav_count;
sav->sav_vdevs = NULL;
sav->sav_count = 0;
/*
* Process new nvlist of vdevs.
*/
for (i = 0; i < nl2cache; i++) {
VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
&guid) == 0);
newvdevs[i] = NULL;
for (j = 0; j < oldnvdevs; j++) {
vd = oldvdevs[j];
if (vd != NULL && guid == vd->vdev_guid) {
/*
* Retain previous vdev for add/remove ops.
*/
newvdevs[i] = vd;
oldvdevs[j] = NULL;
break;
}
}
if (newvdevs[i] == NULL) {
/*
* Create new vdev
*/
VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
VDEV_ALLOC_L2CACHE) == 0);
ASSERT(vd != NULL);
newvdevs[i] = vd;
/*
* Commit this vdev as an l2cache device,
* even if it fails to open.
*/
spa_l2cache_add(vd);
vd->vdev_top = vd;
vd->vdev_aux = sav;
spa_l2cache_activate(vd);
if (vdev_open(vd) != 0)
continue;
(void) vdev_validate_aux(vd);
if (!vdev_is_dead(vd))
l2arc_add_vdev(spa, vd);
}
}
/*
* Purge vdevs that were dropped
*/
for (i = 0; i < oldnvdevs; i++) {
uint64_t pool;
vd = oldvdevs[i];
if (vd != NULL) {
ASSERT(vd->vdev_isl2cache);
if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
pool != 0ULL && l2arc_vdev_present(vd))
l2arc_remove_vdev(vd);
vdev_clear_stats(vd);
vdev_free(vd);
}
}
if (oldvdevs)
kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
if (sav->sav_config == NULL)
goto out;
sav->sav_vdevs = newvdevs;
sav->sav_count = (int)nl2cache;
/*
* Recompute the stashed list of l2cache devices, with status
* information this time.
*/
VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
DATA_TYPE_NVLIST_ARRAY) == 0);
l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
for (i = 0; i < sav->sav_count; i++)
l2cache[i] = vdev_config_generate(spa,
sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
VERIFY(nvlist_add_nvlist_array(sav->sav_config,
ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
out:
for (i = 0; i < sav->sav_count; i++)
nvlist_free(l2cache[i]);
if (sav->sav_count)
kmem_free(l2cache, sav->sav_count * sizeof (void *));
}
static int
load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
{
dmu_buf_t *db;
char *packed = NULL;
size_t nvsize = 0;
int error;
*value = NULL;
error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
if (error != 0)
return (error);
nvsize = *(uint64_t *)db->db_data;
dmu_buf_rele(db, FTAG);
packed = kmem_alloc(nvsize, KM_SLEEP);
error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
DMU_READ_PREFETCH);
if (error == 0)
error = nvlist_unpack(packed, nvsize, value, 0);
kmem_free(packed, nvsize);
return (error);
}
/*
* Checks to see if the given vdev could not be opened, in which case we post a
* sysevent to notify the autoreplace code that the device has been removed.
*/
static void
spa_check_removed(vdev_t *vd)
{
for (int c = 0; c < vd->vdev_children; c++)
spa_check_removed(vd->vdev_child[c]);
if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
- !vd->vdev_ishole) {
+ vdev_is_concrete(vd)) {
zfs_post_autoreplace(vd->vdev_spa, vd);
spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK);
}
}
static void
spa_config_valid_zaps(vdev_t *vd, vdev_t *mvd)
{
ASSERT3U(vd->vdev_children, ==, mvd->vdev_children);
vd->vdev_top_zap = mvd->vdev_top_zap;
vd->vdev_leaf_zap = mvd->vdev_leaf_zap;
for (uint64_t i = 0; i < vd->vdev_children; i++) {
spa_config_valid_zaps(vd->vdev_child[i], mvd->vdev_child[i]);
}
}
/*
* Validate the current config against the MOS config
*/
static boolean_t
spa_config_valid(spa_t *spa, nvlist_t *config)
{
vdev_t *mrvd, *rvd = spa->spa_root_vdev;
nvlist_t *nv;
VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
/*
* If we're doing a normal import, then build up any additional
* diagnostic information about missing devices in this config.
* We'll pass this up to the user for further processing.
*/
if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
nvlist_t **child, *nv;
uint64_t idx = 0;
child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
KM_SLEEP);
VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
vdev_t *mtvd = mrvd->vdev_child[c];
if (tvd->vdev_ops == &vdev_missing_ops &&
mtvd->vdev_ops != &vdev_missing_ops &&
mtvd->vdev_islog)
child[idx++] = vdev_config_generate(spa, mtvd,
B_FALSE, 0);
}
if (idx) {
VERIFY(nvlist_add_nvlist_array(nv,
ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
VERIFY(nvlist_add_nvlist(spa->spa_load_info,
ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
for (int i = 0; i < idx; i++)
nvlist_free(child[i]);
}
nvlist_free(nv);
kmem_free(child, rvd->vdev_children * sizeof (char **));
}
/*
* Compare the root vdev tree with the information we have
* from the MOS config (mrvd). Check each top-level vdev
* with the corresponding MOS config top-level (mtvd).
*/
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
vdev_t *mtvd = mrvd->vdev_child[c];
/*
* Resolve any "missing" vdevs in the current configuration.
+ * Also trust the MOS config about any "indirect" vdevs.
* If we find that the MOS config has more accurate information
* about the top-level vdev then use that vdev instead.
*/
- if (tvd->vdev_ops == &vdev_missing_ops &&
- mtvd->vdev_ops != &vdev_missing_ops) {
+ if ((tvd->vdev_ops == &vdev_missing_ops &&
+ mtvd->vdev_ops != &vdev_missing_ops) ||
+ (mtvd->vdev_ops == &vdev_indirect_ops &&
+ tvd->vdev_ops != &vdev_indirect_ops)) {
- if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
- continue;
-
/*
* Device specific actions.
*/
if (mtvd->vdev_islog) {
+ if (!(spa->spa_import_flags &
+ ZFS_IMPORT_MISSING_LOG)) {
+ continue;
+ }
+
spa_set_log_state(spa, SPA_LOG_CLEAR);
- } else {
- /*
- * XXX - once we have 'readonly' pool
- * support we should be able to handle
- * missing data devices by transitioning
- * the pool to readonly.
- */
+ } else if (mtvd->vdev_ops != &vdev_indirect_ops) {
continue;
}
/*
* Swap the missing vdev with the data we were
* able to obtain from the MOS config.
*/
vdev_remove_child(rvd, tvd);
vdev_remove_child(mrvd, mtvd);
vdev_add_child(rvd, mtvd);
vdev_add_child(mrvd, tvd);
- spa_config_exit(spa, SCL_ALL, FTAG);
- vdev_load(mtvd);
- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-
vdev_reopen(rvd);
} else {
if (mtvd->vdev_islog) {
/*
* Load the slog device's state from the MOS
* config since it's possible that the label
* does not contain the most up-to-date
* information.
*/
vdev_load_log_state(tvd, mtvd);
vdev_reopen(tvd);
}
/*
* Per-vdev ZAP info is stored exclusively in the MOS.
*/
spa_config_valid_zaps(tvd, mtvd);
}
+
+ /*
+ * Never trust this info from userland; always use what's
+ * in the MOS. This prevents it from getting out of sync
+ * with the rest of the info in the MOS.
+ */
+ tvd->vdev_removing = mtvd->vdev_removing;
+ tvd->vdev_indirect_config = mtvd->vdev_indirect_config;
}
vdev_free(mrvd);
spa_config_exit(spa, SCL_ALL, FTAG);
/*
* Ensure we were able to validate the config.
*/
return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
}
/*
* Check for missing log devices
*/
static boolean_t
spa_check_logs(spa_t *spa)
{
boolean_t rv = B_FALSE;
dsl_pool_t *dp = spa_get_dsl(spa);
switch (spa->spa_log_state) {
case SPA_LOG_MISSING:
/* need to recheck in case slog has been restored */
case SPA_LOG_UNKNOWN:
rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
if (rv)
spa_set_log_state(spa, SPA_LOG_MISSING);
break;
}
return (rv);
}
static boolean_t
spa_passivate_log(spa_t *spa)
{
vdev_t *rvd = spa->spa_root_vdev;
boolean_t slog_found = B_FALSE;
ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
if (!spa_has_slogs(spa))
return (B_FALSE);
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = tvd->vdev_mg;
if (tvd->vdev_islog) {
metaslab_group_passivate(mg);
slog_found = B_TRUE;
}
}
return (slog_found);
}
static void
spa_activate_log(spa_t *spa)
{
vdev_t *rvd = spa->spa_root_vdev;
ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = tvd->vdev_mg;
if (tvd->vdev_islog)
metaslab_group_activate(mg);
}
}
int
-spa_offline_log(spa_t *spa)
+spa_reset_logs(spa_t *spa)
{
int error;
- error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
+ error = dmu_objset_find(spa_name(spa), zil_reset,
NULL, DS_FIND_CHILDREN);
if (error == 0) {
/*
* We successfully offlined the log device, sync out the
* current txg so that the "stubby" block can be removed
* by zil_sync().
*/
txg_wait_synced(spa->spa_dsl_pool, 0);
}
return (error);
}
static void
spa_aux_check_removed(spa_aux_vdev_t *sav)
{
int i;
for (i = 0; i < sav->sav_count; i++)
spa_check_removed(sav->sav_vdevs[i]);
}
void
spa_claim_notify(zio_t *zio)
{
spa_t *spa = zio->io_spa;
if (zio->io_error)
return;
mutex_enter(&spa->spa_props_lock); /* any mutex will do */
if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
spa->spa_claim_max_txg = zio->io_bp->blk_birth;
mutex_exit(&spa->spa_props_lock);
}
typedef struct spa_load_error {
uint64_t sle_meta_count;
uint64_t sle_data_count;
} spa_load_error_t;
static void
spa_load_verify_done(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
spa_load_error_t *sle = zio->io_private;
dmu_object_type_t type = BP_GET_TYPE(bp);
int error = zio->io_error;
spa_t *spa = zio->io_spa;
abd_free(zio->io_abd);
if (error) {
if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
type != DMU_OT_INTENT_LOG)
atomic_inc_64(&sle->sle_meta_count);
else
atomic_inc_64(&sle->sle_data_count);
}
mutex_enter(&spa->spa_scrub_lock);
spa->spa_scrub_inflight--;
cv_broadcast(&spa->spa_scrub_io_cv);
mutex_exit(&spa->spa_scrub_lock);
}
/*
* Maximum number of concurrent scrub i/os to create while verifying
* a pool while importing it.
*/
int spa_load_verify_maxinflight = 10000;
boolean_t spa_load_verify_metadata = B_TRUE;
boolean_t spa_load_verify_data = B_TRUE;
SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN,
&spa_load_verify_maxinflight, 0,
"Maximum number of concurrent scrub I/Os to create while verifying a "
"pool while importing it");
SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN,
&spa_load_verify_metadata, 0,
"Check metadata on import?");
SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN,
&spa_load_verify_data, 0,
"Check user data on import?");
/*ARGSUSED*/
static int
spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
return (0);
/*
* Note: normally this routine will not be called if
* spa_load_verify_metadata is not set. However, it may be useful
* to manually set the flag after the traversal has begun.
*/
if (!spa_load_verify_metadata)
return (0);
if (!BP_IS_METADATA(bp) && !spa_load_verify_data)
return (0);
zio_t *rio = arg;
size_t size = BP_GET_PSIZE(bp);
mutex_enter(&spa->spa_scrub_lock);
while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
spa->spa_scrub_inflight++;
mutex_exit(&spa->spa_scrub_lock);
zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
return (0);
}
/* ARGSUSED */
int
verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
{
if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
return (SET_ERROR(ENAMETOOLONG));
return (0);
}
static int
spa_load_verify(spa_t *spa)
{
zio_t *rio;
spa_load_error_t sle = { 0 };
zpool_rewind_policy_t policy;
boolean_t verify_ok = B_FALSE;
int error = 0;
zpool_get_rewind_policy(spa->spa_config, &policy);
if (policy.zrp_request & ZPOOL_NEVER_REWIND)
return (0);
dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
error = dmu_objset_find_dp(spa->spa_dsl_pool,
spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL,
DS_FIND_CHILDREN);
dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
if (error != 0)
return (error);
rio = zio_root(spa, NULL, &sle,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
if (spa_load_verify_metadata) {
error = traverse_pool(spa, spa->spa_verify_min_txg,
TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
spa_load_verify_cb, rio);
}
(void) zio_wait(rio);
spa->spa_load_meta_errors = sle.sle_meta_count;
spa->spa_load_data_errors = sle.sle_data_count;
if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
sle.sle_data_count <= policy.zrp_maxdata) {
int64_t loss = 0;
verify_ok = B_TRUE;
spa->spa_load_txg = spa->spa_uberblock.ub_txg;
spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
VERIFY(nvlist_add_uint64(spa->spa_load_info,
ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
VERIFY(nvlist_add_int64(spa->spa_load_info,
ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
VERIFY(nvlist_add_uint64(spa->spa_load_info,
ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
} else {
spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
}
if (error) {
if (error != ENXIO && error != EIO)
error = SET_ERROR(EIO);
return (error);
}
return (verify_ok ? 0 : EIO);
}
/*
* Find a value in the pool props object.
*/
static void
spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
{
(void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
}
/*
* Find a value in the pool directory object.
*/
static int
spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
{
return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
name, sizeof (uint64_t), 1, val));
}
static int
spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
{
vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
- return (err);
+ return (SET_ERROR(err));
}
/*
* Fix up config after a partly-completed split. This is done with the
* ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off
* pool have that entry in their config, but only the splitting one contains
* a list of all the guids of the vdevs that are being split off.
*
* This function determines what to do with that list: either rejoin
* all the disks to the pool, or complete the splitting process. To attempt
* the rejoin, each disk that is offlined is marked online again, and
* we do a reopen() call. If the vdev label for every disk that was
* marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
* then we call vdev_split() on each disk, and complete the split.
*
* Otherwise we leave the config alone, with all the vdevs in place in
* the original pool.
*/
static void
spa_try_repair(spa_t *spa, nvlist_t *config)
{
uint_t extracted;
uint64_t *glist;
uint_t i, gcount;
nvlist_t *nvl;
vdev_t **vd;
boolean_t attempt_reopen;
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
return;
/* check that the config is complete */
if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
&glist, &gcount) != 0)
return;
vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
/* attempt to online all the vdevs & validate */
attempt_reopen = B_TRUE;
for (i = 0; i < gcount; i++) {
if (glist[i] == 0) /* vdev is hole */
continue;
vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
if (vd[i] == NULL) {
/*
* Don't bother attempting to reopen the disks;
* just do the split.
*/
attempt_reopen = B_FALSE;
} else {
/* attempt to re-online it */
vd[i]->vdev_offline = B_FALSE;
}
}
if (attempt_reopen) {
vdev_reopen(spa->spa_root_vdev);
/* check each device to see what state it's in */
for (extracted = 0, i = 0; i < gcount; i++) {
if (vd[i] != NULL &&
vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
break;
++extracted;
}
}
/*
* If every disk has been moved to the new pool, or if we never
* even attempted to look at them, then we split them off for
* good.
*/
if (!attempt_reopen || gcount == extracted) {
for (i = 0; i < gcount; i++)
if (vd[i] != NULL)
vdev_split(vd[i]);
vdev_reopen(spa->spa_root_vdev);
}
kmem_free(vd, gcount * sizeof (vdev_t *));
}
static int
spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
boolean_t mosconfig)
{
nvlist_t *config = spa->spa_config;
char *ereport = FM_EREPORT_ZFS_POOL;
char *comment;
int error;
uint64_t pool_guid;
nvlist_t *nvl;
if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
return (SET_ERROR(EINVAL));
ASSERT(spa->spa_comment == NULL);
if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
spa->spa_comment = spa_strdup(comment);
/*
* Versioning wasn't explicitly added to the label until later, so if
* it's not present treat it as the initial version.
*/
if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
&spa->spa_ubsync.ub_version) != 0)
spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
&spa->spa_config_txg);
if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
spa_guid_exists(pool_guid, 0)) {
error = SET_ERROR(EEXIST);
} else {
spa->spa_config_guid = pool_guid;
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
&nvl) == 0) {
VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
KM_SLEEP) == 0);
}
nvlist_free(spa->spa_load_info);
spa->spa_load_info = fnvlist_alloc();
gethrestime(&spa->spa_loaded_ts);
error = spa_load_impl(spa, pool_guid, config, state, type,
mosconfig, &ereport);
}
/*
* Don't count references from objsets that are already closed
* and are making their way through the eviction process.
*/
spa_evicting_os_wait(spa);
spa->spa_minref = refcount_count(&spa->spa_refcount);
if (error) {
if (error != EEXIST) {
spa->spa_loaded_ts.tv_sec = 0;
spa->spa_loaded_ts.tv_nsec = 0;
}
if (error != EBADF) {
zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
}
}
spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
spa->spa_ena = 0;
return (error);
}
/*
* Count the number of per-vdev ZAPs associated with all of the vdevs in the
* vdev tree rooted in the given vd, and ensure that each ZAP is present in the
* spa's per-vdev ZAP list.
*/
static uint64_t
vdev_count_verify_zaps(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
uint64_t total = 0;
if (vd->vdev_top_zap != 0) {
total++;
ASSERT0(zap_lookup_int(spa->spa_meta_objset,
spa->spa_all_vdev_zaps, vd->vdev_top_zap));
}
if (vd->vdev_leaf_zap != 0) {
total++;
ASSERT0(zap_lookup_int(spa->spa_meta_objset,
spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
}
for (uint64_t i = 0; i < vd->vdev_children; i++) {
total += vdev_count_verify_zaps(vd->vdev_child[i]);
}
return (total);
}
/*
* Load an existing storage pool, using the pool's builtin spa_config as a
* source of configuration information.
*/
static int
spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
- spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
+ spa_load_state_t state, spa_import_type_t type, boolean_t trust_config,
char **ereport)
{
int error = 0;
nvlist_t *nvroot = NULL;
nvlist_t *label;
vdev_t *rvd;
uberblock_t *ub = &spa->spa_uberblock;
uint64_t children, config_cache_txg = spa->spa_config_txg;
int orig_mode = spa->spa_mode;
int parse;
uint64_t obj;
boolean_t missing_feat_write = B_FALSE;
/*
* If this is an untrusted config, access the pool in read-only mode.
* This prevents things like resilvering recently removed devices.
*/
- if (!mosconfig)
+ if (!trust_config)
spa->spa_mode = FREAD;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
spa->spa_load_state = state;
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
return (SET_ERROR(EINVAL));
parse = (type == SPA_IMPORT_EXISTING ?
VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
/*
* Create "The Godfather" zio to hold all async IOs
*/
spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
KM_SLEEP);
for (int i = 0; i < max_ncpus; i++) {
spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
ZIO_FLAG_GODFATHER);
}
/*
* Parse the configuration into a vdev tree. We explicitly set the
* value that will be returned by spa_version() since parsing the
* configuration requires knowing the version number.
*/
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
spa_config_exit(spa, SCL_ALL, FTAG);
if (error != 0)
return (error);
ASSERT(spa->spa_root_vdev == rvd);
ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
if (type != SPA_IMPORT_ASSEMBLE) {
ASSERT(spa_guid(spa) == pool_guid);
}
/*
* Try to open all vdevs, loading each label in the process.
*/
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
error = vdev_open(rvd);
spa_config_exit(spa, SCL_ALL, FTAG);
if (error != 0)
return (error);
/*
* We need to validate the vdev labels against the configuration that
* we have in hand, which is dependent on the setting of mosconfig. If
* mosconfig is true then we're validating the vdev labels based on
* that config. Otherwise, we're validating against the cached config
* (zpool.cache) that was read when we loaded the zfs module, and then
* later we will recursively call spa_load() and validate against
* the vdev config.
*
* If we're assembling a new pool that's been split off from an
* existing pool, the labels haven't yet been updated so we skip
* validation for now.
*/
if (type != SPA_IMPORT_ASSEMBLE) {
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
- error = vdev_validate(rvd, mosconfig);
+ error = vdev_validate(rvd, trust_config);
spa_config_exit(spa, SCL_ALL, FTAG);
if (error != 0)
return (error);
if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
return (SET_ERROR(ENXIO));
}
/*
* Find the best uberblock.
*/
vdev_uberblock_load(rvd, ub, &label);
/*
* If we weren't able to find a single valid uberblock, return failure.
*/
if (ub->ub_txg == 0) {
nvlist_free(label);
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
}
/*
* If the pool has an unsupported version we can't open it.
*/
if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
nvlist_free(label);
return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
}
if (ub->ub_version >= SPA_VERSION_FEATURES) {
nvlist_t *features;
/*
* If we weren't able to find what's necessary for reading the
* MOS in the label, return failure.
*/
if (label == NULL || nvlist_lookup_nvlist(label,
ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) {
nvlist_free(label);
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
ENXIO));
}
/*
* Update our in-core representation with the definitive values
* from the label.
*/
nvlist_free(spa->spa_label_features);
VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
}
nvlist_free(label);
/*
* Look through entries in the label nvlist's features_for_read. If
* there is a feature listed there which we don't understand then we
* cannot open a pool.
*/
if (ub->ub_version >= SPA_VERSION_FEATURES) {
nvlist_t *unsup_feat;
VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
0);
for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
NULL); nvp != NULL;
nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
if (!zfeature_is_supported(nvpair_name(nvp))) {
VERIFY(nvlist_add_string(unsup_feat,
nvpair_name(nvp), "") == 0);
}
}
if (!nvlist_empty(unsup_feat)) {
VERIFY(nvlist_add_nvlist(spa->spa_load_info,
ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
nvlist_free(unsup_feat);
return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
ENOTSUP));
}
nvlist_free(unsup_feat);
}
/*
* If the vdev guid sum doesn't match the uberblock, we have an
* incomplete configuration. We first check to see if the pool
* is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
* If it is, defer the vdev_guid_sum check till later so we
* can handle missing vdevs.
*/
if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
- &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
+ &children) != 0 && trust_config && type != SPA_IMPORT_ASSEMBLE &&
rvd->vdev_guid_sum != ub->ub_guid_sum)
return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
spa_try_repair(spa, config);
spa_config_exit(spa, SCL_ALL, FTAG);
nvlist_free(spa->spa_config_splitting);
spa->spa_config_splitting = NULL;
}
/*
* Initialize internal SPA structures.
*/
spa->spa_state = POOL_STATE_ACTIVE;
spa->spa_ubsync = spa->spa_uberblock;
spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
spa->spa_first_txg = spa->spa_last_ubsync_txg ?
spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
spa->spa_claim_max_txg = spa->spa_first_txg;
spa->spa_prev_software_version = ub->ub_software_version;
+ /*
+ * Everything that we read before we do spa_remove_init() must
+ * have been rewritten after the last device removal was initiated.
+ * Otherwise we could be reading from indirect vdevs before
+ * we have loaded their mappings.
+ */
+
error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
if (error)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ /*
+ * Validate the config, using the MOS config to fill in any
+ * information which might be missing. If we fail to validate
+ * the config then declare the pool unfit for use. If we're
+ * assembling a pool from a split, the log is not transferred
+ * over.
+ */
+ if (type != SPA_IMPORT_ASSEMBLE) {
+ nvlist_t *mos_config;
+ if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ if (!spa_config_valid(spa, mos_config)) {
+ nvlist_free(mos_config);
+ return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
+ ENXIO));
+ }
+ nvlist_free(mos_config);
+
+ /*
+ * Now that we've validated the config, check the state of the
+ * root vdev. If it can't be opened, it indicates one or
+ * more toplevel vdevs are faulted.
+ */
+ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
+ return (SET_ERROR(ENXIO));
+ }
+
+ /*
+ * Everything that we read before spa_remove_init() must be stored
+ * on concreted vdevs. Therefore we do this as early as possible.
+ */
+ if (spa_remove_init(spa) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
if (spa_version(spa) >= SPA_VERSION_FEATURES) {
boolean_t missing_feat_read = B_FALSE;
nvlist_t *unsup_feat, *enabled_feat;
if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
&spa->spa_feat_for_read_obj) != 0) {
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
}
if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
&spa->spa_feat_for_write_obj) != 0) {
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
}
if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
&spa->spa_feat_desc_obj) != 0) {
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
}
enabled_feat = fnvlist_alloc();
unsup_feat = fnvlist_alloc();
if (!spa_features_check(spa, B_FALSE,
unsup_feat, enabled_feat))
missing_feat_read = B_TRUE;
if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) {
if (!spa_features_check(spa, B_TRUE,
unsup_feat, enabled_feat)) {
missing_feat_write = B_TRUE;
}
}
fnvlist_add_nvlist(spa->spa_load_info,
ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
if (!nvlist_empty(unsup_feat)) {
fnvlist_add_nvlist(spa->spa_load_info,
ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
}
fnvlist_free(enabled_feat);
fnvlist_free(unsup_feat);
if (!missing_feat_read) {
fnvlist_add_boolean(spa->spa_load_info,
ZPOOL_CONFIG_CAN_RDONLY);
}
/*
* If the state is SPA_LOAD_TRYIMPORT, our objective is
* twofold: to determine whether the pool is available for
* import in read-write mode and (if it is not) whether the
* pool is available for import in read-only mode. If the pool
* is available for import in read-write mode, it is displayed
* as available in userland; if it is not available for import
* in read-only mode, it is displayed as unavailable in
* userland. If the pool is available for import in read-only
* mode but not read-write mode, it is displayed as unavailable
* in userland with a special note that the pool is actually
* available for open in read-only mode.
*
* As a result, if the state is SPA_LOAD_TRYIMPORT and we are
* missing a feature for write, we must first determine whether
* the pool can be opened read-only before returning to
* userland in order to know whether to display the
* abovementioned note.
*/
if (missing_feat_read || (missing_feat_write &&
spa_writeable(spa))) {
return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
ENOTSUP));
}
/*
* Load refcounts for ZFS features from disk into an in-memory
* cache during SPA initialization.
*/
for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
uint64_t refcount;
error = feature_get_refcount_from_disk(spa,
&spa_feature_table[i], &refcount);
if (error == 0) {
spa->spa_feat_refcount_cache[i] = refcount;
} else if (error == ENOTSUP) {
spa->spa_feat_refcount_cache[i] =
SPA_FEATURE_DISABLED;
} else {
return (spa_vdev_err(rvd,
VDEV_AUX_CORRUPT_DATA, EIO));
}
}
}
if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
&spa->spa_feat_enabled_txg_obj) != 0)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
}
spa->spa_is_initializing = B_TRUE;
error = dsl_pool_open(spa->spa_dsl_pool);
spa->spa_is_initializing = B_FALSE;
if (error != 0)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
- if (!mosconfig) {
+ if (!trust_config) {
uint64_t hostid;
- nvlist_t *policy = NULL, *nvconfig;
+ nvlist_t *policy = NULL;
+ nvlist_t *mos_config;
- if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
+ if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
- if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
+ if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
char *hostname;
unsigned long myhostid = 0;
- VERIFY(nvlist_lookup_string(nvconfig,
+ VERIFY(nvlist_lookup_string(mos_config,
ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
#ifdef _KERNEL
myhostid = zone_get_hostid(NULL);
#else /* _KERNEL */
/*
* We're emulating the system's hostid in userland, so
* we can't use zone_get_hostid().
*/
(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
#endif /* _KERNEL */
if (check_hostid && hostid != 0 && myhostid != 0 &&
hostid != myhostid) {
- nvlist_free(nvconfig);
+ nvlist_free(mos_config);
cmn_err(CE_WARN, "pool '%s' could not be "
"loaded as it was last accessed by "
"another system (host: %s hostid: 0x%lx). "
"See: http://illumos.org/msg/ZFS-8000-EY",
spa_name(spa), hostname,
(unsigned long)hostid);
return (SET_ERROR(EBADF));
}
}
if (nvlist_lookup_nvlist(spa->spa_config,
ZPOOL_REWIND_POLICY, &policy) == 0)
- VERIFY(nvlist_add_nvlist(nvconfig,
+ VERIFY(nvlist_add_nvlist(mos_config,
ZPOOL_REWIND_POLICY, policy) == 0);
- spa_config_set(spa, nvconfig);
+ spa_config_set(spa, mos_config);
spa_unload(spa);
spa_deactivate(spa);
spa_activate(spa, orig_mode);
return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
}
/* Grab the secret checksum salt from the MOS. */
error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_CHECKSUM_SALT, 1,
sizeof (spa->spa_cksum_salt.zcs_bytes),
spa->spa_cksum_salt.zcs_bytes);
if (error == ENOENT) {
/* Generate a new salt for subsequent use */
(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
sizeof (spa->spa_cksum_salt.zcs_bytes));
} else if (error != 0) {
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
}
if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
if (error != 0)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
/*
* Load the bit that tells us to use the new accounting function
* (raid-z deflation). If we have an older pool, this will not
* be present.
*/
error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
if (error != 0 && error != ENOENT)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
&spa->spa_creation_version);
if (error != 0 && error != ENOENT)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
/*
* Load the persistent error log. If we have an older pool, this will
* not be present.
*/
error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
if (error != 0 && error != ENOENT)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
&spa->spa_errlog_scrub);
if (error != 0 && error != ENOENT)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
/*
* Load the history object. If we have an older pool, this
* will not be present.
*/
error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
if (error != 0 && error != ENOENT)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
/*
* Load the per-vdev ZAP map. If we have an older pool, this will not
* be present; in this case, defer its creation to a later time to
* avoid dirtying the MOS this early / out of sync context. See
* spa_sync_config_object.
*/
/* The sentinel is only available in the MOS config. */
nvlist_t *mos_config;
if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
&spa->spa_all_vdev_zaps);
if (error == ENOENT) {
VERIFY(!nvlist_exists(mos_config,
ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
spa->spa_avz_action = AVZ_ACTION_INITIALIZE;
ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
} else if (error != 0) {
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
} else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
/*
* An older version of ZFS overwrote the sentinel value, so
* we have orphaned per-vdev ZAPs in the MOS. Defer their
* destruction to later; see spa_sync_config_object.
*/
spa->spa_avz_action = AVZ_ACTION_DESTROY;
/*
* We're assuming that no vdevs have had their ZAPs created
* before this. Better be sure of it.
*/
ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
}
nvlist_free(mos_config);
/*
* If we're assembling the pool from the split-off vdevs of
* an existing pool, we don't want to attach the spares & cache
* devices.
*/
/*
* Load any hot spares for this pool.
*/
error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
if (error != 0 && error != ENOENT)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
if (load_nvlist(spa, spa->spa_spares.sav_object,
&spa->spa_spares.sav_config) != 0)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
spa_load_spares(spa);
spa_config_exit(spa, SCL_ALL, FTAG);
} else if (error == 0) {
spa->spa_spares.sav_sync = B_TRUE;
}
/*
* Load any level 2 ARC devices for this pool.
*/
error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
&spa->spa_l2cache.sav_object);
if (error != 0 && error != ENOENT)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
if (load_nvlist(spa, spa->spa_l2cache.sav_object,
&spa->spa_l2cache.sav_config) != 0)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
spa_load_l2cache(spa);
spa_config_exit(spa, SCL_ALL, FTAG);
} else if (error == 0) {
spa->spa_l2cache.sav_sync = B_TRUE;
}
spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
if (error && error != ENOENT)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
if (error == 0) {
uint64_t autoreplace;
spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
spa_prop_find(spa, ZPOOL_PROP_BOOTSIZE, &spa->spa_bootsize);
spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
&spa->spa_dedup_ditto);
spa->spa_autoreplace = (autoreplace != 0);
}
/*
* If the 'autoreplace' property is set, then post a resource notifying
* the ZFS DE that it should not issue any faults for unopenable
* devices. We also iterate over the vdevs, and post a sysevent for any
* unopenable vdevs so that the normal autoreplace handler can take
* over.
*/
if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
spa_check_removed(spa->spa_root_vdev);
/*
* For the import case, this is done in spa_import(), because
* at this point we're using the spare definitions from
* the MOS config, not necessarily from the userland config.
*/
if (state != SPA_LOAD_IMPORT) {
spa_aux_check_removed(&spa->spa_spares);
spa_aux_check_removed(&spa->spa_l2cache);
}
}
/*
* Load the vdev state for all toplevel vdevs.
*/
- vdev_load(rvd);
+ error = vdev_load(rvd);
+ if (error != 0) {
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
+ }
+ error = spa_condense_init(spa);
+ if (error != 0) {
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
+ }
+
/*
* Propagate the leaf DTLs we just loaded all the way up the tree.
*/
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
spa_config_exit(spa, SCL_ALL, FTAG);
/*
* Load the DDTs (dedup tables).
*/
error = ddt_load(spa);
if (error != 0)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
spa_update_dspace(spa);
- /*
- * Validate the config, using the MOS config to fill in any
- * information which might be missing. If we fail to validate
- * the config then declare the pool unfit for use. If we're
- * assembling a pool from a split, the log is not transferred
- * over.
- */
- if (type != SPA_IMPORT_ASSEMBLE) {
- nvlist_t *nvconfig;
-
- if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-
- if (!spa_config_valid(spa, nvconfig)) {
- nvlist_free(nvconfig);
- return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
- ENXIO));
- }
- nvlist_free(nvconfig);
-
- /*
- * Now that we've validated the config, check the state of the
- * root vdev. If it can't be opened, it indicates one or
- * more toplevel vdevs are faulted.
- */
- if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
- return (SET_ERROR(ENXIO));
-
- if (spa_writeable(spa) && spa_check_logs(spa)) {
- *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
- return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
- }
+ if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa) &&
+ spa_check_logs(spa)) {
+ *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
+ return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
}
if (missing_feat_write) {
ASSERT(state == SPA_LOAD_TRYIMPORT);
/*
* At this point, we know that we can open the pool in
* read-only mode but not read-write mode. We now have enough
* information and can return to userland.
*/
return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP));
}
/*
* We've successfully opened the pool, verify that we're ready
* to start pushing transactions.
*/
if (state != SPA_LOAD_TRYIMPORT) {
if (error = spa_load_verify(spa))
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
error));
}
if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
spa->spa_load_max_txg == UINT64_MAX)) {
dmu_tx_t *tx;
int need_update = B_FALSE;
dsl_pool_t *dp = spa_get_dsl(spa);
+ /*
+ * We must check this before we start the sync thread, because
+ * we only want to start a condense thread for condense
+ * operations that were in progress when the pool was
+ * imported. Once we start syncing, spa_sync() could
+ * initiate a condense (and start a thread for it). In
+ * that case it would be wrong to start a second
+ * condense thread.
+ */
+ boolean_t condense_in_progress =
+ (spa->spa_condensing_indirect != NULL);
+
ASSERT(state != SPA_LOAD_TRYIMPORT);
/*
* Claim log blocks that haven't been committed yet.
* This must all happen in a single txg.
* Note: spa_claim_max_txg is updated by spa_claim_notify(),
* invoked from zil_claim_log_block()'s i/o done callback.
* Price of rollback is that we abandon the log.
*/
spa->spa_claiming = B_TRUE;
tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
(void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
zil_claim, tx, DS_FIND_CHILDREN);
dmu_tx_commit(tx);
spa->spa_claiming = B_FALSE;
spa_set_log_state(spa, SPA_LOG_GOOD);
spa->spa_sync_on = B_TRUE;
txg_sync_start(spa->spa_dsl_pool);
/*
* Wait for all claims to sync. We sync up to the highest
* claimed log block birth time so that claimed log blocks
* don't appear to be from the future. spa_claim_max_txg
* will have been set for us by either zil_check_log_chain()
* (invoked from spa_check_logs()) or zil_claim() above.
*/
txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
/*
* If the config cache is stale, or we have uninitialized
* metaslabs (see spa_vdev_add()), then update the config.
*
* If this is a verbatim import, trust the current
* in-core spa_config and update the disk labels.
*/
if (config_cache_txg != spa->spa_config_txg ||
state == SPA_LOAD_IMPORT ||
state == SPA_LOAD_RECOVER ||
(spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
need_update = B_TRUE;
for (int c = 0; c < rvd->vdev_children; c++)
if (rvd->vdev_child[c]->vdev_ms_array == 0)
need_update = B_TRUE;
/*
* Update the config cache asychronously in case we're the
* root pool, in which case the config cache isn't writable yet.
*/
if (need_update)
spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
/*
* Check all DTLs to see if anything needs resilvering.
*/
if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
vdev_resilver_needed(rvd, NULL, NULL))
spa_async_request(spa, SPA_ASYNC_RESILVER);
/*
* Log the fact that we booted up (so that we can detect if
* we rebooted in the middle of an operation).
*/
spa_history_log_version(spa, "open");
/*
* Delete any inconsistent datasets.
*/
(void) dmu_objset_find(spa_name(spa),
dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
/*
* Clean up any stale temporary dataset userrefs.
*/
dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
+
+ /*
+ * Note: unlike condensing, we don't need an analogous
+ * "removal_in_progress" dance because no other thread
+ * can start a removal while we hold the spa_namespace_lock.
+ */
+ spa_restart_removal(spa);
+
+ if (condense_in_progress)
+ spa_condense_indirect_restart(spa);
}
return (0);
}
static int
spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
{
int mode = spa->spa_mode;
spa_unload(spa);
spa_deactivate(spa);
spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
spa_activate(spa, mode);
spa_async_suspend(spa);
return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
}
/*
* If spa_load() fails this function will try loading prior txg's. If
* 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
* will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
* function will not rewind the pool and will return the same error as
* spa_load().
*/
static int
spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
uint64_t max_request, int rewind_flags)
{
nvlist_t *loadinfo = NULL;
nvlist_t *config = NULL;
int load_error, rewind_error;
uint64_t safe_rewind_txg;
uint64_t min_txg;
if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
spa->spa_load_max_txg = spa->spa_load_txg;
spa_set_log_state(spa, SPA_LOG_CLEAR);
} else {
spa->spa_load_max_txg = max_request;
if (max_request != UINT64_MAX)
spa->spa_extreme_rewind = B_TRUE;
}
load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
mosconfig);
if (load_error == 0)
return (0);
if (spa->spa_root_vdev != NULL)
config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
if (rewind_flags & ZPOOL_NEVER_REWIND) {
nvlist_free(config);
return (load_error);
}
if (state == SPA_LOAD_RECOVER) {
/* Price of rolling back is discarding txgs, including log */
spa_set_log_state(spa, SPA_LOG_CLEAR);
} else {
/*
* If we aren't rolling back save the load info from our first
* import attempt so that we can restore it after attempting
* to rewind.
*/
loadinfo = spa->spa_load_info;
spa->spa_load_info = fnvlist_alloc();
}
spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
TXG_INITIAL : safe_rewind_txg;
/*
* Continue as long as we're finding errors, we're still within
* the acceptable rewind range, and we're still finding uberblocks
*/
while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
if (spa->spa_load_max_txg < safe_rewind_txg)
spa->spa_extreme_rewind = B_TRUE;
rewind_error = spa_load_retry(spa, state, mosconfig);
}
spa->spa_extreme_rewind = B_FALSE;
spa->spa_load_max_txg = UINT64_MAX;
if (config && (rewind_error || state != SPA_LOAD_RECOVER))
spa_config_set(spa, config);
else
nvlist_free(config);
if (state == SPA_LOAD_RECOVER) {
ASSERT3P(loadinfo, ==, NULL);
return (rewind_error);
} else {
/* Store the rewind info as part of the initial load info */
fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
spa->spa_load_info);
/* Restore the initial load info */
fnvlist_free(spa->spa_load_info);
spa->spa_load_info = loadinfo;
return (load_error);
}
}
/*
* Pool Open/Import
*
* The import case is identical to an open except that the configuration is sent
* down from userland, instead of grabbed from the configuration cache. For the
* case of an open, the pool configuration will exist in the
* POOL_STATE_UNINITIALIZED state.
*
* The stats information (gen/count/ustats) is used to gather vdev statistics at
* the same time open the pool, without having to keep around the spa_t in some
* ambiguous state.
*/
static int
spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
nvlist_t **config)
{
spa_t *spa;
spa_load_state_t state = SPA_LOAD_OPEN;
int error;
int locked = B_FALSE;
int firstopen = B_FALSE;
*spapp = NULL;
/*
* As disgusting as this is, we need to support recursive calls to this
* function because dsl_dir_open() is called during spa_load(), and ends
* up calling spa_open() again. The real fix is to figure out how to
* avoid dsl_dir_open() calling this in the first place.
*/
if (mutex_owner(&spa_namespace_lock) != curthread) {
mutex_enter(&spa_namespace_lock);
locked = B_TRUE;
}
if ((spa = spa_lookup(pool)) == NULL) {
if (locked)
mutex_exit(&spa_namespace_lock);
return (SET_ERROR(ENOENT));
}
if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
zpool_rewind_policy_t policy;
firstopen = B_TRUE;
zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
&policy);
if (policy.zrp_request & ZPOOL_DO_REWIND)
state = SPA_LOAD_RECOVER;
spa_activate(spa, spa_mode_global);
if (state != SPA_LOAD_RECOVER)
spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
policy.zrp_request);
if (error == EBADF) {
/*
* If vdev_validate() returns failure (indicated by
* EBADF), it indicates that one of the vdevs indicates
* that the pool has been exported or destroyed. If
* this is the case, the config cache is out of sync and
* we should remove the pool from the namespace.
*/
spa_unload(spa);
spa_deactivate(spa);
- spa_config_sync(spa, B_TRUE, B_TRUE);
+ spa_write_cachefile(spa, B_TRUE, B_TRUE);
spa_remove(spa);
if (locked)
mutex_exit(&spa_namespace_lock);
return (SET_ERROR(ENOENT));
}
if (error) {
/*
* We can't open the pool, but we still have useful
* information: the state of each vdev after the
* attempted vdev_open(). Return this to the user.
*/
if (config != NULL && spa->spa_config) {
VERIFY(nvlist_dup(spa->spa_config, config,
KM_SLEEP) == 0);
VERIFY(nvlist_add_nvlist(*config,
ZPOOL_CONFIG_LOAD_INFO,
spa->spa_load_info) == 0);
}
spa_unload(spa);
spa_deactivate(spa);
spa->spa_last_open_failed = error;
if (locked)
mutex_exit(&spa_namespace_lock);
*spapp = NULL;
return (error);
}
}
spa_open_ref(spa, tag);
if (config != NULL)
*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
/*
* If we've recovered the pool, pass back any information we
* gathered while doing the load.
*/
if (state == SPA_LOAD_RECOVER) {
VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
spa->spa_load_info) == 0);
}
if (locked) {
spa->spa_last_open_failed = 0;
spa->spa_last_ubsync_txg = 0;
spa->spa_load_txg = 0;
mutex_exit(&spa_namespace_lock);
#ifdef __FreeBSD__
#ifdef _KERNEL
if (firstopen)
zvol_create_minors(spa->spa_name);
#endif
#endif
}
*spapp = spa;
return (0);
}
int
spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
nvlist_t **config)
{
return (spa_open_common(name, spapp, tag, policy, config));
}
int
spa_open(const char *name, spa_t **spapp, void *tag)
{
return (spa_open_common(name, spapp, tag, NULL, NULL));
}
/*
* Lookup the given spa_t, incrementing the inject count in the process,
* preventing it from being exported or destroyed.
*/
spa_t *
spa_inject_addref(char *name)
{
spa_t *spa;
mutex_enter(&spa_namespace_lock);
if ((spa = spa_lookup(name)) == NULL) {
mutex_exit(&spa_namespace_lock);
return (NULL);
}
spa->spa_inject_ref++;
mutex_exit(&spa_namespace_lock);
return (spa);
}
void
spa_inject_delref(spa_t *spa)
{
mutex_enter(&spa_namespace_lock);
spa->spa_inject_ref--;
mutex_exit(&spa_namespace_lock);
}
/*
* Add spares device information to the nvlist.
*/
static void
spa_add_spares(spa_t *spa, nvlist_t *config)
{
nvlist_t **spares;
uint_t i, nspares;
nvlist_t *nvroot;
uint64_t guid;
vdev_stat_t *vs;
uint_t vsc;
uint64_t pool;
ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
if (spa->spa_spares.sav_count == 0)
return;
VERIFY(nvlist_lookup_nvlist(config,
ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
if (nspares != 0) {
VERIFY(nvlist_add_nvlist_array(nvroot,
ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
VERIFY(nvlist_lookup_nvlist_array(nvroot,
ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
/*
* Go through and find any spares which have since been
* repurposed as an active spare. If this is the case, update
* their status appropriately.
*/
for (i = 0; i < nspares; i++) {
VERIFY(nvlist_lookup_uint64(spares[i],
ZPOOL_CONFIG_GUID, &guid) == 0);
if (spa_spare_exists(guid, &pool, NULL) &&
pool != 0ULL) {
VERIFY(nvlist_lookup_uint64_array(
spares[i], ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &vsc) == 0);
vs->vs_state = VDEV_STATE_CANT_OPEN;
vs->vs_aux = VDEV_AUX_SPARED;
}
}
}
}
/*
* Add l2cache device information to the nvlist, including vdev stats.
*/
static void
spa_add_l2cache(spa_t *spa, nvlist_t *config)
{
nvlist_t **l2cache;
uint_t i, j, nl2cache;
nvlist_t *nvroot;
uint64_t guid;
vdev_t *vd;
vdev_stat_t *vs;
uint_t vsc;
ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
if (spa->spa_l2cache.sav_count == 0)
return;
VERIFY(nvlist_lookup_nvlist(config,
ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
if (nl2cache != 0) {
VERIFY(nvlist_add_nvlist_array(nvroot,
ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
VERIFY(nvlist_lookup_nvlist_array(nvroot,
ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
/*
* Update level 2 cache device stats.
*/
for (i = 0; i < nl2cache; i++) {
VERIFY(nvlist_lookup_uint64(l2cache[i],
ZPOOL_CONFIG_GUID, &guid) == 0);
vd = NULL;
for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
if (guid ==
spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
vd = spa->spa_l2cache.sav_vdevs[j];
break;
}
}
ASSERT(vd != NULL);
VERIFY(nvlist_lookup_uint64_array(l2cache[i],
ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
== 0);
vdev_get_stats(vd, vs);
}
}
}
static void
spa_add_feature_stats(spa_t *spa, nvlist_t *config)
{
nvlist_t *features;
zap_cursor_t zc;
zap_attribute_t za;
ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
/* We may be unable to read features if pool is suspended. */
if (spa_suspended(spa))
goto out;
if (spa->spa_feat_for_read_obj != 0) {
for (zap_cursor_init(&zc, spa->spa_meta_objset,
spa->spa_feat_for_read_obj);
zap_cursor_retrieve(&zc, &za) == 0;
zap_cursor_advance(&zc)) {
ASSERT(za.za_integer_length == sizeof (uint64_t) &&
za.za_num_integers == 1);
VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
za.za_first_integer));
}
zap_cursor_fini(&zc);
}
if (spa->spa_feat_for_write_obj != 0) {
for (zap_cursor_init(&zc, spa->spa_meta_objset,
spa->spa_feat_for_write_obj);
zap_cursor_retrieve(&zc, &za) == 0;
zap_cursor_advance(&zc)) {
ASSERT(za.za_integer_length == sizeof (uint64_t) &&
za.za_num_integers == 1);
VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
za.za_first_integer));
}
zap_cursor_fini(&zc);
}
out:
VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
features) == 0);
nvlist_free(features);
}
int
spa_get_stats(const char *name, nvlist_t **config,
char *altroot, size_t buflen)
{
int error;
spa_t *spa;
*config = NULL;
error = spa_open_common(name, &spa, FTAG, NULL, config);
if (spa != NULL) {
/*
* This still leaves a window of inconsistency where the spares
* or l2cache devices could change and the config would be
* self-inconsistent.
*/
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
if (*config != NULL) {
uint64_t loadtimes[2];
loadtimes[0] = spa->spa_loaded_ts.tv_sec;
loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
VERIFY(nvlist_add_uint64_array(*config,
ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
VERIFY(nvlist_add_uint64(*config,
ZPOOL_CONFIG_ERRCOUNT,
spa_get_errlog_size(spa)) == 0);
if (spa_suspended(spa))
VERIFY(nvlist_add_uint64(*config,
ZPOOL_CONFIG_SUSPENDED,
spa->spa_failmode) == 0);
spa_add_spares(spa, *config);
spa_add_l2cache(spa, *config);
spa_add_feature_stats(spa, *config);
}
}
/*
* We want to get the alternate root even for faulted pools, so we cheat
* and call spa_lookup() directly.
*/
if (altroot) {
if (spa == NULL) {
mutex_enter(&spa_namespace_lock);
spa = spa_lookup(name);
if (spa)
spa_altroot(spa, altroot, buflen);
else
altroot[0] = '\0';
spa = NULL;
mutex_exit(&spa_namespace_lock);
} else {
spa_altroot(spa, altroot, buflen);
}
}
if (spa != NULL) {
spa_config_exit(spa, SCL_CONFIG, FTAG);
spa_close(spa, FTAG);
}
return (error);
}
/*
* Validate that the auxiliary device array is well formed. We must have an
* array of nvlists, each which describes a valid leaf vdev. If this is an
* import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
* specified, as long as they are well-formed.
*/
static int
spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
spa_aux_vdev_t *sav, const char *config, uint64_t version,
vdev_labeltype_t label)
{
nvlist_t **dev;
uint_t i, ndev;
vdev_t *vd;
int error;
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
/*
* It's acceptable to have no devs specified.
*/
if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
return (0);
if (ndev == 0)
return (SET_ERROR(EINVAL));
/*
* Make sure the pool is formatted with a version that supports this
* device type.
*/
if (spa_version(spa) < version)
return (SET_ERROR(ENOTSUP));
/*
* Set the pending device list so we correctly handle device in-use
* checking.
*/
sav->sav_pending = dev;
sav->sav_npending = ndev;
for (i = 0; i < ndev; i++) {
if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
mode)) != 0)
goto out;
if (!vd->vdev_ops->vdev_op_leaf) {
vdev_free(vd);
error = SET_ERROR(EINVAL);
goto out;
}
/*
* The L2ARC currently only supports disk devices in
* kernel context. For user-level testing, we allow it.
*/
#ifdef _KERNEL
if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
error = SET_ERROR(ENOTBLK);
vdev_free(vd);
goto out;
}
#endif
vd->vdev_top = vd;
if ((error = vdev_open(vd)) == 0 &&
(error = vdev_label_init(vd, crtxg, label)) == 0) {
VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
vd->vdev_guid) == 0);
}
vdev_free(vd);
if (error &&
(mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
goto out;
else
error = 0;
}
out:
sav->sav_pending = NULL;
sav->sav_npending = 0;
return (error);
}
static int
spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
{
int error;
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
&spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
VDEV_LABEL_SPARE)) != 0) {
return (error);
}
return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
&spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
VDEV_LABEL_L2CACHE));
}
static void
spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
const char *config)
{
int i;
if (sav->sav_config != NULL) {
nvlist_t **olddevs;
uint_t oldndevs;
nvlist_t **newdevs;
/*
* Generate new dev list by concatentating with the
* current dev list.
*/
VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
&olddevs, &oldndevs) == 0);
newdevs = kmem_alloc(sizeof (void *) *
(ndevs + oldndevs), KM_SLEEP);
for (i = 0; i < oldndevs; i++)
VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
KM_SLEEP) == 0);
for (i = 0; i < ndevs; i++)
VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
KM_SLEEP) == 0);
VERIFY(nvlist_remove(sav->sav_config, config,
DATA_TYPE_NVLIST_ARRAY) == 0);
VERIFY(nvlist_add_nvlist_array(sav->sav_config,
config, newdevs, ndevs + oldndevs) == 0);
for (i = 0; i < oldndevs + ndevs; i++)
nvlist_free(newdevs[i]);
kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
} else {
/*
* Generate a new dev list.
*/
VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
KM_SLEEP) == 0);
VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
devs, ndevs) == 0);
}
}
/*
* Stop and drop level 2 ARC devices
*/
void
spa_l2cache_drop(spa_t *spa)
{
vdev_t *vd;
int i;
spa_aux_vdev_t *sav = &spa->spa_l2cache;
for (i = 0; i < sav->sav_count; i++) {
uint64_t pool;
vd = sav->sav_vdevs[i];
ASSERT(vd != NULL);
if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
pool != 0ULL && l2arc_vdev_present(vd))
l2arc_remove_vdev(vd);
}
}
/*
* Pool Creation
*/
int
spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
nvlist_t *zplprops)
{
spa_t *spa;
char *altroot = NULL;
vdev_t *rvd;
dsl_pool_t *dp;
dmu_tx_t *tx;
int error = 0;
uint64_t txg = TXG_INITIAL;
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
uint64_t version, obj;
boolean_t has_features;
/*
* If this pool already exists, return failure.
*/
mutex_enter(&spa_namespace_lock);
if (spa_lookup(pool) != NULL) {
mutex_exit(&spa_namespace_lock);
return (SET_ERROR(EEXIST));
}
/*
* Allocate a new spa_t structure.
*/
(void) nvlist_lookup_string(props,
zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
spa = spa_add(pool, NULL, altroot);
spa_activate(spa, spa_mode_global);
if (props && (error = spa_prop_validate(spa, props))) {
spa_deactivate(spa);
spa_remove(spa);
mutex_exit(&spa_namespace_lock);
return (error);
}
has_features = B_FALSE;
for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
if (zpool_prop_feature(nvpair_name(elem)))
has_features = B_TRUE;
}
if (has_features || nvlist_lookup_uint64(props,
zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
version = SPA_VERSION;
}
ASSERT(SPA_VERSION_IS_SUPPORTED(version));
spa->spa_first_txg = txg;
spa->spa_uberblock.ub_txg = txg - 1;
spa->spa_uberblock.ub_version = version;
spa->spa_ubsync = spa->spa_uberblock;
spa->spa_load_state = SPA_LOAD_CREATE;
+ spa->spa_removing_phys.sr_state = DSS_NONE;
+ spa->spa_removing_phys.sr_removing_vdev = -1;
+ spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
/*
* Create "The Godfather" zio to hold all async IOs
*/
spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
KM_SLEEP);
for (int i = 0; i < max_ncpus; i++) {
spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
ZIO_FLAG_GODFATHER);
}
/*
* Create the root vdev.
*/
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
ASSERT(error != 0 || rvd != NULL);
ASSERT(error != 0 || spa->spa_root_vdev == rvd);
if (error == 0 && !zfs_allocatable_devs(nvroot))
error = SET_ERROR(EINVAL);
if (error == 0 &&
(error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
(error = spa_validate_aux(spa, nvroot, txg,
VDEV_ALLOC_ADD)) == 0) {
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_ashift_optimize(rvd->vdev_child[c]);
vdev_metaslab_set_size(rvd->vdev_child[c]);
vdev_expand(rvd->vdev_child[c], txg);
}
}
spa_config_exit(spa, SCL_ALL, FTAG);
if (error != 0) {
spa_unload(spa);
spa_deactivate(spa);
spa_remove(spa);
mutex_exit(&spa_namespace_lock);
return (error);
}
/*
* Get the list of spares, if specified.
*/
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
&spares, &nspares) == 0) {
VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
KM_SLEEP) == 0);
VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
spa_load_spares(spa);
spa_config_exit(spa, SCL_ALL, FTAG);
spa->spa_spares.sav_sync = B_TRUE;
}
/*
* Get the list of level 2 cache devices, if specified.
*/
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
&l2cache, &nl2cache) == 0) {
VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
NV_UNIQUE_NAME, KM_SLEEP) == 0);
VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
spa_load_l2cache(spa);
spa_config_exit(spa, SCL_ALL, FTAG);
spa->spa_l2cache.sav_sync = B_TRUE;
}
spa->spa_is_initializing = B_TRUE;
spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
spa->spa_meta_objset = dp->dp_meta_objset;
spa->spa_is_initializing = B_FALSE;
/*
* Create DDTs (dedup tables).
*/
ddt_create(spa);
spa_update_dspace(spa);
tx = dmu_tx_create_assigned(dp, txg);
/*
* Create the pool config object.
*/
spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
if (zap_add(spa->spa_meta_objset,
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
cmn_err(CE_PANIC, "failed to add pool config");
}
if (spa_version(spa) >= SPA_VERSION_FEATURES)
spa_feature_create_zap_objects(spa, tx);
if (zap_add(spa->spa_meta_objset,
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
sizeof (uint64_t), 1, &version, tx) != 0) {
cmn_err(CE_PANIC, "failed to add pool version");
}
/* Newly created pools with the right version are always deflated. */
if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
spa->spa_deflate = TRUE;
if (zap_add(spa->spa_meta_objset,
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
cmn_err(CE_PANIC, "failed to add deflate");
}
}
/*
* Create the deferred-free bpobj. Turn off compression
* because sync-to-convergence takes longer if the blocksize
* keeps changing.
*/
obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
dmu_object_set_compress(spa->spa_meta_objset, obj,
ZIO_COMPRESS_OFF, tx);
if (zap_add(spa->spa_meta_objset,
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
sizeof (uint64_t), 1, &obj, tx) != 0) {
cmn_err(CE_PANIC, "failed to add bpobj");
}
VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
spa->spa_meta_objset, obj));
/*
* Create the pool's history object.
*/
if (version >= SPA_VERSION_ZPOOL_HISTORY)
spa_history_create_obj(spa, tx);
/*
* Generate some random noise for salted checksums to operate on.
*/
(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
sizeof (spa->spa_cksum_salt.zcs_bytes));
/*
* Set pool properties.
*/
spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
if (props != NULL) {
spa_configfile_set(spa, props, B_FALSE);
spa_sync_props(props, tx);
}
dmu_tx_commit(tx);
spa->spa_sync_on = B_TRUE;
txg_sync_start(spa->spa_dsl_pool);
/*
* We explicitly wait for the first transaction to complete so that our
* bean counters are appropriately updated.
*/
txg_wait_synced(spa->spa_dsl_pool, txg);
- spa_config_sync(spa, B_FALSE, B_TRUE);
+ spa_write_cachefile(spa, B_FALSE, B_TRUE);
spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
spa_history_log_version(spa, "create");
/*
* Don't count references from objsets that are already closed
* and are making their way through the eviction process.
*/
spa_evicting_os_wait(spa);
spa->spa_minref = refcount_count(&spa->spa_refcount);
spa->spa_load_state = SPA_LOAD_NONE;
mutex_exit(&spa_namespace_lock);
return (0);
}
#ifdef _KERNEL
#ifdef illumos
/*
* Get the root pool information from the root disk, then import the root pool
* during the system boot up time.
*/
extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
static nvlist_t *
spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
{
nvlist_t *config;
nvlist_t *nvtop, *nvroot;
uint64_t pgid;
if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
return (NULL);
/*
* Add this top-level vdev to the child array.
*/
VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvtop) == 0);
VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
&pgid) == 0);
VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
/*
* Put this pool's top-level vdevs into a root vdev.
*/
VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
VDEV_TYPE_ROOT) == 0);
VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
&nvtop, 1) == 0);
/*
* Replace the existing vdev_tree with the new root vdev in
* this pool's configuration (remove the old, add the new).
*/
VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
nvlist_free(nvroot);
return (config);
}
/*
* Walk the vdev tree and see if we can find a device with "better"
* configuration. A configuration is "better" if the label on that
* device has a more recent txg.
*/
static void
spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
{
for (int c = 0; c < vd->vdev_children; c++)
spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
if (vd->vdev_ops->vdev_op_leaf) {
nvlist_t *label;
uint64_t label_txg;
if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
&label) != 0)
return;
VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
&label_txg) == 0);
/*
* Do we have a better boot device?
*/
if (label_txg > *txg) {
*txg = label_txg;
*avd = vd;
}
nvlist_free(label);
}
}
/*
* Import a root pool.
*
* For x86. devpath_list will consist of devid and/or physpath name of
* the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
* The GRUB "findroot" command will return the vdev we should boot.
*
* For Sparc, devpath_list consists the physpath name of the booting device
* no matter the rootpool is a single device pool or a mirrored pool.
* e.g.
* "/pci@1f,0/ide@d/disk@0,0:a"
*/
int
spa_import_rootpool(char *devpath, char *devid)
{
spa_t *spa;
vdev_t *rvd, *bvd, *avd = NULL;
nvlist_t *config, *nvtop;
uint64_t guid, txg;
char *pname;
int error;
/*
* Read the label from the boot device and generate a configuration.
*/
config = spa_generate_rootconf(devpath, devid, &guid);
#if defined(_OBP) && defined(_KERNEL)
if (config == NULL) {
if (strstr(devpath, "/iscsi/ssd") != NULL) {
/* iscsi boot */
get_iscsi_bootpath_phy(devpath);
config = spa_generate_rootconf(devpath, devid, &guid);
}
}
#endif
if (config == NULL) {
cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
devpath);
return (SET_ERROR(EIO));
}
VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
&pname) == 0);
VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
mutex_enter(&spa_namespace_lock);
if ((spa = spa_lookup(pname)) != NULL) {
/*
* Remove the existing root pool from the namespace so that we
* can replace it with the correct config we just read in.
*/
spa_remove(spa);
}
spa = spa_add(pname, config, NULL);
spa->spa_is_root = B_TRUE;
spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
&spa->spa_ubsync.ub_version) != 0)
spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
/*
* Build up a vdev tree based on the boot device's label config.
*/
VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvtop) == 0);
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
VDEV_ALLOC_ROOTPOOL);
spa_config_exit(spa, SCL_ALL, FTAG);
if (error) {
mutex_exit(&spa_namespace_lock);
nvlist_free(config);
cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
pname);
return (error);
}
/*
* Get the boot vdev.
*/
if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
(u_longlong_t)guid);
error = SET_ERROR(ENOENT);
goto out;
}
/*
* Determine if there is a better boot device.
*/
avd = bvd;
spa_alt_rootvdev(rvd, &avd, &txg);
if (avd != bvd) {
cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
"try booting from '%s'", avd->vdev_path);
error = SET_ERROR(EINVAL);
goto out;
}
/*
* If the boot device is part of a spare vdev then ensure that
* we're booting off the active spare.
*/
if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
!bvd->vdev_isspare) {
cmn_err(CE_NOTE, "The boot device is currently spared. Please "
"try booting from '%s'",
bvd->vdev_parent->
vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
error = SET_ERROR(EINVAL);
goto out;
}
error = 0;
out:
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
vdev_free(rvd);
spa_config_exit(spa, SCL_ALL, FTAG);
mutex_exit(&spa_namespace_lock);
nvlist_free(config);
return (error);
}
#else /* !illumos */
extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs,
uint64_t *count);
static nvlist_t *
spa_generate_rootconf(const char *name)
{
nvlist_t **configs, **tops;
nvlist_t *config;
nvlist_t *best_cfg, *nvtop, *nvroot;
uint64_t *holes;
uint64_t best_txg;
uint64_t nchildren;
uint64_t pgid;
uint64_t count;
uint64_t i;
uint_t nholes;
if (vdev_geom_read_pool_label(name, &configs, &count) != 0)
return (NULL);
ASSERT3U(count, !=, 0);
best_txg = 0;
for (i = 0; i < count; i++) {
uint64_t txg;
VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG,
&txg) == 0);
if (txg > best_txg) {
best_txg = txg;
best_cfg = configs[i];
}
}
nchildren = 1;
nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren);
holes = NULL;
nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY,
&holes, &nholes);
tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP);
for (i = 0; i < nchildren; i++) {
if (i >= count)
break;
if (configs[i] == NULL)
continue;
VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE,
&nvtop) == 0);
nvlist_dup(nvtop, &tops[i], KM_SLEEP);
}
for (i = 0; holes != NULL && i < nholes; i++) {
if (i >= nchildren)
continue;
if (tops[holes[i]] != NULL)
continue;
nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP);
VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE,
VDEV_TYPE_HOLE) == 0);
VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID,
holes[i]) == 0);
VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID,
0) == 0);
}
for (i = 0; i < nchildren; i++) {
if (tops[i] != NULL)
continue;
nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP);
VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE,
VDEV_TYPE_MISSING) == 0);
VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID,
i) == 0);
VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID,
0) == 0);
}
/*
* Create pool config based on the best vdev config.
*/
nvlist_dup(best_cfg, &config, KM_SLEEP);
/*
* Put this pool's top-level vdevs into a root vdev.
*/
VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
&pgid) == 0);
VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
VDEV_TYPE_ROOT) == 0);
VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
tops, nchildren) == 0);
/*
* Replace the existing vdev_tree with the new root vdev in
* this pool's configuration (remove the old, add the new).
*/
VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
/*
* Drop vdev config elements that should not be present at pool level.
*/
nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64);
nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64);
for (i = 0; i < count; i++)
nvlist_free(configs[i]);
kmem_free(configs, count * sizeof(void *));
for (i = 0; i < nchildren; i++)
nvlist_free(tops[i]);
kmem_free(tops, nchildren * sizeof(void *));
nvlist_free(nvroot);
return (config);
}
int
spa_import_rootpool(const char *name)
{
spa_t *spa;
vdev_t *rvd, *bvd, *avd = NULL;
nvlist_t *config, *nvtop;
uint64_t txg;
char *pname;
int error;
/*
* Read the label from the boot device and generate a configuration.
*/
config = spa_generate_rootconf(name);
mutex_enter(&spa_namespace_lock);
if (config != NULL) {
VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
&pname) == 0 && strcmp(name, pname) == 0);
VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg)
== 0);
if ((spa = spa_lookup(pname)) != NULL) {
/*
* The pool could already be imported,
* e.g., after reboot -r.
*/
if (spa->spa_state == POOL_STATE_ACTIVE) {
mutex_exit(&spa_namespace_lock);
nvlist_free(config);
return (0);
}
/*
* Remove the existing root pool from the namespace so
* that we can replace it with the correct config
* we just read in.
*/
spa_remove(spa);
}
spa = spa_add(pname, config, NULL);
/*
* Set spa_ubsync.ub_version as it can be used in vdev_alloc()
* via spa_version().
*/
if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
&spa->spa_ubsync.ub_version) != 0)
spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
} else if ((spa = spa_lookup(name)) == NULL) {
mutex_exit(&spa_namespace_lock);
nvlist_free(config);
cmn_err(CE_NOTE, "Cannot find the pool label for '%s'",
name);
return (EIO);
} else {
VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0);
}
spa->spa_is_root = B_TRUE;
spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
/*
* Build up a vdev tree based on the boot device's label config.
*/
VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvtop) == 0);
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
VDEV_ALLOC_ROOTPOOL);
spa_config_exit(spa, SCL_ALL, FTAG);
if (error) {
mutex_exit(&spa_namespace_lock);
nvlist_free(config);
cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
pname);
return (error);
}
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
vdev_free(rvd);
spa_config_exit(spa, SCL_ALL, FTAG);
mutex_exit(&spa_namespace_lock);
nvlist_free(config);
return (0);
}
#endif /* illumos */
#endif /* _KERNEL */
/*
* Import a non-root pool into the system.
*/
int
spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
{
spa_t *spa;
char *altroot = NULL;
spa_load_state_t state = SPA_LOAD_IMPORT;
zpool_rewind_policy_t policy;
uint64_t mode = spa_mode_global;
uint64_t readonly = B_FALSE;
int error;
nvlist_t *nvroot;
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
/*
* If a pool with this name exists, return failure.
*/
mutex_enter(&spa_namespace_lock);
if (spa_lookup(pool) != NULL) {
mutex_exit(&spa_namespace_lock);
return (SET_ERROR(EEXIST));
}
/*
* Create and initialize the spa structure.
*/
(void) nvlist_lookup_string(props,
zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
(void) nvlist_lookup_uint64(props,
zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
if (readonly)
mode = FREAD;
spa = spa_add(pool, config, altroot);
spa->spa_import_flags = flags;
/*
* Verbatim import - Take a pool and insert it into the namespace
* as if it had been loaded at boot.
*/
if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
if (props != NULL)
spa_configfile_set(spa, props, B_FALSE);
- spa_config_sync(spa, B_FALSE, B_TRUE);
+ spa_write_cachefile(spa, B_FALSE, B_TRUE);
spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
mutex_exit(&spa_namespace_lock);
return (0);
}
spa_activate(spa, mode);
/*
* Don't start async tasks until we know everything is healthy.
*/
spa_async_suspend(spa);
zpool_get_rewind_policy(config, &policy);
if (policy.zrp_request & ZPOOL_DO_REWIND)
state = SPA_LOAD_RECOVER;
/*
* Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig
* because the user-supplied config is actually the one to trust when
* doing an import.
*/
if (state != SPA_LOAD_RECOVER)
spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
policy.zrp_request);
/*
* Propagate anything learned while loading the pool and pass it
* back to caller (i.e. rewind info, missing devices, etc).
*/
VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
spa->spa_load_info) == 0);
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
/*
* Toss any existing sparelist, as it doesn't have any validity
* anymore, and conflicts with spa_has_spare().
*/
if (spa->spa_spares.sav_config) {
nvlist_free(spa->spa_spares.sav_config);
spa->spa_spares.sav_config = NULL;
spa_load_spares(spa);
}
if (spa->spa_l2cache.sav_config) {
nvlist_free(spa->spa_l2cache.sav_config);
spa->spa_l2cache.sav_config = NULL;
spa_load_l2cache(spa);
}
VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
if (error == 0)
error = spa_validate_aux(spa, nvroot, -1ULL,
VDEV_ALLOC_SPARE);
if (error == 0)
error = spa_validate_aux(spa, nvroot, -1ULL,
VDEV_ALLOC_L2CACHE);
spa_config_exit(spa, SCL_ALL, FTAG);
if (props != NULL)
spa_configfile_set(spa, props, B_FALSE);
if (error != 0 || (props && spa_writeable(spa) &&
(error = spa_prop_set(spa, props)))) {
spa_unload(spa);
spa_deactivate(spa);
spa_remove(spa);
mutex_exit(&spa_namespace_lock);
return (error);
}
spa_async_resume(spa);
/*
* Override any spares and level 2 cache devices as specified by
* the user, as these may have correct device names/devids, etc.
*/
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
&spares, &nspares) == 0) {
if (spa->spa_spares.sav_config)
VERIFY(nvlist_remove(spa->spa_spares.sav_config,
ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
else
VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
NV_UNIQUE_NAME, KM_SLEEP) == 0);
VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
spa_load_spares(spa);
spa_config_exit(spa, SCL_ALL, FTAG);
spa->spa_spares.sav_sync = B_TRUE;
}
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
&l2cache, &nl2cache) == 0) {
if (spa->spa_l2cache.sav_config)
VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
else
VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
NV_UNIQUE_NAME, KM_SLEEP) == 0);
VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
spa_load_l2cache(spa);
spa_config_exit(spa, SCL_ALL, FTAG);
spa->spa_l2cache.sav_sync = B_TRUE;
}
/*
* Check for any removed devices.
*/
if (spa->spa_autoreplace) {
spa_aux_check_removed(&spa->spa_spares);
spa_aux_check_removed(&spa->spa_l2cache);
}
if (spa_writeable(spa)) {
/*
* Update the config cache to include the newly-imported pool.
*/
spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
}
/*
* It's possible that the pool was expanded while it was exported.
* We kick off an async task to handle this for us.
*/
spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
spa_history_log_version(spa, "import");
spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
mutex_exit(&spa_namespace_lock);
#ifdef __FreeBSD__
#ifdef _KERNEL
zvol_create_minors(pool);
#endif
#endif
return (0);
}
nvlist_t *
spa_tryimport(nvlist_t *tryconfig)
{
nvlist_t *config = NULL;
char *poolname;
spa_t *spa;
uint64_t state;
int error;
if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
return (NULL);
if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
return (NULL);
/*
* Create and initialize the spa structure.
*/
mutex_enter(&spa_namespace_lock);
spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
spa_activate(spa, FREAD);
/*
* Pass off the heavy lifting to spa_load().
* Pass TRUE for mosconfig because the user-supplied config
* is actually the one to trust when doing an import.
*/
error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
/*
* If 'tryconfig' was at least parsable, return the current config.
*/
if (spa->spa_root_vdev != NULL) {
config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
poolname) == 0);
VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
state) == 0);
VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
spa->spa_uberblock.ub_timestamp) == 0);
VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
spa->spa_load_info) == 0);
/*
* If the bootfs property exists on this pool then we
* copy it out so that external consumers can tell which
* pools are bootable.
*/
if ((!error || error == EEXIST) && spa->spa_bootfs) {
char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
/*
* We have to play games with the name since the
* pool was opened as TRYIMPORT_NAME.
*/
if (dsl_dsobj_to_dsname(spa_name(spa),
spa->spa_bootfs, tmpname) == 0) {
char *cp;
char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
cp = strchr(tmpname, '/');
if (cp == NULL) {
(void) strlcpy(dsname, tmpname,
MAXPATHLEN);
} else {
(void) snprintf(dsname, MAXPATHLEN,
"%s/%s", poolname, ++cp);
}
VERIFY(nvlist_add_string(config,
ZPOOL_CONFIG_BOOTFS, dsname) == 0);
kmem_free(dsname, MAXPATHLEN);
}
kmem_free(tmpname, MAXPATHLEN);
}
/*
* Add the list of hot spares and level 2 cache devices.
*/
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
spa_add_spares(spa, config);
spa_add_l2cache(spa, config);
spa_config_exit(spa, SCL_CONFIG, FTAG);
}
spa_unload(spa);
spa_deactivate(spa);
spa_remove(spa);
mutex_exit(&spa_namespace_lock);
return (config);
}
/*
* Pool export/destroy
*
* The act of destroying or exporting a pool is very simple. We make sure there
* is no more pending I/O and any references to the pool are gone. Then, we
* update the pool state and sync all the labels to disk, removing the
* configuration from the cache afterwards. If the 'hardforce' flag is set, then
* we don't sync the labels or remove the configuration cache.
*/
static int
spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
boolean_t force, boolean_t hardforce)
{
spa_t *spa;
if (oldconfig)
*oldconfig = NULL;
if (!(spa_mode_global & FWRITE))
return (SET_ERROR(EROFS));
mutex_enter(&spa_namespace_lock);
if ((spa = spa_lookup(pool)) == NULL) {
mutex_exit(&spa_namespace_lock);
return (SET_ERROR(ENOENT));
}
/*
* Put a hold on the pool, drop the namespace lock, stop async tasks,
* reacquire the namespace lock, and see if we can export.
*/
spa_open_ref(spa, FTAG);
mutex_exit(&spa_namespace_lock);
spa_async_suspend(spa);
mutex_enter(&spa_namespace_lock);
spa_close(spa, FTAG);
/*
* The pool will be in core if it's openable,
* in which case we can modify its state.
*/
if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
/*
* Objsets may be open only because they're dirty, so we
* have to force it to sync before checking spa_refcnt.
*/
txg_wait_synced(spa->spa_dsl_pool, 0);
spa_evicting_os_wait(spa);
/*
* A pool cannot be exported or destroyed if there are active
* references. If we are resetting a pool, allow references by
* fault injection handlers.
*/
if (!spa_refcount_zero(spa) ||
(spa->spa_inject_ref != 0 &&
new_state != POOL_STATE_UNINITIALIZED)) {
spa_async_resume(spa);
mutex_exit(&spa_namespace_lock);
return (SET_ERROR(EBUSY));
}
/*
* A pool cannot be exported if it has an active shared spare.
* This is to prevent other pools stealing the active spare
* from an exported pool. At user's own will, such pool can
* be forcedly exported.
*/
if (!force && new_state == POOL_STATE_EXPORTED &&
spa_has_active_shared_spare(spa)) {
spa_async_resume(spa);
mutex_exit(&spa_namespace_lock);
return (SET_ERROR(EXDEV));
}
/*
* We want this to be reflected on every label,
* so mark them all dirty. spa_unload() will do the
* final sync that pushes these changes out.
*/
if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
spa->spa_state = new_state;
spa->spa_final_txg = spa_last_synced_txg(spa) +
TXG_DEFER_SIZE + 1;
vdev_config_dirty(spa->spa_root_vdev);
spa_config_exit(spa, SCL_ALL, FTAG);
}
}
spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY);
if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
spa_unload(spa);
spa_deactivate(spa);
}
if (oldconfig && spa->spa_config)
VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
if (new_state != POOL_STATE_UNINITIALIZED) {
if (!hardforce)
- spa_config_sync(spa, B_TRUE, B_TRUE);
+ spa_write_cachefile(spa, B_TRUE, B_TRUE);
spa_remove(spa);
}
mutex_exit(&spa_namespace_lock);
return (0);
}
/*
* Destroy a storage pool.
*/
int
spa_destroy(char *pool)
{
return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
B_FALSE, B_FALSE));
}
/*
* Export a storage pool.
*/
int
spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
boolean_t hardforce)
{
return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
force, hardforce));
}
/*
* Similar to spa_export(), this unloads the spa_t without actually removing it
* from the namespace in any way.
*/
int
spa_reset(char *pool)
{
return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
B_FALSE, B_FALSE));
}
/*
* ==========================================================================
* Device manipulation
* ==========================================================================
*/
/*
* Add a device to a storage pool.
*/
int
spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
{
uint64_t txg, id;
int error;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd, *tvd;
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
ASSERT(spa_writeable(spa));
txg = spa_vdev_enter(spa);
if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
VDEV_ALLOC_ADD)) != 0)
return (spa_vdev_exit(spa, NULL, txg, error));
spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
&nspares) != 0)
nspares = 0;
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
&nl2cache) != 0)
nl2cache = 0;
if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
return (spa_vdev_exit(spa, vd, txg, EINVAL));
if (vd->vdev_children != 0 &&
(error = vdev_create(vd, txg, B_FALSE)) != 0)
return (spa_vdev_exit(spa, vd, txg, error));
/*
* We must validate the spares and l2cache devices after checking the
* children. Otherwise, vdev_inuse() will blindly overwrite the spare.
*/
if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
return (spa_vdev_exit(spa, vd, txg, error));
/*
- * Transfer each new top-level vdev from vd to rvd.
+ * If we are in the middle of a device removal, we can only add
+ * devices which match the existing devices in the pool.
+ * If we are in the middle of a removal, or have some indirect
+ * vdevs, we can not add raidz toplevels.
*/
+ if (spa->spa_vdev_removal != NULL ||
+ spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
+ for (int c = 0; c < vd->vdev_children; c++) {
+ tvd = vd->vdev_child[c];
+ if (spa->spa_vdev_removal != NULL &&
+ tvd->vdev_ashift !=
+ spa->spa_vdev_removal->svr_vdev->vdev_ashift) {
+ return (spa_vdev_exit(spa, vd, txg, EINVAL));
+ }
+ /* Fail if top level vdev is raidz */
+ if (tvd->vdev_ops == &vdev_raidz_ops) {
+ return (spa_vdev_exit(spa, vd, txg, EINVAL));
+ }
+ /*
+ * Need the top level mirror to be
+ * a mirror of leaf vdevs only
+ */
+ if (tvd->vdev_ops == &vdev_mirror_ops) {
+ for (uint64_t cid = 0;
+ cid < tvd->vdev_children; cid++) {
+ vdev_t *cvd = tvd->vdev_child[cid];
+ if (!cvd->vdev_ops->vdev_op_leaf) {
+ return (spa_vdev_exit(spa, vd,
+ txg, EINVAL));
+ }
+ }
+ }
+ }
+ }
+
for (int c = 0; c < vd->vdev_children; c++) {
/*
* Set the vdev id to the first hole, if one exists.
*/
for (id = 0; id < rvd->vdev_children; id++) {
if (rvd->vdev_child[id]->vdev_ishole) {
vdev_free(rvd->vdev_child[id]);
break;
}
}
tvd = vd->vdev_child[c];
vdev_remove_child(vd, tvd);
tvd->vdev_id = id;
vdev_add_child(rvd, tvd);
vdev_config_dirty(tvd);
}
if (nspares != 0) {
spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
ZPOOL_CONFIG_SPARES);
spa_load_spares(spa);
spa->spa_spares.sav_sync = B_TRUE;
}
if (nl2cache != 0) {
spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
ZPOOL_CONFIG_L2CACHE);
spa_load_l2cache(spa);
spa->spa_l2cache.sav_sync = B_TRUE;
}
/*
* We have to be careful when adding new vdevs to an existing pool.
* If other threads start allocating from these vdevs before we
* sync the config cache, and we lose power, then upon reboot we may
* fail to open the pool because there are DVAs that the config cache
* can't translate. Therefore, we first add the vdevs without
* initializing metaslabs; sync the config cache (via spa_vdev_exit());
* and then let spa_config_update() initialize the new metaslabs.
*
* spa_load() checks for added-but-not-initialized vdevs, so that
* if we lose power at any point in this sequence, the remaining
* steps will be completed the next time we load the pool.
*/
(void) spa_vdev_exit(spa, vd, txg, 0);
mutex_enter(&spa_namespace_lock);
spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD);
mutex_exit(&spa_namespace_lock);
return (0);
}
/*
* Attach a device to a mirror. The arguments are the path to any device
* in the mirror, and the nvroot for the new device. If the path specifies
* a device that is not mirrored, we automatically insert the mirror vdev.
*
* If 'replacing' is specified, the new device is intended to replace the
* existing device; in this case the two devices are made into their own
* mirror using the 'replacing' vdev, which is functionally identical to
* the mirror vdev (it actually reuses all the same ops) but has a few
* extra rules: you can't attach to it after it's been created, and upon
* completion of resilvering, the first disk (the one being replaced)
* is automatically detached.
*/
int
spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
{
uint64_t txg, dtl_max_txg;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
vdev_ops_t *pvops;
char *oldvdpath, *newvdpath;
int newvd_isspare;
int error;
ASSERT(spa_writeable(spa));
txg = spa_vdev_enter(spa);
oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
+ if (spa->spa_vdev_removal != NULL ||
+ spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
+ return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+ }
+
if (oldvd == NULL)
return (spa_vdev_exit(spa, NULL, txg, ENODEV));
if (!oldvd->vdev_ops->vdev_op_leaf)
return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
pvd = oldvd->vdev_parent;
if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
VDEV_ALLOC_ATTACH)) != 0)
return (spa_vdev_exit(spa, NULL, txg, EINVAL));
if (newrootvd->vdev_children != 1)
return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
newvd = newrootvd->vdev_child[0];
if (!newvd->vdev_ops->vdev_op_leaf)
return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
return (spa_vdev_exit(spa, newrootvd, txg, error));
/*
* Spares can't replace logs
*/
if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
if (!replacing) {
/*
* For attach, the only allowable parent is a mirror or the root
* vdev.
*/
if (pvd->vdev_ops != &vdev_mirror_ops &&
pvd->vdev_ops != &vdev_root_ops)
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
pvops = &vdev_mirror_ops;
} else {
/*
* Active hot spares can only be replaced by inactive hot
* spares.
*/
if (pvd->vdev_ops == &vdev_spare_ops &&
oldvd->vdev_isspare &&
!spa_has_spare(spa, newvd->vdev_guid))
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
/*
* If the source is a hot spare, and the parent isn't already a
* spare, then we want to create a new hot spare. Otherwise, we
* want to create a replacing vdev. The user is not allowed to
* attach to a spared vdev child unless the 'isspare' state is
* the same (spare replaces spare, non-spare replaces
* non-spare).
*/
if (pvd->vdev_ops == &vdev_replacing_ops &&
spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
} else if (pvd->vdev_ops == &vdev_spare_ops &&
newvd->vdev_isspare != oldvd->vdev_isspare) {
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
}
if (newvd->vdev_isspare)
pvops = &vdev_spare_ops;
else
pvops = &vdev_replacing_ops;
}
/*
* Make sure the new device is big enough.
*/
if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
/*
* The new device cannot have a higher alignment requirement
* than the top-level vdev.
*/
if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
/*
* If this is an in-place replacement, update oldvd's path and devid
* to make it distinguishable from newvd, and unopenable from now on.
*/
if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
spa_strfree(oldvd->vdev_path);
oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
KM_SLEEP);
(void) sprintf(oldvd->vdev_path, "%s/%s",
newvd->vdev_path, "old");
if (oldvd->vdev_devid != NULL) {
spa_strfree(oldvd->vdev_devid);
oldvd->vdev_devid = NULL;
}
}
/* mark the device being resilvered */
newvd->vdev_resilver_txg = txg;
/*
* If the parent is not a mirror, or if we're replacing, insert the new
* mirror/replacing/spare vdev above oldvd.
*/
if (pvd->vdev_ops != pvops)
pvd = vdev_add_parent(oldvd, pvops);
ASSERT(pvd->vdev_top->vdev_parent == rvd);
ASSERT(pvd->vdev_ops == pvops);
ASSERT(oldvd->vdev_parent == pvd);
/*
* Extract the new device from its root and add it to pvd.
*/
vdev_remove_child(newrootvd, newvd);
newvd->vdev_id = pvd->vdev_children;
newvd->vdev_crtxg = oldvd->vdev_crtxg;
vdev_add_child(pvd, newvd);
tvd = newvd->vdev_top;
ASSERT(pvd->vdev_top == tvd);
ASSERT(tvd->vdev_parent == rvd);
vdev_config_dirty(tvd);
/*
* Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
* for any dmu_sync-ed blocks. It will propagate upward when
* spa_vdev_exit() calls vdev_dtl_reassess().
*/
dtl_max_txg = txg + TXG_CONCURRENT_STATES;
vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
dtl_max_txg - TXG_INITIAL);
if (newvd->vdev_isspare) {
spa_spare_activate(newvd);
spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
}
oldvdpath = spa_strdup(oldvd->vdev_path);
newvdpath = spa_strdup(newvd->vdev_path);
newvd_isspare = newvd->vdev_isspare;
/*
* Mark newvd's DTL dirty in this txg.
*/
vdev_dirty(tvd, VDD_DTL, newvd, txg);
/*
* Schedule the resilver to restart in the future. We do this to
* ensure that dmu_sync-ed blocks have been stitched into the
* respective datasets.
*/
dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
if (spa->spa_bootfs)
spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH);
/*
* Commit the config
*/
(void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
spa_history_log_internal(spa, "vdev attach", NULL,
"%s vdev=%s %s vdev=%s",
replacing && newvd_isspare ? "spare in" :
replacing ? "replace" : "attach", newvdpath,
replacing ? "for" : "to", oldvdpath);
spa_strfree(oldvdpath);
spa_strfree(newvdpath);
return (0);
}
/*
* Detach a device from a mirror or replacing vdev.
*
* If 'replace_done' is specified, only detach if the parent
* is a replacing vdev.
*/
int
spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
{
uint64_t txg;
int error;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd, *pvd, *cvd, *tvd;
boolean_t unspare = B_FALSE;
uint64_t unspare_guid = 0;
char *vdpath;
ASSERT(spa_writeable(spa));
txg = spa_vdev_enter(spa);
vd = spa_lookup_by_guid(spa, guid, B_FALSE);
if (vd == NULL)
return (spa_vdev_exit(spa, NULL, txg, ENODEV));
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
pvd = vd->vdev_parent;
/*
* If the parent/child relationship is not as expected, don't do it.
* Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
* vdev that's replacing B with C. The user's intent in replacing
* is to go from M(A,B) to M(A,C). If the user decides to cancel
* the replace by detaching C, the expected behavior is to end up
* M(A,B). But suppose that right after deciding to detach C,
* the replacement of B completes. We would have M(A,C), and then
* ask to detach C, which would leave us with just A -- not what
* the user wanted. To prevent this, we make sure that the
* parent/child relationship hasn't changed -- in this example,
* that C's parent is still the replacing vdev R.
*/
if (pvd->vdev_guid != pguid && pguid != 0)
return (spa_vdev_exit(spa, NULL, txg, EBUSY));
/*
* Only 'replacing' or 'spare' vdevs can be replaced.
*/
if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
pvd->vdev_ops != &vdev_spare_ops)
return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
spa_version(spa) >= SPA_VERSION_SPARES);
/*
* Only mirror, replacing, and spare vdevs support detach.
*/
if (pvd->vdev_ops != &vdev_replacing_ops &&
pvd->vdev_ops != &vdev_mirror_ops &&
pvd->vdev_ops != &vdev_spare_ops)
return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
/*
* If this device has the only valid copy of some data,
* we cannot safely detach it.
*/
if (vdev_dtl_required(vd))
return (spa_vdev_exit(spa, NULL, txg, EBUSY));
ASSERT(pvd->vdev_children >= 2);
/*
* If we are detaching the second disk from a replacing vdev, then
* check to see if we changed the original vdev's path to have "/old"
* at the end in spa_vdev_attach(). If so, undo that change now.
*/
if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
vd->vdev_path != NULL) {
size_t len = strlen(vd->vdev_path);
for (int c = 0; c < pvd->vdev_children; c++) {
cvd = pvd->vdev_child[c];
if (cvd == vd || cvd->vdev_path == NULL)
continue;
if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
strcmp(cvd->vdev_path + len, "/old") == 0) {
spa_strfree(cvd->vdev_path);
cvd->vdev_path = spa_strdup(vd->vdev_path);
break;
}
}
}
/*
* If we are detaching the original disk from a spare, then it implies
* that the spare should become a real disk, and be removed from the
* active spare list for the pool.
*/
if (pvd->vdev_ops == &vdev_spare_ops &&
vd->vdev_id == 0 &&
pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
unspare = B_TRUE;
/*
* Erase the disk labels so the disk can be used for other things.
* This must be done after all other error cases are handled,
* but before we disembowel vd (so we can still do I/O to it).
* But if we can't do it, don't treat the error as fatal --
* it may be that the unwritability of the disk is the reason
* it's being detached!
*/
error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
/*
* Remove vd from its parent and compact the parent's children.
*/
vdev_remove_child(pvd, vd);
vdev_compact_children(pvd);
/*
* Remember one of the remaining children so we can get tvd below.
*/
cvd = pvd->vdev_child[pvd->vdev_children - 1];
/*
* If we need to remove the remaining child from the list of hot spares,
* do it now, marking the vdev as no longer a spare in the process.
* We must do this before vdev_remove_parent(), because that can
* change the GUID if it creates a new toplevel GUID. For a similar
* reason, we must remove the spare now, in the same txg as the detach;
* otherwise someone could attach a new sibling, change the GUID, and
* the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
*/
if (unspare) {
ASSERT(cvd->vdev_isspare);
spa_spare_remove(cvd);
unspare_guid = cvd->vdev_guid;
(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
cvd->vdev_unspare = B_TRUE;
}
/*
* If the parent mirror/replacing vdev only has one child,
* the parent is no longer needed. Remove it from the tree.
*/
if (pvd->vdev_children == 1) {
if (pvd->vdev_ops == &vdev_spare_ops)
cvd->vdev_unspare = B_FALSE;
vdev_remove_parent(cvd);
}
/*
* We don't set tvd until now because the parent we just removed
* may have been the previous top-level vdev.
*/
tvd = cvd->vdev_top;
ASSERT(tvd->vdev_parent == rvd);
/*
* Reevaluate the parent vdev state.
*/
vdev_propagate_state(cvd);
/*
* If the 'autoexpand' property is set on the pool then automatically
* try to expand the size of the pool. For example if the device we
* just detached was smaller than the others, it may be possible to
* add metaslabs (i.e. grow the pool). We need to reopen the vdev
* first so that we can obtain the updated sizes of the leaf vdevs.
*/
if (spa->spa_autoexpand) {
vdev_reopen(tvd);
vdev_expand(tvd, txg);
}
vdev_config_dirty(tvd);
/*
* Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that
* vd->vdev_detached is set and free vd's DTL object in syncing context.
* But first make sure we're not on any *other* txg's DTL list, to
* prevent vd from being accessed after it's freed.
*/
vdpath = spa_strdup(vd->vdev_path);
for (int t = 0; t < TXG_SIZE; t++)
(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
vd->vdev_detached = B_TRUE;
vdev_dirty(tvd, VDD_DTL, vd, txg);
spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE);
/* hang on to the spa before we release the lock */
spa_open_ref(spa, FTAG);
error = spa_vdev_exit(spa, vd, txg, 0);
spa_history_log_internal(spa, "detach", NULL,
"vdev=%s", vdpath);
spa_strfree(vdpath);
/*
* If this was the removal of the original device in a hot spare vdev,
* then we want to go through and remove the device from the hot spare
* list of every other pool.
*/
if (unspare) {
spa_t *altspa = NULL;
mutex_enter(&spa_namespace_lock);
while ((altspa = spa_next(altspa)) != NULL) {
if (altspa->spa_state != POOL_STATE_ACTIVE ||
altspa == spa)
continue;
spa_open_ref(altspa, FTAG);
mutex_exit(&spa_namespace_lock);
(void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
mutex_enter(&spa_namespace_lock);
spa_close(altspa, FTAG);
}
mutex_exit(&spa_namespace_lock);
/* search the rest of the vdevs for spares to remove */
spa_vdev_resilver_done(spa);
}
/* all done with the spa; OK to release */
mutex_enter(&spa_namespace_lock);
spa_close(spa, FTAG);
mutex_exit(&spa_namespace_lock);
return (error);
}
/*
* Split a set of devices from their mirrors, and create a new pool from them.
*/
int
spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
nvlist_t *props, boolean_t exp)
{
int error = 0;
uint64_t txg, *glist;
spa_t *newspa;
uint_t c, children, lastlog;
nvlist_t **child, *nvl, *tmp;
dmu_tx_t *tx;
char *altroot = NULL;
vdev_t *rvd, **vml = NULL; /* vdev modify list */
boolean_t activate_slog;
ASSERT(spa_writeable(spa));
txg = spa_vdev_enter(spa);
/* clear the log and flush everything up to now */
activate_slog = spa_passivate_log(spa);
(void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
- error = spa_offline_log(spa);
+ error = spa_reset_logs(spa);
txg = spa_vdev_config_enter(spa);
if (activate_slog)
spa_activate_log(spa);
if (error != 0)
return (spa_vdev_exit(spa, NULL, txg, error));
/* check new spa name before going any further */
if (spa_lookup(newname) != NULL)
return (spa_vdev_exit(spa, NULL, txg, EEXIST));
/*
* scan through all the children to ensure they're all mirrors
*/
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
&children) != 0)
return (spa_vdev_exit(spa, NULL, txg, EINVAL));
/* first, check to ensure we've got the right child count */
rvd = spa->spa_root_vdev;
lastlog = 0;
for (c = 0; c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
/* don't count the holes & logs as children */
- if (vd->vdev_islog || vd->vdev_ishole) {
+ if (vd->vdev_islog || !vdev_is_concrete(vd)) {
if (lastlog == 0)
lastlog = c;
continue;
}
lastlog = 0;
}
if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
return (spa_vdev_exit(spa, NULL, txg, EINVAL));
/* next, ensure no spare or cache devices are part of the split */
if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
return (spa_vdev_exit(spa, NULL, txg, EINVAL));
vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
/* then, loop over each vdev and validate it */
for (c = 0; c < children; c++) {
uint64_t is_hole = 0;
(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
&is_hole);
if (is_hole != 0) {
if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
continue;
} else {
error = SET_ERROR(EINVAL);
break;
}
}
/* which disk is going to be split? */
if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
&glist[c]) != 0) {
error = SET_ERROR(EINVAL);
break;
}
/* look it up in the spa */
vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
if (vml[c] == NULL) {
error = SET_ERROR(ENODEV);
break;
}
/* make sure there's nothing stopping the split */
if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
vml[c]->vdev_islog ||
- vml[c]->vdev_ishole ||
+ !vdev_is_concrete(vml[c]) ||
vml[c]->vdev_isspare ||
vml[c]->vdev_isl2cache ||
!vdev_writeable(vml[c]) ||
vml[c]->vdev_children != 0 ||
vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
error = SET_ERROR(EINVAL);
break;
}
if (vdev_dtl_required(vml[c])) {
error = SET_ERROR(EBUSY);
break;
}
/* we need certain info from the top level */
VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
vml[c]->vdev_top->vdev_ms_array) == 0);
VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
vml[c]->vdev_top->vdev_ms_shift) == 0);
VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
vml[c]->vdev_top->vdev_asize) == 0);
VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
vml[c]->vdev_top->vdev_ashift) == 0);
/* transfer per-vdev ZAPs */
ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
VERIFY0(nvlist_add_uint64(child[c],
ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));
ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
VERIFY0(nvlist_add_uint64(child[c],
ZPOOL_CONFIG_VDEV_TOP_ZAP,
vml[c]->vdev_parent->vdev_top_zap));
}
if (error != 0) {
kmem_free(vml, children * sizeof (vdev_t *));
kmem_free(glist, children * sizeof (uint64_t));
return (spa_vdev_exit(spa, NULL, txg, error));
}
/* stop writers from using the disks */
for (c = 0; c < children; c++) {
if (vml[c] != NULL)
vml[c]->vdev_offline = B_TRUE;
}
vdev_reopen(spa->spa_root_vdev);
/*
* Temporarily record the splitting vdevs in the spa config. This
* will disappear once the config is regenerated.
*/
VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
glist, children) == 0);
kmem_free(glist, children * sizeof (uint64_t));
mutex_enter(&spa->spa_props_lock);
VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
nvl) == 0);
mutex_exit(&spa->spa_props_lock);
spa->spa_config_splitting = nvl;
vdev_config_dirty(spa->spa_root_vdev);
/* configure and create the new pool */
VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
spa_version(spa)) == 0);
VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
spa->spa_config_txg) == 0);
VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
spa_generate_guid(NULL)) == 0);
VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
(void) nvlist_lookup_string(props,
zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
/* add the new pool to the namespace */
newspa = spa_add(newname, config, altroot);
newspa->spa_avz_action = AVZ_ACTION_REBUILD;
newspa->spa_config_txg = spa->spa_config_txg;
spa_set_log_state(newspa, SPA_LOG_CLEAR);
/* release the spa config lock, retaining the namespace lock */
spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
if (zio_injection_enabled)
zio_handle_panic_injection(spa, FTAG, 1);
spa_activate(newspa, spa_mode_global);
spa_async_suspend(newspa);
#ifndef illumos
/* mark that we are creating new spa by splitting */
newspa->spa_splitting_newspa = B_TRUE;
#endif
/* create the new pool from the disks of the original pool */
error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
#ifndef illumos
newspa->spa_splitting_newspa = B_FALSE;
#endif
if (error)
goto out;
/* if that worked, generate a real config for the new pool */
if (newspa->spa_root_vdev != NULL) {
VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
NV_UNIQUE_NAME, KM_SLEEP) == 0);
VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
B_TRUE));
}
/* set the props */
if (props != NULL) {
spa_configfile_set(newspa, props, B_FALSE);
error = spa_prop_set(newspa, props);
if (error)
goto out;
}
/* flush everything */
txg = spa_vdev_config_enter(newspa);
vdev_config_dirty(newspa->spa_root_vdev);
(void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
if (zio_injection_enabled)
zio_handle_panic_injection(spa, FTAG, 2);
spa_async_resume(newspa);
/* finally, update the original pool's config */
txg = spa_vdev_config_enter(spa);
tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error != 0)
dmu_tx_abort(tx);
for (c = 0; c < children; c++) {
if (vml[c] != NULL) {
vdev_split(vml[c]);
if (error == 0)
spa_history_log_internal(spa, "detach", tx,
"vdev=%s", vml[c]->vdev_path);
vdev_free(vml[c]);
}
}
spa->spa_avz_action = AVZ_ACTION_REBUILD;
vdev_config_dirty(spa->spa_root_vdev);
spa->spa_config_splitting = NULL;
nvlist_free(nvl);
if (error == 0)
dmu_tx_commit(tx);
(void) spa_vdev_exit(spa, NULL, txg, 0);
if (zio_injection_enabled)
zio_handle_panic_injection(spa, FTAG, 3);
/* split is complete; log a history record */
spa_history_log_internal(newspa, "split", NULL,
"from pool %s", spa_name(spa));
kmem_free(vml, children * sizeof (vdev_t *));
/* if we're not going to mount the filesystems in userland, export */
if (exp)
error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
B_FALSE, B_FALSE);
return (error);
out:
spa_unload(newspa);
spa_deactivate(newspa);
spa_remove(newspa);
txg = spa_vdev_config_enter(spa);
/* re-online all offlined disks */
for (c = 0; c < children; c++) {
if (vml[c] != NULL)
vml[c]->vdev_offline = B_FALSE;
}
vdev_reopen(spa->spa_root_vdev);
nvlist_free(spa->spa_config_splitting);
spa->spa_config_splitting = NULL;
(void) spa_vdev_exit(spa, NULL, txg, error);
kmem_free(vml, children * sizeof (vdev_t *));
return (error);
}
-static nvlist_t *
-spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
-{
- for (int i = 0; i < count; i++) {
- uint64_t guid;
-
- VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
- &guid) == 0);
-
- if (guid == target_guid)
- return (nvpp[i]);
- }
-
- return (NULL);
-}
-
-static void
-spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
- nvlist_t *dev_to_remove)
-{
- nvlist_t **newdev = NULL;
-
- if (count > 1)
- newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
-
- for (int i = 0, j = 0; i < count; i++) {
- if (dev[i] == dev_to_remove)
- continue;
- VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
- }
-
- VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
- VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
-
- for (int i = 0; i < count - 1; i++)
- nvlist_free(newdev[i]);
-
- if (count > 1)
- kmem_free(newdev, (count - 1) * sizeof (void *));
-}
-
/*
- * Evacuate the device.
- */
-static int
-spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
-{
- uint64_t txg;
- int error = 0;
-
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
- ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
- ASSERT(vd == vd->vdev_top);
-
- /*
- * Evacuate the device. We don't hold the config lock as writer
- * since we need to do I/O but we do keep the
- * spa_namespace_lock held. Once this completes the device
- * should no longer have any blocks allocated on it.
- */
- if (vd->vdev_islog) {
- if (vd->vdev_stat.vs_alloc != 0)
- error = spa_offline_log(spa);
- } else {
- error = SET_ERROR(ENOTSUP);
- }
-
- if (error)
- return (error);
-
- /*
- * The evacuation succeeded. Remove any remaining MOS metadata
- * associated with this vdev, and wait for these changes to sync.
- */
- ASSERT0(vd->vdev_stat.vs_alloc);
- txg = spa_vdev_config_enter(spa);
- vd->vdev_removing = B_TRUE;
- vdev_dirty_leaves(vd, VDD_DTL, txg);
- vdev_config_dirty(vd);
- spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
-
- return (0);
-}
-
-/*
- * Complete the removal by cleaning up the namespace.
- */
-static void
-spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
-{
- vdev_t *rvd = spa->spa_root_vdev;
- uint64_t id = vd->vdev_id;
- boolean_t last_vdev = (id == (rvd->vdev_children - 1));
-
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
- ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
- ASSERT(vd == vd->vdev_top);
-
- /*
- * Only remove any devices which are empty.
- */
- if (vd->vdev_stat.vs_alloc != 0)
- return;
-
- (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
-
- if (list_link_active(&vd->vdev_state_dirty_node))
- vdev_state_clean(vd);
- if (list_link_active(&vd->vdev_config_dirty_node))
- vdev_config_clean(vd);
-
- vdev_free(vd);
-
- if (last_vdev) {
- vdev_compact_children(rvd);
- } else {
- vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
- vdev_add_child(rvd, vd);
- }
- vdev_config_dirty(rvd);
-
- /*
- * Reassess the health of our root vdev.
- */
- vdev_reopen(rvd);
-}
-
-/*
- * Remove a device from the pool -
- *
- * Removing a device from the vdev namespace requires several steps
- * and can take a significant amount of time. As a result we use
- * the spa_vdev_config_[enter/exit] functions which allow us to
- * grab and release the spa_config_lock while still holding the namespace
- * lock. During each step the configuration is synced out.
- *
- * Currently, this supports removing only hot spares, slogs, and level 2 ARC
- * devices.
- */
-int
-spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
-{
- vdev_t *vd;
- sysevent_t *ev = NULL;
- metaslab_group_t *mg;
- nvlist_t **spares, **l2cache, *nv;
- uint64_t txg = 0;
- uint_t nspares, nl2cache;
- int error = 0;
- boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
-
- ASSERT(spa_writeable(spa));
-
- if (!locked)
- txg = spa_vdev_enter(spa);
-
- vd = spa_lookup_by_guid(spa, guid, B_FALSE);
-
- if (spa->spa_spares.sav_vdevs != NULL &&
- nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
- ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
- (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
- /*
- * Only remove the hot spare if it's not currently in use
- * in this pool.
- */
- if (vd == NULL || unspare) {
- if (vd == NULL)
- vd = spa_lookup_by_guid(spa, guid, B_TRUE);
- ev = spa_event_create(spa, vd, NULL,
- ESC_ZFS_VDEV_REMOVE_AUX);
- spa_vdev_remove_aux(spa->spa_spares.sav_config,
- ZPOOL_CONFIG_SPARES, spares, nspares, nv);
- spa_load_spares(spa);
- spa->spa_spares.sav_sync = B_TRUE;
- } else {
- error = SET_ERROR(EBUSY);
- }
- } else if (spa->spa_l2cache.sav_vdevs != NULL &&
- nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
- ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
- (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
- /*
- * Cache devices can always be removed.
- */
- vd = spa_lookup_by_guid(spa, guid, B_TRUE);
- ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX);
- spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
- ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
- spa_load_l2cache(spa);
- spa->spa_l2cache.sav_sync = B_TRUE;
- } else if (vd != NULL && vd->vdev_islog) {
- ASSERT(!locked);
- ASSERT(vd == vd->vdev_top);
-
- mg = vd->vdev_mg;
-
- /*
- * Stop allocating from this vdev.
- */
- metaslab_group_passivate(mg);
-
- /*
- * Wait for the youngest allocations and frees to sync,
- * and then wait for the deferral of those frees to finish.
- */
- spa_vdev_config_exit(spa, NULL,
- txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
-
- /*
- * Attempt to evacuate the vdev.
- */
- error = spa_vdev_remove_evacuate(spa, vd);
-
- txg = spa_vdev_config_enter(spa);
-
- /*
- * If we couldn't evacuate the vdev, unwind.
- */
- if (error) {
- metaslab_group_activate(mg);
- return (spa_vdev_exit(spa, NULL, txg, error));
- }
-
- /*
- * Clean up the vdev namespace.
- */
- ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_DEV);
- spa_vdev_remove_from_namespace(spa, vd);
-
- } else if (vd != NULL) {
- /*
- * Normal vdevs cannot be removed (yet).
- */
- error = SET_ERROR(ENOTSUP);
- } else {
- /*
- * There is no vdev of any kind with the specified guid.
- */
- error = SET_ERROR(ENOENT);
- }
-
- if (!locked)
- error = spa_vdev_exit(spa, NULL, txg, error);
-
- if (ev)
- spa_event_post(ev);
-
- return (error);
-}
-
-/*
* Find any device that's done replacing, or a vdev marked 'unspare' that's
* currently spared, so we can detach it.
*/
static vdev_t *
spa_vdev_resilver_done_hunt(vdev_t *vd)
{
vdev_t *newvd, *oldvd;
for (int c = 0; c < vd->vdev_children; c++) {
oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
if (oldvd != NULL)
return (oldvd);
}
/*
* Check for a completed replacement. We always consider the first
* vdev in the list to be the oldest vdev, and the last one to be
* the newest (see spa_vdev_attach() for how that works). In
* the case where the newest vdev is faulted, we will not automatically
* remove it after a resilver completes. This is OK as it will require
* user intervention to determine which disk the admin wishes to keep.
*/
if (vd->vdev_ops == &vdev_replacing_ops) {
ASSERT(vd->vdev_children > 1);
newvd = vd->vdev_child[vd->vdev_children - 1];
oldvd = vd->vdev_child[0];
if (vdev_dtl_empty(newvd, DTL_MISSING) &&
vdev_dtl_empty(newvd, DTL_OUTAGE) &&
!vdev_dtl_required(oldvd))
return (oldvd);
}
/*
* Check for a completed resilver with the 'unspare' flag set.
*/
if (vd->vdev_ops == &vdev_spare_ops) {
vdev_t *first = vd->vdev_child[0];
vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
if (last->vdev_unspare) {
oldvd = first;
newvd = last;
} else if (first->vdev_unspare) {
oldvd = last;
newvd = first;
} else {
oldvd = NULL;
}
if (oldvd != NULL &&
vdev_dtl_empty(newvd, DTL_MISSING) &&
vdev_dtl_empty(newvd, DTL_OUTAGE) &&
!vdev_dtl_required(oldvd))
return (oldvd);
/*
* If there are more than two spares attached to a disk,
* and those spares are not required, then we want to
* attempt to free them up now so that they can be used
* by other pools. Once we're back down to a single
* disk+spare, we stop removing them.
*/
if (vd->vdev_children > 2) {
newvd = vd->vdev_child[1];
if (newvd->vdev_isspare && last->vdev_isspare &&
vdev_dtl_empty(last, DTL_MISSING) &&
vdev_dtl_empty(last, DTL_OUTAGE) &&
!vdev_dtl_required(newvd))
return (newvd);
}
}
return (NULL);
}
static void
spa_vdev_resilver_done(spa_t *spa)
{
vdev_t *vd, *pvd, *ppvd;
uint64_t guid, sguid, pguid, ppguid;
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
pvd = vd->vdev_parent;
ppvd = pvd->vdev_parent;
guid = vd->vdev_guid;
pguid = pvd->vdev_guid;
ppguid = ppvd->vdev_guid;
sguid = 0;
/*
* If we have just finished replacing a hot spared device, then
* we need to detach the parent's first child (the original hot
* spare) as well.
*/
if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
ppvd->vdev_children == 2) {
ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
sguid = ppvd->vdev_child[1]->vdev_guid;
}
ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
spa_config_exit(spa, SCL_ALL, FTAG);
if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
return;
if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
return;
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
}
spa_config_exit(spa, SCL_ALL, FTAG);
}
/*
* Update the stored path or FRU for this vdev.
*/
int
spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
boolean_t ispath)
{
vdev_t *vd;
boolean_t sync = B_FALSE;
ASSERT(spa_writeable(spa));
spa_vdev_state_enter(spa, SCL_ALL);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENOENT));
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
if (ispath) {
if (strcmp(value, vd->vdev_path) != 0) {
spa_strfree(vd->vdev_path);
vd->vdev_path = spa_strdup(value);
sync = B_TRUE;
}
} else {
if (vd->vdev_fru == NULL) {
vd->vdev_fru = spa_strdup(value);
sync = B_TRUE;
} else if (strcmp(value, vd->vdev_fru) != 0) {
spa_strfree(vd->vdev_fru);
vd->vdev_fru = spa_strdup(value);
sync = B_TRUE;
}
}
return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
}
int
spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
{
return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
}
int
spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
{
return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
}
/*
* ==========================================================================
* SPA Scanning
* ==========================================================================
*/
int
spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd)
{
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
if (dsl_scan_resilvering(spa->spa_dsl_pool))
return (SET_ERROR(EBUSY));
return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd));
}
int
spa_scan_stop(spa_t *spa)
{
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
if (dsl_scan_resilvering(spa->spa_dsl_pool))
return (SET_ERROR(EBUSY));
return (dsl_scan_cancel(spa->spa_dsl_pool));
}
int
spa_scan(spa_t *spa, pool_scan_func_t func)
{
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
return (SET_ERROR(ENOTSUP));
/*
* If a resilver was requested, but there is no DTL on a
* writeable leaf device, we have nothing to do.
*/
if (func == POOL_SCAN_RESILVER &&
!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
return (0);
}
return (dsl_scan(spa->spa_dsl_pool, func));
}
/*
* ==========================================================================
* SPA async task processing
* ==========================================================================
*/
static void
spa_async_remove(spa_t *spa, vdev_t *vd)
{
if (vd->vdev_remove_wanted) {
vd->vdev_remove_wanted = B_FALSE;
vd->vdev_delayed_close = B_FALSE;
vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
/*
* We want to clear the stats, but we don't want to do a full
* vdev_clear() as that will cause us to throw away
* degraded/faulted state as well as attempt to reopen the
* device, all of which is a waste.
*/
vd->vdev_stat.vs_read_errors = 0;
vd->vdev_stat.vs_write_errors = 0;
vd->vdev_stat.vs_checksum_errors = 0;
vdev_state_dirty(vd->vdev_top);
/* Tell userspace that the vdev is gone. */
zfs_post_remove(spa, vd);
}
for (int c = 0; c < vd->vdev_children; c++)
spa_async_remove(spa, vd->vdev_child[c]);
}
static void
spa_async_probe(spa_t *spa, vdev_t *vd)
{
if (vd->vdev_probe_wanted) {
vd->vdev_probe_wanted = B_FALSE;
vdev_reopen(vd); /* vdev_open() does the actual probe */
}
for (int c = 0; c < vd->vdev_children; c++)
spa_async_probe(spa, vd->vdev_child[c]);
}
static void
spa_async_autoexpand(spa_t *spa, vdev_t *vd)
{
sysevent_id_t eid;
nvlist_t *attr;
char *physpath;
if (!spa->spa_autoexpand)
return;
for (int c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
spa_async_autoexpand(spa, cvd);
}
if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
return;
physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
(void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP);
nvlist_free(attr);
kmem_free(physpath, MAXPATHLEN);
}
static void
spa_async_thread(void *arg)
{
spa_t *spa = (spa_t *)arg;
int tasks;
ASSERT(spa->spa_sync_on);
mutex_enter(&spa->spa_async_lock);
tasks = spa->spa_async_tasks;
spa->spa_async_tasks &= SPA_ASYNC_REMOVE;
mutex_exit(&spa->spa_async_lock);
/*
* See if the config needs to be updated.
*/
if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
uint64_t old_space, new_space;
mutex_enter(&spa_namespace_lock);
old_space = metaslab_class_get_space(spa_normal_class(spa));
spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
new_space = metaslab_class_get_space(spa_normal_class(spa));
mutex_exit(&spa_namespace_lock);
/*
* If the pool grew as a result of the config update,
* then log an internal history event.
*/
if (new_space != old_space) {
spa_history_log_internal(spa, "vdev online", NULL,
"pool '%s' size: %llu(+%llu)",
spa_name(spa), new_space, new_space - old_space);
}
}
if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
spa_async_autoexpand(spa, spa->spa_root_vdev);
spa_config_exit(spa, SCL_CONFIG, FTAG);
}
/*
* See if any devices need to be probed.
*/
if (tasks & SPA_ASYNC_PROBE) {
spa_vdev_state_enter(spa, SCL_NONE);
spa_async_probe(spa, spa->spa_root_vdev);
(void) spa_vdev_state_exit(spa, NULL, 0);
}
/*
* If any devices are done replacing, detach them.
*/
if (tasks & SPA_ASYNC_RESILVER_DONE)
spa_vdev_resilver_done(spa);
/*
* Kick off a resilver.
*/
if (tasks & SPA_ASYNC_RESILVER)
dsl_resilver_restart(spa->spa_dsl_pool, 0);
/*
* Let the world know that we're done.
*/
mutex_enter(&spa->spa_async_lock);
spa->spa_async_thread = NULL;
cv_broadcast(&spa->spa_async_cv);
mutex_exit(&spa->spa_async_lock);
thread_exit();
}
static void
spa_async_thread_vd(void *arg)
{
spa_t *spa = arg;
int tasks;
mutex_enter(&spa->spa_async_lock);
tasks = spa->spa_async_tasks;
retry:
spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE;
mutex_exit(&spa->spa_async_lock);
/*
* See if any devices need to be marked REMOVED.
*/
if (tasks & SPA_ASYNC_REMOVE) {
spa_vdev_state_enter(spa, SCL_NONE);
spa_async_remove(spa, spa->spa_root_vdev);
for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
for (int i = 0; i < spa->spa_spares.sav_count; i++)
spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
(void) spa_vdev_state_exit(spa, NULL, 0);
}
/*
* Let the world know that we're done.
*/
mutex_enter(&spa->spa_async_lock);
tasks = spa->spa_async_tasks;
if ((tasks & SPA_ASYNC_REMOVE) != 0)
goto retry;
spa->spa_async_thread_vd = NULL;
cv_broadcast(&spa->spa_async_cv);
mutex_exit(&spa->spa_async_lock);
thread_exit();
}
void
spa_async_suspend(spa_t *spa)
{
mutex_enter(&spa->spa_async_lock);
spa->spa_async_suspended++;
- while (spa->spa_async_thread != NULL &&
- spa->spa_async_thread_vd != NULL)
+ while (spa->spa_async_thread != NULL ||
+ spa->spa_async_thread_vd != NULL ||
+ spa->spa_condense_thread != NULL)
cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
mutex_exit(&spa->spa_async_lock);
+
+ spa_vdev_remove_suspend(spa);
}
void
spa_async_resume(spa_t *spa)
{
mutex_enter(&spa->spa_async_lock);
ASSERT(spa->spa_async_suspended != 0);
spa->spa_async_suspended--;
mutex_exit(&spa->spa_async_lock);
+ spa_restart_removal(spa);
}
static boolean_t
spa_async_tasks_pending(spa_t *spa)
{
uint_t non_config_tasks;
uint_t config_task;
boolean_t config_task_suspended;
non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE |
SPA_ASYNC_REMOVE);
config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
if (spa->spa_ccw_fail_time == 0) {
config_task_suspended = B_FALSE;
} else {
config_task_suspended =
(gethrtime() - spa->spa_ccw_fail_time) <
(zfs_ccw_retry_interval * NANOSEC);
}
return (non_config_tasks || (config_task && !config_task_suspended));
}
static void
spa_async_dispatch(spa_t *spa)
{
mutex_enter(&spa->spa_async_lock);
if (spa_async_tasks_pending(spa) &&
!spa->spa_async_suspended &&
spa->spa_async_thread == NULL &&
rootdir != NULL)
spa->spa_async_thread = thread_create(NULL, 0,
spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
mutex_exit(&spa->spa_async_lock);
}
static void
spa_async_dispatch_vd(spa_t *spa)
{
mutex_enter(&spa->spa_async_lock);
if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 &&
!spa->spa_async_suspended &&
spa->spa_async_thread_vd == NULL &&
rootdir != NULL)
spa->spa_async_thread_vd = thread_create(NULL, 0,
spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri);
mutex_exit(&spa->spa_async_lock);
}
void
spa_async_request(spa_t *spa, int task)
{
zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
mutex_enter(&spa->spa_async_lock);
spa->spa_async_tasks |= task;
mutex_exit(&spa->spa_async_lock);
spa_async_dispatch_vd(spa);
}
/*
* ==========================================================================
* SPA syncing routines
* ==========================================================================
*/
static int
bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
{
bpobj_t *bpo = arg;
bpobj_enqueue(bpo, bp, tx);
return (0);
}
static int
spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
{
zio_t *zio = arg;
zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
BP_GET_PSIZE(bp), zio->io_flags));
return (0);
}
/*
* Note: this simple function is not inlined to make it easier to dtrace the
* amount of time spent syncing frees.
*/
static void
spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
{
zio_t *zio = zio_root(spa, NULL, NULL, 0);
bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
VERIFY(zio_wait(zio) == 0);
}
/*
* Note: this simple function is not inlined to make it easier to dtrace the
* amount of time spent syncing deferred frees.
*/
static void
spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
{
zio_t *zio = zio_root(spa, NULL, NULL, 0);
VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
spa_free_sync_cb, zio, tx), ==, 0);
VERIFY0(zio_wait(zio));
}
static void
spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
{
char *packed = NULL;
size_t bufsize;
size_t nvsize = 0;
dmu_buf_t *db;
VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
/*
* Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
* information. This avoids the dmu_buf_will_dirty() path and
* saves us a pre-read to get data we don't actually care about.
*/
bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
packed = kmem_alloc(bufsize, KM_SLEEP);
VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
KM_SLEEP) == 0);
bzero(packed + nvsize, bufsize - nvsize);
dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
kmem_free(packed, bufsize);
VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
dmu_buf_will_dirty(db, tx);
*(uint64_t *)db->db_data = nvsize;
dmu_buf_rele(db, FTAG);
}
static void
spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
const char *config, const char *entry)
{
nvlist_t *nvroot;
nvlist_t **list;
int i;
if (!sav->sav_sync)
return;
/*
* Update the MOS nvlist describing the list of available devices.
* spa_validate_aux() will have already made sure this nvlist is
* valid and the vdevs are labeled appropriately.
*/
if (sav->sav_object == 0) {
sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
sizeof (uint64_t), tx);
VERIFY(zap_update(spa->spa_meta_objset,
DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
&sav->sav_object, tx) == 0);
}
VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
if (sav->sav_count == 0) {
VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
} else {
list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
for (i = 0; i < sav->sav_count; i++)
list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
B_FALSE, VDEV_CONFIG_L2CACHE);
VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
sav->sav_count) == 0);
for (i = 0; i < sav->sav_count; i++)
nvlist_free(list[i]);
kmem_free(list, sav->sav_count * sizeof (void *));
}
spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
nvlist_free(nvroot);
sav->sav_sync = B_FALSE;
}
/*
* Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t.
* The all-vdev ZAP must be empty.
*/
static void
spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
{
spa_t *spa = vd->vdev_spa;
if (vd->vdev_top_zap != 0) {
VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
vd->vdev_top_zap, tx));
}
if (vd->vdev_leaf_zap != 0) {
VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
vd->vdev_leaf_zap, tx));
}
for (uint64_t i = 0; i < vd->vdev_children; i++) {
spa_avz_build(vd->vdev_child[i], avz, tx);
}
}
static void
spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
{
nvlist_t *config;
/*
* If the pool is being imported from a pre-per-vdev-ZAP version of ZFS,
* its config may not be dirty but we still need to build per-vdev ZAPs.
* Similarly, if the pool is being assembled (e.g. after a split), we
* need to rebuild the AVZ although the config may not be dirty.
*/
if (list_is_empty(&spa->spa_config_dirty_list) &&
spa->spa_avz_action == AVZ_ACTION_NONE)
return;
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE ||
spa->spa_avz_action == AVZ_ACTION_INITIALIZE ||
spa->spa_all_vdev_zaps != 0);
if (spa->spa_avz_action == AVZ_ACTION_REBUILD) {
/* Make and build the new AVZ */
uint64_t new_avz = zap_create(spa->spa_meta_objset,
DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
spa_avz_build(spa->spa_root_vdev, new_avz, tx);
/* Diff old AVZ with new one */
zap_cursor_t zc;
zap_attribute_t za;
for (zap_cursor_init(&zc, spa->spa_meta_objset,
spa->spa_all_vdev_zaps);
zap_cursor_retrieve(&zc, &za) == 0;
zap_cursor_advance(&zc)) {
uint64_t vdzap = za.za_first_integer;
if (zap_lookup_int(spa->spa_meta_objset, new_avz,
vdzap) == ENOENT) {
/*
* ZAP is listed in old AVZ but not in new one;
* destroy it
*/
VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap,
tx));
}
}
zap_cursor_fini(&zc);
/* Destroy the old AVZ */
VERIFY0(zap_destroy(spa->spa_meta_objset,
spa->spa_all_vdev_zaps, tx));
/* Replace the old AVZ in the dir obj with the new one */
VERIFY0(zap_update(spa->spa_meta_objset,
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP,
sizeof (new_avz), 1, &new_avz, tx));
spa->spa_all_vdev_zaps = new_avz;
} else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) {
zap_cursor_t zc;
zap_attribute_t za;
/* Walk through the AVZ and destroy all listed ZAPs */
for (zap_cursor_init(&zc, spa->spa_meta_objset,
spa->spa_all_vdev_zaps);
zap_cursor_retrieve(&zc, &za) == 0;
zap_cursor_advance(&zc)) {
uint64_t zap = za.za_first_integer;
VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx));
}
zap_cursor_fini(&zc);
/* Destroy and unlink the AVZ itself */
VERIFY0(zap_destroy(spa->spa_meta_objset,
spa->spa_all_vdev_zaps, tx));
VERIFY0(zap_remove(spa->spa_meta_objset,
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx));
spa->spa_all_vdev_zaps = 0;
}
if (spa->spa_all_vdev_zaps == 0) {
spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset,
DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_VDEV_ZAP_MAP, tx);
}
spa->spa_avz_action = AVZ_ACTION_NONE;
/* Create ZAPs for vdevs that don't have them. */
vdev_construct_zaps(spa->spa_root_vdev, tx);
config = spa_config_generate(spa, spa->spa_root_vdev,
dmu_tx_get_txg(tx), B_FALSE);
/*
* If we're upgrading the spa version then make sure that
* the config object gets updated with the correct version.
*/
if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
spa->spa_uberblock.ub_version);
spa_config_exit(spa, SCL_STATE, FTAG);
nvlist_free(spa->spa_config_syncing);
spa->spa_config_syncing = config;
spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
}
static void
spa_sync_version(void *arg, dmu_tx_t *tx)
{
uint64_t *versionp = arg;
uint64_t version = *versionp;
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
/*
* Setting the version is special cased when first creating the pool.
*/
ASSERT(tx->tx_txg != TXG_INITIAL);
ASSERT(SPA_VERSION_IS_SUPPORTED(version));
ASSERT(version >= spa_version(spa));
spa->spa_uberblock.ub_version = version;
vdev_config_dirty(spa->spa_root_vdev);
spa_history_log_internal(spa, "set", tx, "version=%lld", version);
}
/*
* Set zpool properties.
*/
static void
spa_sync_props(void *arg, dmu_tx_t *tx)
{
nvlist_t *nvp = arg;
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
objset_t *mos = spa->spa_meta_objset;
nvpair_t *elem = NULL;
mutex_enter(&spa->spa_props_lock);
while ((elem = nvlist_next_nvpair(nvp, elem))) {
uint64_t intval;
char *strval, *fname;
zpool_prop_t prop;
const char *propname;
zprop_type_t proptype;
spa_feature_t fid;
switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
case ZPOOL_PROP_INVAL:
/*
* We checked this earlier in spa_prop_validate().
*/
ASSERT(zpool_prop_feature(nvpair_name(elem)));
fname = strchr(nvpair_name(elem), '@') + 1;
VERIFY0(zfeature_lookup_name(fname, &fid));
spa_feature_enable(spa, fid, tx);
spa_history_log_internal(spa, "set", tx,
"%s=enabled", nvpair_name(elem));
break;
case ZPOOL_PROP_VERSION:
intval = fnvpair_value_uint64(elem);
/*
* The version is synced seperatly before other
* properties and should be correct by now.
*/
ASSERT3U(spa_version(spa), >=, intval);
break;
case ZPOOL_PROP_ALTROOT:
/*
* 'altroot' is a non-persistent property. It should
* have been set temporarily at creation or import time.
*/
ASSERT(spa->spa_root != NULL);
break;
case ZPOOL_PROP_READONLY:
case ZPOOL_PROP_CACHEFILE:
/*
* 'readonly' and 'cachefile' are also non-persisitent
* properties.
*/
break;
case ZPOOL_PROP_COMMENT:
strval = fnvpair_value_string(elem);
if (spa->spa_comment != NULL)
spa_strfree(spa->spa_comment);
spa->spa_comment = spa_strdup(strval);
/*
* We need to dirty the configuration on all the vdevs
* so that their labels get updated. It's unnecessary
* to do this for pool creation since the vdev's
* configuratoin has already been dirtied.
*/
if (tx->tx_txg != TXG_INITIAL)
vdev_config_dirty(spa->spa_root_vdev);
spa_history_log_internal(spa, "set", tx,
"%s=%s", nvpair_name(elem), strval);
break;
default:
/*
* Set pool property values in the poolprops mos object.
*/
if (spa->spa_pool_props_object == 0) {
spa->spa_pool_props_object =
zap_create_link(mos, DMU_OT_POOL_PROPS,
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
tx);
}
/* normalize the property name */
propname = zpool_prop_to_name(prop);
proptype = zpool_prop_get_type(prop);
if (nvpair_type(elem) == DATA_TYPE_STRING) {
ASSERT(proptype == PROP_TYPE_STRING);
strval = fnvpair_value_string(elem);
VERIFY0(zap_update(mos,
spa->spa_pool_props_object, propname,
1, strlen(strval) + 1, strval, tx));
spa_history_log_internal(spa, "set", tx,
"%s=%s", nvpair_name(elem), strval);
} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
intval = fnvpair_value_uint64(elem);
if (proptype == PROP_TYPE_INDEX) {
const char *unused;
VERIFY0(zpool_prop_index_to_string(
prop, intval, &unused));
}
VERIFY0(zap_update(mos,
spa->spa_pool_props_object, propname,
8, 1, &intval, tx));
spa_history_log_internal(spa, "set", tx,
"%s=%lld", nvpair_name(elem), intval);
} else {
ASSERT(0); /* not allowed */
}
switch (prop) {
case ZPOOL_PROP_DELEGATION:
spa->spa_delegation = intval;
break;
case ZPOOL_PROP_BOOTFS:
spa->spa_bootfs = intval;
break;
case ZPOOL_PROP_FAILUREMODE:
spa->spa_failmode = intval;
break;
case ZPOOL_PROP_AUTOEXPAND:
spa->spa_autoexpand = intval;
if (tx->tx_txg != TXG_INITIAL)
spa_async_request(spa,
SPA_ASYNC_AUTOEXPAND);
break;
case ZPOOL_PROP_DEDUPDITTO:
spa->spa_dedup_ditto = intval;
break;
default:
break;
}
}
}
mutex_exit(&spa->spa_props_lock);
}
/*
* Perform one-time upgrade on-disk changes. spa_version() does not
* reflect the new version this txg, so there must be no changes this
* txg to anything that the upgrade code depends on after it executes.
* Therefore this must be called after dsl_pool_sync() does the sync
* tasks.
*/
static void
spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
{
dsl_pool_t *dp = spa->spa_dsl_pool;
ASSERT(spa->spa_sync_pass == 1);
rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
dsl_pool_create_origin(dp, tx);
/* Keeping the origin open increases spa_minref */
spa->spa_minref += 3;
}
if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
dsl_pool_upgrade_clones(dp, tx);
}
if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
dsl_pool_upgrade_dir_clones(dp, tx);
/* Keeping the freedir open increases spa_minref */
spa->spa_minref += 3;
}
if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
spa_feature_create_zap_objects(spa, tx);
}
/*
* LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
* when possibility to use lz4 compression for metadata was added
* Old pools that have this feature enabled must be upgraded to have
* this feature active
*/
if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
boolean_t lz4_en = spa_feature_is_enabled(spa,
SPA_FEATURE_LZ4_COMPRESS);
boolean_t lz4_ac = spa_feature_is_active(spa,
SPA_FEATURE_LZ4_COMPRESS);
if (lz4_en && !lz4_ac)
spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
}
/*
* If we haven't written the salt, do so now. Note that the
* feature may not be activated yet, but that's fine since
* the presence of this ZAP entry is backwards compatible.
*/
if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_CHECKSUM_SALT) == ENOENT) {
VERIFY0(zap_add(spa->spa_meta_objset,
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
sizeof (spa->spa_cksum_salt.zcs_bytes),
spa->spa_cksum_salt.zcs_bytes, tx));
}
rrw_exit(&dp->dp_config_rwlock, FTAG);
}
+static void
+vdev_indirect_state_sync_verify(vdev_t *vd)
+{
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ vdev_indirect_births_t *vib = vd->vdev_indirect_births;
+
+ if (vd->vdev_ops == &vdev_indirect_ops) {
+ ASSERT(vim != NULL);
+ ASSERT(vib != NULL);
+ }
+
+ if (vdev_obsolete_sm_object(vd) != 0) {
+ ASSERT(vd->vdev_obsolete_sm != NULL);
+ ASSERT(vd->vdev_removing ||
+ vd->vdev_ops == &vdev_indirect_ops);
+ ASSERT(vdev_indirect_mapping_num_entries(vim) > 0);
+ ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0);
+
+ ASSERT3U(vdev_obsolete_sm_object(vd), ==,
+ space_map_object(vd->vdev_obsolete_sm));
+ ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=,
+ space_map_allocated(vd->vdev_obsolete_sm));
+ }
+ ASSERT(vd->vdev_obsolete_segments != NULL);
+
+ /*
+ * Since frees / remaps to an indirect vdev can only
+ * happen in syncing context, the obsolete segments
+ * tree must be empty when we start syncing.
+ */
+ ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
+}
+
/*
* Sync the specified transaction group. New blocks may be dirtied as
* part of the process, so we iterate until it converges.
*/
void
spa_sync(spa_t *spa, uint64_t txg)
{
dsl_pool_t *dp = spa->spa_dsl_pool;
objset_t *mos = spa->spa_meta_objset;
bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd;
dmu_tx_t *tx;
int error;
uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
zfs_vdev_queue_depth_pct / 100;
VERIFY(spa_writeable(spa));
/*
+ * Wait for i/os issued in open context that need to complete
+ * before this txg syncs.
+ */
+ VERIFY0(zio_wait(spa->spa_txg_zio[txg & TXG_MASK]));
+ spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 0);
+
+ /*
* Lock out configuration changes.
*/
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
spa->spa_syncing_txg = txg;
spa->spa_sync_pass = 0;
mutex_enter(&spa->spa_alloc_lock);
VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
mutex_exit(&spa->spa_alloc_lock);
/*
* If there are any pending vdev state changes, convert them
* into config changes that go out with this transaction group.
*/
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
while (list_head(&spa->spa_state_dirty_list) != NULL) {
/*
* We need the write lock here because, for aux vdevs,
* calling vdev_config_dirty() modifies sav_config.
* This is ugly and will become unnecessary when we
* eliminate the aux vdev wart by integrating all vdevs
* into the root vdev tree.
*/
spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
vdev_state_clean(vd);
vdev_config_dirty(vd);
}
spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
}
spa_config_exit(spa, SCL_STATE, FTAG);
tx = dmu_tx_create_assigned(dp, txg);
spa->spa_sync_starttime = gethrtime();
#ifdef illumos
VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
spa->spa_sync_starttime + spa->spa_deadman_synctime));
#else /* !illumos */
#ifdef _KERNEL
callout_schedule(&spa->spa_deadman_cycid,
hz * spa->spa_deadman_synctime / NANOSEC);
#endif
#endif /* illumos */
/*
* If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
* set spa_deflate if we have no raid-z vdevs.
*/
if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
int i;
for (i = 0; i < rvd->vdev_children; i++) {
vd = rvd->vdev_child[i];
if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
break;
}
if (i == rvd->vdev_children) {
spa->spa_deflate = TRUE;
VERIFY(0 == zap_add(spa->spa_meta_objset,
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
sizeof (uint64_t), 1, &spa->spa_deflate, tx));
}
}
/*
* Set the top-level vdev's max queue depth. Evaluate each
* top-level's async write queue depth in case it changed.
* The max queue depth will not change in the middle of syncing
* out this txg.
*/
uint64_t queue_depth_total = 0;
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = tvd->vdev_mg;
if (mg == NULL || mg->mg_class != spa_normal_class(spa) ||
!metaslab_group_initialized(mg))
continue;
/*
* It is safe to do a lock-free check here because only async
* allocations look at mg_max_alloc_queue_depth, and async
* allocations all happen from spa_sync().
*/
ASSERT0(refcount_count(&mg->mg_alloc_queue_depth));
mg->mg_max_alloc_queue_depth = max_queue_depth;
queue_depth_total += mg->mg_max_alloc_queue_depth;
}
metaslab_class_t *mc = spa_normal_class(spa);
ASSERT0(refcount_count(&mc->mc_alloc_slots));
mc->mc_alloc_max_slots = queue_depth_total;
mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
ASSERT3U(mc->mc_alloc_max_slots, <=,
max_queue_depth * rvd->vdev_children);
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+ vdev_indirect_state_sync_verify(vd);
+
+ if (vdev_indirect_should_condense(vd)) {
+ spa_condense_indirect_start_sync(vd, tx);
+ break;
+ }
+ }
+
/*
* Iterate to convergence.
*/
do {
int pass = ++spa->spa_sync_pass;
spa_sync_config_object(spa, tx);
spa_sync_aux_dev(spa, &spa->spa_spares, tx,
ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
spa_errlog_sync(spa, txg);
dsl_pool_sync(dp, txg);
if (pass < zfs_sync_pass_deferred_free) {
spa_sync_frees(spa, free_bpl, tx);
} else {
/*
* We can not defer frees in pass 1, because
* we sync the deferred frees later in pass 1.
*/
ASSERT3U(pass, >, 1);
bplist_iterate(free_bpl, bpobj_enqueue_cb,
&spa->spa_deferred_bpobj, tx);
}
ddt_sync(spa, txg);
dsl_scan_sync(dp, tx);
- while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
+ if (spa->spa_vdev_removal != NULL)
+ svr_sync(spa, tx);
+
+ while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
+ != NULL)
vdev_sync(vd, txg);
if (pass == 1) {
spa_sync_upgrades(spa, tx);
ASSERT3U(txg, >=,
spa->spa_uberblock.ub_rootbp.blk_birth);
/*
* Note: We need to check if the MOS is dirty
* because we could have marked the MOS dirty
* without updating the uberblock (e.g. if we
* have sync tasks but no dirty user data). We
* need to check the uberblock's rootbp because
* it is updated if we have synced out dirty
* data (though in this case the MOS will most
* likely also be dirty due to second order
* effects, we don't want to rely on that here).
*/
if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
!dmu_objset_is_dirty(mos, txg)) {
/*
* Nothing changed on the first pass,
* therefore this TXG is a no-op. Avoid
* syncing deferred frees, so that we
* can keep this TXG as a no-op.
*/
ASSERT(txg_list_empty(&dp->dp_dirty_datasets,
txg));
ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
break;
}
spa_sync_deferred_frees(spa, tx);
}
} while (dmu_objset_is_dirty(mos, txg));
if (!list_is_empty(&spa->spa_config_dirty_list)) {
/*
* Make sure that the number of ZAPs for all the vdevs matches
* the number of ZAPs in the per-vdev ZAP list. This only gets
* called if the config is dirty; otherwise there may be
* outstanding AVZ operations that weren't completed in
* spa_sync_config_object.
*/
uint64_t all_vdev_zap_entry_count;
ASSERT0(zap_count(spa->spa_meta_objset,
spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
all_vdev_zap_entry_count);
}
+ if (spa->spa_vdev_removal != NULL) {
+ ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
+ }
+
/*
* Rewrite the vdev configuration (which includes the uberblock)
* to commit the transaction group.
*
* If there are no dirty vdevs, we sync the uberblock to a few
* random top-level vdevs that are known to be visible in the
* config cache (see spa_vdev_add() for a complete description).
* If there *are* dirty vdevs, sync the uberblock to all vdevs.
*/
for (;;) {
/*
* We hold SCL_STATE to prevent vdev open/close/etc.
* while we're attempting to write the vdev labels.
*/
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
if (list_is_empty(&spa->spa_config_dirty_list)) {
vdev_t *svd[SPA_DVAS_PER_BP];
int svdcount = 0;
int children = rvd->vdev_children;
int c0 = spa_get_random(children);
for (int c = 0; c < children; c++) {
vd = rvd->vdev_child[(c0 + c) % children];
- if (vd->vdev_ms_array == 0 || vd->vdev_islog)
+ if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
+ !vdev_is_concrete(vd))
continue;
svd[svdcount++] = vd;
if (svdcount == SPA_DVAS_PER_BP)
break;
}
error = vdev_config_sync(svd, svdcount, txg);
} else {
error = vdev_config_sync(rvd->vdev_child,
rvd->vdev_children, txg);
}
if (error == 0)
spa->spa_last_synced_guid = rvd->vdev_guid;
spa_config_exit(spa, SCL_STATE, FTAG);
if (error == 0)
break;
zio_suspend(spa, NULL);
zio_resume_wait(spa);
}
dmu_tx_commit(tx);
#ifdef illumos
VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
#else /* !illumos */
#ifdef _KERNEL
callout_drain(&spa->spa_deadman_cycid);
#endif
#endif /* illumos */
/*
* Clear the dirty config list.
*/
while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
vdev_config_clean(vd);
/*
* Now that the new config has synced transactionally,
* let it become visible to the config cache.
*/
if (spa->spa_config_syncing != NULL) {
spa_config_set(spa, spa->spa_config_syncing);
spa->spa_config_txg = txg;
spa->spa_config_syncing = NULL;
}
dsl_pool_sync_done(dp, txg);
mutex_enter(&spa->spa_alloc_lock);
VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
mutex_exit(&spa->spa_alloc_lock);
/*
* Update usable space statistics.
*/
while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
vdev_sync_done(vd, txg);
spa_update_dspace(spa);
/*
* It had better be the case that we didn't dirty anything
* since vdev_config_sync().
*/
ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
spa->spa_sync_pass = 0;
/*
* Update the last synced uberblock here. We want to do this at
* the end of spa_sync() so that consumers of spa_last_synced_txg()
* will be guaranteed that all the processing associated with
* that txg has been completed.
*/
spa->spa_ubsync = spa->spa_uberblock;
spa_config_exit(spa, SCL_CONFIG, FTAG);
spa_handle_ignored_writes(spa);
/*
* If any async tasks have been requested, kick them off.
*/
spa_async_dispatch(spa);
spa_async_dispatch_vd(spa);
}
/*
* Sync all pools. We don't want to hold the namespace lock across these
* operations, so we take a reference on the spa_t and drop the lock during the
* sync.
*/
void
spa_sync_allpools(void)
{
spa_t *spa = NULL;
mutex_enter(&spa_namespace_lock);
while ((spa = spa_next(spa)) != NULL) {
if (spa_state(spa) != POOL_STATE_ACTIVE ||
!spa_writeable(spa) || spa_suspended(spa))
continue;
spa_open_ref(spa, FTAG);
mutex_exit(&spa_namespace_lock);
txg_wait_synced(spa_get_dsl(spa), 0);
mutex_enter(&spa_namespace_lock);
spa_close(spa, FTAG);
}
mutex_exit(&spa_namespace_lock);
}
/*
* ==========================================================================
* Miscellaneous routines
* ==========================================================================
*/
/*
* Remove all pools in the system.
*/
void
spa_evict_all(void)
{
spa_t *spa;
/*
* Remove all cached state. All pools should be closed now,
* so every spa in the AVL tree should be unreferenced.
*/
mutex_enter(&spa_namespace_lock);
while ((spa = spa_next(NULL)) != NULL) {
/*
* Stop async tasks. The async thread may need to detach
* a device that's been replaced, which requires grabbing
* spa_namespace_lock, so we must drop it here.
*/
spa_open_ref(spa, FTAG);
mutex_exit(&spa_namespace_lock);
spa_async_suspend(spa);
mutex_enter(&spa_namespace_lock);
spa_close(spa, FTAG);
if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
spa_unload(spa);
spa_deactivate(spa);
}
spa_remove(spa);
}
mutex_exit(&spa_namespace_lock);
}
vdev_t *
spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
{
vdev_t *vd;
int i;
if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
return (vd);
if (aux) {
for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
vd = spa->spa_l2cache.sav_vdevs[i];
if (vd->vdev_guid == guid)
return (vd);
}
for (i = 0; i < spa->spa_spares.sav_count; i++) {
vd = spa->spa_spares.sav_vdevs[i];
if (vd->vdev_guid == guid)
return (vd);
}
}
return (NULL);
}
void
spa_upgrade(spa_t *spa, uint64_t version)
{
ASSERT(spa_writeable(spa));
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
/*
* This should only be called for a non-faulted pool, and since a
* future version would result in an unopenable pool, this shouldn't be
* possible.
*/
ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
spa->spa_uberblock.ub_version = version;
vdev_config_dirty(spa->spa_root_vdev);
spa_config_exit(spa, SCL_ALL, FTAG);
txg_wait_synced(spa_get_dsl(spa), 0);
}
boolean_t
spa_has_spare(spa_t *spa, uint64_t guid)
{
int i;
uint64_t spareguid;
spa_aux_vdev_t *sav = &spa->spa_spares;
for (i = 0; i < sav->sav_count; i++)
if (sav->sav_vdevs[i]->vdev_guid == guid)
return (B_TRUE);
for (i = 0; i < sav->sav_npending; i++) {
if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
&spareguid) == 0 && spareguid == guid)
return (B_TRUE);
}
return (B_FALSE);
}
/*
* Check if a pool has an active shared spare device.
* Note: reference count of an active spare is 2, as a spare and as a replace
*/
static boolean_t
spa_has_active_shared_spare(spa_t *spa)
{
int i, refcnt;
uint64_t pool;
spa_aux_vdev_t *sav = &spa->spa_spares;
for (i = 0; i < sav->sav_count; i++) {
if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
&refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
refcnt > 2)
return (B_TRUE);
}
return (B_FALSE);
}
-static sysevent_t *
+sysevent_t *
spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
{
sysevent_t *ev = NULL;
#ifdef _KERNEL
sysevent_attr_list_t *attr = NULL;
sysevent_value_t value;
ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
SE_SLEEP);
ASSERT(ev != NULL);
value.value_type = SE_DATA_TYPE_STRING;
value.value.sv_string = spa_name(spa);
if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
goto done;
value.value_type = SE_DATA_TYPE_UINT64;
value.value.sv_uint64 = spa_guid(spa);
if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
goto done;
if (vd) {
value.value_type = SE_DATA_TYPE_UINT64;
value.value.sv_uint64 = vd->vdev_guid;
if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
SE_SLEEP) != 0)
goto done;
if (vd->vdev_path) {
value.value_type = SE_DATA_TYPE_STRING;
value.value.sv_string = vd->vdev_path;
if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
&value, SE_SLEEP) != 0)
goto done;
}
}
if (hist_nvl != NULL) {
fnvlist_merge((nvlist_t *)attr, hist_nvl);
}
if (sysevent_attach_attributes(ev, attr) != 0)
goto done;
attr = NULL;
done:
if (attr)
sysevent_free_attr(attr);
#endif
return (ev);
}
-static void
+void
spa_event_post(sysevent_t *ev)
{
#ifdef _KERNEL
sysevent_id_t eid;
(void) log_sysevent(ev, SE_SLEEP, &eid);
+ sysevent_free(ev);
+#endif
+}
+
+void
+spa_event_discard(sysevent_t *ev)
+{
+#ifdef _KERNEL
sysevent_free(ev);
#endif
}
/*
* Post a sysevent corresponding to the given event. The 'name' must be one of
* the event definitions in sys/sysevent/eventdefs.h. The payload will be
* filled in from the spa and (optionally) the vdev and history nvl. This
* doesn't do anything in the userland libzpool, as we don't want consumers to
* misinterpret ztest or zdb as real changes.
*/
void
spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
{
spa_event_post(spa_event_create(spa, vd, hist_nvl, name));
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c (revision 332525)
@@ -1,563 +1,563 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright 2017 Joyent, Inc.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/fm/fs/zfs.h>
#include <sys/spa_impl.h>
#include <sys/nvpair.h>
#include <sys/uio.h>
#include <sys/fs/zfs.h>
#include <sys/vdev_impl.h>
#include <sys/zfs_ioctl.h>
#include <sys/utsname.h>
#include <sys/sunddi.h>
#include <sys/zfeature.h>
#ifdef _KERNEL
#include <sys/kobj.h>
#include <sys/zone.h>
#endif
/*
* Pool configuration repository.
*
* Pool configuration is stored as a packed nvlist on the filesystem. By
* default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot
* (when the ZFS module is loaded). Pools can also have the 'cachefile'
* property set that allows them to be stored in an alternate location until
* the control of external software.
*
* For each cache file, we have a single nvlist which holds all the
* configuration information. When the module loads, we read this information
* from /etc/zfs/zpool.cache and populate the SPA namespace. This namespace is
* maintained independently in spa.c. Whenever the namespace is modified, or
- * the configuration of a pool is changed, we call spa_config_sync(), which
+ * the configuration of a pool is changed, we call spa_write_cachefile(), which
* walks through all the active pools and writes the configuration to disk.
*/
static uint64_t spa_config_generation = 1;
/*
* This can be overridden in userland to preserve an alternate namespace for
* userland pools when doing testing.
*/
const char *spa_config_path = ZPOOL_CACHE;
/*
* Called when the module is first loaded, this routine loads the configuration
* file into the SPA namespace. It does not actually open or load the pools; it
* only populates the namespace.
*/
void
spa_config_load(void)
{
void *buf = NULL;
nvlist_t *nvlist, *child;
nvpair_t *nvpair;
char *pathname;
struct _buf *file;
uint64_t fsize;
/*
* Open the configuration file.
*/
pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
(void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path);
file = kobj_open_file(pathname);
kmem_free(pathname, MAXPATHLEN);
if (file == (struct _buf *)-1)
return;
if (kobj_get_filesize(file, &fsize) != 0)
goto out;
buf = kmem_alloc(fsize, KM_SLEEP);
/*
* Read the nvlist from the file.
*/
if (kobj_read_file(file, buf, fsize, 0) < 0)
goto out;
/*
* Unpack the nvlist.
*/
if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0)
goto out;
/*
* Iterate over all elements in the nvlist, creating a new spa_t for
* each one with the specified configuration.
*/
mutex_enter(&spa_namespace_lock);
nvpair = NULL;
while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
continue;
child = fnvpair_value_nvlist(nvpair);
if (spa_lookup(nvpair_name(nvpair)) != NULL)
continue;
(void) spa_add(nvpair_name(nvpair), child, NULL);
}
mutex_exit(&spa_namespace_lock);
nvlist_free(nvlist);
out:
if (buf != NULL)
kmem_free(buf, fsize);
kobj_close_file(file);
}
static void
spa_config_clean(nvlist_t *nvl)
{
nvlist_t **child;
nvlist_t *nvroot = NULL;
uint_t c, children;
if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
&children) == 0) {
for (c = 0; c < children; c++)
spa_config_clean(child[c]);
}
if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0)
spa_config_clean(nvroot);
nvlist_remove(nvl, ZPOOL_CONFIG_VDEV_STATS, DATA_TYPE_UINT64_ARRAY);
nvlist_remove(nvl, ZPOOL_CONFIG_SCAN_STATS, DATA_TYPE_UINT64_ARRAY);
}
static int
spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl)
{
size_t buflen;
char *buf;
vnode_t *vp;
int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX;
char *temp;
int err;
/*
* If the nvlist is empty (NULL), then remove the old cachefile.
*/
if (nvl == NULL) {
err = vn_remove(dp->scd_path, UIO_SYSSPACE, RMFILE);
return (err);
}
/*
* Pack the configuration into a buffer.
*/
buf = fnvlist_pack(nvl, &buflen);
temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
/*
* Write the configuration to disk. We need to do the traditional
* 'write to temporary file, sync, move over original' to make sure we
* always have a consistent view of the data.
*/
(void) snprintf(temp, MAXPATHLEN, "%s.tmp", dp->scd_path);
err = vn_open(temp, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0);
if (err == 0) {
err = vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE,
0, RLIM64_INFINITY, kcred, NULL);
if (err == 0)
err = VOP_FSYNC(vp, FSYNC, kcred, NULL);
if (err == 0)
err = vn_rename(temp, dp->scd_path, UIO_SYSSPACE);
(void) VOP_CLOSE(vp, oflags, 1, 0, kcred, NULL);
}
(void) vn_remove(temp, UIO_SYSSPACE, RMFILE);
fnvlist_pack_free(buf, buflen);
kmem_free(temp, MAXPATHLEN);
return (err);
}
/*
* Synchronize pool configuration to disk. This must be called with the
* namespace lock held. Synchronizing the pool cache is typically done after
* the configuration has been synced to the MOS. This exposes a window where
* the MOS config will have been updated but the cache file has not. If
* the system were to crash at that instant then the cached config may not
- * contain the correct information to open the pool and an explicity import
+ * contain the correct information to open the pool and an explicit import
* would be required.
*/
void
-spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent)
+spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent)
{
spa_config_dirent_t *dp, *tdp;
nvlist_t *nvl;
boolean_t ccw_failure;
int error;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
if (rootdir == NULL || !(spa_mode_global & FWRITE))
return;
/*
* Iterate over all cachefiles for the pool, past or present. When the
* cachefile is changed, the new one is pushed onto this list, allowing
* us to update previous cachefiles that no longer contain this pool.
*/
ccw_failure = B_FALSE;
for (dp = list_head(&target->spa_config_list); dp != NULL;
dp = list_next(&target->spa_config_list, dp)) {
spa_t *spa = NULL;
if (dp->scd_path == NULL)
continue;
/*
* Iterate over all pools, adding any matching pools to 'nvl'.
*/
nvl = NULL;
while ((spa = spa_next(spa)) != NULL) {
nvlist_t *nvroot = NULL;
/*
* Skip over our own pool if we're about to remove
* ourselves from the spa namespace or any pool that
* is readonly. Since we cannot guarantee that a
* readonly pool would successfully import upon reboot,
* we don't allow them to be written to the cache file.
*/
if ((spa == target && removing) ||
(spa_state(spa) == POOL_STATE_ACTIVE &&
!spa_writeable(spa)))
continue;
mutex_enter(&spa->spa_props_lock);
tdp = list_head(&spa->spa_config_list);
if (spa->spa_config == NULL ||
tdp->scd_path == NULL ||
strcmp(tdp->scd_path, dp->scd_path) != 0) {
mutex_exit(&spa->spa_props_lock);
continue;
}
if (nvl == NULL)
nvl = fnvlist_alloc();
fnvlist_add_nvlist(nvl, spa->spa_name,
spa->spa_config);
mutex_exit(&spa->spa_props_lock);
if (nvlist_lookup_nvlist(nvl, spa->spa_name, &nvroot) == 0)
spa_config_clean(nvroot);
}
error = spa_config_write(dp, nvl);
if (error != 0)
ccw_failure = B_TRUE;
nvlist_free(nvl);
}
if (ccw_failure) {
/*
* Keep trying so that configuration data is
* written if/when any temporary filesystem
* resource issues are resolved.
*/
if (target->spa_ccw_fail_time == 0) {
zfs_ereport_post(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE,
target, NULL, NULL, 0, 0);
}
target->spa_ccw_fail_time = gethrtime();
spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE);
} else {
/*
* Do not rate limit future attempts to update
* the config cache.
*/
target->spa_ccw_fail_time = 0;
}
/*
* Remove any config entries older than the current one.
*/
dp = list_head(&target->spa_config_list);
while ((tdp = list_next(&target->spa_config_list, dp)) != NULL) {
list_remove(&target->spa_config_list, tdp);
if (tdp->scd_path != NULL)
spa_strfree(tdp->scd_path);
kmem_free(tdp, sizeof (spa_config_dirent_t));
}
spa_config_generation++;
if (postsysevent)
spa_event_notify(target, NULL, NULL, ESC_ZFS_CONFIG_SYNC);
}
/*
* Sigh. Inside a local zone, we don't have access to /etc/zfs/zpool.cache,
* and we don't want to allow the local zone to see all the pools anyway.
* So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration
* information for all pool visible within the zone.
*/
nvlist_t *
spa_all_configs(uint64_t *generation)
{
nvlist_t *pools;
spa_t *spa = NULL;
if (*generation == spa_config_generation)
return (NULL);
pools = fnvlist_alloc();
mutex_enter(&spa_namespace_lock);
while ((spa = spa_next(spa)) != NULL) {
if (INGLOBALZONE(curthread) ||
zone_dataset_visible(spa_name(spa), NULL)) {
mutex_enter(&spa->spa_props_lock);
fnvlist_add_nvlist(pools, spa_name(spa),
spa->spa_config);
mutex_exit(&spa->spa_props_lock);
}
}
*generation = spa_config_generation;
mutex_exit(&spa_namespace_lock);
return (pools);
}
void
spa_config_set(spa_t *spa, nvlist_t *config)
{
mutex_enter(&spa->spa_props_lock);
nvlist_free(spa->spa_config);
spa->spa_config = config;
mutex_exit(&spa->spa_props_lock);
}
/*
* Generate the pool's configuration based on the current in-core state.
*
* We infer whether to generate a complete config or just one top-level config
* based on whether vd is the root vdev.
*/
nvlist_t *
spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
{
nvlist_t *config, *nvroot;
vdev_t *rvd = spa->spa_root_vdev;
unsigned long hostid = 0;
boolean_t locked = B_FALSE;
uint64_t split_guid;
if (vd == NULL) {
vd = rvd;
locked = B_TRUE;
spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
}
ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER) ==
(SCL_CONFIG | SCL_STATE));
/*
* If txg is -1, report the current value of spa->spa_config_txg.
*/
if (txg == -1ULL)
txg = spa->spa_config_txg;
config = fnvlist_alloc();
fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa));
fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, spa_name(spa));
fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, spa_state(spa));
fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg);
fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa));
if (spa->spa_comment != NULL) {
fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT,
spa->spa_comment);
}
#ifdef _KERNEL
hostid = zone_get_hostid(NULL);
#else /* _KERNEL */
/*
* We're emulating the system's hostid in userland, so we can't use
* zone_get_hostid().
*/
(void) ddi_strtoul(hw_serial, NULL, 10, &hostid);
#endif /* _KERNEL */
if (hostid != 0) {
fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid);
}
fnvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, utsname.nodename);
int config_gen_flags = 0;
if (vd != rvd) {
fnvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID,
vd->vdev_top->vdev_guid);
fnvlist_add_uint64(config, ZPOOL_CONFIG_GUID,
vd->vdev_guid);
if (vd->vdev_isspare) {
fnvlist_add_uint64(config,
ZPOOL_CONFIG_IS_SPARE, 1ULL);
}
if (vd->vdev_islog) {
fnvlist_add_uint64(config,
ZPOOL_CONFIG_IS_LOG, 1ULL);
}
vd = vd->vdev_top; /* label contains top config */
} else {
/*
* Only add the (potentially large) split information
* in the mos config, and not in the vdev labels
*/
if (spa->spa_config_splitting != NULL)
fnvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT,
spa->spa_config_splitting);
fnvlist_add_boolean(config,
ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS);
config_gen_flags |= VDEV_CONFIG_MOS;
}
/*
* Add the top-level config. We even add this on pools which
* don't support holes in the namespace.
*/
vdev_top_config_generate(spa, config);
/*
* If we're splitting, record the original pool's guid.
*/
if (spa->spa_config_splitting != NULL &&
nvlist_lookup_uint64(spa->spa_config_splitting,
ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) {
fnvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID,
split_guid);
}
nvroot = vdev_config_generate(spa, vd, getstats, config_gen_flags);
fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot);
nvlist_free(nvroot);
/*
* Store what's necessary for reading the MOS in the label.
*/
fnvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
spa->spa_label_features);
if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) {
ddt_histogram_t *ddh;
ddt_stat_t *dds;
ddt_object_t *ddo;
ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
ddt_get_dedup_histogram(spa, ddh);
fnvlist_add_uint64_array(config,
ZPOOL_CONFIG_DDT_HISTOGRAM,
(uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t));
kmem_free(ddh, sizeof (ddt_histogram_t));
ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP);
ddt_get_dedup_object_stats(spa, ddo);
fnvlist_add_uint64_array(config,
ZPOOL_CONFIG_DDT_OBJ_STATS,
(uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t));
kmem_free(ddo, sizeof (ddt_object_t));
dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP);
ddt_get_dedup_stats(spa, dds);
fnvlist_add_uint64_array(config,
ZPOOL_CONFIG_DDT_STATS,
(uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t));
kmem_free(dds, sizeof (ddt_stat_t));
}
if (locked)
spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
return (config);
}
/*
* Update all disk labels, generate a fresh config based on the current
* in-core state, and sync the global config cache (do not sync the config
* cache if this is a booting rootpool).
*/
void
spa_config_update(spa_t *spa, int what)
{
vdev_t *rvd = spa->spa_root_vdev;
uint64_t txg;
int c;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
txg = spa_last_synced_txg(spa) + 1;
if (what == SPA_CONFIG_UPDATE_POOL) {
vdev_config_dirty(rvd);
} else {
/*
* If we have top-level vdevs that were added but have
* not yet been prepared for allocation, do that now.
* (It's safe now because the config cache is up to date,
* so it will be able to translate the new DVAs.)
* See comments in spa_vdev_add() for full details.
*/
for (c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
if (tvd->vdev_ms_array == 0) {
vdev_ashift_optimize(tvd);
vdev_metaslab_set_size(tvd);
}
vdev_expand(tvd, txg);
}
}
spa_config_exit(spa, SCL_ALL, FTAG);
/*
* Wait for the mosconfig to be regenerated and synced.
*/
txg_wait_synced(spa->spa_dsl_pool, txg);
/*
* Update the global config cache to reflect the new mosconfig.
*/
- spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL);
+ spa_write_cachefile(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL);
if (what == SPA_CONFIG_UPDATE_POOL)
spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS);
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c (revision 332525)
@@ -1,2207 +1,2276 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright (c) 2017 Datto Inc.
*/
#include <sys/zfs_context.h>
#include <sys/spa_impl.h>
#include <sys/spa_boot.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
#include <sys/zio_compress.h>
#include <sys/dmu.h>
#include <sys/dmu_tx.h>
#include <sys/zap.h>
#include <sys/zil.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_file.h>
#include <sys/metaslab.h>
#include <sys/uberblock_impl.h>
#include <sys/txg.h>
#include <sys/avl.h>
#include <sys/unique.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_dir.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_scan.h>
#include <sys/fs/zfs.h>
#include <sys/metaslab_impl.h>
#include <sys/arc.h>
#include <sys/ddt.h>
#include "zfs_prop.h"
#include <sys/zfeature.h>
#if defined(__FreeBSD__) && defined(_KERNEL)
#include <sys/types.h>
#include <sys/sysctl.h>
#endif
/*
* SPA locking
*
* There are four basic locks for managing spa_t structures:
*
* spa_namespace_lock (global mutex)
*
* This lock must be acquired to do any of the following:
*
* - Lookup a spa_t by name
* - Add or remove a spa_t from the namespace
* - Increase spa_refcount from non-zero
* - Check if spa_refcount is zero
* - Rename a spa_t
* - add/remove/attach/detach devices
* - Held for the duration of create/destroy/import/export
*
* It does not need to handle recursion. A create or destroy may
* reference objects (files or zvols) in other pools, but by
* definition they must have an existing reference, and will never need
* to lookup a spa_t by name.
*
* spa_refcount (per-spa refcount_t protected by mutex)
*
* This reference count keep track of any active users of the spa_t. The
* spa_t cannot be destroyed or freed while this is non-zero. Internally,
* the refcount is never really 'zero' - opening a pool implicitly keeps
* some references in the DMU. Internally we check against spa_minref, but
* present the image of a zero/non-zero value to consumers.
*
* spa_config_lock[] (per-spa array of rwlocks)
*
* This protects the spa_t from config changes, and must be held in
* the following circumstances:
*
* - RW_READER to perform I/O to the spa
* - RW_WRITER to change the vdev config
*
* The locking order is fairly straightforward:
*
* spa_namespace_lock -> spa_refcount
*
* The namespace lock must be acquired to increase the refcount from 0
* or to check if it is zero.
*
* spa_refcount -> spa_config_lock[]
*
* There must be at least one valid reference on the spa_t to acquire
* the config lock.
*
* spa_namespace_lock -> spa_config_lock[]
*
* The namespace lock must always be taken before the config lock.
*
*
* The spa_namespace_lock can be acquired directly and is globally visible.
*
* The namespace is manipulated using the following functions, all of which
* require the spa_namespace_lock to be held.
*
* spa_lookup() Lookup a spa_t by name.
*
* spa_add() Create a new spa_t in the namespace.
*
* spa_remove() Remove a spa_t from the namespace. This also
* frees up any memory associated with the spa_t.
*
* spa_next() Returns the next spa_t in the system, or the
* first if NULL is passed.
*
* spa_evict_all() Shutdown and remove all spa_t structures in
* the system.
*
* spa_guid_exists() Determine whether a pool/device guid exists.
*
* The spa_refcount is manipulated using the following functions:
*
* spa_open_ref() Adds a reference to the given spa_t. Must be
* called with spa_namespace_lock held if the
* refcount is currently zero.
*
* spa_close() Remove a reference from the spa_t. This will
* not free the spa_t or remove it from the
* namespace. No locking is required.
*
* spa_refcount_zero() Returns true if the refcount is currently
* zero. Must be called with spa_namespace_lock
* held.
*
* The spa_config_lock[] is an array of rwlocks, ordered as follows:
* SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV.
* spa_config_lock[] is manipulated with spa_config_{enter,exit,held}().
*
* To read the configuration, it suffices to hold one of these locks as reader.
* To modify the configuration, you must hold all locks as writer. To modify
* vdev state without altering the vdev tree's topology (e.g. online/offline),
* you must hold SCL_STATE and SCL_ZIO as writer.
*
* We use these distinct config locks to avoid recursive lock entry.
* For example, spa_sync() (which holds SCL_CONFIG as reader) induces
* block allocations (SCL_ALLOC), which may require reading space maps
* from disk (dmu_read() -> zio_read() -> SCL_ZIO).
*
* The spa config locks cannot be normal rwlocks because we need the
* ability to hand off ownership. For example, SCL_ZIO is acquired
* by the issuing thread and later released by an interrupt thread.
* They do, however, obey the usual write-wanted semantics to prevent
* writer (i.e. system administrator) starvation.
*
* The lock acquisition rules are as follows:
*
* SCL_CONFIG
* Protects changes to the vdev tree topology, such as vdev
* add/remove/attach/detach. Protects the dirty config list
* (spa_config_dirty_list) and the set of spares and l2arc devices.
*
* SCL_STATE
* Protects changes to pool state and vdev state, such as vdev
* online/offline/fault/degrade/clear. Protects the dirty state list
* (spa_state_dirty_list) and global pool state (spa_state).
*
* SCL_ALLOC
* Protects changes to metaslab groups and classes.
* Held as reader by metaslab_alloc() and metaslab_claim().
*
* SCL_ZIO
* Held by bp-level zios (those which have no io_vd upon entry)
* to prevent changes to the vdev tree. The bp-level zio implicitly
* protects all of its vdev child zios, which do not hold SCL_ZIO.
*
* SCL_FREE
* Protects changes to metaslab groups and classes.
* Held as reader by metaslab_free(). SCL_FREE is distinct from
* SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free
* blocks in zio_done() while another i/o that holds either
* SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete.
*
* SCL_VDEV
* Held as reader to prevent changes to the vdev tree during trivial
* inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the
* other locks, and lower than all of them, to ensure that it's safe
* to acquire regardless of caller context.
*
* In addition, the following rules apply:
*
* (a) spa_props_lock protects pool properties, spa_config and spa_config_list.
* The lock ordering is SCL_CONFIG > spa_props_lock.
*
* (b) I/O operations on leaf vdevs. For any zio operation that takes
* an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(),
* or zio_write_phys() -- the caller must ensure that the config cannot
* cannot change in the interim, and that the vdev cannot be reopened.
* SCL_STATE as reader suffices for both.
*
* The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
*
* spa_vdev_enter() Acquire the namespace lock and the config lock
* for writing.
*
* spa_vdev_exit() Release the config lock, wait for all I/O
* to complete, sync the updated configs to the
* cache, and release the namespace lock.
*
* vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit().
* Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
* locking is, always, based on spa_namespace_lock and spa_config_lock[].
*
* spa_rename() is also implemented within this file since it requires
* manipulation of the namespace.
*/
static avl_tree_t spa_namespace_avl;
kmutex_t spa_namespace_lock;
static kcondvar_t spa_namespace_cv;
static int spa_active_count;
int spa_max_replication_override = SPA_DVAS_PER_BP;
static kmutex_t spa_spare_lock;
static avl_tree_t spa_spare_avl;
static kmutex_t spa_l2cache_lock;
static avl_tree_t spa_l2cache_avl;
kmem_cache_t *spa_buffer_pool;
int spa_mode_global;
#ifdef ZFS_DEBUG
-/* Everything except dprintf and spa is on by default in debug builds */
-int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA);
+/*
+ * Everything except dprintf, spa, and indirect_remap is on by default
+ * in debug builds.
+ */
+int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA | ZFS_DEBUG_INDIRECT_REMAP);
#else
int zfs_flags = 0;
#endif
/*
* zfs_recover can be set to nonzero to attempt to recover from
* otherwise-fatal errors, typically caused by on-disk corruption. When
* set, calls to zfs_panic_recover() will turn into warning messages.
* This should only be used as a last resort, as it typically results
* in leaked space, or worse.
*/
boolean_t zfs_recover = B_FALSE;
/*
* If destroy encounters an EIO while reading metadata (e.g. indirect
* blocks), space referenced by the missing metadata can not be freed.
* Normally this causes the background destroy to become "stalled", as
* it is unable to make forward progress. While in this stalled state,
* all remaining space to free from the error-encountering filesystem is
* "temporarily leaked". Set this flag to cause it to ignore the EIO,
* permanently leak the space from indirect blocks that can not be read,
* and continue to free everything else that it can.
*
* The default, "stalling" behavior is useful if the storage partially
* fails (i.e. some but not all i/os fail), and then later recovers. In
* this case, we will be able to continue pool operations while it is
* partially failed, and when it recovers, we can continue to free the
* space, with no leaks. However, note that this case is actually
* fairly rare.
*
* Typically pools either (a) fail completely (but perhaps temporarily,
* e.g. a top-level vdev going offline), or (b) have localized,
* permanent errors (e.g. disk returns the wrong data due to bit flip or
* firmware bug). In case (a), this setting does not matter because the
* pool will be suspended and the sync thread will not be able to make
* forward progress regardless. In case (b), because the error is
* permanent, the best we can do is leak the minimum amount of space,
* which is what setting this flag will do. Therefore, it is reasonable
* for this flag to normally be set, but we chose the more conservative
* approach of not setting it, so that there is no possibility of
* leaking space in the "partial temporary" failure case.
*/
boolean_t zfs_free_leak_on_eio = B_FALSE;
/*
* Expiration time in milliseconds. This value has two meanings. First it is
* used to determine when the spa_deadman() logic should fire. By default the
* spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
* Secondly, the value determines if an I/O is considered "hung". Any I/O that
* has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
* in a system panic.
*/
uint64_t zfs_deadman_synctime_ms = 1000000ULL;
/*
* Check time in milliseconds. This defines the frequency at which we check
* for hung I/O.
*/
uint64_t zfs_deadman_checktime_ms = 5000ULL;
/*
* Default value of -1 for zfs_deadman_enabled is resolved in
* zfs_deadman_init()
*/
int zfs_deadman_enabled = -1;
/*
* The worst case is single-sector max-parity RAID-Z blocks, in which
* case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
* times the size; so just assume that. Add to this the fact that
* we can have up to 3 DVAs per bp, and one more factor of 2 because
* the block may be dittoed with up to 3 DVAs by ddt_sync(). All together,
* the worst case is:
* (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
*/
int spa_asize_inflation = 24;
#if defined(__FreeBSD__) && defined(_KERNEL)
SYSCTL_DECL(_vfs_zfs);
SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RWTUN, &zfs_recover, 0,
"Try to recover from otherwise-fatal errors.");
static int
sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS)
{
int err, val;
val = zfs_flags;
err = sysctl_handle_int(oidp, &val, 0, req);
if (err != 0 || req->newptr == NULL)
return (err);
/*
* ZFS_DEBUG_MODIFY must be enabled prior to boot so all
* arc buffers in the system have the necessary additional
* checksum data. However, it is safe to disable at any
* time.
*/
if (!(zfs_flags & ZFS_DEBUG_MODIFY))
val &= ~ZFS_DEBUG_MODIFY;
zfs_flags = val;
return (0);
}
SYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags,
CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing.");
SYSCTL_PROC(_vfs_zfs, OID_AUTO, debug_flags,
CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(int),
sysctl_vfs_zfs_debug_flags, "IU",
"Debug flags for ZFS testing (deprecated, see vfs.zfs.debugflags).");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime_ms, CTLFLAG_RDTUN,
&zfs_deadman_synctime_ms, 0,
"Stalled ZFS I/O expiration time in milliseconds");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_checktime_ms, CTLFLAG_RDTUN,
&zfs_deadman_checktime_ms, 0,
"Period of checks for stalled ZFS I/O in milliseconds");
SYSCTL_INT(_vfs_zfs, OID_AUTO, deadman_enabled, CTLFLAG_RDTUN,
&zfs_deadman_enabled, 0, "Kernel panic on stalled ZFS I/O");
SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_asize_inflation, CTLFLAG_RWTUN,
&spa_asize_inflation, 0, "Worst case inflation factor for single sector writes");
#endif
#ifndef illumos
#ifdef _KERNEL
static void
zfs_deadman_init()
{
/*
* If we are not i386 or amd64 or in a virtual machine,
* disable ZFS deadman thread by default
*/
if (zfs_deadman_enabled == -1) {
#if defined(__amd64__) || defined(__i386__)
zfs_deadman_enabled = (vm_guest == VM_GUEST_NO) ? 1 : 0;
#else
zfs_deadman_enabled = 0;
#endif
}
}
#endif /* _KERNEL */
#endif /* !illumos */
/*
* Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in
* the pool to be consumed. This ensures that we don't run the pool
* completely out of space, due to unaccounted changes (e.g. to the MOS).
* It also limits the worst-case time to allocate space. If we have
* less than this amount of free space, most ZPL operations (e.g. write,
* create) will return ENOSPC.
*
* Certain operations (e.g. file removal, most administrative actions) can
* use half the slop space. They will only return ENOSPC if less than half
* the slop space is free. Typically, once the pool has less than the slop
* space free, the user will use these operations to free up space in the pool.
* These are the operations that call dsl_pool_adjustedsize() with the netfree
* argument set to TRUE.
*
* A very restricted set of operations are always permitted, regardless of
* the amount of free space. These are the operations that call
* dsl_sync_task(ZFS_SPACE_CHECK_NONE), e.g. "zfs destroy". If these
* operations result in a net increase in the amount of space used,
* it is possible to run the pool completely out of space, causing it to
* be permanently read-only.
*
* Note that on very small pools, the slop space will be larger than
* 3.2%, in an effort to have it be at least spa_min_slop (128MB),
* but we never allow it to be more than half the pool size.
*
* See also the comments in zfs_space_check_t.
*/
int spa_slop_shift = 5;
SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_slop_shift, CTLFLAG_RWTUN,
&spa_slop_shift, 0,
"Shift value of reserved space (1/(2^spa_slop_shift)).");
uint64_t spa_min_slop = 128 * 1024 * 1024;
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, spa_min_slop, CTLFLAG_RWTUN,
&spa_min_slop, 0,
"Minimal value of reserved space");
/*
* ==========================================================================
* SPA config locking
* ==========================================================================
*/
static void
spa_config_lock_init(spa_t *spa)
{
for (int i = 0; i < SCL_LOCKS; i++) {
spa_config_lock_t *scl = &spa->spa_config_lock[i];
mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
refcount_create_untracked(&scl->scl_count);
scl->scl_writer = NULL;
scl->scl_write_wanted = 0;
}
}
static void
spa_config_lock_destroy(spa_t *spa)
{
for (int i = 0; i < SCL_LOCKS; i++) {
spa_config_lock_t *scl = &spa->spa_config_lock[i];
mutex_destroy(&scl->scl_lock);
cv_destroy(&scl->scl_cv);
refcount_destroy(&scl->scl_count);
ASSERT(scl->scl_writer == NULL);
ASSERT(scl->scl_write_wanted == 0);
}
}
int
spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
{
for (int i = 0; i < SCL_LOCKS; i++) {
spa_config_lock_t *scl = &spa->spa_config_lock[i];
if (!(locks & (1 << i)))
continue;
mutex_enter(&scl->scl_lock);
if (rw == RW_READER) {
if (scl->scl_writer || scl->scl_write_wanted) {
mutex_exit(&scl->scl_lock);
spa_config_exit(spa, locks & ((1 << i) - 1),
tag);
return (0);
}
} else {
ASSERT(scl->scl_writer != curthread);
if (!refcount_is_zero(&scl->scl_count)) {
mutex_exit(&scl->scl_lock);
spa_config_exit(spa, locks & ((1 << i) - 1),
tag);
return (0);
}
scl->scl_writer = curthread;
}
(void) refcount_add(&scl->scl_count, tag);
mutex_exit(&scl->scl_lock);
}
return (1);
}
void
spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw)
{
int wlocks_held = 0;
ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY);
for (int i = 0; i < SCL_LOCKS; i++) {
spa_config_lock_t *scl = &spa->spa_config_lock[i];
if (scl->scl_writer == curthread)
wlocks_held |= (1 << i);
if (!(locks & (1 << i)))
continue;
mutex_enter(&scl->scl_lock);
if (rw == RW_READER) {
while (scl->scl_writer || scl->scl_write_wanted) {
cv_wait(&scl->scl_cv, &scl->scl_lock);
}
} else {
ASSERT(scl->scl_writer != curthread);
while (!refcount_is_zero(&scl->scl_count)) {
scl->scl_write_wanted++;
cv_wait(&scl->scl_cv, &scl->scl_lock);
scl->scl_write_wanted--;
}
scl->scl_writer = curthread;
}
(void) refcount_add(&scl->scl_count, tag);
mutex_exit(&scl->scl_lock);
}
- ASSERT(wlocks_held <= locks);
+ ASSERT3U(wlocks_held, <=, locks);
}
void
spa_config_exit(spa_t *spa, int locks, void *tag)
{
for (int i = SCL_LOCKS - 1; i >= 0; i--) {
spa_config_lock_t *scl = &spa->spa_config_lock[i];
if (!(locks & (1 << i)))
continue;
mutex_enter(&scl->scl_lock);
ASSERT(!refcount_is_zero(&scl->scl_count));
if (refcount_remove(&scl->scl_count, tag) == 0) {
ASSERT(scl->scl_writer == NULL ||
scl->scl_writer == curthread);
scl->scl_writer = NULL; /* OK in either case */
cv_broadcast(&scl->scl_cv);
}
mutex_exit(&scl->scl_lock);
}
}
int
spa_config_held(spa_t *spa, int locks, krw_t rw)
{
int locks_held = 0;
for (int i = 0; i < SCL_LOCKS; i++) {
spa_config_lock_t *scl = &spa->spa_config_lock[i];
if (!(locks & (1 << i)))
continue;
if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) ||
(rw == RW_WRITER && scl->scl_writer == curthread))
locks_held |= 1 << i;
}
return (locks_held);
}
/*
* ==========================================================================
* SPA namespace functions
* ==========================================================================
*/
/*
* Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held.
* Returns NULL if no matching spa_t is found.
*/
spa_t *
spa_lookup(const char *name)
{
static spa_t search; /* spa_t is large; don't allocate on stack */
spa_t *spa;
avl_index_t where;
char *cp;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
(void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
/*
* If it's a full dataset name, figure out the pool name and
* just use that.
*/
cp = strpbrk(search.spa_name, "/@#");
if (cp != NULL)
*cp = '\0';
spa = avl_find(&spa_namespace_avl, &search, &where);
return (spa);
}
/*
* Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
* If the zfs_deadman_enabled flag is set then it inspects all vdev queues
* looking for potentially hung I/Os.
*/
static void
spa_deadman(void *arg, int pending)
{
spa_t *spa = arg;
/*
* Disable the deadman timer if the pool is suspended.
*/
if (spa_suspended(spa)) {
#ifdef illumos
VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
#else
/* Nothing. just don't schedule any future callouts. */
#endif
return;
}
zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
(gethrtime() - spa->spa_sync_starttime) / NANOSEC,
++spa->spa_deadman_calls);
if (zfs_deadman_enabled)
vdev_deadman(spa->spa_root_vdev);
#ifdef __FreeBSD__
#ifdef _KERNEL
callout_schedule(&spa->spa_deadman_cycid,
hz * zfs_deadman_checktime_ms / MILLISEC);
#endif
#endif
}
#if defined(__FreeBSD__) && defined(_KERNEL)
static void
spa_deadman_timeout(void *arg)
{
spa_t *spa = arg;
taskqueue_enqueue(taskqueue_thread, &spa->spa_deadman_task);
}
#endif
/*
* Create an uninitialized spa_t with the given name. Requires
* spa_namespace_lock. The caller must ensure that the spa_t doesn't already
* exist by calling spa_lookup() first.
*/
spa_t *
spa_add(const char *name, nvlist_t *config, const char *altroot)
{
spa_t *spa;
spa_config_dirent_t *dp;
#ifdef illumos
cyc_handler_t hdlr;
cyc_time_t when;
#endif
ASSERT(MUTEX_HELD(&spa_namespace_lock));
spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_alloc_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
for (int t = 0; t < TXG_SIZE; t++)
bplist_create(&spa->spa_free_bplist[t]);
(void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
spa->spa_state = POOL_STATE_UNINITIALIZED;
spa->spa_freeze_txg = UINT64_MAX;
spa->spa_final_txg = UINT64_MAX;
spa->spa_load_max_txg = UINT64_MAX;
spa->spa_proc = &p0;
spa->spa_proc_state = SPA_PROC_NONE;
#ifdef illumos
hdlr.cyh_func = spa_deadman;
hdlr.cyh_arg = spa;
hdlr.cyh_level = CY_LOW_LEVEL;
#endif
spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
#ifdef illumos
/*
* This determines how often we need to check for hung I/Os after
* the cyclic has already fired. Since checking for hung I/Os is
* an expensive operation we don't want to check too frequently.
* Instead wait for 5 seconds before checking again.
*/
when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms);
when.cyt_when = CY_INFINITY;
mutex_enter(&cpu_lock);
spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
mutex_exit(&cpu_lock);
#else /* !illumos */
#ifdef _KERNEL
/*
* callout(9) does not provide a way to initialize a callout with
* a function and an argument, so we use callout_reset() to schedule
* the callout in the very distant future. Even if that event ever
* fires, it should be okayas we won't have any active zio-s.
* But normally spa_sync() will reschedule the callout with a proper
* timeout.
* callout(9) does not allow the callback function to sleep but
* vdev_deadman() needs to acquire vq_lock and illumos mutexes are
* emulated using sx(9). For this reason spa_deadman_timeout()
* will schedule spa_deadman() as task on a taskqueue that allows
* sleeping.
*/
TASK_INIT(&spa->spa_deadman_task, 0, spa_deadman, spa);
callout_init(&spa->spa_deadman_cycid, 1);
callout_reset_sbt(&spa->spa_deadman_cycid, SBT_MAX, 0,
spa_deadman_timeout, spa, 0);
#endif
#endif
refcount_create(&spa->spa_refcount);
spa_config_lock_init(spa);
avl_add(&spa_namespace_avl, spa);
/*
* Set the alternate root, if there is one.
*/
if (altroot) {
spa->spa_root = spa_strdup(altroot);
spa_active_count++;
}
avl_create(&spa->spa_alloc_tree, zio_bookmark_compare,
sizeof (zio_t), offsetof(zio_t, io_alloc_node));
/*
* Every pool starts with the default cachefile
*/
list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t),
offsetof(spa_config_dirent_t, scd_link));
dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
list_insert_head(&spa->spa_config_list, dp);
VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
KM_SLEEP) == 0);
if (config != NULL) {
nvlist_t *features;
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
&features) == 0) {
VERIFY(nvlist_dup(features, &spa->spa_label_features,
0) == 0);
}
VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
}
if (spa->spa_label_features == NULL) {
VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME,
KM_SLEEP) == 0);
}
spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0);
spa->spa_min_ashift = INT_MAX;
spa->spa_max_ashift = 0;
/*
* As a pool is being created, treat all features as disabled by
* setting SPA_FEATURE_DISABLED for all entries in the feature
* refcount cache.
*/
for (int i = 0; i < SPA_FEATURES; i++) {
spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED;
}
return (spa);
}
/*
* Removes a spa_t from the namespace, freeing up any memory used. Requires
* spa_namespace_lock. This is called only after the spa_t has been closed and
* deactivated.
*/
void
spa_remove(spa_t *spa)
{
spa_config_dirent_t *dp;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
ASSERT3U(refcount_count(&spa->spa_refcount), ==, 0);
nvlist_free(spa->spa_config_splitting);
avl_remove(&spa_namespace_avl, spa);
cv_broadcast(&spa_namespace_cv);
if (spa->spa_root) {
spa_strfree(spa->spa_root);
spa_active_count--;
}
while ((dp = list_head(&spa->spa_config_list)) != NULL) {
list_remove(&spa->spa_config_list, dp);
if (dp->scd_path != NULL)
spa_strfree(dp->scd_path);
kmem_free(dp, sizeof (spa_config_dirent_t));
}
avl_destroy(&spa->spa_alloc_tree);
list_destroy(&spa->spa_config_list);
nvlist_free(spa->spa_label_features);
nvlist_free(spa->spa_load_info);
spa_config_set(spa, NULL);
#ifdef illumos
mutex_enter(&cpu_lock);
if (spa->spa_deadman_cycid != CYCLIC_NONE)
cyclic_remove(spa->spa_deadman_cycid);
mutex_exit(&cpu_lock);
spa->spa_deadman_cycid = CYCLIC_NONE;
#else /* !illumos */
#ifdef _KERNEL
callout_drain(&spa->spa_deadman_cycid);
taskqueue_drain(taskqueue_thread, &spa->spa_deadman_task);
#endif
#endif
refcount_destroy(&spa->spa_refcount);
spa_config_lock_destroy(spa);
for (int t = 0; t < TXG_SIZE; t++)
bplist_destroy(&spa->spa_free_bplist[t]);
zio_checksum_templates_free(spa);
cv_destroy(&spa->spa_async_cv);
cv_destroy(&spa->spa_evicting_os_cv);
cv_destroy(&spa->spa_proc_cv);
cv_destroy(&spa->spa_scrub_io_cv);
cv_destroy(&spa->spa_suspend_cv);
mutex_destroy(&spa->spa_alloc_lock);
mutex_destroy(&spa->spa_async_lock);
mutex_destroy(&spa->spa_errlist_lock);
mutex_destroy(&spa->spa_errlog_lock);
mutex_destroy(&spa->spa_evicting_os_lock);
mutex_destroy(&spa->spa_history_lock);
mutex_destroy(&spa->spa_proc_lock);
mutex_destroy(&spa->spa_props_lock);
mutex_destroy(&spa->spa_cksum_tmpls_lock);
mutex_destroy(&spa->spa_scrub_lock);
mutex_destroy(&spa->spa_suspend_lock);
mutex_destroy(&spa->spa_vdev_top_lock);
kmem_free(spa, sizeof (spa_t));
}
/*
* Given a pool, return the next pool in the namespace, or NULL if there is
* none. If 'prev' is NULL, return the first pool.
*/
spa_t *
spa_next(spa_t *prev)
{
ASSERT(MUTEX_HELD(&spa_namespace_lock));
if (prev)
return (AVL_NEXT(&spa_namespace_avl, prev));
else
return (avl_first(&spa_namespace_avl));
}
/*
* ==========================================================================
* SPA refcount functions
* ==========================================================================
*/
/*
* Add a reference to the given spa_t. Must have at least one reference, or
* have the namespace lock held.
*/
void
spa_open_ref(spa_t *spa, void *tag)
{
ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
MUTEX_HELD(&spa_namespace_lock));
(void) refcount_add(&spa->spa_refcount, tag);
}
/*
* Remove a reference to the given spa_t. Must have at least one reference, or
* have the namespace lock held.
*/
void
spa_close(spa_t *spa, void *tag)
{
ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref ||
MUTEX_HELD(&spa_namespace_lock));
(void) refcount_remove(&spa->spa_refcount, tag);
}
/*
* Remove a reference to the given spa_t held by a dsl dir that is
* being asynchronously released. Async releases occur from a taskq
* performing eviction of dsl datasets and dirs. The namespace lock
* isn't held and the hold by the object being evicted may contribute to
* spa_minref (e.g. dataset or directory released during pool export),
* so the asserts in spa_close() do not apply.
*/
void
spa_async_close(spa_t *spa, void *tag)
{
(void) refcount_remove(&spa->spa_refcount, tag);
}
/*
* Check to see if the spa refcount is zero. Must be called with
* spa_namespace_lock held. We really compare against spa_minref, which is the
* number of references acquired when opening a pool
*/
boolean_t
spa_refcount_zero(spa_t *spa)
{
ASSERT(MUTEX_HELD(&spa_namespace_lock));
return (refcount_count(&spa->spa_refcount) == spa->spa_minref);
}
/*
* ==========================================================================
* SPA spare and l2cache tracking
* ==========================================================================
*/
/*
* Hot spares and cache devices are tracked using the same code below,
* for 'auxiliary' devices.
*/
typedef struct spa_aux {
uint64_t aux_guid;
uint64_t aux_pool;
avl_node_t aux_avl;
int aux_count;
} spa_aux_t;
static int
spa_aux_compare(const void *a, const void *b)
{
const spa_aux_t *sa = a;
const spa_aux_t *sb = b;
if (sa->aux_guid < sb->aux_guid)
return (-1);
else if (sa->aux_guid > sb->aux_guid)
return (1);
else
return (0);
}
void
spa_aux_add(vdev_t *vd, avl_tree_t *avl)
{
avl_index_t where;
spa_aux_t search;
spa_aux_t *aux;
search.aux_guid = vd->vdev_guid;
if ((aux = avl_find(avl, &search, &where)) != NULL) {
aux->aux_count++;
} else {
aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP);
aux->aux_guid = vd->vdev_guid;
aux->aux_count = 1;
avl_insert(avl, aux, where);
}
}
void
spa_aux_remove(vdev_t *vd, avl_tree_t *avl)
{
spa_aux_t search;
spa_aux_t *aux;
avl_index_t where;
search.aux_guid = vd->vdev_guid;
aux = avl_find(avl, &search, &where);
ASSERT(aux != NULL);
if (--aux->aux_count == 0) {
avl_remove(avl, aux);
kmem_free(aux, sizeof (spa_aux_t));
} else if (aux->aux_pool == spa_guid(vd->vdev_spa)) {
aux->aux_pool = 0ULL;
}
}
boolean_t
spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl)
{
spa_aux_t search, *found;
search.aux_guid = guid;
found = avl_find(avl, &search, NULL);
if (pool) {
if (found)
*pool = found->aux_pool;
else
*pool = 0ULL;
}
if (refcnt) {
if (found)
*refcnt = found->aux_count;
else
*refcnt = 0;
}
return (found != NULL);
}
void
spa_aux_activate(vdev_t *vd, avl_tree_t *avl)
{
spa_aux_t search, *found;
avl_index_t where;
search.aux_guid = vd->vdev_guid;
found = avl_find(avl, &search, &where);
ASSERT(found != NULL);
ASSERT(found->aux_pool == 0ULL);
found->aux_pool = spa_guid(vd->vdev_spa);
}
/*
* Spares are tracked globally due to the following constraints:
*
* - A spare may be part of multiple pools.
* - A spare may be added to a pool even if it's actively in use within
* another pool.
* - A spare in use in any pool can only be the source of a replacement if
* the target is a spare in the same pool.
*
* We keep track of all spares on the system through the use of a reference
* counted AVL tree. When a vdev is added as a spare, or used as a replacement
* spare, then we bump the reference count in the AVL tree. In addition, we set
* the 'vdev_isspare' member to indicate that the device is a spare (active or
* inactive). When a spare is made active (used to replace a device in the
* pool), we also keep track of which pool its been made a part of.
*
* The 'spa_spare_lock' protects the AVL tree. These functions are normally
* called under the spa_namespace lock as part of vdev reconfiguration. The
* separate spare lock exists for the status query path, which does not need to
* be completely consistent with respect to other vdev configuration changes.
*/
static int
spa_spare_compare(const void *a, const void *b)
{
return (spa_aux_compare(a, b));
}
void
spa_spare_add(vdev_t *vd)
{
mutex_enter(&spa_spare_lock);
ASSERT(!vd->vdev_isspare);
spa_aux_add(vd, &spa_spare_avl);
vd->vdev_isspare = B_TRUE;
mutex_exit(&spa_spare_lock);
}
void
spa_spare_remove(vdev_t *vd)
{
mutex_enter(&spa_spare_lock);
ASSERT(vd->vdev_isspare);
spa_aux_remove(vd, &spa_spare_avl);
vd->vdev_isspare = B_FALSE;
mutex_exit(&spa_spare_lock);
}
boolean_t
spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt)
{
boolean_t found;
mutex_enter(&spa_spare_lock);
found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl);
mutex_exit(&spa_spare_lock);
return (found);
}
void
spa_spare_activate(vdev_t *vd)
{
mutex_enter(&spa_spare_lock);
ASSERT(vd->vdev_isspare);
spa_aux_activate(vd, &spa_spare_avl);
mutex_exit(&spa_spare_lock);
}
/*
* Level 2 ARC devices are tracked globally for the same reasons as spares.
* Cache devices currently only support one pool per cache device, and so
* for these devices the aux reference count is currently unused beyond 1.
*/
static int
spa_l2cache_compare(const void *a, const void *b)
{
return (spa_aux_compare(a, b));
}
void
spa_l2cache_add(vdev_t *vd)
{
mutex_enter(&spa_l2cache_lock);
ASSERT(!vd->vdev_isl2cache);
spa_aux_add(vd, &spa_l2cache_avl);
vd->vdev_isl2cache = B_TRUE;
mutex_exit(&spa_l2cache_lock);
}
void
spa_l2cache_remove(vdev_t *vd)
{
mutex_enter(&spa_l2cache_lock);
ASSERT(vd->vdev_isl2cache);
spa_aux_remove(vd, &spa_l2cache_avl);
vd->vdev_isl2cache = B_FALSE;
mutex_exit(&spa_l2cache_lock);
}
boolean_t
spa_l2cache_exists(uint64_t guid, uint64_t *pool)
{
boolean_t found;
mutex_enter(&spa_l2cache_lock);
found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl);
mutex_exit(&spa_l2cache_lock);
return (found);
}
void
spa_l2cache_activate(vdev_t *vd)
{
mutex_enter(&spa_l2cache_lock);
ASSERT(vd->vdev_isl2cache);
spa_aux_activate(vd, &spa_l2cache_avl);
mutex_exit(&spa_l2cache_lock);
}
/*
* ==========================================================================
* SPA vdev locking
* ==========================================================================
*/
/*
* Lock the given spa_t for the purpose of adding or removing a vdev.
* Grabs the global spa_namespace_lock plus the spa config lock for writing.
* It returns the next transaction group for the spa_t.
*/
uint64_t
spa_vdev_enter(spa_t *spa)
{
mutex_enter(&spa->spa_vdev_top_lock);
mutex_enter(&spa_namespace_lock);
return (spa_vdev_config_enter(spa));
}
/*
* Internal implementation for spa_vdev_enter(). Used when a vdev
* operation requires multiple syncs (i.e. removing a device) while
* keeping the spa_namespace_lock held.
*/
uint64_t
spa_vdev_config_enter(spa_t *spa)
{
ASSERT(MUTEX_HELD(&spa_namespace_lock));
spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
return (spa_last_synced_txg(spa) + 1);
}
/*
* Used in combination with spa_vdev_config_enter() to allow the syncing
* of multiple transactions without releasing the spa_namespace_lock.
*/
void
spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
{
ASSERT(MUTEX_HELD(&spa_namespace_lock));
int config_changed = B_FALSE;
ASSERT(txg > spa_last_synced_txg(spa));
spa->spa_pending_vdev = NULL;
/*
* Reassess the DTLs.
*/
vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
config_changed = B_TRUE;
spa->spa_config_generation++;
}
/*
* Verify the metaslab classes.
*/
ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
spa_config_exit(spa, SCL_ALL, spa);
/*
* Panic the system if the specified tag requires it. This
* is useful for ensuring that configurations are updated
* transactionally.
*/
if (zio_injection_enabled)
zio_handle_panic_injection(spa, tag, 0);
/*
* Note: this txg_wait_synced() is important because it ensures
* that there won't be more than one config change per txg.
* This allows us to use the txg as the generation number.
*/
if (error == 0)
txg_wait_synced(spa->spa_dsl_pool, txg);
if (vd != NULL) {
ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
vdev_free(vd);
spa_config_exit(spa, SCL_ALL, spa);
}
/*
* If the config changed, update the config cache.
*/
if (config_changed)
- spa_config_sync(spa, B_FALSE, B_TRUE);
+ spa_write_cachefile(spa, B_FALSE, B_TRUE);
}
/*
* Unlock the spa_t after adding or removing a vdev. Besides undoing the
* locking of spa_vdev_enter(), we also want make sure the transactions have
* synced to disk, and then update the global configuration cache with the new
* information.
*/
int
spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
{
spa_vdev_config_exit(spa, vd, txg, error, FTAG);
mutex_exit(&spa_namespace_lock);
mutex_exit(&spa->spa_vdev_top_lock);
return (error);
}
/*
* Lock the given spa_t for the purpose of changing vdev state.
*/
void
spa_vdev_state_enter(spa_t *spa, int oplocks)
{
int locks = SCL_STATE_ALL | oplocks;
/*
* Root pools may need to read of the underlying devfs filesystem
* when opening up a vdev. Unfortunately if we're holding the
* SCL_ZIO lock it will result in a deadlock when we try to issue
* the read from the root filesystem. Instead we "prefetch"
* the associated vnodes that we need prior to opening the
* underlying devices and cache them so that we can prevent
* any I/O when we are doing the actual open.
*/
if (spa_is_root(spa)) {
int low = locks & ~(SCL_ZIO - 1);
int high = locks & ~low;
spa_config_enter(spa, high, spa, RW_WRITER);
vdev_hold(spa->spa_root_vdev);
spa_config_enter(spa, low, spa, RW_WRITER);
} else {
spa_config_enter(spa, locks, spa, RW_WRITER);
}
spa->spa_vdev_locks = locks;
}
int
spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
{
boolean_t config_changed = B_FALSE;
if (vd != NULL || error == 0)
vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev,
0, 0, B_FALSE);
if (vd != NULL) {
vdev_state_dirty(vd->vdev_top);
config_changed = B_TRUE;
spa->spa_config_generation++;
}
if (spa_is_root(spa))
vdev_rele(spa->spa_root_vdev);
ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
spa_config_exit(spa, spa->spa_vdev_locks, spa);
/*
* If anything changed, wait for it to sync. This ensures that,
* from the system administrator's perspective, zpool(1M) commands
* are synchronous. This is important for things like zpool offline:
* when the command completes, you expect no further I/O from ZFS.
*/
if (vd != NULL)
txg_wait_synced(spa->spa_dsl_pool, 0);
/*
* If the config changed, update the config cache.
*/
if (config_changed) {
mutex_enter(&spa_namespace_lock);
- spa_config_sync(spa, B_FALSE, B_TRUE);
+ spa_write_cachefile(spa, B_FALSE, B_TRUE);
mutex_exit(&spa_namespace_lock);
}
return (error);
}
/*
* ==========================================================================
* Miscellaneous functions
* ==========================================================================
*/
void
spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx)
{
if (!nvlist_exists(spa->spa_label_features, feature)) {
fnvlist_add_boolean(spa->spa_label_features, feature);
/*
* When we are creating the pool (tx_txg==TXG_INITIAL), we can't
* dirty the vdev config because lock SCL_CONFIG is not held.
* Thankfully, in this case we don't need to dirty the config
* because it will be written out anyway when we finish
* creating the pool.
*/
if (tx->tx_txg != TXG_INITIAL)
vdev_config_dirty(spa->spa_root_vdev);
}
}
void
spa_deactivate_mos_feature(spa_t *spa, const char *feature)
{
if (nvlist_remove_all(spa->spa_label_features, feature) == 0)
vdev_config_dirty(spa->spa_root_vdev);
}
/*
* Rename a spa_t.
*/
int
spa_rename(const char *name, const char *newname)
{
spa_t *spa;
int err;
/*
* Lookup the spa_t and grab the config lock for writing. We need to
* actually open the pool so that we can sync out the necessary labels.
* It's OK to call spa_open() with the namespace lock held because we
* allow recursive calls for other reasons.
*/
mutex_enter(&spa_namespace_lock);
if ((err = spa_open(name, &spa, FTAG)) != 0) {
mutex_exit(&spa_namespace_lock);
return (err);
}
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
avl_remove(&spa_namespace_avl, spa);
(void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name));
avl_add(&spa_namespace_avl, spa);
/*
* Sync all labels to disk with the new names by marking the root vdev
* dirty and waiting for it to sync. It will pick up the new pool name
* during the sync.
*/
vdev_config_dirty(spa->spa_root_vdev);
spa_config_exit(spa, SCL_ALL, FTAG);
txg_wait_synced(spa->spa_dsl_pool, 0);
/*
* Sync the updated config cache.
*/
- spa_config_sync(spa, B_FALSE, B_TRUE);
+ spa_write_cachefile(spa, B_FALSE, B_TRUE);
spa_close(spa, FTAG);
mutex_exit(&spa_namespace_lock);
return (0);
}
/*
* Return the spa_t associated with given pool_guid, if it exists. If
* device_guid is non-zero, determine whether the pool exists *and* contains
* a device with the specified device_guid.
*/
spa_t *
spa_by_guid(uint64_t pool_guid, uint64_t device_guid)
{
spa_t *spa;
avl_tree_t *t = &spa_namespace_avl;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
if (spa->spa_state == POOL_STATE_UNINITIALIZED)
continue;
if (spa->spa_root_vdev == NULL)
continue;
if (spa_guid(spa) == pool_guid) {
if (device_guid == 0)
break;
if (vdev_lookup_by_guid(spa->spa_root_vdev,
device_guid) != NULL)
break;
/*
* Check any devices we may be in the process of adding.
*/
if (spa->spa_pending_vdev) {
if (vdev_lookup_by_guid(spa->spa_pending_vdev,
device_guid) != NULL)
break;
}
}
}
return (spa);
}
/*
* Determine whether a pool with the given pool_guid exists.
*/
boolean_t
spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
{
return (spa_by_guid(pool_guid, device_guid) != NULL);
}
char *
spa_strdup(const char *s)
{
size_t len;
char *new;
len = strlen(s);
new = kmem_alloc(len + 1, KM_SLEEP);
bcopy(s, new, len);
new[len] = '\0';
return (new);
}
void
spa_strfree(char *s)
{
kmem_free(s, strlen(s) + 1);
}
uint64_t
spa_get_random(uint64_t range)
{
uint64_t r;
ASSERT(range != 0);
(void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t));
return (r % range);
}
uint64_t
spa_generate_guid(spa_t *spa)
{
uint64_t guid = spa_get_random(-1ULL);
if (spa != NULL) {
while (guid == 0 || spa_guid_exists(spa_guid(spa), guid))
guid = spa_get_random(-1ULL);
} else {
while (guid == 0 || spa_guid_exists(guid, 0))
guid = spa_get_random(-1ULL);
}
return (guid);
}
void
snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
{
char type[256];
char *checksum = NULL;
char *compress = NULL;
if (bp != NULL) {
if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) {
dmu_object_byteswap_t bswap =
DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
(void) snprintf(type, sizeof (type), "bswap %s %s",
DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ?
"metadata" : "data",
dmu_ot_byteswap[bswap].ob_name);
} else {
(void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
sizeof (type));
}
if (!BP_IS_EMBEDDED(bp)) {
checksum =
zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
}
compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
}
SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum,
compress);
}
void
spa_freeze(spa_t *spa)
{
uint64_t freeze_txg = 0;
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
if (spa->spa_freeze_txg == UINT64_MAX) {
freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
spa->spa_freeze_txg = freeze_txg;
}
spa_config_exit(spa, SCL_ALL, FTAG);
if (freeze_txg != 0)
txg_wait_synced(spa_get_dsl(spa), freeze_txg);
}
void
zfs_panic_recover(const char *fmt, ...)
{
va_list adx;
va_start(adx, fmt);
vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx);
va_end(adx);
}
/*
* This is a stripped-down version of strtoull, suitable only for converting
* lowercase hexadecimal numbers that don't overflow.
*/
uint64_t
zfs_strtonum(const char *str, char **nptr)
{
uint64_t val = 0;
char c;
int digit;
while ((c = *str) != '\0') {
if (c >= '0' && c <= '9')
digit = c - '0';
else if (c >= 'a' && c <= 'f')
digit = 10 + c - 'a';
else
break;
val *= 16;
val += digit;
str++;
}
if (nptr)
*nptr = (char *)str;
return (val);
}
/*
* ==========================================================================
* Accessor functions
* ==========================================================================
*/
boolean_t
spa_shutting_down(spa_t *spa)
{
return (spa->spa_async_suspended);
}
dsl_pool_t *
spa_get_dsl(spa_t *spa)
{
return (spa->spa_dsl_pool);
}
boolean_t
spa_is_initializing(spa_t *spa)
{
return (spa->spa_is_initializing);
}
+boolean_t
+spa_indirect_vdevs_loaded(spa_t *spa)
+{
+ return (spa->spa_indirect_vdevs_loaded);
+}
+
blkptr_t *
spa_get_rootblkptr(spa_t *spa)
{
return (&spa->spa_ubsync.ub_rootbp);
}
void
spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
{
spa->spa_uberblock.ub_rootbp = *bp;
}
void
spa_altroot(spa_t *spa, char *buf, size_t buflen)
{
if (spa->spa_root == NULL)
buf[0] = '\0';
else
(void) strncpy(buf, spa->spa_root, buflen);
}
int
spa_sync_pass(spa_t *spa)
{
return (spa->spa_sync_pass);
}
char *
spa_name(spa_t *spa)
{
return (spa->spa_name);
}
uint64_t
spa_guid(spa_t *spa)
{
dsl_pool_t *dp = spa_get_dsl(spa);
uint64_t guid;
/*
* If we fail to parse the config during spa_load(), we can go through
* the error path (which posts an ereport) and end up here with no root
* vdev. We stash the original pool guid in 'spa_config_guid' to handle
* this case.
*/
if (spa->spa_root_vdev == NULL)
return (spa->spa_config_guid);
guid = spa->spa_last_synced_guid != 0 ?
spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid;
/*
* Return the most recently synced out guid unless we're
* in syncing context.
*/
if (dp && dsl_pool_sync_context(dp))
return (spa->spa_root_vdev->vdev_guid);
else
return (guid);
}
uint64_t
spa_load_guid(spa_t *spa)
{
/*
* This is a GUID that exists solely as a reference for the
* purposes of the arc. It is generated at load time, and
* is never written to persistent storage.
*/
return (spa->spa_load_guid);
}
uint64_t
spa_last_synced_txg(spa_t *spa)
{
return (spa->spa_ubsync.ub_txg);
}
uint64_t
spa_first_txg(spa_t *spa)
{
return (spa->spa_first_txg);
}
uint64_t
spa_syncing_txg(spa_t *spa)
{
return (spa->spa_syncing_txg);
}
/*
* Return the last txg where data can be dirtied. The final txgs
* will be used to just clear out any deferred frees that remain.
*/
uint64_t
spa_final_dirty_txg(spa_t *spa)
{
return (spa->spa_final_txg - TXG_DEFER_SIZE);
}
pool_state_t
spa_state(spa_t *spa)
{
return (spa->spa_state);
}
spa_load_state_t
spa_load_state(spa_t *spa)
{
return (spa->spa_load_state);
}
uint64_t
spa_freeze_txg(spa_t *spa)
{
return (spa->spa_freeze_txg);
}
/* ARGSUSED */
uint64_t
spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
{
return (lsize * spa_asize_inflation);
}
/*
* Return the amount of slop space in bytes. It is 1/32 of the pool (3.2%),
* or at least 128MB, unless that would cause it to be more than half the
* pool size.
*
* See the comment above spa_slop_shift for details.
*/
uint64_t
spa_get_slop_space(spa_t *spa)
{
uint64_t space = spa_get_dspace(spa);
return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop)));
}
uint64_t
spa_get_dspace(spa_t *spa)
{
return (spa->spa_dspace);
}
void
spa_update_dspace(spa_t *spa)
{
spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
ddt_get_dedup_dspace(spa);
+ if (spa->spa_vdev_removal != NULL) {
+ /*
+ * We can't allocate from the removing device, so
+ * subtract its size. This prevents the DMU/DSL from
+ * filling up the (now smaller) pool while we are in the
+ * middle of removing the device.
+ *
+ * Note that the DMU/DSL doesn't actually know or care
+ * how much space is allocated (it does its own tracking
+ * of how much space has been logically used). So it
+ * doesn't matter that the data we are moving may be
+ * allocated twice (on the old device and the new
+ * device).
+ */
+ vdev_t *vd = spa->spa_vdev_removal->svr_vdev;
+ spa->spa_dspace -= spa_deflate(spa) ?
+ vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
+ }
}
/*
* Return the failure mode that has been set to this pool. The default
* behavior will be to block all I/Os when a complete failure occurs.
*/
uint8_t
spa_get_failmode(spa_t *spa)
{
return (spa->spa_failmode);
}
boolean_t
spa_suspended(spa_t *spa)
{
return (spa->spa_suspended);
}
uint64_t
spa_version(spa_t *spa)
{
return (spa->spa_ubsync.ub_version);
}
boolean_t
spa_deflate(spa_t *spa)
{
return (spa->spa_deflate);
}
metaslab_class_t *
spa_normal_class(spa_t *spa)
{
return (spa->spa_normal_class);
}
metaslab_class_t *
spa_log_class(spa_t *spa)
{
return (spa->spa_log_class);
}
void
spa_evicting_os_register(spa_t *spa, objset_t *os)
{
mutex_enter(&spa->spa_evicting_os_lock);
list_insert_head(&spa->spa_evicting_os_list, os);
mutex_exit(&spa->spa_evicting_os_lock);
}
void
spa_evicting_os_deregister(spa_t *spa, objset_t *os)
{
mutex_enter(&spa->spa_evicting_os_lock);
list_remove(&spa->spa_evicting_os_list, os);
cv_broadcast(&spa->spa_evicting_os_cv);
mutex_exit(&spa->spa_evicting_os_lock);
}
void
spa_evicting_os_wait(spa_t *spa)
{
mutex_enter(&spa->spa_evicting_os_lock);
while (!list_is_empty(&spa->spa_evicting_os_list))
cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock);
mutex_exit(&spa->spa_evicting_os_lock);
dmu_buf_user_evict_wait();
}
int
spa_max_replication(spa_t *spa)
{
/*
* As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to
* handle BPs with more than one DVA allocated. Set our max
* replication level accordingly.
*/
if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS)
return (1);
return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
}
int
spa_prev_software_version(spa_t *spa)
{
return (spa->spa_prev_software_version);
}
uint64_t
spa_deadman_synctime(spa_t *spa)
{
return (spa->spa_deadman_synctime);
}
uint64_t
dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
{
uint64_t asize = DVA_GET_ASIZE(dva);
uint64_t dsize = asize;
ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
if (asize != 0 && spa->spa_deflate) {
uint64_t vdev = DVA_GET_VDEV(dva);
vdev_t *vd = vdev_lookup_top(spa, vdev);
if (vd == NULL) {
panic(
"dva_get_dsize_sync(): bad DVA %llu:%llu",
(u_longlong_t)vdev, (u_longlong_t)asize);
}
dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
}
return (dsize);
}
uint64_t
bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
{
uint64_t dsize = 0;
for (int d = 0; d < BP_GET_NDVAS(bp); d++)
dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
return (dsize);
}
uint64_t
bp_get_dsize(spa_t *spa, const blkptr_t *bp)
{
uint64_t dsize = 0;
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
for (int d = 0; d < BP_GET_NDVAS(bp); d++)
dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
spa_config_exit(spa, SCL_VDEV, FTAG);
return (dsize);
}
/*
* ==========================================================================
* Initialization and Termination
* ==========================================================================
*/
static int
spa_name_compare(const void *a1, const void *a2)
{
const spa_t *s1 = a1;
const spa_t *s2 = a2;
int s;
s = strcmp(s1->spa_name, s2->spa_name);
if (s > 0)
return (1);
if (s < 0)
return (-1);
return (0);
}
int
spa_busy(void)
{
return (spa_active_count);
}
void
spa_boot_init()
{
spa_config_load();
}
#ifdef _KERNEL
EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0);
#endif
void
spa_init(int mode)
{
mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
offsetof(spa_t, spa_avl));
avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t),
offsetof(spa_aux_t, aux_avl));
avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
offsetof(spa_aux_t, aux_avl));
spa_mode_global = mode;
#ifdef illumos
#ifdef _KERNEL
spa_arch_init();
#else
if (spa_mode_global != FREAD && dprintf_find_string("watch")) {
arc_procfd = open("/proc/self/ctl", O_WRONLY);
if (arc_procfd == -1) {
perror("could not enable watchpoints: "
"opening /proc/self/ctl failed: ");
} else {
arc_watch = B_TRUE;
}
}
#endif
#endif /* illumos */
refcount_sysinit();
unique_init();
range_tree_init();
metaslab_alloc_trace_init();
zio_init();
lz4_init();
dmu_init();
zil_init();
vdev_cache_stat_init();
vdev_file_init();
zfs_prop_init();
zpool_prop_init();
zpool_feature_init();
spa_config_load();
l2arc_start();
#ifndef illumos
#ifdef _KERNEL
zfs_deadman_init();
#endif
#endif /* !illumos */
}
void
spa_fini(void)
{
l2arc_stop();
spa_evict_all();
vdev_file_fini();
vdev_cache_stat_fini();
zil_fini();
dmu_fini();
lz4_fini();
zio_fini();
metaslab_alloc_trace_fini();
range_tree_fini();
unique_fini();
refcount_fini();
avl_destroy(&spa_namespace_avl);
avl_destroy(&spa_spare_avl);
avl_destroy(&spa_l2cache_avl);
cv_destroy(&spa_namespace_cv);
mutex_destroy(&spa_namespace_lock);
mutex_destroy(&spa_spare_lock);
mutex_destroy(&spa_l2cache_lock);
}
/*
* Return whether this pool has slogs. No locking needed.
* It's not a problem if the wrong answer is returned as it's only for
* performance and not correctness
*/
boolean_t
spa_has_slogs(spa_t *spa)
{
return (spa->spa_log_class->mc_rotor != NULL);
}
spa_log_state_t
spa_get_log_state(spa_t *spa)
{
return (spa->spa_log_state);
}
void
spa_set_log_state(spa_t *spa, spa_log_state_t state)
{
spa->spa_log_state = state;
}
boolean_t
spa_is_root(spa_t *spa)
{
return (spa->spa_is_root);
}
boolean_t
spa_writeable(spa_t *spa)
{
return (!!(spa->spa_mode & FWRITE));
}
/*
* Returns true if there is a pending sync task in any of the current
* syncing txg, the current quiescing txg, or the current open txg.
*/
boolean_t
spa_has_pending_synctask(spa_t *spa)
{
return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks));
}
int
spa_mode(spa_t *spa)
{
return (spa->spa_mode);
}
uint64_t
spa_bootfs(spa_t *spa)
{
return (spa->spa_bootfs);
}
uint64_t
spa_delegation(spa_t *spa)
{
return (spa->spa_delegation);
}
objset_t *
spa_meta_objset(spa_t *spa)
{
return (spa->spa_meta_objset);
}
enum zio_checksum
spa_dedup_checksum(spa_t *spa)
{
return (spa->spa_dedup_checksum);
}
/*
* Reset pool scan stat per scan pass (or reboot).
*/
void
spa_scan_stat_init(spa_t *spa)
{
/* data not stored on disk */
spa->spa_scan_pass_start = gethrestime_sec();
if (dsl_scan_is_paused_scrub(spa->spa_dsl_pool->dp_scan))
spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start;
else
spa->spa_scan_pass_scrub_pause = 0;
spa->spa_scan_pass_scrub_spent_paused = 0;
spa->spa_scan_pass_exam = 0;
vdev_scan_stat_init(spa->spa_root_vdev);
}
/*
* Get scan stats for zpool status reports
*/
int
spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
{
dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
return (SET_ERROR(ENOENT));
bzero(ps, sizeof (pool_scan_stat_t));
/* data stored on disk */
ps->pss_func = scn->scn_phys.scn_func;
ps->pss_start_time = scn->scn_phys.scn_start_time;
ps->pss_end_time = scn->scn_phys.scn_end_time;
ps->pss_to_examine = scn->scn_phys.scn_to_examine;
ps->pss_examined = scn->scn_phys.scn_examined;
ps->pss_to_process = scn->scn_phys.scn_to_process;
ps->pss_processed = scn->scn_phys.scn_processed;
ps->pss_errors = scn->scn_phys.scn_errors;
ps->pss_state = scn->scn_phys.scn_state;
/* data not stored on disk */
ps->pss_pass_start = spa->spa_scan_pass_start;
ps->pss_pass_exam = spa->spa_scan_pass_exam;
ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause;
ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused;
return (0);
}
boolean_t
spa_debug_enabled(spa_t *spa)
{
return (spa->spa_debug);
}
int
spa_maxblocksize(spa_t *spa)
{
if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
return (SPA_MAXBLOCKSIZE);
else
return (SPA_OLD_MAXBLOCKSIZE);
+}
+
+/*
+ * Returns the txg that the last device removal completed. No indirect mappings
+ * have been added since this txg.
+ */
+uint64_t
+spa_get_last_removal_txg(spa_t *spa)
+{
+ uint64_t vdevid;
+ uint64_t ret = -1ULL;
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ /*
+ * sr_prev_indirect_vdev is only modified while holding all the
+ * config locks, so it is sufficient to hold SCL_VDEV as reader when
+ * examining it.
+ */
+ vdevid = spa->spa_removing_phys.sr_prev_indirect_vdev;
+
+ while (vdevid != -1ULL) {
+ vdev_t *vd = vdev_lookup_top(spa, vdevid);
+ vdev_indirect_births_t *vib = vd->vdev_indirect_births;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+
+ /*
+ * If the removal did not remap any data, we don't care.
+ */
+ if (vdev_indirect_births_count(vib) != 0) {
+ ret = vdev_indirect_births_last_entry_txg(vib);
+ break;
+ }
+
+ vdevid = vd->vdev_indirect_config.vic_prev_indirect_vdev;
+ }
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ IMPLY(ret != -1ULL,
+ spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
+
+ return (ret);
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c (revision 332525)
@@ -1,548 +1,566 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/dmu.h>
#include <sys/dmu_tx.h>
#include <sys/dnode.h>
#include <sys/dsl_pool.h>
#include <sys/zio.h>
#include <sys/space_map.h>
#include <sys/refcount.h>
#include <sys/zfeature.h>
SYSCTL_DECL(_vfs_zfs);
/*
* The data for a given space map can be kept on blocks of any size.
* Larger blocks entail fewer i/o operations, but they also cause the
* DMU to keep more data in-core, and also to waste more i/o bandwidth
* when only a few blocks have changed since the last transaction group.
*/
int space_map_blksz = (1 << 12);
SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_blksz, CTLFLAG_RDTUN, &space_map_blksz, 0,
"Maximum block size for space map. Must be power of 2 and greater than 4096.");
/*
- * Load the space map disk into the specified range tree. Segments of maptype
- * are added to the range tree, other segment types are removed.
- *
- * Note: space_map_load() will drop sm_lock across dmu_read() calls.
- * The caller must be OK with this.
+ * Iterate through the space map, invoking the callback on each (non-debug)
+ * space map entry.
*/
int
-space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
+space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg)
{
uint64_t *entry, *entry_map, *entry_map_end;
- uint64_t bufsize, size, offset, end, space;
+ uint64_t bufsize, size, offset, end;
int error = 0;
- ASSERT(MUTEX_HELD(sm->sm_lock));
-
end = space_map_length(sm);
- space = space_map_allocated(sm);
- VERIFY0(range_tree_space(rt));
-
- if (maptype == SM_FREE) {
- range_tree_add(rt, sm->sm_start, sm->sm_size);
- space = sm->sm_size - space;
- }
-
bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
entry_map = zio_buf_alloc(bufsize);
- mutex_exit(sm->sm_lock);
if (end > bufsize) {
dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize,
end - bufsize, ZIO_PRIORITY_SYNC_READ);
}
- mutex_enter(sm->sm_lock);
- for (offset = 0; offset < end; offset += bufsize) {
+ for (offset = 0; offset < end && error == 0; offset += bufsize) {
size = MIN(end - offset, bufsize);
VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
VERIFY(size != 0);
ASSERT3U(sm->sm_blksz, !=, 0);
dprintf("object=%llu offset=%llx size=%llx\n",
space_map_object(sm), offset, size);
- mutex_exit(sm->sm_lock);
error = dmu_read(sm->sm_os, space_map_object(sm), offset, size,
entry_map, DMU_READ_PREFETCH);
- mutex_enter(sm->sm_lock);
if (error != 0)
break;
entry_map_end = entry_map + (size / sizeof (uint64_t));
- for (entry = entry_map; entry < entry_map_end; entry++) {
+ for (entry = entry_map; entry < entry_map_end && error == 0;
+ entry++) {
uint64_t e = *entry;
uint64_t offset, size;
- if (SM_DEBUG_DECODE(e)) /* Skip debug entries */
+ if (SM_DEBUG_DECODE(e)) /* Skip debug entries */
continue;
offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) +
sm->sm_start;
size = SM_RUN_DECODE(e) << sm->sm_shift;
VERIFY0(P2PHASE(offset, 1ULL << sm->sm_shift));
VERIFY0(P2PHASE(size, 1ULL << sm->sm_shift));
VERIFY3U(offset, >=, sm->sm_start);
VERIFY3U(offset + size, <=, sm->sm_start + sm->sm_size);
- if (SM_TYPE_DECODE(e) == maptype) {
- VERIFY3U(range_tree_space(rt) + size, <=,
- sm->sm_size);
- range_tree_add(rt, offset, size);
- } else {
- range_tree_remove(rt, offset, size);
- }
+ error = callback(SM_TYPE_DECODE(e), offset, size, arg);
}
}
- if (error == 0)
+ zio_buf_free(entry_map, bufsize);
+ return (error);
+}
+
+typedef struct space_map_load_arg {
+ space_map_t *smla_sm;
+ range_tree_t *smla_rt;
+ maptype_t smla_type;
+} space_map_load_arg_t;
+
+static int
+space_map_load_callback(maptype_t type, uint64_t offset, uint64_t size,
+ void *arg)
+{
+ space_map_load_arg_t *smla = arg;
+ if (type == smla->smla_type) {
+ VERIFY3U(range_tree_space(smla->smla_rt) + size, <=,
+ smla->smla_sm->sm_size);
+ range_tree_add(smla->smla_rt, offset, size);
+ } else {
+ range_tree_remove(smla->smla_rt, offset, size);
+ }
+
+ return (0);
+}
+
+/*
+ * Load the space map disk into the specified range tree. Segments of maptype
+ * are added to the range tree, other segment types are removed.
+ */
+int
+space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
+{
+ uint64_t space;
+ int err;
+ space_map_load_arg_t smla;
+
+ VERIFY0(range_tree_space(rt));
+ space = space_map_allocated(sm);
+
+ if (maptype == SM_FREE) {
+ range_tree_add(rt, sm->sm_start, sm->sm_size);
+ space = sm->sm_size - space;
+ }
+
+ smla.smla_rt = rt;
+ smla.smla_sm = sm;
+ smla.smla_type = maptype;
+ err = space_map_iterate(sm, space_map_load_callback, &smla);
+
+ if (err == 0) {
VERIFY3U(range_tree_space(rt), ==, space);
- else
+ } else {
range_tree_vacate(rt, NULL, NULL);
+ }
- zio_buf_free(entry_map, bufsize);
- return (error);
+ return (err);
}
void
space_map_histogram_clear(space_map_t *sm)
{
if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
return;
bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram));
}
boolean_t
space_map_histogram_verify(space_map_t *sm, range_tree_t *rt)
{
/*
* Verify that the in-core range tree does not have any
* ranges smaller than our sm_shift size.
*/
for (int i = 0; i < sm->sm_shift; i++) {
if (rt->rt_histogram[i] != 0)
return (B_FALSE);
}
return (B_TRUE);
}
void
space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx)
{
int idx = 0;
- ASSERT(MUTEX_HELD(rt->rt_lock));
ASSERT(dmu_tx_is_syncing(tx));
VERIFY3U(space_map_object(sm), !=, 0);
if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
return;
dmu_buf_will_dirty(sm->sm_dbuf, tx);
ASSERT(space_map_histogram_verify(sm, rt));
/*
* Transfer the content of the range tree histogram to the space
* map histogram. The space map histogram contains 32 buckets ranging
* between 2^sm_shift to 2^(32+sm_shift-1). The range tree,
* however, can represent ranges from 2^0 to 2^63. Since the space
* map only cares about allocatable blocks (minimum of sm_shift) we
* can safely ignore all ranges in the range tree smaller than sm_shift.
*/
for (int i = sm->sm_shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
/*
* Since the largest histogram bucket in the space map is
* 2^(32+sm_shift-1), we need to normalize the values in
* the range tree for any bucket larger than that size. For
* example given an sm_shift of 9, ranges larger than 2^40
* would get normalized as if they were 1TB ranges. Assume
* the range tree had a count of 5 in the 2^44 (16TB) bucket,
* the calculation below would normalize this to 5 * 2^4 (16).
*/
ASSERT3U(i, >=, idx + sm->sm_shift);
sm->sm_phys->smp_histogram[idx] +=
rt->rt_histogram[i] << (i - idx - sm->sm_shift);
/*
* Increment the space map's index as long as we haven't
* reached the maximum bucket size. Accumulate all ranges
* larger than the max bucket size into the last bucket.
*/
if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
ASSERT3U(idx + sm->sm_shift, ==, i);
idx++;
ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
}
}
}
uint64_t
space_map_entries(space_map_t *sm, range_tree_t *rt)
{
avl_tree_t *t = &rt->rt_root;
range_seg_t *rs;
uint64_t size, entries;
/*
* All space_maps always have a debug entry so account for it here.
*/
entries = 1;
/*
* Traverse the range tree and calculate the number of space map
* entries that would be required to write out the range tree.
*/
for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
entries += howmany(size, SM_RUN_MAX);
}
return (entries);
}
-/*
- * Note: space_map_write() will drop sm_lock across dmu_write() calls.
- */
void
space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
dmu_tx_t *tx)
{
objset_t *os = sm->sm_os;
spa_t *spa = dmu_objset_spa(os);
avl_tree_t *t = &rt->rt_root;
range_seg_t *rs;
uint64_t size, total, rt_space, nodes;
uint64_t *entry, *entry_map, *entry_map_end;
uint64_t expected_entries, actual_entries = 1;
- ASSERT(MUTEX_HELD(rt->rt_lock));
ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
VERIFY3U(space_map_object(sm), !=, 0);
dmu_buf_will_dirty(sm->sm_dbuf, tx);
/*
* This field is no longer necessary since the in-core space map
* now contains the object number but is maintained for backwards
* compatibility.
*/
sm->sm_phys->smp_object = sm->sm_object;
if (range_tree_space(rt) == 0) {
VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object);
return;
}
if (maptype == SM_ALLOC)
sm->sm_phys->smp_alloc += range_tree_space(rt);
else
sm->sm_phys->smp_alloc -= range_tree_space(rt);
expected_entries = space_map_entries(sm, rt);
entry_map = zio_buf_alloc(sm->sm_blksz);
entry_map_end = entry_map + (sm->sm_blksz / sizeof (uint64_t));
entry = entry_map;
*entry++ = SM_DEBUG_ENCODE(1) |
SM_DEBUG_ACTION_ENCODE(maptype) |
SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) |
SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
total = 0;
nodes = avl_numnodes(&rt->rt_root);
rt_space = range_tree_space(rt);
for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
uint64_t start;
size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
start = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
total += size << sm->sm_shift;
while (size != 0) {
uint64_t run_len;
run_len = MIN(size, SM_RUN_MAX);
if (entry == entry_map_end) {
- mutex_exit(rt->rt_lock);
dmu_write(os, space_map_object(sm),
sm->sm_phys->smp_objsize, sm->sm_blksz,
entry_map, tx);
- mutex_enter(rt->rt_lock);
sm->sm_phys->smp_objsize += sm->sm_blksz;
entry = entry_map;
}
*entry++ = SM_OFFSET_ENCODE(start) |
SM_TYPE_ENCODE(maptype) |
SM_RUN_ENCODE(run_len);
start += run_len;
size -= run_len;
actual_entries++;
}
}
if (entry != entry_map) {
size = (entry - entry_map) * sizeof (uint64_t);
- mutex_exit(rt->rt_lock);
dmu_write(os, space_map_object(sm), sm->sm_phys->smp_objsize,
size, entry_map, tx);
- mutex_enter(rt->rt_lock);
sm->sm_phys->smp_objsize += size;
}
ASSERT3U(expected_entries, ==, actual_entries);
/*
* Ensure that the space_map's accounting wasn't changed
* while we were in the middle of writing it out.
*/
VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root));
VERIFY3U(range_tree_space(rt), ==, rt_space);
VERIFY3U(range_tree_space(rt), ==, total);
zio_buf_free(entry_map, sm->sm_blksz);
}
static int
space_map_open_impl(space_map_t *sm)
{
int error;
u_longlong_t blocks;
error = dmu_bonus_hold(sm->sm_os, sm->sm_object, sm, &sm->sm_dbuf);
if (error)
return (error);
dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks);
sm->sm_phys = sm->sm_dbuf->db_data;
return (0);
}
int
space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
- uint64_t start, uint64_t size, uint8_t shift, kmutex_t *lp)
+ uint64_t start, uint64_t size, uint8_t shift)
{
space_map_t *sm;
int error;
ASSERT(*smp == NULL);
ASSERT(os != NULL);
ASSERT(object != 0);
sm = kmem_zalloc(sizeof (space_map_t), KM_SLEEP);
sm->sm_start = start;
sm->sm_size = size;
sm->sm_shift = shift;
- sm->sm_lock = lp;
sm->sm_os = os;
sm->sm_object = object;
error = space_map_open_impl(sm);
if (error != 0) {
space_map_close(sm);
return (error);
}
*smp = sm;
return (0);
}
void
space_map_close(space_map_t *sm)
{
if (sm == NULL)
return;
if (sm->sm_dbuf != NULL)
dmu_buf_rele(sm->sm_dbuf, sm);
sm->sm_dbuf = NULL;
sm->sm_phys = NULL;
kmem_free(sm, sizeof (*sm));
}
void
space_map_truncate(space_map_t *sm, dmu_tx_t *tx)
{
objset_t *os = sm->sm_os;
spa_t *spa = dmu_objset_spa(os);
dmu_object_info_t doi;
ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
ASSERT(dmu_tx_is_syncing(tx));
VERIFY3U(dmu_tx_get_txg(tx), <=, spa_final_dirty_txg(spa));
dmu_object_info_from_db(sm->sm_dbuf, &doi);
/*
* If the space map has the wrong bonus size (because
* SPA_FEATURE_SPACEMAP_HISTOGRAM has recently been enabled), or
* the wrong block size (because space_map_blksz has changed),
* free and re-allocate its object with the updated sizes.
*
* Otherwise, just truncate the current object.
*/
if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
doi.doi_bonus_size != sizeof (space_map_phys_t)) ||
doi.doi_data_block_size != space_map_blksz) {
zfs_dbgmsg("txg %llu, spa %s, sm %p, reallocating "
"object[%llu]: old bonus %u, old blocksz %u",
dmu_tx_get_txg(tx), spa_name(spa), sm, sm->sm_object,
doi.doi_bonus_size, doi.doi_data_block_size);
space_map_free(sm, tx);
dmu_buf_rele(sm->sm_dbuf, sm);
sm->sm_object = space_map_alloc(sm->sm_os, tx);
VERIFY0(space_map_open_impl(sm));
} else {
VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx));
/*
* If the spacemap is reallocated, its histogram
* will be reset. Do the same in the common case so that
* bugs related to the uncommon case do not go unnoticed.
*/
bzero(sm->sm_phys->smp_histogram,
sizeof (sm->sm_phys->smp_histogram));
}
dmu_buf_will_dirty(sm->sm_dbuf, tx);
sm->sm_phys->smp_objsize = 0;
sm->sm_phys->smp_alloc = 0;
}
/*
* Update the in-core space_map allocation and length values.
*/
void
space_map_update(space_map_t *sm)
{
if (sm == NULL)
return;
- ASSERT(MUTEX_HELD(sm->sm_lock));
-
sm->sm_alloc = sm->sm_phys->smp_alloc;
sm->sm_length = sm->sm_phys->smp_objsize;
}
uint64_t
space_map_alloc(objset_t *os, dmu_tx_t *tx)
{
spa_t *spa = dmu_objset_spa(os);
uint64_t object;
int bonuslen;
if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
spa_feature_incr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
bonuslen = sizeof (space_map_phys_t);
ASSERT3U(bonuslen, <=, dmu_bonus_max());
} else {
bonuslen = SPACE_MAP_SIZE_V0;
}
object = dmu_object_alloc(os,
DMU_OT_SPACE_MAP, space_map_blksz,
DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);
return (object);
}
void
-space_map_free(space_map_t *sm, dmu_tx_t *tx)
+space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx)
{
- spa_t *spa;
-
- if (sm == NULL)
- return;
-
- spa = dmu_objset_spa(sm->sm_os);
+ spa_t *spa = dmu_objset_spa(os);
if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
dmu_object_info_t doi;
- dmu_object_info_from_db(sm->sm_dbuf, &doi);
+ VERIFY0(dmu_object_info(os, smobj, &doi));
if (doi.doi_bonus_size != SPACE_MAP_SIZE_V0) {
- VERIFY(spa_feature_is_active(spa,
- SPA_FEATURE_SPACEMAP_HISTOGRAM));
spa_feature_decr(spa,
SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
}
}
- VERIFY3U(dmu_object_free(sm->sm_os, space_map_object(sm), tx), ==, 0);
+ VERIFY0(dmu_object_free(os, smobj, tx));
+}
+
+void
+space_map_free(space_map_t *sm, dmu_tx_t *tx)
+{
+ if (sm == NULL)
+ return;
+
+ space_map_free_obj(sm->sm_os, space_map_object(sm), tx);
sm->sm_object = 0;
}
uint64_t
space_map_object(space_map_t *sm)
{
return (sm != NULL ? sm->sm_object : 0);
}
/*
* Returns the already synced, on-disk allocated space.
*/
uint64_t
space_map_allocated(space_map_t *sm)
{
return (sm != NULL ? sm->sm_alloc : 0);
}
/*
* Returns the already synced, on-disk length;
*/
uint64_t
space_map_length(space_map_t *sm)
{
return (sm != NULL ? sm->sm_length : 0);
}
/*
* Returns the allocated space that is currently syncing.
*/
int64_t
space_map_alloc_delta(space_map_t *sm)
{
if (sm == NULL)
return (0);
ASSERT(sm->sm_dbuf != NULL);
return (sm->sm_phys->smp_alloc - space_map_allocated(sm));
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c (revision 332525)
@@ -1,159 +1,155 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/range_tree.h>
#include <sys/space_reftree.h>
/*
* Space reference trees.
*
* A range tree is a collection of integers. Every integer is either
* in the tree, or it's not. A space reference tree generalizes
* the idea: it allows its members to have arbitrary reference counts,
* as opposed to the implicit reference count of 0 or 1 in a range tree.
* This representation comes in handy when computing the union or
* intersection of multiple space maps. For example, the union of
* N range trees is the subset of the reference tree with refcnt >= 1.
* The intersection of N range trees is the subset with refcnt >= N.
*
* [It's very much like a Fourier transform. Unions and intersections
* are hard to perform in the 'range tree domain', so we convert the trees
* into the 'reference count domain', where it's trivial, then invert.]
*
* vdev_dtl_reassess() uses computations of this form to determine
* DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev
* has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev
* has an outage wherever refcnt >= vdev_children.
*/
static int
space_reftree_compare(const void *x1, const void *x2)
{
const space_ref_t *sr1 = x1;
const space_ref_t *sr2 = x2;
if (sr1->sr_offset < sr2->sr_offset)
return (-1);
if (sr1->sr_offset > sr2->sr_offset)
return (1);
if (sr1 < sr2)
return (-1);
if (sr1 > sr2)
return (1);
return (0);
}
void
space_reftree_create(avl_tree_t *t)
{
avl_create(t, space_reftree_compare,
sizeof (space_ref_t), offsetof(space_ref_t, sr_node));
}
void
space_reftree_destroy(avl_tree_t *t)
{
space_ref_t *sr;
void *cookie = NULL;
while ((sr = avl_destroy_nodes(t, &cookie)) != NULL)
kmem_free(sr, sizeof (*sr));
avl_destroy(t);
}
static void
space_reftree_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt)
{
space_ref_t *sr;
sr = kmem_alloc(sizeof (*sr), KM_SLEEP);
sr->sr_offset = offset;
sr->sr_refcnt = refcnt;
avl_add(t, sr);
}
void
space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
int64_t refcnt)
{
space_reftree_add_node(t, start, refcnt);
space_reftree_add_node(t, end, -refcnt);
}
/*
* Convert (or add) a range tree into a reference tree.
*/
void
space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt)
{
range_seg_t *rs;
- ASSERT(MUTEX_HELD(rt->rt_lock));
-
for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
space_reftree_add_seg(t, rs->rs_start, rs->rs_end, refcnt);
}
/*
* Convert a reference tree into a range tree. The range tree will contain
* all members of the reference tree for which refcnt >= minref.
*/
void
space_reftree_generate_map(avl_tree_t *t, range_tree_t *rt, int64_t minref)
{
uint64_t start = -1ULL;
int64_t refcnt = 0;
space_ref_t *sr;
-
- ASSERT(MUTEX_HELD(rt->rt_lock));
range_tree_vacate(rt, NULL, NULL);
for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) {
refcnt += sr->sr_refcnt;
if (refcnt >= minref) {
if (start == -1ULL) {
start = sr->sr_offset;
}
} else {
if (start != -1ULL) {
uint64_t end = sr->sr_offset;
ASSERT(start <= end);
if (end > start)
range_tree_add(rt, start, end - start);
start = -1ULL;
}
}
}
ASSERT(refcnt == 0);
ASSERT(start == -1ULL);
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h (revision 332525)
@@ -1,93 +1,95 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_BPOBJ_H
#define _SYS_BPOBJ_H
#include <sys/dmu.h>
#include <sys/spa.h>
#include <sys/txg.h>
#include <sys/zio.h>
#include <sys/zfs_context.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef struct bpobj_phys {
/*
* This is the bonus buffer for the dead lists. The object's
* contents is an array of bpo_entries blkptr_t's, representing
* a total of bpo_bytes physical space.
*/
uint64_t bpo_num_blkptrs;
uint64_t bpo_bytes;
uint64_t bpo_comp;
uint64_t bpo_uncomp;
uint64_t bpo_subobjs;
uint64_t bpo_num_subobjs;
} bpobj_phys_t;
#define BPOBJ_SIZE_V0 (2 * sizeof (uint64_t))
#define BPOBJ_SIZE_V1 (4 * sizeof (uint64_t))
typedef struct bpobj {
kmutex_t bpo_lock;
objset_t *bpo_os;
uint64_t bpo_object;
int bpo_epb;
uint8_t bpo_havecomp;
uint8_t bpo_havesubobj;
bpobj_phys_t *bpo_phys;
dmu_buf_t *bpo_dbuf;
dmu_buf_t *bpo_cached_dbuf;
} bpobj_t;
typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx);
uint64_t bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx);
void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
void bpobj_decr_empty(objset_t *os, dmu_tx_t *tx);
int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object);
void bpobj_close(bpobj_t *bpo);
+boolean_t bpobj_is_open(const bpobj_t *bpo);
int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx);
int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *);
void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx);
void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx);
int bpobj_space(bpobj_t *bpo,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+boolean_t bpobj_is_empty(bpobj_t *bpo);
#ifdef __cplusplus
}
#endif
#endif /* _SYS_BPOBJ_H */
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h (revision 332525)
@@ -1,403 +1,405 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
#ifndef _SYS_DBUF_H
#define _SYS_DBUF_H
#include <sys/dmu.h>
#include <sys/spa.h>
#include <sys/txg.h>
#include <sys/zio.h>
#include <sys/arc.h>
#include <sys/zfs_context.h>
#include <sys/refcount.h>
#include <sys/zrlock.h>
#include <sys/multilist.h>
#ifdef __cplusplus
extern "C" {
#endif
#define IN_DMU_SYNC 2
/*
* define flags for dbuf_read
*/
#define DB_RF_MUST_SUCCEED (1 << 0)
#define DB_RF_CANFAIL (1 << 1)
#define DB_RF_HAVESTRUCT (1 << 2)
#define DB_RF_NOPREFETCH (1 << 3)
#define DB_RF_NEVERWAIT (1 << 4)
#define DB_RF_CACHED (1 << 5)
/*
* The simplified state transition diagram for dbufs looks like:
*
* +----> READ ----+
* | |
* | V
* (alloc)-->UNCACHED CACHED-->EVICTING-->(free)
* | ^ ^
* | | |
* +----> FILL ----+ |
* | |
* | |
* +--------> NOFILL -------+
*
* DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range
* to find all dbufs in a range of a dnode and must be less than any other
* dbuf_states_t (see comment on dn_dbufs in dnode.h).
*/
typedef enum dbuf_states {
DB_SEARCH = -1,
DB_UNCACHED,
DB_FILL,
DB_NOFILL,
DB_READ,
DB_CACHED,
DB_EVICTING
} dbuf_states_t;
struct dnode;
struct dmu_tx;
/*
* level = 0 means the user data
* level = 1 means the single indirect block
* etc.
*/
struct dmu_buf_impl;
typedef enum override_states {
DR_NOT_OVERRIDDEN,
DR_IN_DMU_SYNC,
DR_OVERRIDDEN
} override_states_t;
typedef struct dbuf_dirty_record {
/* link on our parents dirty list */
list_node_t dr_dirty_node;
/* transaction group this data will sync in */
uint64_t dr_txg;
/* zio of outstanding write IO */
zio_t *dr_zio;
/* pointer back to our dbuf */
struct dmu_buf_impl *dr_dbuf;
/* pointer to next dirty record */
struct dbuf_dirty_record *dr_next;
/* pointer to parent dirty record */
struct dbuf_dirty_record *dr_parent;
/* How much space was changed to dsl_pool_dirty_space() for this? */
unsigned int dr_accounted;
/* A copy of the bp that points to us */
blkptr_t dr_bp_copy;
union dirty_types {
struct dirty_indirect {
/* protect access to list */
kmutex_t dr_mtx;
/* Our list of dirty children */
list_t dr_children;
} di;
struct dirty_leaf {
/*
* dr_data is set when we dirty the buffer
* so that we can retain the pointer even if it
* gets COW'd in a subsequent transaction group.
*/
arc_buf_t *dr_data;
blkptr_t dr_overridden_by;
override_states_t dr_override_state;
uint8_t dr_copies;
boolean_t dr_nopwrite;
} dl;
} dt;
} dbuf_dirty_record_t;
typedef struct dmu_buf_impl {
/*
* The following members are immutable, with the exception of
* db.db_data, which is protected by db_mtx.
*/
/* the publicly visible structure */
dmu_buf_t db;
/* the objset we belong to */
struct objset *db_objset;
/*
* handle to safely access the dnode we belong to (NULL when evicted)
*/
struct dnode_handle *db_dnode_handle;
/*
* our parent buffer; if the dnode points to us directly,
* db_parent == db_dnode_handle->dnh_dnode->dn_dbuf
* only accessed by sync thread ???
* (NULL when evicted)
* May change from NULL to non-NULL under the protection of db_mtx
* (see dbuf_check_blkptr())
*/
struct dmu_buf_impl *db_parent;
/*
* link for hash table of all dmu_buf_impl_t's
*/
struct dmu_buf_impl *db_hash_next;
/* our block number */
uint64_t db_blkid;
/*
* Pointer to the blkptr_t which points to us. May be NULL if we
* don't have one yet. (NULL when evicted)
*/
blkptr_t *db_blkptr;
/*
* Our indirection level. Data buffers have db_level==0.
* Indirect buffers which point to data buffers have
* db_level==1. etc. Buffers which contain dnodes have
* db_level==0, since the dnodes are stored in a file.
*/
uint8_t db_level;
/* db_mtx protects the members below */
kmutex_t db_mtx;
/*
* Current state of the buffer
*/
dbuf_states_t db_state;
/*
* Refcount accessed by dmu_buf_{hold,rele}.
* If nonzero, the buffer can't be destroyed.
* Protected by db_mtx.
*/
refcount_t db_holds;
/* buffer holding our data */
arc_buf_t *db_buf;
kcondvar_t db_changed;
dbuf_dirty_record_t *db_data_pending;
/* pointer to most recent dirty record for this buffer */
dbuf_dirty_record_t *db_last_dirty;
/*
* Our link on the owner dnodes's dn_dbufs list.
* Protected by its dn_dbufs_mtx.
*/
avl_node_t db_link;
/*
* Link in dbuf_cache.
*/
multilist_node_t db_cache_link;
/* Data which is unique to data (leaf) blocks: */
/* User callback information. */
dmu_buf_user_t *db_user;
/*
* Evict user data as soon as the dirty and reference
* counts are equal.
*/
uint8_t db_user_immediate_evict;
/*
* This block was freed while a read or write was
* active.
*/
uint8_t db_freed_in_flight;
/*
* dnode_evict_dbufs() or dnode_evict_bonus() tried to
* evict this dbuf, but couldn't due to outstanding
* references. Evict once the refcount drops to 0.
*/
uint8_t db_pending_evict;
uint8_t db_dirtycnt;
} dmu_buf_impl_t;
/* Note: the dbuf hash table is exposed only for the mdb module */
#define DBUF_MUTEXES 256
#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
typedef struct dbuf_hash_table {
uint64_t hash_table_mask;
dmu_buf_impl_t **hash_table;
kmutex_t hash_mutexes[DBUF_MUTEXES];
} dbuf_hash_table_t;
uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset);
dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
void dbuf_create_bonus(struct dnode *dn);
int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx);
void dbuf_spill_hold(struct dnode *dn, dmu_buf_impl_t **dbp, void *tag);
void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx);
dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
void *tag);
int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid,
boolean_t fail_sparse, boolean_t fail_uncached,
void *tag, dmu_buf_impl_t **dbp);
void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid,
zio_priority_t prio, arc_flags_t aflags);
void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,
uint64_t blkid, void *tag);
uint64_t dbuf_refcount(dmu_buf_impl_t *db);
void dbuf_rele(dmu_buf_impl_t *db, void *tag);
void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);
dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
uint64_t blkid);
int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
bp_embedded_type_t etype, enum zio_compress comp,
int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);
void dbuf_destroy(dmu_buf_impl_t *db);
void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
void dbuf_unoverride(dbuf_dirty_record_t *dr);
void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx);
void dbuf_release_bp(dmu_buf_impl_t *db);
+boolean_t dbuf_can_remap(const dmu_buf_impl_t *buf);
+
void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
struct dmu_tx *);
void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
#define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode)
#define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock)
#define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db)))
#define DB_DNODE_EXIT(_db) (zrl_remove(&DB_DNODE_LOCK(_db)))
#define DB_DNODE_HELD(_db) (!zrl_is_zero(&DB_DNODE_LOCK(_db)))
void dbuf_init(void);
void dbuf_fini(void);
boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
#define DBUF_GET_BUFC_TYPE(_db) \
(dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
#define DBUF_IS_CACHEABLE(_db) \
((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \
(dbuf_is_metadata(_db) && \
((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
#define DBUF_IS_L2CACHEABLE(_db) \
((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \
(dbuf_is_metadata(_db) && \
((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
#define DNODE_LEVEL_IS_L2CACHEABLE(_dn, _level) \
((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_ALL || \
(((_level) > 0 || \
DMU_OT_IS_METADATA((_dn)->dn_handle->dnh_dnode->dn_type)) && \
((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
#ifdef ZFS_DEBUG
/*
* There should be a ## between the string literal and fmt, to make it
* clear that we're joining two strings together, but gcc does not
* support that preprocessor token.
*/
#define dprintf_dbuf(dbuf, fmt, ...) do { \
if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
char __db_buf[32]; \
uint64_t __db_obj = (dbuf)->db.db_object; \
if (__db_obj == DMU_META_DNODE_OBJECT) \
(void) strcpy(__db_buf, "mdn"); \
else \
(void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
(u_longlong_t)__db_obj); \
dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \
"obj=%s lvl=%u blkid=%lld " fmt, \
__db_buf, (dbuf)->db_level, \
(u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \
} \
_NOTE(CONSTCOND) } while (0)
#define dprintf_dbuf_bp(db, bp, fmt, ...) do { \
if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp); \
dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \
kmem_free(__blkbuf, BP_SPRINTF_LEN); \
} \
_NOTE(CONSTCOND) } while (0)
#define DBUF_VERIFY(db) dbuf_verify(db)
#else
#define dprintf_dbuf(db, fmt, ...)
#define dprintf_dbuf_bp(db, bp, fmt, ...)
#define DBUF_VERIFY(db)
#endif
#ifdef __cplusplus
}
#endif
#endif /* _SYS_DBUF_H */
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h (revision 332525)
@@ -1,977 +1,984 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright 2013 DEY Storage Systems, Inc.
* Copyright 2014 HybridCluster. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
/* Portions Copyright 2010 Robert Milkowski */
#ifndef _SYS_DMU_H
#define _SYS_DMU_H
/*
* This file describes the interface that the DMU provides for its
* consumers.
*
* The DMU also interacts with the SPA. That interface is described in
* dmu_spa.h.
*/
#include <sys/zfs_context.h>
#include <sys/cred.h>
#include <sys/fs/zfs.h>
#include <sys/zio_compress.h>
#include <sys/zio_priority.h>
#ifdef __cplusplus
extern "C" {
#endif
struct uio;
struct xuio;
struct page;
struct vnode;
struct spa;
struct zilog;
struct zio;
struct blkptr;
struct zap_cursor;
struct dsl_dataset;
struct dsl_pool;
struct dnode;
struct drr_begin;
struct drr_end;
struct zbookmark_phys;
struct spa;
struct nvlist;
struct arc_buf;
struct zio_prop;
struct sa_handle;
struct file;
typedef struct objset objset_t;
typedef struct dmu_tx dmu_tx_t;
typedef struct dsl_dir dsl_dir_t;
typedef struct dnode dnode_t;
typedef enum dmu_object_byteswap {
DMU_BSWAP_UINT8,
DMU_BSWAP_UINT16,
DMU_BSWAP_UINT32,
DMU_BSWAP_UINT64,
DMU_BSWAP_ZAP,
DMU_BSWAP_DNODE,
DMU_BSWAP_OBJSET,
DMU_BSWAP_ZNODE,
DMU_BSWAP_OLDACL,
DMU_BSWAP_ACL,
/*
* Allocating a new byteswap type number makes the on-disk format
* incompatible with any other format that uses the same number.
*
* Data can usually be structured to work with one of the
* DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types.
*/
DMU_BSWAP_NUMFUNCS
} dmu_object_byteswap_t;
#define DMU_OT_NEWTYPE 0x80
#define DMU_OT_METADATA 0x40
#define DMU_OT_BYTESWAP_MASK 0x3f
/*
* Defines a uint8_t object type. Object types specify if the data
* in the object is metadata (boolean) and how to byteswap the data
* (dmu_object_byteswap_t).
*/
#define DMU_OT(byteswap, metadata) \
(DMU_OT_NEWTYPE | \
((metadata) ? DMU_OT_METADATA : 0) | \
((byteswap) & DMU_OT_BYTESWAP_MASK))
#define DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \
((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \
(ot) < DMU_OT_NUMTYPES)
#define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \
((ot) & DMU_OT_METADATA) : \
dmu_ot[(ot)].ot_metadata)
/*
* These object types use bp_fill != 1 for their L0 bp's. Therefore they can't
* have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill
* is repurposed for embedded BPs.
*/
#define DMU_OT_HAS_FILL(ot) \
((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET)
#define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \
((ot) & DMU_OT_BYTESWAP_MASK) : \
dmu_ot[(ot)].ot_byteswap)
typedef enum dmu_object_type {
DMU_OT_NONE,
/* general: */
DMU_OT_OBJECT_DIRECTORY, /* ZAP */
DMU_OT_OBJECT_ARRAY, /* UINT64 */
DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */
DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */
DMU_OT_BPOBJ, /* UINT64 */
DMU_OT_BPOBJ_HDR, /* UINT64 */
/* spa: */
DMU_OT_SPACE_MAP_HEADER, /* UINT64 */
DMU_OT_SPACE_MAP, /* UINT64 */
/* zil: */
DMU_OT_INTENT_LOG, /* UINT64 */
/* dmu: */
DMU_OT_DNODE, /* DNODE */
DMU_OT_OBJSET, /* OBJSET */
/* dsl: */
DMU_OT_DSL_DIR, /* UINT64 */
DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */
DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */
DMU_OT_DSL_PROPS, /* ZAP */
DMU_OT_DSL_DATASET, /* UINT64 */
/* zpl: */
DMU_OT_ZNODE, /* ZNODE */
DMU_OT_OLDACL, /* Old ACL */
DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */
DMU_OT_DIRECTORY_CONTENTS, /* ZAP */
DMU_OT_MASTER_NODE, /* ZAP */
DMU_OT_UNLINKED_SET, /* ZAP */
/* zvol: */
DMU_OT_ZVOL, /* UINT8 */
DMU_OT_ZVOL_PROP, /* ZAP */
/* other; for testing only! */
DMU_OT_PLAIN_OTHER, /* UINT8 */
DMU_OT_UINT64_OTHER, /* UINT64 */
DMU_OT_ZAP_OTHER, /* ZAP */
/* new object types: */
DMU_OT_ERROR_LOG, /* ZAP */
DMU_OT_SPA_HISTORY, /* UINT8 */
DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */
DMU_OT_POOL_PROPS, /* ZAP */
DMU_OT_DSL_PERMS, /* ZAP */
DMU_OT_ACL, /* ACL */
DMU_OT_SYSACL, /* SYSACL */
DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */
DMU_OT_FUID_SIZE, /* FUID table size UINT64 */
DMU_OT_NEXT_CLONES, /* ZAP */
DMU_OT_SCAN_QUEUE, /* ZAP */
DMU_OT_USERGROUP_USED, /* ZAP */
DMU_OT_USERGROUP_QUOTA, /* ZAP */
DMU_OT_USERREFS, /* ZAP */
DMU_OT_DDT_ZAP, /* ZAP */
DMU_OT_DDT_STATS, /* ZAP */
DMU_OT_SA, /* System attr */
DMU_OT_SA_MASTER_NODE, /* ZAP */
DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */
DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */
DMU_OT_SCAN_XLATE, /* ZAP */
DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */
DMU_OT_DEADLIST, /* ZAP */
DMU_OT_DEADLIST_HDR, /* UINT64 */
DMU_OT_DSL_CLONES, /* ZAP */
DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */
/*
* Do not allocate new object types here. Doing so makes the on-disk
* format incompatible with any other format that uses the same object
* type number.
*
* When creating an object which does not have one of the above types
* use the DMU_OTN_* type with the correct byteswap and metadata
* values.
*
* The DMU_OTN_* types do not have entries in the dmu_ot table,
* use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead
* of indexing into dmu_ot directly (this works for both DMU_OT_* types
* and DMU_OTN_* types).
*/
DMU_OT_NUMTYPES,
/*
* Names for valid types declared with DMU_OT().
*/
DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE),
DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE),
DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE),
DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE),
DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE),
DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE),
DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE),
DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE),
DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE),
DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE),
} dmu_object_type_t;
/*
* These flags are intended to be used to specify the "txg_how"
* parameter when calling the dmu_tx_assign() function. See the comment
* above dmu_tx_assign() for more details on the meaning of these flags.
*/
#define TXG_NOWAIT (0ULL)
#define TXG_WAIT (1ULL<<0)
#define TXG_NOTHROTTLE (1ULL<<1)
void byteswap_uint64_array(void *buf, size_t size);
void byteswap_uint32_array(void *buf, size_t size);
void byteswap_uint16_array(void *buf, size_t size);
void byteswap_uint8_array(void *buf, size_t size);
void zap_byteswap(void *buf, size_t size);
void zfs_oldacl_byteswap(void *buf, size_t size);
void zfs_acl_byteswap(void *buf, size_t size);
void zfs_znode_byteswap(void *buf, size_t size);
#define DS_FIND_SNAPSHOTS (1<<0)
#define DS_FIND_CHILDREN (1<<1)
#define DS_FIND_SERIALIZE (1<<2)
/*
* The maximum number of bytes that can be accessed as part of one
* operation, including metadata.
*/
#define DMU_MAX_ACCESS (32 * 1024 * 1024) /* 32MB */
#define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
#define DMU_USERUSED_OBJECT (-1ULL)
#define DMU_GROUPUSED_OBJECT (-2ULL)
/*
* artificial blkids for bonus buffer and spill blocks
*/
#define DMU_BONUS_BLKID (-1ULL)
#define DMU_SPILL_BLKID (-2ULL)
/*
* Public routines to create, destroy, open, and close objsets.
*/
int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
int dmu_objset_own(const char *name, dmu_objset_type_t type,
boolean_t readonly, void *tag, objset_t **osp);
void dmu_objset_rele(objset_t *os, void *tag);
void dmu_objset_disown(objset_t *os, void *tag);
int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp);
void dmu_objset_evict_dbufs(objset_t *os);
int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
int dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname,
struct nvlist *snaps);
int dmu_objset_clone(const char *name, const char *origin);
int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer,
struct nvlist *errlist);
int dmu_objset_snapshot_one(const char *fsname, const char *snapname);
int dmu_objset_snapshot_tmp(const char *, const char *, int);
int dmu_objset_find(char *name, int func(const char *, void *), void *arg,
int flags);
void dmu_objset_byteswap(void *buf, size_t size);
int dsl_dataset_rename_snapshot(const char *fsname,
const char *oldsnapname, const char *newsnapname, boolean_t recursive);
+int dmu_objset_remap_indirects(const char *fsname);
typedef struct dmu_buf {
uint64_t db_object; /* object that this buffer is part of */
uint64_t db_offset; /* byte offset in this object */
uint64_t db_size; /* size of buffer in bytes */
void *db_data; /* data in buffer */
} dmu_buf_t;
/*
* The names of zap entries in the DIRECTORY_OBJECT of the MOS.
*/
#define DMU_POOL_DIRECTORY_OBJECT 1
#define DMU_POOL_CONFIG "config"
#define DMU_POOL_FEATURES_FOR_WRITE "features_for_write"
#define DMU_POOL_FEATURES_FOR_READ "features_for_read"
#define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions"
#define DMU_POOL_FEATURE_ENABLED_TXG "feature_enabled_txg"
#define DMU_POOL_ROOT_DATASET "root_dataset"
#define DMU_POOL_SYNC_BPOBJ "sync_bplist"
#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub"
#define DMU_POOL_ERRLOG_LAST "errlog_last"
#define DMU_POOL_SPARES "spares"
#define DMU_POOL_DEFLATE "deflate"
#define DMU_POOL_HISTORY "history"
#define DMU_POOL_PROPS "pool_props"
#define DMU_POOL_L2CACHE "l2cache"
#define DMU_POOL_TMP_USERREFS "tmp_userrefs"
#define DMU_POOL_DDT "DDT-%s-%s-%s"
#define DMU_POOL_DDT_STATS "DDT-statistics"
#define DMU_POOL_CREATION_VERSION "creation_version"
#define DMU_POOL_SCAN "scan"
#define DMU_POOL_FREE_BPOBJ "free_bpobj"
#define DMU_POOL_BPTREE_OBJ "bptree_obj"
#define DMU_POOL_EMPTY_BPOBJ "empty_bpobj"
#define DMU_POOL_CHECKSUM_SALT "org.illumos:checksum_salt"
#define DMU_POOL_VDEV_ZAP_MAP "com.delphix:vdev_zap_map"
+#define DMU_POOL_REMOVING "com.delphix:removing"
+#define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj"
+#define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect"
/*
* Allocate an object from this objset. The range of object numbers
* available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode.
*
* The transaction must be assigned to a txg. The newly allocated
* object will be "held" in the transaction (ie. you can modify the
* newly allocated object in this transaction).
*
* dmu_object_alloc() chooses an object and returns it in *objectp.
*
* dmu_object_claim() allocates a specific object number. If that
* number is already allocated, it fails and returns EEXIST.
*
* Return 0 on success, or ENOSPC or EEXIST as specified above.
*/
uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp);
/*
* Free an object from this objset.
*
* The object's data will be freed as well (ie. you don't need to call
* dmu_free(object, 0, -1, tx)).
*
* The object need not be held in the transaction.
*
* If there are any holds on this object's buffers (via dmu_buf_hold()),
* or tx holds on the object (via dmu_tx_hold_object()), you can not
* free it; it fails and returns EBUSY.
*
* If the object is not allocated, it fails and returns ENOENT.
*
* Return 0 on success, or EBUSY or ENOENT as specified above.
*/
int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx);
/*
* Find the next allocated or free object.
*
* The objectp parameter is in-out. It will be updated to be the next
* object which is allocated. Ignore objects which have not been
* modified since txg.
*
* XXX Can only be called on a objset with no dirty data.
*
* Returns 0 on success, or ENOENT if there are no more objects.
*/
int dmu_object_next(objset_t *os, uint64_t *objectp,
boolean_t hole, uint64_t txg);
/*
* Set the data blocksize for an object.
*
* The object cannot have any blocks allcated beyond the first. If
* the first block is allocated already, the new size must be greater
* than the current block size. If these conditions are not met,
* ENOTSUP will be returned.
*
* Returns 0 on success, or EBUSY if there are any holds on the object
* contents, or ENOTSUP as described above.
*/
int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
int ibs, dmu_tx_t *tx);
/*
* Set the checksum property on a dnode. The new checksum algorithm will
* apply to all newly written blocks; existing blocks will not be affected.
*/
void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
dmu_tx_t *tx);
/*
* Set the compress property on a dnode. The new compression algorithm will
* apply to all newly written blocks; existing blocks will not be affected.
*/
void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
dmu_tx_t *tx);
+int dmu_object_remap_indirects(objset_t *os, uint64_t object, uint64_t txg);
+
void
dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
int compressed_size, int byteorder, dmu_tx_t *tx);
/*
* Decide how to write a block: checksum, compression, number of copies, etc.
*/
#define WP_NOFILL 0x1
#define WP_DMU_SYNC 0x2
#define WP_SPILL 0x4
void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
struct zio_prop *zp);
/*
* The bonus data is accessed more or less like a regular buffer.
* You must dmu_bonus_hold() to get the buffer, which will give you a
* dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
- * data. As with any normal buffer, you must call dmu_buf_read() to
- * read db_data, dmu_buf_will_dirty() before modifying it, and the
+ * data. As with any normal buffer, you must call dmu_buf_will_dirty()
+ * before modifying it, and the
* object must be held in an assigned transaction before calling
* dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus
* buffer as well. You must release your hold with dmu_buf_rele().
*
* Returns ENOENT, EIO, or 0.
*/
int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
int dmu_bonus_max(void);
int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
dmu_object_type_t dmu_get_bonustype(dmu_buf_t *);
int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
/*
* Special spill buffer support used by "SA" framework
*/
int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags,
void *tag, dmu_buf_t **dbp);
int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
/*
* Obtain the DMU buffer from the specified object which contains the
* specified offset. dmu_buf_hold() puts a "hold" on the buffer, so
* that it will remain in memory. You must release the hold with
* dmu_buf_rele(). You musn't access the dmu_buf_t after releasing your
* hold. You must have a hold on any dmu_buf_t* you pass to the DMU.
*
* You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill
* on the returned buffer before reading or writing the buffer's
* db_data. The comments for those routines describe what particular
* operations are valid after calling them.
*
* The object number must be a valid, allocated object number.
*/
int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
void *tag, dmu_buf_t **, int flags);
int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
void *tag, dmu_buf_t **dbp, int flags);
/*
* Add a reference to a dmu buffer that has already been held via
* dmu_buf_hold() in the current context.
*/
void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
/*
* Attempt to add a reference to a dmu buffer that is in an unknown state,
* using a pointer that may have been invalidated by eviction processing.
* The request will succeed if the passed in dbuf still represents the
* same os/object/blkid, is ineligible for eviction, and has at least
* one hold by a user other than the syncer.
*/
boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object,
uint64_t blkid, void *tag);
void dmu_buf_rele(dmu_buf_t *db, void *tag);
uint64_t dmu_buf_refcount(dmu_buf_t *db);
/*
* dmu_buf_hold_array holds the DMU buffers which contain all bytes in a
* range of an object. A pointer to an array of dmu_buf_t*'s is
* returned (in *dbpp).
*
* dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and
* frees the array. The hold on the array of buffers MUST be released
* with dmu_buf_rele_array. You can NOT release the hold on each buffer
* individually with dmu_buf_rele.
*/
int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
uint64_t length, boolean_t read, void *tag,
int *numbufsp, dmu_buf_t ***dbpp);
void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
typedef void dmu_buf_evict_func_t(void *user_ptr);
/*
* A DMU buffer user object may be associated with a dbuf for the
* duration of its lifetime. This allows the user of a dbuf (client)
* to attach private data to a dbuf (e.g. in-core only data such as a
* dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified
* when that dbuf has been evicted. Clients typically respond to the
* eviction notification by freeing their private data, thus ensuring
* the same lifetime for both dbuf and private data.
*
* The mapping from a dmu_buf_user_t to any client private data is the
* client's responsibility. All current consumers of the API with private
* data embed a dmu_buf_user_t as the first member of the structure for
* their private data. This allows conversions between the two types
* with a simple cast. Since the DMU buf user API never needs access
* to the private data, other strategies can be employed if necessary
* or convenient for the client (e.g. using container_of() to do the
* conversion for private data that cannot have the dmu_buf_user_t as
* its first member).
*
* Eviction callbacks are executed without the dbuf mutex held or any
* other type of mechanism to guarantee that the dbuf is still available.
* For this reason, users must assume the dbuf has already been freed
* and not reference the dbuf from the callback context.
*
* Users requesting "immediate eviction" are notified as soon as the dbuf
* is only referenced by dirty records (dirties == holds). Otherwise the
* notification occurs after eviction processing for the dbuf begins.
*/
typedef struct dmu_buf_user {
/*
* Asynchronous user eviction callback state.
*/
taskq_ent_t dbu_tqent;
/*
* This instance's eviction function pointers.
*
* dbu_evict_func_sync is called synchronously and then
* dbu_evict_func_async is executed asynchronously on a taskq.
*/
dmu_buf_evict_func_t *dbu_evict_func_sync;
dmu_buf_evict_func_t *dbu_evict_func_async;
#ifdef ZFS_DEBUG
/*
* Pointer to user's dbuf pointer. NULL for clients that do
* not associate a dbuf with their user data.
*
* The dbuf pointer is cleared upon eviction so as to catch
* use-after-evict bugs in clients.
*/
dmu_buf_t **dbu_clear_on_evict_dbufp;
#endif
} dmu_buf_user_t;
/*
* Initialize the given dmu_buf_user_t instance with the eviction function
* evict_func, to be called when the user is evicted.
*
* NOTE: This function should only be called once on a given dmu_buf_user_t.
* To allow enforcement of this, dbu must already be zeroed on entry.
*/
/*ARGSUSED*/
inline void
dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync,
dmu_buf_evict_func_t *evict_func_async, dmu_buf_t **clear_on_evict_dbufp)
{
ASSERT(dbu->dbu_evict_func_sync == NULL);
ASSERT(dbu->dbu_evict_func_async == NULL);
/* must have at least one evict func */
IMPLY(evict_func_sync == NULL, evict_func_async != NULL);
dbu->dbu_evict_func_sync = evict_func_sync;
dbu->dbu_evict_func_async = evict_func_async;
#ifdef ZFS_DEBUG
dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp;
#endif
}
/*
* Attach user data to a dbuf and mark it for normal (when the dbuf's
* data is cleared or its reference count goes to zero) eviction processing.
*
* Returns NULL on success, or the existing user if another user currently
* owns the buffer.
*/
void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user);
/*
* Attach user data to a dbuf and mark it for immediate (its dirty and
* reference counts are equal) eviction processing.
*
* Returns NULL on success, or the existing user if another user currently
* owns the buffer.
*/
void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user);
/*
* Replace the current user of a dbuf.
*
* If given the current user of a dbuf, replaces the dbuf's user with
* "new_user" and returns the user data pointer that was replaced.
* Otherwise returns the current, and unmodified, dbuf user pointer.
*/
void *dmu_buf_replace_user(dmu_buf_t *db,
dmu_buf_user_t *old_user, dmu_buf_user_t *new_user);
/*
* Remove the specified user data for a DMU buffer.
*
* Returns the user that was removed on success, or the current user if
* another user currently owns the buffer.
*/
void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);
/*
* Returns the user data (dmu_buf_user_t *) associated with this dbuf.
*/
void *dmu_buf_get_user(dmu_buf_t *db);
objset_t *dmu_buf_get_objset(dmu_buf_t *db);
dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db);
void dmu_buf_dnode_exit(dmu_buf_t *db);
/* Block until any in-progress dmu buf user evictions complete. */
void dmu_buf_user_evict_wait(void);
/*
* Returns the blkptr associated with this dbuf, or NULL if not set.
*/
struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
/*
* Indicate that you are going to modify the buffer's data (db_data).
*
* The transaction (tx) must be assigned to a txg (ie. you've called
* dmu_tx_assign()). The buffer's object must be held in the tx
* (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
*/
void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
/*
* You must create a transaction, then hold the objects which you will
* (or might) modify as part of this transaction. Then you must assign
* the transaction to a transaction group. Once the transaction has
* been assigned, you can modify buffers which belong to held objects as
* part of this transaction. You can't modify buffers before the
* transaction has been assigned; you can't modify buffers which don't
* belong to objects which this transaction holds; you can't hold
* objects once the transaction has been assigned. You may hold an
* object which you are going to free (with dmu_object_free()), but you
* don't have to.
*
* You can abort the transaction before it has been assigned.
*
* Note that you may hold buffers (with dmu_buf_hold) at any time,
* regardless of transaction state.
*/
#define DMU_NEW_OBJECT (-1ULL)
#define DMU_OBJECT_END (-1ULL)
dmu_tx_t *dmu_tx_create(objset_t *os);
void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
int len);
void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
uint64_t len);
void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
uint64_t len);
+void dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object);
void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name);
void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add,
const char *name);
void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
void dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn);
void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
void dmu_tx_abort(dmu_tx_t *tx);
int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
void dmu_tx_wait(dmu_tx_t *tx);
void dmu_tx_commit(dmu_tx_t *tx);
void dmu_tx_mark_netfree(dmu_tx_t *tx);
/*
* To register a commit callback, dmu_tx_callback_register() must be called.
*
* dcb_data is a pointer to caller private data that is passed on as a
* callback parameter. The caller is responsible for properly allocating and
* freeing it.
*
* When registering a callback, the transaction must be already created, but
* it cannot be committed or aborted. It can be assigned to a txg or not.
*
* The callback will be called after the transaction has been safely written
* to stable storage and will also be called if the dmu_tx is aborted.
* If there is any error which prevents the transaction from being committed to
* disk, the callback will be called with a value of error != 0.
*/
typedef void dmu_tx_callback_func_t(void *dcb_data, int error);
void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
void *dcb_data);
/*
* Free up the data blocks for a defined range of a file. If size is
* -1, the range from offset to end-of-file is freed.
*/
int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size, dmu_tx_t *tx);
int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size);
int dmu_free_long_object(objset_t *os, uint64_t object);
/*
* Convenience functions.
*
* Canfail routines will return 0 on success, or an errno if there is a
* nonrecoverable I/O error.
*/
#define DMU_READ_PREFETCH 0 /* prefetch */
#define DMU_READ_NO_PREFETCH 1 /* don't prefetch */
int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
void *buf, uint32_t flags);
int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
uint32_t flags);
void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx);
void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx);
void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_tx_t *tx);
int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size);
int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
dmu_tx_t *tx);
int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size,
dmu_tx_t *tx);
#ifdef _KERNEL
#ifdef illumos
int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size, struct page *pp, dmu_tx_t *tx);
#else
int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size, struct vm_page **ppa, dmu_tx_t *tx);
int dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
int *rbehind, int *rahead, int last_size);
#endif
#endif
struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
void dmu_return_arcbuf(struct arc_buf *buf);
void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
dmu_tx_t *tx);
int dmu_xuio_init(struct xuio *uio, int niov);
void dmu_xuio_fini(struct xuio *uio);
int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off,
size_t n);
int dmu_xuio_cnt(struct xuio *uio);
struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i);
void dmu_xuio_clear(struct xuio *uio, int i);
void xuio_stat_wbuf_copied(void);
void xuio_stat_wbuf_nocopy(void);
extern boolean_t zfs_prefetch_disable;
extern int zfs_max_recordsize;
/*
* Asynchronously try to read in the data.
*/
void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
uint64_t len, enum zio_priority pri);
typedef struct dmu_object_info {
/* All sizes are in bytes unless otherwise indicated. */
uint32_t doi_data_block_size;
uint32_t doi_metadata_block_size;
dmu_object_type_t doi_type;
dmu_object_type_t doi_bonus_type;
uint64_t doi_bonus_size;
uint8_t doi_indirection; /* 2 = dnode->indirect->data */
uint8_t doi_checksum;
uint8_t doi_compress;
uint8_t doi_nblkptr;
uint8_t doi_pad[4];
uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */
uint64_t doi_max_offset;
uint64_t doi_fill_count; /* number of non-empty blocks */
} dmu_object_info_t;
typedef void arc_byteswap_func_t(void *buf, size_t size);
typedef struct dmu_object_type_info {
dmu_object_byteswap_t ot_byteswap;
boolean_t ot_metadata;
char *ot_name;
} dmu_object_type_info_t;
typedef struct dmu_object_byteswap_info {
arc_byteswap_func_t *ob_func;
char *ob_name;
} dmu_object_byteswap_info_t;
extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];
/*
* Get information on a DMU object.
*
* Return 0 on success or ENOENT if object is not allocated.
*
* If doi is NULL, just indicates whether the object exists.
*/
int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
/* Like dmu_object_info, but faster if you have a held dnode in hand. */
void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi);
/* Like dmu_object_info, but faster if you have a held dbuf in hand. */
void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
/*
* Like dmu_object_info_from_db, but faster still when you only care about
* the size. This is specifically optimized for zfs_getattr().
*/
void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
u_longlong_t *nblk512);
typedef struct dmu_objset_stats {
uint64_t dds_num_clones; /* number of clones of this */
uint64_t dds_creation_txg;
uint64_t dds_guid;
dmu_objset_type_t dds_type;
uint8_t dds_is_snapshot;
uint8_t dds_inconsistent;
char dds_origin[ZFS_MAX_DATASET_NAME_LEN];
} dmu_objset_stats_t;
/*
* Get stats on a dataset.
*/
void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
/*
* Add entries to the nvlist for all the objset's properties. See
* zfs_prop_table[] and zfs(1m) for details on the properties.
*/
void dmu_objset_stats(objset_t *os, struct nvlist *nv);
/*
* Get the space usage statistics for statvfs().
*
* refdbytes is the amount of space "referenced" by this objset.
* availbytes is the amount of space available to this objset, taking
* into account quotas & reservations, assuming that no other objsets
* use the space first. These values correspond to the 'referenced' and
* 'available' properties, described in the zfs(1m) manpage.
*
* usedobjs and availobjs are the number of objects currently allocated,
* and available.
*/
void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
uint64_t *usedobjsp, uint64_t *availobjsp);
/*
* The fsid_guid is a 56-bit ID that can change to avoid collisions.
* (Contrast with the ds_guid which is a 64-bit ID that will never
* change, so there is a small probability that it will collide.)
*/
uint64_t dmu_objset_fsid_guid(objset_t *os);
/*
* Get the [cm]time for an objset's snapshot dir
*/
timestruc_t dmu_objset_snap_cmtime(objset_t *os);
int dmu_objset_is_snapshot(objset_t *os);
extern struct spa *dmu_objset_spa(objset_t *os);
extern struct zilog *dmu_objset_zil(objset_t *os);
extern struct dsl_pool *dmu_objset_pool(objset_t *os);
extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
extern void dmu_objset_name(objset_t *os, char *buf);
extern dmu_objset_type_t dmu_objset_type(objset_t *os);
extern uint64_t dmu_objset_id(objset_t *os);
extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
extern int dmu_snapshot_realname(objset_t *os, char *name, char *real,
int maxlen, boolean_t *conflict);
extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
uint64_t *idp, uint64_t *offp);
typedef int objset_used_cb_t(dmu_object_type_t bonustype,
void *bonus, uint64_t *userp, uint64_t *groupp);
extern void dmu_objset_register_type(dmu_objset_type_t ost,
objset_used_cb_t *cb);
extern void dmu_objset_set_user(objset_t *os, void *user_ptr);
extern void *dmu_objset_get_user(objset_t *os);
/*
* Return the txg number for the given assigned transaction.
*/
uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
/*
* Synchronous write.
* If a parent zio is provided this function initiates a write on the
* provided buffer as a child of the parent zio.
* In the absence of a parent zio, the write is completed synchronously.
* At write completion, blk is filled with the bp of the written block.
* Note that while the data covered by this function will be on stable
* storage when the write completes this new data does not become a
* permanent part of the file until the associated transaction commits.
*/
/*
* {zfs,zvol,ztest}_get_done() args
*/
typedef struct zgd {
struct lwb *zgd_lwb;
struct blkptr *zgd_bp;
dmu_buf_t *zgd_db;
struct rl *zgd_rl;
void *zgd_private;
} zgd_t;
typedef void dmu_sync_cb_t(zgd_t *arg, int error);
int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd);
/*
* Find the next hole or data block in file starting at *off
* Return found offset in *off. Return ESRCH for end of file.
*/
int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
uint64_t *off);
/*
* Check if a DMU object has any dirty blocks. If so, sync out
* all pending transaction groups. Otherwise, this function
* does not alter DMU state. This could be improved to only sync
* out the necessary transaction groups for this particular
* object.
*/
int dmu_object_wait_synced(objset_t *os, uint64_t object);
/*
* Initial setup and final teardown.
*/
extern void dmu_init(void);
extern void dmu_fini(void);
typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp,
uint64_t object, uint64_t offset, int len);
void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
dmu_traverse_cb_t cb, void *arg);
int dmu_diff(const char *tosnap_name, const char *fromsnap_name,
struct file *fp, offset_t *offp);
/* CRC64 table */
#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */
extern uint64_t zfs_crc64_table[256];
extern int zfs_mdcomp_disable;
#ifdef __cplusplus
}
#endif
#endif /* _SYS_DMU_H */
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h (revision 332525)
@@ -1,362 +1,363 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
#ifndef _SYS_DNODE_H
#define _SYS_DNODE_H
#include <sys/zfs_context.h>
#include <sys/avl.h>
#include <sys/spa.h>
#include <sys/txg.h>
#include <sys/zio.h>
#include <sys/refcount.h>
#include <sys/dmu_zfetch.h>
#include <sys/zrlock.h>
#include <sys/multilist.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
* dnode_hold() flags.
*/
#define DNODE_MUST_BE_ALLOCATED 1
#define DNODE_MUST_BE_FREE 2
/*
* dnode_next_offset() flags.
*/
#define DNODE_FIND_HOLE 1
#define DNODE_FIND_BACKWARDS 2
#define DNODE_FIND_HAVELOCK 4
/*
* Fixed constants.
*/
#define DNODE_SHIFT 9 /* 512 bytes */
#define DN_MIN_INDBLKSHIFT 12 /* 4k */
/*
* If we ever increase this value beyond 20, we need to revisit all logic that
* does x << level * ebps to handle overflow. With a 1M indirect block size,
* 4 levels of indirect blocks would not be able to guarantee addressing an
* entire object, so 5 levels will be used, but 5 * (20 - 7) = 65.
*/
#define DN_MAX_INDBLKSHIFT 17 /* 128k */
#define DNODE_BLOCK_SHIFT 14 /* 16k */
#define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */
#define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */
#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */
/*
* dnode id flags
*
* Note: a file will never ever have its
* ids moved from bonus->spill
* and only in a crypto environment would it be on spill
*/
#define DN_ID_CHKED_BONUS 0x1
#define DN_ID_CHKED_SPILL 0x2
#define DN_ID_OLD_EXIST 0x4
#define DN_ID_NEW_EXIST 0x8
/*
* Derived constants.
*/
#define DNODE_SIZE (1 << DNODE_SHIFT)
#define DN_MAX_NBLKPTR ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
#define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT)
#define DN_ZERO_BONUSLEN (DN_MAX_BONUSLEN + 1)
#define DN_KILL_SPILLBLK (1)
#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT)
/*
* This is inaccurate if the indblkshift of the particular object is not the
* max. But it's only used by userland to calculate the zvol reservation.
*/
#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
#define DNODES_PER_LEVEL (1ULL << DNODES_PER_LEVEL_SHIFT)
/* The +2 here is a cheesy way to round up */
#define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
(DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT)))
#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \
(((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))
#define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \
(dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT)
#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift))
struct dmu_buf_impl;
struct objset;
struct zio;
enum dnode_dirtycontext {
DN_UNDIRTIED,
DN_DIRTY_OPEN,
DN_DIRTY_SYNC
};
/* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */
#define DNODE_FLAG_USED_BYTES (1<<0)
#define DNODE_FLAG_USERUSED_ACCOUNTED (1<<1)
/* Does dnode have a SA spill blkptr in bonus? */
#define DNODE_FLAG_SPILL_BLKPTR (1<<2)
typedef struct dnode_phys {
uint8_t dn_type; /* dmu_object_type_t */
uint8_t dn_indblkshift; /* ln2(indirect block size) */
uint8_t dn_nlevels; /* 1=dn_blkptr->data blocks */
uint8_t dn_nblkptr; /* length of dn_blkptr */
uint8_t dn_bonustype; /* type of data in bonus buffer */
uint8_t dn_checksum; /* ZIO_CHECKSUM type */
uint8_t dn_compress; /* ZIO_COMPRESS type */
uint8_t dn_flags; /* DNODE_FLAG_* */
uint16_t dn_datablkszsec; /* data block size in 512b sectors */
uint16_t dn_bonuslen; /* length of dn_bonus */
uint8_t dn_pad2[4];
/* accounting is protected by dn_dirty_mtx */
uint64_t dn_maxblkid; /* largest allocated block ID */
uint64_t dn_used; /* bytes (or sectors) of disk space */
uint64_t dn_pad3[4];
blkptr_t dn_blkptr[1];
uint8_t dn_bonus[DN_MAX_BONUSLEN - sizeof (blkptr_t)];
blkptr_t dn_spill;
} dnode_phys_t;
struct dnode {
/*
* Protects the structure of the dnode, including the number of levels
* of indirection (dn_nlevels), dn_maxblkid, and dn_next_*
*/
krwlock_t dn_struct_rwlock;
/* Our link on dn_objset->os_dnodes list; protected by os_lock. */
list_node_t dn_link;
/* immutable: */
struct objset *dn_objset;
uint64_t dn_object;
struct dmu_buf_impl *dn_dbuf;
struct dnode_handle *dn_handle;
dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
/*
* Copies of stuff in dn_phys. They're valid in the open
* context (eg. even before the dnode is first synced).
* Where necessary, these are protected by dn_struct_rwlock.
*/
dmu_object_type_t dn_type; /* object type */
uint16_t dn_bonuslen; /* bonus length */
uint8_t dn_bonustype; /* bonus type */
uint8_t dn_nblkptr; /* number of blkptrs (immutable) */
uint8_t dn_checksum; /* ZIO_CHECKSUM type */
uint8_t dn_compress; /* ZIO_COMPRESS type */
uint8_t dn_nlevels;
uint8_t dn_indblkshift;
uint8_t dn_datablkshift; /* zero if blksz not power of 2! */
uint8_t dn_moved; /* Has this dnode been moved? */
uint16_t dn_datablkszsec; /* in 512b sectors */
uint32_t dn_datablksz; /* in bytes */
uint64_t dn_maxblkid;
uint8_t dn_next_type[TXG_SIZE];
uint8_t dn_next_nblkptr[TXG_SIZE];
uint8_t dn_next_nlevels[TXG_SIZE];
uint8_t dn_next_indblkshift[TXG_SIZE];
uint8_t dn_next_bonustype[TXG_SIZE];
uint8_t dn_rm_spillblk[TXG_SIZE]; /* for removing spill blk */
uint16_t dn_next_bonuslen[TXG_SIZE];
uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */
/* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */
uint32_t dn_dbufs_count; /* count of dn_dbufs */
/* protected by os_lock: */
multilist_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */
/* protected by dn_mtx: */
kmutex_t dn_mtx;
list_t dn_dirty_records[TXG_SIZE];
struct range_tree *dn_free_ranges[TXG_SIZE];
uint64_t dn_allocated_txg;
uint64_t dn_free_txg;
uint64_t dn_assigned_txg;
kcondvar_t dn_notxholds;
enum dnode_dirtycontext dn_dirtyctx;
uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */
/* protected by own devices */
refcount_t dn_tx_holds;
refcount_t dn_holds;
kmutex_t dn_dbufs_mtx;
/*
* Descendent dbufs, ordered by dbuf_compare. Note that dn_dbufs
* can contain multiple dbufs of the same (level, blkid) when a
* dbuf is marked DB_EVICTING without being removed from
* dn_dbufs. To maintain the avl invariant that there cannot be
* duplicate entries, we order the dbufs by an arbitrary value -
* their address in memory. This means that dn_dbufs cannot be used to
* directly look up a dbuf. Instead, callers must use avl_walk, have
* a reference to the dbuf, or look up a non-existant node with
* db_state = DB_SEARCH (see dbuf_free_range for an example).
*/
avl_tree_t dn_dbufs;
/* protected by dn_struct_rwlock */
struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */
boolean_t dn_have_spill; /* have spill or are spilling */
/* parent IO for current sync write */
zio_t *dn_zio;
/* used in syncing context */
uint64_t dn_oldused; /* old phys used bytes */
uint64_t dn_oldflags; /* old phys dn_flags */
uint64_t dn_olduid, dn_oldgid;
uint64_t dn_newuid, dn_newgid;
int dn_id_flags;
/* holds prefetch structure */
struct zfetch dn_zfetch;
};
/*
* Adds a level of indirection between the dbuf and the dnode to avoid
* iterating descendent dbufs in dnode_move(). Handles are not allocated
* individually, but as an array of child dnodes in dnode_hold_impl().
*/
typedef struct dnode_handle {
/* Protects dnh_dnode from modification by dnode_move(). */
zrlock_t dnh_zrlock;
dnode_t *dnh_dnode;
} dnode_handle_t;
typedef struct dnode_children {
dmu_buf_user_t dnc_dbu; /* User evict data */
size_t dnc_count; /* number of children */
dnode_handle_t dnc_children[]; /* sized dynamically */
} dnode_children_t;
typedef struct free_range {
avl_node_t fr_node;
uint64_t fr_blkid;
uint64_t fr_nblks;
} free_range_t;
void dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
uint64_t object, dnode_handle_t *dnh);
void dnode_special_close(dnode_handle_t *dnh);
void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx);
void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx);
void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx);
int dnode_hold(struct objset *dd, uint64_t object,
void *ref, dnode_t **dnp);
int dnode_hold_impl(struct objset *dd, uint64_t object, int flag,
void *ref, dnode_t **dnp);
boolean_t dnode_add_ref(dnode_t *dn, void *ref);
void dnode_rele(dnode_t *dn, void *ref);
void dnode_rele_and_unlock(dnode_t *dn, void *tag);
void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
void dnode_free(dnode_t *dn, dmu_tx_t *tx);
void dnode_byteswap(dnode_phys_t *dnp);
void dnode_buf_byteswap(void *buf, size_t size);
void dnode_verify(dnode_t *dn);
int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx);
void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx);
void dnode_diduse_space(dnode_t *dn, int64_t space);
void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t);
uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid);
void dnode_init(void);
void dnode_fini(void);
int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off,
int minlvl, uint64_t blkfill, uint64_t txg);
void dnode_evict_dbufs(dnode_t *dn);
void dnode_evict_bonus(dnode_t *dn);
+boolean_t dnode_needs_remap(const dnode_t *dn);
#define DNODE_IS_CACHEABLE(_dn) \
((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
(DMU_OT_IS_METADATA((_dn)->dn_type) && \
(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA))
#define DNODE_META_IS_CACHEABLE(_dn) \
((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)
#ifdef ZFS_DEBUG
/*
* There should be a ## between the string literal and fmt, to make it
* clear that we're joining two strings together, but that piece of shit
* gcc doesn't support that preprocessor token.
*/
#define dprintf_dnode(dn, fmt, ...) do { \
if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
char __db_buf[32]; \
uint64_t __db_obj = (dn)->dn_object; \
if (__db_obj == DMU_META_DNODE_OBJECT) \
(void) strcpy(__db_buf, "mdn"); \
else \
(void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
(u_longlong_t)__db_obj);\
dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \
__db_buf, __VA_ARGS__); \
} \
_NOTE(CONSTCOND) } while (0)
#define DNODE_VERIFY(dn) dnode_verify(dn)
#define FREE_VERIFY(db, start, end, tx) free_verify(db, start, end, tx)
#else
#define dprintf_dnode(db, fmt, ...)
#define DNODE_VERIFY(dn)
#define FREE_VERIFY(db, start, end, tx)
#endif
#ifdef __cplusplus
}
#endif
#endif /* _SYS_DNODE_H */
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h (revision 332525)
@@ -1,420 +1,450 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
#ifndef _SYS_DSL_DATASET_H
#define _SYS_DSL_DATASET_H
#include <sys/dmu.h>
#include <sys/spa.h>
#include <sys/txg.h>
#include <sys/zio.h>
#include <sys/bplist.h>
#include <sys/dsl_synctask.h>
#include <sys/zfs_context.h>
#include <sys/dsl_deadlist.h>
#include <sys/refcount.h>
#include <sys/rrwlock.h>
#include <zfeature_common.h>
#ifdef __cplusplus
extern "C" {
#endif
struct dsl_dataset;
struct dsl_dir;
struct dsl_pool;
#define DS_FLAG_INCONSISTENT (1ULL<<0)
#define DS_IS_INCONSISTENT(ds) \
(dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT)
/*
* Do not allow this dataset to be promoted.
*/
#define DS_FLAG_NOPROMOTE (1ULL<<1)
/*
* DS_FLAG_UNIQUE_ACCURATE is set if ds_unique_bytes has been correctly
* calculated for head datasets (starting with SPA_VERSION_UNIQUE_ACCURATE,
* refquota/refreservations).
*/
#define DS_FLAG_UNIQUE_ACCURATE (1ULL<<2)
/*
* DS_FLAG_DEFER_DESTROY is set after 'zfs destroy -d' has been called
* on a dataset. This allows the dataset to be destroyed using 'zfs release'.
*/
#define DS_FLAG_DEFER_DESTROY (1ULL<<3)
#define DS_IS_DEFER_DESTROY(ds) \
(dsl_dataset_phys(ds)->ds_flags & DS_FLAG_DEFER_DESTROY)
/*
* DS_FIELD_* are strings that are used in the "extensified" dataset zap object.
* They should be of the format <reverse-dns>:<field>.
*/
/*
* This field's value is the object ID of a zap object which contains the
* bookmarks of this dataset. If it is present, then this dataset is counted
* in the refcount of the SPA_FEATURES_BOOKMARKS feature.
*/
#define DS_FIELD_BOOKMARK_NAMES "com.delphix:bookmarks"
/*
* These fields are set on datasets that are in the middle of a resumable
* receive, and allow the sender to resume the send if it is interrupted.
*/
#define DS_FIELD_RESUME_FROMGUID "com.delphix:resume_fromguid"
#define DS_FIELD_RESUME_TONAME "com.delphix:resume_toname"
#define DS_FIELD_RESUME_TOGUID "com.delphix:resume_toguid"
#define DS_FIELD_RESUME_OBJECT "com.delphix:resume_object"
#define DS_FIELD_RESUME_OFFSET "com.delphix:resume_offset"
#define DS_FIELD_RESUME_BYTES "com.delphix:resume_bytes"
#define DS_FIELD_RESUME_LARGEBLOCK "com.delphix:resume_largeblockok"
#define DS_FIELD_RESUME_EMBEDOK "com.delphix:resume_embedok"
#define DS_FIELD_RESUME_COMPRESSOK "com.delphix:resume_compressok"
/*
+ * This field is set to the object number of the remap deadlist if one exists.
+ */
+#define DS_FIELD_REMAP_DEADLIST "com.delphix:remap_deadlist"
+
+/*
* DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
* name lookups should be performed case-insensitively.
*/
#define DS_FLAG_CI_DATASET (1ULL<<16)
#define DS_CREATE_FLAG_NODIRTY (1ULL<<24)
typedef struct dsl_dataset_phys {
uint64_t ds_dir_obj; /* DMU_OT_DSL_DIR */
uint64_t ds_prev_snap_obj; /* DMU_OT_DSL_DATASET */
uint64_t ds_prev_snap_txg;
uint64_t ds_next_snap_obj; /* DMU_OT_DSL_DATASET */
uint64_t ds_snapnames_zapobj; /* DMU_OT_DSL_DS_SNAP_MAP 0 for snaps */
uint64_t ds_num_children; /* clone/snap children; ==0 for head */
uint64_t ds_creation_time; /* seconds since 1970 */
uint64_t ds_creation_txg;
uint64_t ds_deadlist_obj; /* DMU_OT_DEADLIST */
/*
* ds_referenced_bytes, ds_compressed_bytes, and ds_uncompressed_bytes
* include all blocks referenced by this dataset, including those
* shared with any other datasets.
*/
uint64_t ds_referenced_bytes;
uint64_t ds_compressed_bytes;
uint64_t ds_uncompressed_bytes;
uint64_t ds_unique_bytes; /* only relevant to snapshots */
/*
* The ds_fsid_guid is a 56-bit ID that can change to avoid
* collisions. The ds_guid is a 64-bit ID that will never
* change, so there is a small probability that it will collide.
*/
uint64_t ds_fsid_guid;
uint64_t ds_guid;
uint64_t ds_flags; /* DS_FLAG_* */
blkptr_t ds_bp;
uint64_t ds_next_clones_obj; /* DMU_OT_DSL_CLONES */
uint64_t ds_props_obj; /* DMU_OT_DSL_PROPS for snaps */
uint64_t ds_userrefs_obj; /* DMU_OT_USERREFS */
uint64_t ds_pad[5]; /* pad out to 320 bytes for good measure */
} dsl_dataset_phys_t;
typedef struct dsl_dataset {
dmu_buf_user_t ds_dbu;
rrwlock_t ds_bp_rwlock; /* Protects ds_phys->ds_bp */
/* Immutable: */
struct dsl_dir *ds_dir;
dmu_buf_t *ds_dbuf;
uint64_t ds_object;
uint64_t ds_fsid_guid;
boolean_t ds_is_snapshot;
/* only used in syncing context, only valid for non-snapshots: */
struct dsl_dataset *ds_prev;
uint64_t ds_bookmarks; /* DMU_OTN_ZAP_METADATA */
/* has internal locking: */
dsl_deadlist_t ds_deadlist;
bplist_t ds_pending_deadlist;
+ /*
+ * The remap deadlist contains blocks (DVA's, really) that are
+ * referenced by the previous snapshot and point to indirect vdevs,
+ * but in this dataset they have been remapped to point to concrete
+ * (or at least, less-indirect) vdevs. In other words, the
+ * physical DVA is referenced by the previous snapshot but not by
+ * this dataset. Logically, the DVA continues to be referenced,
+ * but we are using a different (less indirect) physical DVA.
+ * This deadlist is used to determine when physical DVAs that
+ * point to indirect vdevs are no longer referenced anywhere,
+ * and thus should be marked obsolete.
+ *
+ * This is only used if SPA_FEATURE_OBSOLETE_COUNTS is enabled.
+ */
+ dsl_deadlist_t ds_remap_deadlist;
+ /* protects creation of the ds_remap_deadlist */
+ kmutex_t ds_remap_deadlist_lock;
+
/* protected by lock on pool's dp_dirty_datasets list */
txg_node_t ds_dirty_link;
list_node_t ds_synced_link;
/*
* ds_phys->ds_<accounting> is also protected by ds_lock.
* Protected by ds_lock:
*/
kmutex_t ds_lock;
objset_t *ds_objset;
uint64_t ds_userrefs;
void *ds_owner;
/*
* Long holds prevent the ds from being destroyed; they allow the
* ds to remain held even after dropping the dp_config_rwlock.
* Owning counts as a long hold. See the comments above
* dsl_pool_hold() for details.
*/
refcount_t ds_longholds;
/* no locking; only for making guesses */
uint64_t ds_trysnap_txg;
/* for objset_open() */
kmutex_t ds_opening_lock;
uint64_t ds_reserved; /* cached refreservation */
uint64_t ds_quota; /* cached refquota */
kmutex_t ds_sendstream_lock;
list_t ds_sendstreams;
/*
* When in the middle of a resumable receive, tracks how much
* progress we have made.
*/
uint64_t ds_resume_object[TXG_SIZE];
uint64_t ds_resume_offset[TXG_SIZE];
uint64_t ds_resume_bytes[TXG_SIZE];
/* Protected by our dsl_dir's dd_lock */
list_t ds_prop_cbs;
/*
* For ZFEATURE_FLAG_PER_DATASET features, set if this dataset
* uses this feature.
*/
uint8_t ds_feature_inuse[SPA_FEATURES];
/*
* Set if we need to activate the feature on this dataset this txg
* (used only in syncing context).
*/
uint8_t ds_feature_activation_needed[SPA_FEATURES];
/* Protected by ds_lock; keep at end of struct for better locality */
char ds_snapname[ZFS_MAX_DATASET_NAME_LEN];
} dsl_dataset_t;
inline dsl_dataset_phys_t *
dsl_dataset_phys(dsl_dataset_t *ds)
{
return (ds->ds_dbuf->db_data);
}
typedef struct dsl_dataset_promote_arg {
const char *ddpa_clonename;
dsl_dataset_t *ddpa_clone;
list_t shared_snaps, origin_snaps, clone_snaps;
dsl_dataset_t *origin_origin; /* origin of the origin */
uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
nvlist_t *err_ds;
cred_t *cr;
} dsl_dataset_promote_arg_t;
typedef struct dsl_dataset_rollback_arg {
const char *ddra_fsname;
const char *ddra_tosnap;
void *ddra_owner;
nvlist_t *ddra_result;
} dsl_dataset_rollback_arg_t;
typedef struct dsl_dataset_snapshot_arg {
nvlist_t *ddsa_snaps;
nvlist_t *ddsa_props;
nvlist_t *ddsa_errors;
cred_t *ddsa_cr;
} dsl_dataset_snapshot_arg_t;
/*
* The max length of a temporary tag prefix is the number of hex digits
* required to express UINT64_MAX plus one for the hyphen.
*/
#define MAX_TAG_PREFIX_LEN 17
#define dsl_dataset_is_snapshot(ds) \
(dsl_dataset_phys(ds)->ds_num_children != 0)
#define DS_UNIQUE_IS_ACCURATE(ds) \
((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0)
int dsl_dataset_hold(struct dsl_pool *dp, const char *name, void *tag,
dsl_dataset_t **dsp);
boolean_t dsl_dataset_try_add_ref(struct dsl_pool *dp, dsl_dataset_t *ds,
void *tag);
int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, void *tag,
dsl_dataset_t **);
void dsl_dataset_rele(dsl_dataset_t *ds, void *tag);
int dsl_dataset_own(struct dsl_pool *dp, const char *name,
void *tag, dsl_dataset_t **dsp);
int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj,
void *tag, dsl_dataset_t **dsp);
void dsl_dataset_disown(dsl_dataset_t *ds, void *tag);
void dsl_dataset_name(dsl_dataset_t *ds, char *name);
boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag);
int dsl_dataset_namelen(dsl_dataset_t *ds);
boolean_t dsl_dataset_has_owner(dsl_dataset_t *ds);
uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname,
dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *);
uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
uint64_t flags, dmu_tx_t *tx);
void dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx);
int dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx);
int dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors);
void dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx);
int dsl_dataset_promote_check(void *arg, dmu_tx_t *tx);
int dsl_dataset_promote(const char *name, char *conflsnap);
int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
boolean_t force);
int dsl_dataset_rename_snapshot(const char *fsname,
const char *oldsnapname, const char *newsnapname, boolean_t recursive);
int dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
minor_t cleanup_minor, const char *htag);
blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds);
boolean_t dsl_dataset_modified_since_snap(dsl_dataset_t *ds,
dsl_dataset_t *snap);
void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
void dsl_dataset_sync_done(dsl_dataset_t *os, dmu_tx_t *tx);
void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp,
dmu_tx_t *tx);
int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp,
dmu_tx_t *tx, boolean_t async);
+void dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev,
+ uint64_t offset, uint64_t size, uint64_t birth, dmu_tx_t *tx);
void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);
int get_clones_stat_impl(dsl_dataset_t *ds, nvlist_t *val);
char *get_receive_resume_stats_impl(dsl_dataset_t *ds);
char *get_child_receive_stats(dsl_dataset_t *ds);
uint64_t dsl_get_refratio(dsl_dataset_t *ds);
uint64_t dsl_get_logicalreferenced(dsl_dataset_t *ds);
uint64_t dsl_get_compressratio(dsl_dataset_t *ds);
uint64_t dsl_get_used(dsl_dataset_t *ds);
uint64_t dsl_get_creation(dsl_dataset_t *ds);
uint64_t dsl_get_creationtxg(dsl_dataset_t *ds);
uint64_t dsl_get_refquota(dsl_dataset_t *ds);
uint64_t dsl_get_refreservation(dsl_dataset_t *ds);
uint64_t dsl_get_guid(dsl_dataset_t *ds);
uint64_t dsl_get_unique(dsl_dataset_t *ds);
uint64_t dsl_get_objsetid(dsl_dataset_t *ds);
uint64_t dsl_get_userrefs(dsl_dataset_t *ds);
uint64_t dsl_get_defer_destroy(dsl_dataset_t *ds);
uint64_t dsl_get_referenced(dsl_dataset_t *ds);
uint64_t dsl_get_numclones(dsl_dataset_t *ds);
uint64_t dsl_get_inconsistent(dsl_dataset_t *ds);
uint64_t dsl_get_available(dsl_dataset_t *ds);
int dsl_get_written(dsl_dataset_t *ds, uint64_t *written);
int dsl_get_prev_snap(dsl_dataset_t *ds, char *snap);
int dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value,
char *source);
void get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv);
void dsl_dataset_stats(dsl_dataset_t *os, nvlist_t *nv);
void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat);
void dsl_dataset_space(dsl_dataset_t *ds,
uint64_t *refdbytesp, uint64_t *availbytesp,
uint64_t *usedobjsp, uint64_t *availobjsp);
uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds);
int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *last,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds);
int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
uint64_t asize, uint64_t inflight, uint64_t *used,
uint64_t *ref_rsrv);
int dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
uint64_t quota);
int dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
uint64_t reservation);
boolean_t dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
uint64_t earlier_txg);
void dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag);
void dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag);
boolean_t dsl_dataset_long_held(dsl_dataset_t *ds);
int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx);
void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
dsl_dataset_t *origin_head, dmu_tx_t *tx);
int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr);
void dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
dmu_tx_t *tx);
void dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
dmu_tx_t *tx);
void dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds);
int dsl_dataset_get_snapname(dsl_dataset_t *ds);
int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name,
uint64_t *value);
int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
boolean_t adj_cnt);
void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
zprop_source_t source, uint64_t value, dmu_tx_t *tx);
void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx);
boolean_t dsl_dataset_is_zapified(dsl_dataset_t *ds);
boolean_t dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds);
int dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx);
void dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx);
int dsl_dataset_rollback(const char *fsname, const char *tosnap, void *owner,
nvlist_t *result);
+
+uint64_t dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds);
+void dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx);
+boolean_t dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds);
+void dsl_dataset_destroy_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx);
void dsl_dataset_deactivate_feature(uint64_t dsobj,
spa_feature_t f, dmu_tx_t *tx);
#ifdef ZFS_DEBUG
#define dprintf_ds(ds, fmt, ...) do { \
if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
char *__ds_name = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); \
dsl_dataset_name(ds, __ds_name); \
dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \
kmem_free(__ds_name, ZFS_MAX_DATASET_NAME_LEN); \
} \
_NOTE(CONSTCOND) } while (0)
#else
#define dprintf_ds(dd, fmt, ...)
#endif
#ifdef __cplusplus
}
#endif
#endif /* _SYS_DSL_DATASET_H */
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h (revision 332525)
@@ -1,87 +1,89 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_DSL_DEADLIST_H
#define _SYS_DSL_DEADLIST_H
#include <sys/bpobj.h>
#include <sys/zfs_context.h>
#ifdef __cplusplus
extern "C" {
#endif
struct dmu_buf;
struct dsl_dataset;
typedef struct dsl_deadlist_phys {
uint64_t dl_used;
uint64_t dl_comp;
uint64_t dl_uncomp;
uint64_t dl_pad[37]; /* pad out to 320b for future expansion */
} dsl_deadlist_phys_t;
typedef struct dsl_deadlist {
objset_t *dl_os;
uint64_t dl_object;
avl_tree_t dl_tree;
boolean_t dl_havetree;
struct dmu_buf *dl_dbuf;
dsl_deadlist_phys_t *dl_phys;
kmutex_t dl_lock;
/* if it's the old on-disk format: */
bpobj_t dl_bpobj;
boolean_t dl_oldfmt;
} dsl_deadlist_t;
typedef struct dsl_deadlist_entry {
avl_node_t dle_node;
uint64_t dle_mintxg;
bpobj_t dle_bpobj;
} dsl_deadlist_entry_t;
void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object);
void dsl_deadlist_close(dsl_deadlist_t *dl);
uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx);
void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx);
void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx);
void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
uint64_t mrs_obj, dmu_tx_t *tx);
void dsl_deadlist_space(dsl_deadlist_t *dl,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
void dsl_deadlist_space_range(dsl_deadlist_t *dl,
uint64_t mintxg, uint64_t maxtxg,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx);
void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
dmu_tx_t *tx);
+boolean_t dsl_deadlist_is_open(dsl_deadlist_t *dl);
#ifdef __cplusplus
}
#endif
#endif /* _SYS_DSL_DEADLIST_H */
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h (revision 332525)
@@ -1,80 +1,81 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_DSL_DELEG_H
#define _SYS_DSL_DELEG_H
#include <sys/dmu.h>
#include <sys/dsl_pool.h>
#include <sys/zfs_context.h>
#ifdef __cplusplus
extern "C" {
#endif
#define ZFS_DELEG_PERM_NONE ""
#define ZFS_DELEG_PERM_CREATE "create"
#define ZFS_DELEG_PERM_DESTROY "destroy"
#define ZFS_DELEG_PERM_SNAPSHOT "snapshot"
#define ZFS_DELEG_PERM_ROLLBACK "rollback"
#define ZFS_DELEG_PERM_CLONE "clone"
#define ZFS_DELEG_PERM_PROMOTE "promote"
#define ZFS_DELEG_PERM_RENAME "rename"
#define ZFS_DELEG_PERM_MOUNT "mount"
#define ZFS_DELEG_PERM_SHARE "share"
#define ZFS_DELEG_PERM_SEND "send"
#define ZFS_DELEG_PERM_RECEIVE "receive"
#define ZFS_DELEG_PERM_ALLOW "allow"
#define ZFS_DELEG_PERM_USERPROP "userprop"
#define ZFS_DELEG_PERM_VSCAN "vscan"
#define ZFS_DELEG_PERM_USERQUOTA "userquota"
#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota"
#define ZFS_DELEG_PERM_USERUSED "userused"
#define ZFS_DELEG_PERM_GROUPUSED "groupused"
#define ZFS_DELEG_PERM_HOLD "hold"
#define ZFS_DELEG_PERM_RELEASE "release"
#define ZFS_DELEG_PERM_DIFF "diff"
#define ZFS_DELEG_PERM_BOOKMARK "bookmark"
+#define ZFS_DELEG_PERM_REMAP "remap"
/*
* Note: the names of properties that are marked delegatable are also
* valid delegated permissions
*/
int dsl_deleg_get(const char *ddname, nvlist_t **nvp);
int dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset);
int dsl_deleg_access(const char *ddname, const char *perm, cred_t *cr);
int dsl_deleg_access_impl(struct dsl_dataset *ds, const char *perm, cred_t *cr);
void dsl_deleg_set_create_perms(dsl_dir_t *dd, dmu_tx_t *tx, cred_t *cr);
int dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr);
int dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr);
int dsl_deleg_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx);
boolean_t dsl_delegation_on(objset_t *os);
#ifdef __cplusplus
}
#endif
#endif /* _SYS_DSL_DELEG_H */
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h (revision 332525)
@@ -1,206 +1,208 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
* Copyright (c) 2014, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
#ifndef _SYS_DSL_DIR_H
#define _SYS_DSL_DIR_H
#include <sys/dmu.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_synctask.h>
#include <sys/refcount.h>
#include <sys/zfs_context.h>
#ifdef __cplusplus
extern "C" {
#endif
struct dsl_dataset;
/*
* DD_FIELD_* are strings that are used in the "extensified" dsl_dir zap object.
* They should be of the format <reverse-dns>:<field>.
*/
#define DD_FIELD_FILESYSTEM_COUNT "com.joyent:filesystem_count"
#define DD_FIELD_SNAPSHOT_COUNT "com.joyent:snapshot_count"
+#define DD_FIELD_LAST_REMAP_TXG "com.delphix:last_remap_txg"
typedef enum dd_used {
DD_USED_HEAD,
DD_USED_SNAP,
DD_USED_CHILD,
DD_USED_CHILD_RSRV,
DD_USED_REFRSRV,
DD_USED_NUM
} dd_used_t;
#define DD_FLAG_USED_BREAKDOWN (1<<0)
typedef struct dsl_dir_phys {
uint64_t dd_creation_time; /* not actually used */
uint64_t dd_head_dataset_obj;
uint64_t dd_parent_obj;
uint64_t dd_origin_obj;
uint64_t dd_child_dir_zapobj;
/*
* how much space our children are accounting for; for leaf
* datasets, == physical space used by fs + snaps
*/
uint64_t dd_used_bytes;
uint64_t dd_compressed_bytes;
uint64_t dd_uncompressed_bytes;
/* Administrative quota setting */
uint64_t dd_quota;
/* Administrative reservation setting */
uint64_t dd_reserved;
uint64_t dd_props_zapobj;
uint64_t dd_deleg_zapobj; /* dataset delegation permissions */
uint64_t dd_flags;
uint64_t dd_used_breakdown[DD_USED_NUM];
uint64_t dd_clones; /* dsl_dir objects */
uint64_t dd_pad[13]; /* pad out to 256 bytes for good measure */
} dsl_dir_phys_t;
struct dsl_dir {
dmu_buf_user_t dd_dbu;
/* These are immutable; no lock needed: */
uint64_t dd_object;
dsl_pool_t *dd_pool;
/* Stable until user eviction; no lock needed: */
dmu_buf_t *dd_dbuf;
/* protected by lock on pool's dp_dirty_dirs list */
txg_node_t dd_dirty_link;
/* protected by dp_config_rwlock */
dsl_dir_t *dd_parent;
/* Protected by dd_lock */
kmutex_t dd_lock;
list_t dd_props; /* list of dsl_prop_record_t's */
timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */
uint64_t dd_origin_txg;
/* gross estimate of space used by in-flight tx's */
uint64_t dd_tempreserved[TXG_SIZE];
/* amount of space we expect to write; == amount of dirty data */
int64_t dd_space_towrite[TXG_SIZE];
/* protected by dd_lock; keep at end of struct for better locality */
char dd_myname[ZFS_MAX_DATASET_NAME_LEN];
};
inline dsl_dir_phys_t *
dsl_dir_phys(dsl_dir_t *dd)
{
return (dd->dd_dbuf->db_data);
}
void dsl_dir_rele(dsl_dir_t *dd, void *tag);
void dsl_dir_async_rele(dsl_dir_t *dd, void *tag);
int dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
dsl_dir_t **, const char **tail);
int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
const char *tail, void *tag, dsl_dir_t **);
void dsl_dir_name(dsl_dir_t *dd, char *buf);
int dsl_dir_namelen(dsl_dir_t *dd);
uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds,
const char *name, dmu_tx_t *tx);
uint64_t dsl_dir_get_used(dsl_dir_t *dd);
uint64_t dsl_dir_get_quota(dsl_dir_t *dd);
uint64_t dsl_dir_get_reservation(dsl_dir_t *dd);
uint64_t dsl_dir_get_compressratio(dsl_dir_t *dd);
uint64_t dsl_dir_get_logicalused(dsl_dir_t *dd);
uint64_t dsl_dir_get_usedsnap(dsl_dir_t *dd);
uint64_t dsl_dir_get_usedds(dsl_dir_t *dd);
uint64_t dsl_dir_get_usedrefreserv(dsl_dir_t *dd);
uint64_t dsl_dir_get_usedchild(dsl_dir_t *dd);
void dsl_dir_get_origin(dsl_dir_t *dd, char *buf);
int dsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count);
int dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count);
+int dsl_dir_get_remaptxg(dsl_dir_t *dd, uint64_t *count);
void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv);
uint64_t dsl_dir_space_available(dsl_dir_t *dd,
dsl_dir_t *ancestor, int64_t delta, int ondiskonly);
void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx);
void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx);
int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem,
uint64_t asize, boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx);
void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx);
void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx);
void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx);
void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx);
int dsl_dir_set_quota(const char *ddname, zprop_source_t source,
uint64_t quota);
int dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
uint64_t reservation);
int dsl_dir_activate_fs_ss_limit(const char *);
int dsl_fs_ss_limit_check(dsl_dir_t *, uint64_t, zfs_prop_t, dsl_dir_t *,
cred_t *);
void dsl_fs_ss_count_adjust(dsl_dir_t *, int64_t, const char *, dmu_tx_t *);
+int dsl_dir_update_last_remap_txg(dsl_dir_t *, uint64_t);
int dsl_dir_rename(const char *oldname, const char *newname);
int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *);
boolean_t dsl_dir_is_clone(dsl_dir_t *dd);
void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds,
uint64_t reservation, cred_t *cr, dmu_tx_t *tx);
void dsl_dir_snap_cmtime_update(dsl_dir_t *dd);
timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd);
void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value,
dmu_tx_t *tx);
void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx);
boolean_t dsl_dir_is_zapified(dsl_dir_t *dd);
/* internal reserved dir name */
#define MOS_DIR_NAME "$MOS"
#define ORIGIN_DIR_NAME "$ORIGIN"
-#define XLATION_DIR_NAME "$XLATION"
#define FREE_DIR_NAME "$FREE"
#define LEAK_DIR_NAME "$LEAK"
#ifdef ZFS_DEBUG
#define dprintf_dd(dd, fmt, ...) do { \
if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
char *__ds_name = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); \
dsl_dir_name(dd, __ds_name); \
dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \
kmem_free(__ds_name, ZFS_MAX_DATASET_NAME_LEN); \
} \
_NOTE(CONSTCOND) } while (0)
#else
#define dprintf_dd(dd, fmt, ...)
#endif
#ifdef __cplusplus
}
#endif
#endif /* _SYS_DSL_DIR_H */
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h (revision 332525)
@@ -1,180 +1,183 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013, 2017 by Delphix. All rights reserved.
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
*/
#ifndef _SYS_DSL_POOL_H
#define _SYS_DSL_POOL_H
#include <sys/spa.h>
#include <sys/txg.h>
#include <sys/txg_impl.h>
#include <sys/zfs_context.h>
#include <sys/zio.h>
#include <sys/dnode.h>
#include <sys/ddt.h>
#include <sys/arc.h>
#include <sys/bpobj.h>
#include <sys/bptree.h>
#include <sys/rrwlock.h>
#ifdef __cplusplus
extern "C" {
#endif
struct objset;
struct dsl_dir;
struct dsl_dataset;
struct dsl_pool;
struct dmu_tx;
struct dsl_scan;
extern uint64_t zfs_dirty_data_max;
extern uint64_t zfs_dirty_data_max_max;
extern uint64_t zfs_dirty_data_sync;
extern int zfs_dirty_data_max_percent;
extern int zfs_delay_min_dirty_percent;
extern uint64_t zfs_delay_scale;
/* These macros are for indexing into the zfs_all_blkstats_t. */
#define DMU_OT_DEFERRED DMU_OT_NONE
#define DMU_OT_OTHER DMU_OT_NUMTYPES /* place holder for DMU_OT() types */
#define DMU_OT_TOTAL (DMU_OT_NUMTYPES + 1)
typedef struct zfs_blkstat {
uint64_t zb_count;
uint64_t zb_asize;
uint64_t zb_lsize;
uint64_t zb_psize;
uint64_t zb_gangs;
uint64_t zb_ditto_2_of_2_samevdev;
uint64_t zb_ditto_2_of_3_samevdev;
uint64_t zb_ditto_3_of_3_samevdev;
} zfs_blkstat_t;
typedef struct zfs_all_blkstats {
zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1];
} zfs_all_blkstats_t;
typedef struct dsl_pool {
/* Immutable */
spa_t *dp_spa;
struct objset *dp_meta_objset;
struct dsl_dir *dp_root_dir;
struct dsl_dir *dp_mos_dir;
struct dsl_dir *dp_free_dir;
struct dsl_dir *dp_leak_dir;
struct dsl_dataset *dp_origin_snap;
uint64_t dp_root_dir_obj;
struct taskq *dp_vnrele_taskq;
/* No lock needed - sync context only */
blkptr_t dp_meta_rootbp;
uint64_t dp_tmp_userrefs_obj;
bpobj_t dp_free_bpobj;
uint64_t dp_bptree_obj;
uint64_t dp_empty_bpobj;
+ bpobj_t dp_obsolete_bpobj;
struct dsl_scan *dp_scan;
/* Uses dp_lock */
kmutex_t dp_lock;
kcondvar_t dp_spaceavail_cv;
uint64_t dp_dirty_pertxg[TXG_SIZE];
uint64_t dp_dirty_total;
uint64_t dp_long_free_dirty_pertxg[TXG_SIZE];
uint64_t dp_mos_used_delta;
uint64_t dp_mos_compressed_delta;
uint64_t dp_mos_uncompressed_delta;
/*
* Time of most recently scheduled (furthest in the future)
* wakeup for delayed transactions.
*/
hrtime_t dp_last_wakeup;
/* Has its own locking */
tx_state_t dp_tx;
txg_list_t dp_dirty_datasets;
txg_list_t dp_dirty_zilogs;
txg_list_t dp_dirty_dirs;
txg_list_t dp_sync_tasks;
taskq_t *dp_sync_taskq;
taskq_t *dp_zil_clean_taskq;
/*
* Protects administrative changes (properties, namespace)
*
* It is only held for write in syncing context. Therefore
* syncing context does not need to ever have it for read, since
* nobody else could possibly have it for write.
*/
rrwlock_t dp_config_rwlock;
zfs_all_blkstats_t *dp_blkstats;
} dsl_pool_t;
int dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp);
int dsl_pool_open(dsl_pool_t *dp);
void dsl_pool_close(dsl_pool_t *dp);
dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg);
void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg);
int dsl_pool_sync_context(dsl_pool_t *dp);
uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
-uint64_t dsl_pool_adjustedfree(dsl_pool_t *dp, boolean_t netfree);
void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg);
void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg,
const blkptr_t *bpp);
void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx);
void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx);
void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx);
void dsl_pool_mos_diduse_space(dsl_pool_t *dp,
int64_t used, int64_t comp, int64_t uncomp);
void dsl_pool_config_enter(dsl_pool_t *dp, void *tag);
void dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag);
void dsl_pool_config_exit(dsl_pool_t *dp, void *tag);
boolean_t dsl_pool_config_held(dsl_pool_t *dp);
boolean_t dsl_pool_config_held_writer(dsl_pool_t *dp);
boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp);
taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp);
int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj,
const char *tag, uint64_t now, dmu_tx_t *tx);
int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj,
const char *tag, dmu_tx_t *tx);
void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp);
int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **);
int dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp);
void dsl_pool_rele(dsl_pool_t *dp, void *tag);
+
+void dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx);
+void dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx);
#ifdef __cplusplus
}
#endif
#endif /* _SYS_DSL_POOL_H */
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h (revision 332525)
@@ -1,148 +1,151 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2017 Datto Inc.
*/
#ifndef _SYS_DSL_SCAN_H
#define _SYS_DSL_SCAN_H
#include <sys/zfs_context.h>
#include <sys/zio.h>
#include <sys/ddt.h>
#include <sys/bplist.h>
#ifdef __cplusplus
extern "C" {
#endif
struct objset;
struct dsl_dir;
struct dsl_dataset;
struct dsl_pool;
struct dmu_tx;
/*
* All members of this structure must be uint64_t, for byteswap
* purposes.
*/
typedef struct dsl_scan_phys {
uint64_t scn_func; /* pool_scan_func_t */
uint64_t scn_state; /* dsl_scan_state_t */
uint64_t scn_queue_obj;
uint64_t scn_min_txg;
uint64_t scn_max_txg;
uint64_t scn_cur_min_txg;
uint64_t scn_cur_max_txg;
uint64_t scn_start_time;
uint64_t scn_end_time;
uint64_t scn_to_examine; /* total bytes to be scanned */
uint64_t scn_examined; /* bytes scanned so far */
uint64_t scn_to_process;
uint64_t scn_processed;
uint64_t scn_errors; /* scan I/O error count */
uint64_t scn_ddt_class_max;
ddt_bookmark_t scn_ddt_bookmark;
zbookmark_phys_t scn_bookmark;
uint64_t scn_flags; /* dsl_scan_flags_t */
} dsl_scan_phys_t;
#define SCAN_PHYS_NUMINTS (sizeof (dsl_scan_phys_t) / sizeof (uint64_t))
typedef enum dsl_scan_flags {
DSF_VISIT_DS_AGAIN = 1<<0,
DSF_SCRUB_PAUSED = 1<<1,
} dsl_scan_flags_t;
/*
* Every pool will have one dsl_scan_t and this structure will contain
* in-memory information about the scan and a pointer to the on-disk
* representation (i.e. dsl_scan_phys_t). Most of the state of the scan
* is contained on-disk to allow the scan to resume in the event of a reboot
* or panic. This structure maintains information about the behavior of a
* running scan, some caching information, and how it should traverse the pool.
*
* The following members of this structure direct the behavior of the scan:
*
* scn_suspending - a scan that cannot be completed in a single txg or
* has exceeded its allotted time will need to suspend.
* When this flag is set the scanner will stop traversing
* the pool and write out the current state to disk.
*
* scn_restart_txg - directs the scanner to either restart or start a
* a scan at the specified txg value.
*
* scn_done_txg - when a scan completes its traversal it will set
* the completion txg to the next txg. This is necessary
* to ensure that any blocks that were freed during
* the scan but have not yet been processed (i.e deferred
* frees) are accounted for.
*
* This structure also maintains information about deferred frees which are
* a special kind of traversal. Deferred free can exist in either a bptree or
* a bpobj structure. The scn_is_bptree flag will indicate the type of
* deferred free that is in progress. If the deferred free is part of an
* asynchronous destroy then the scn_async_destroying flag will be set.
*/
typedef struct dsl_scan {
struct dsl_pool *scn_dp;
boolean_t scn_suspending;
uint64_t scn_restart_txg;
uint64_t scn_done_txg;
uint64_t scn_sync_start_time;
zio_t *scn_zio_root;
/* for freeing blocks */
boolean_t scn_is_bptree;
boolean_t scn_async_destroying;
boolean_t scn_async_stalled;
+ uint64_t scn_async_block_min_time_ms;
+
+ /* for debugging / information */
uint64_t scn_visited_this_txg;
dsl_scan_phys_t scn_phys;
} dsl_scan_t;
int dsl_scan_init(struct dsl_pool *dp, uint64_t txg);
void dsl_scan_fini(struct dsl_pool *dp);
void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *);
int dsl_scan_cancel(struct dsl_pool *);
int dsl_scan(struct dsl_pool *, pool_scan_func_t);
boolean_t dsl_scan_scrubbing(const struct dsl_pool *dp);
int dsl_scrub_set_pause_resume(const struct dsl_pool *dp, pool_scrub_cmd_t cmd);
void dsl_resilver_restart(struct dsl_pool *, uint64_t txg);
boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
ddt_entry_t *dde, dmu_tx_t *tx);
void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
struct dmu_tx *tx);
boolean_t dsl_scan_active(dsl_scan_t *scn);
boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn);
#ifdef __cplusplus
}
#endif
#endif /* _SYS_DSL_SCAN_H */
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h (revision 332525)
@@ -1,112 +1,119 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
*/
#ifndef _SYS_METASLAB_H
#define _SYS_METASLAB_H
#include <sys/spa.h>
#include <sys/space_map.h>
#include <sys/txg.h>
#include <sys/zio.h>
#include <sys/avl.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef struct metaslab_ops {
uint64_t (*msop_alloc)(metaslab_t *, uint64_t);
} metaslab_ops_t;
extern metaslab_ops_t *zfs_metaslab_ops;
int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t,
metaslab_t **);
void metaslab_fini(metaslab_t *);
void metaslab_load_wait(metaslab_t *);
int metaslab_load(metaslab_t *);
void metaslab_unload(metaslab_t *);
void metaslab_sync(metaslab_t *, uint64_t);
void metaslab_sync_done(metaslab_t *, uint64_t);
void metaslab_sync_reassess(metaslab_group_t *);
uint64_t metaslab_block_maxsize(metaslab_t *);
#define METASLAB_HINTBP_FAVOR 0x0
#define METASLAB_HINTBP_AVOID 0x1
#define METASLAB_GANG_HEADER 0x2
#define METASLAB_GANG_CHILD 0x4
#define METASLAB_ASYNC_ALLOC 0x8
#define METASLAB_DONT_THROTTLE 0x10
int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *);
+int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t,
+ dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *);
void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
+void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, uint64_t);
+void metaslab_free_dva(spa_t *, const dva_t *, uint64_t);
+void metaslab_free_impl_cb(uint64_t, vdev_t *, uint64_t, uint64_t, void *);
+void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t);
int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
+int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
void metaslab_check_free(spa_t *, const blkptr_t *);
void metaslab_alloc_trace_init(void);
void metaslab_alloc_trace_fini(void);
void metaslab_trace_init(zio_alloc_list_t *);
void metaslab_trace_fini(zio_alloc_list_t *);
metaslab_class_t *metaslab_class_create(spa_t *, metaslab_ops_t *);
void metaslab_class_destroy(metaslab_class_t *);
int metaslab_class_validate(metaslab_class_t *);
void metaslab_class_histogram_verify(metaslab_class_t *);
uint64_t metaslab_class_fragmentation(metaslab_class_t *);
uint64_t metaslab_class_expandable_space(metaslab_class_t *);
boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int,
zio_t *, int);
void metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *);
void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t,
int64_t, int64_t);
uint64_t metaslab_class_get_alloc(metaslab_class_t *);
uint64_t metaslab_class_get_space(metaslab_class_t *);
uint64_t metaslab_class_get_dspace(metaslab_class_t *);
uint64_t metaslab_class_get_deferred(metaslab_class_t *);
uint64_t metaslab_class_get_minblocksize(metaslab_class_t *mc);
metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *);
void metaslab_group_destroy(metaslab_group_t *);
void metaslab_group_activate(metaslab_group_t *);
void metaslab_group_passivate(metaslab_group_t *);
boolean_t metaslab_group_initialized(metaslab_group_t *);
uint64_t metaslab_group_get_space(metaslab_group_t *);
void metaslab_group_histogram_verify(metaslab_group_t *);
uint64_t metaslab_group_fragmentation(metaslab_group_t *);
void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int);
void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *);
#ifdef __cplusplus
}
#endif
#endif /* _SYS_METASLAB_H */
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h (revision 332525)
@@ -1,376 +1,376 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_METASLAB_IMPL_H
#define _SYS_METASLAB_IMPL_H
#include <sys/metaslab.h>
#include <sys/space_map.h>
#include <sys/range_tree.h>
#include <sys/vdev.h>
#include <sys/txg.h>
#include <sys/avl.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
* Metaslab allocation tracing record.
*/
typedef struct metaslab_alloc_trace {
list_node_t mat_list_node;
metaslab_group_t *mat_mg;
metaslab_t *mat_msp;
uint64_t mat_size;
uint64_t mat_weight;
uint32_t mat_dva_id;
uint64_t mat_offset;
} metaslab_alloc_trace_t;
/*
* Used by the metaslab allocation tracing facility to indicate
* error conditions. These errors are stored to the offset member
* of the metaslab_alloc_trace_t record and displayed by mdb.
*/
typedef enum trace_alloc_type {
TRACE_ALLOC_FAILURE = -1ULL,
TRACE_TOO_SMALL = -2ULL,
TRACE_FORCE_GANG = -3ULL,
TRACE_NOT_ALLOCATABLE = -4ULL,
TRACE_GROUP_FAILURE = -5ULL,
TRACE_ENOSPC = -6ULL,
TRACE_CONDENSING = -7ULL,
TRACE_VDEV_ERROR = -8ULL
} trace_alloc_type_t;
#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
#define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
#define METASLAB_WEIGHT_TYPE (1ULL << 61)
#define METASLAB_ACTIVE_MASK \
(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
/*
* The metaslab weight is used to encode the amount of free space in a
* metaslab, such that the "best" metaslab appears first when sorting the
* metaslabs by weight. The weight (and therefore the "best" metaslab) can
* be determined in two different ways: by computing a weighted sum of all
* the free space in the metaslab (a space based weight) or by counting only
* the free segments of the largest size (a segment based weight). We prefer
* the segment based weight because it reflects how the free space is
* comprised, but we cannot always use it -- legacy pools do not have the
* space map histogram information necessary to determine the largest
* contiguous regions. Pools that have the space map histogram determine
* the segment weight by looking at each bucket in the histogram and
* determining the free space whose size in bytes is in the range:
* [2^i, 2^(i+1))
* We then encode the largest index, i, that contains regions into the
* segment-weighted value.
*
* Space-based weight:
*
* 64 56 48 40 32 24 16 8 0
* +-------+-------+-------+-------+-------+-------+-------+-------+
* |PS1| weighted-free space |
* +-------+-------+-------+-------+-------+-------+-------+-------+
*
* PS - indicates primary and secondary activation
* space - the fragmentation-weighted space
*
* Segment-based weight:
*
* 64 56 48 40 32 24 16 8 0
* +-------+-------+-------+-------+-------+-------+-------+-------+
* |PS0| idx| count of segments in region |
* +-------+-------+-------+-------+-------+-------+-------+-------+
*
* PS - indicates primary and secondary activation
* idx - index for the highest bucket in the histogram
* count - number of segments in the specified bucket
*/
#define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 62, 2)
#define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 62, 2, x)
#define WEIGHT_IS_SPACEBASED(weight) \
((weight) == 0 || BF64_GET((weight), 61, 1))
#define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 61, 1, 1)
/*
* These macros are only applicable to segment-based weighting.
*/
#define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 55, 6)
#define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 55, 6, x)
#define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 55)
#define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 55, x)
/*
* A metaslab class encompasses a category of allocatable top-level vdevs.
* Each top-level vdev is associated with a metaslab group which defines
* the allocatable region for that vdev. Examples of these categories include
* "normal" for data block allocations (i.e. main pool allocations) or "log"
* for allocations designated for intent log devices (i.e. slog devices).
* When a block allocation is requested from the SPA it is associated with a
* metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging
* to the class can be used to satisfy that request. Allocations are done
* by traversing the metaslab groups that are linked off of the mc_rotor field.
* This rotor points to the next metaslab group where allocations will be
* attempted. Allocating a block is a 3 step process -- select the metaslab
* group, select the metaslab, and then allocate the block. The metaslab
* class defines the low-level block allocator that will be used as the
* final step in allocation. These allocators are pluggable allowing each class
* to use a block allocator that best suits that class.
*/
struct metaslab_class {
kmutex_t mc_lock;
spa_t *mc_spa;
metaslab_group_t *mc_rotor;
metaslab_ops_t *mc_ops;
uint64_t mc_aliquot;
/*
* Track the number of metaslab groups that have been initialized
* and can accept allocations. An initialized metaslab group is
* one has been completely added to the config (i.e. we have
* updated the MOS config and the space has been added to the pool).
*/
uint64_t mc_groups;
/*
* Toggle to enable/disable the allocation throttle.
*/
boolean_t mc_alloc_throttle_enabled;
/*
* The allocation throttle works on a reservation system. Whenever
* an asynchronous zio wants to perform an allocation it must
* first reserve the number of blocks that it wants to allocate.
* If there aren't sufficient slots available for the pending zio
* then that I/O is throttled until more slots free up. The current
* number of reserved allocations is maintained by the mc_alloc_slots
* refcount. The mc_alloc_max_slots value determines the maximum
* number of allocations that the system allows. Gang blocks are
* allowed to reserve slots even if we've reached the maximum
* number of allocations allowed.
*/
uint64_t mc_alloc_max_slots;
refcount_t mc_alloc_slots;
uint64_t mc_alloc_groups; /* # of allocatable groups */
uint64_t mc_alloc; /* total allocated space */
uint64_t mc_deferred; /* total deferred frees */
uint64_t mc_space; /* total space (alloc + free) */
uint64_t mc_dspace; /* total deflated space */
uint64_t mc_minblocksize;
uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];
};
/*
* Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs)
* of a top-level vdev. They are linked togther to form a circular linked
* list and can belong to only one metaslab class. Metaslab groups may become
* ineligible for allocations for a number of reasons such as limited free
* space, fragmentation, or going offline. When this happens the allocator will
* simply find the next metaslab group in the linked list and attempt
* to allocate from that group instead.
*/
struct metaslab_group {
kmutex_t mg_lock;
avl_tree_t mg_metaslab_tree;
uint64_t mg_aliquot;
boolean_t mg_allocatable; /* can we allocate? */
/*
* A metaslab group is considered to be initialized only after
* we have updated the MOS config and added the space to the pool.
* We only allow allocation attempts to a metaslab group if it
* has been initialized.
*/
boolean_t mg_initialized;
uint64_t mg_free_capacity; /* percentage free */
int64_t mg_bias;
int64_t mg_activation_count;
metaslab_class_t *mg_class;
vdev_t *mg_vd;
taskq_t *mg_taskq;
metaslab_group_t *mg_prev;
metaslab_group_t *mg_next;
/*
* Each metaslab group can handle mg_max_alloc_queue_depth allocations
* which are tracked by mg_alloc_queue_depth. It's possible for a
* metaslab group to handle more allocations than its max. This
* can occur when gang blocks are required or when other groups
* are unable to handle their share of allocations.
*/
uint64_t mg_max_alloc_queue_depth;
refcount_t mg_alloc_queue_depth;
/*
* A metalab group that can no longer allocate the minimum block
* size will set mg_no_free_space. Once a metaslab group is out
* of space then its share of work must be distributed to other
* groups.
*/
boolean_t mg_no_free_space;
uint64_t mg_allocations;
uint64_t mg_failed_allocations;
uint64_t mg_fragmentation;
uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
};
/*
* This value defines the number of elements in the ms_lbas array. The value
* of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX.
* This is the equivalent of highbit(UINT64_MAX).
*/
#define MAX_LBAS 64
/*
* Each metaslab maintains a set of in-core trees to track metaslab
* operations. The in-core free tree (ms_tree) contains the list of
* free segments which are eligible for allocation. As blocks are
- * allocated, the allocated segments are removed from the ms_tree and
- * added to a per txg allocation tree (ms_alloctree). This allows us to
- * process all allocations in syncing context where it is safe to update
- * the on-disk space maps. Frees are also processed in syncing context.
- * Most frees are generated from syncing context, and those that are not
- * are held in the spa_free_bplist for processing in syncing context.
- * An additional set of in-core trees is maintained to track deferred
- * frees (ms_defertree). Once a block is freed it will move from the
+ * allocated, the allocated segment are removed from the ms_tree and
+ * added to a per txg allocation tree (ms_alloctree). As blocks are
+ * freed, they are added to the free tree (ms_freeingtree). These trees
+ * allow us to process all allocations and frees in syncing context
+ * where it is safe to update the on-disk space maps. An additional set
+ * of in-core trees is maintained to track deferred frees
+ * (ms_defertree). Once a block is freed it will move from the
* ms_freedtree to the ms_defertree. A deferred free means that a block
* has been freed but cannot be used by the pool until TXG_DEFER_SIZE
* transactions groups later. For example, a block that is freed in txg
* 50 will not be available for reallocation until txg 52 (50 +
* TXG_DEFER_SIZE). This provides a safety net for uberblock rollback.
* A pool could be safely rolled back TXG_DEFERS_SIZE transactions
* groups and ensure that no block has been reallocated.
*
* The simplified transition diagram looks like this:
*
*
* ALLOCATE
* |
* V
* free segment (ms_tree) -----> ms_alloctree[4] ----> (write to space map)
* ^
* | ms_freeingtree <--- FREE
* | |
* | v
* | ms_freedtree
* | |
* +-------- ms_defertree[2] <-------+---------> (write to space map)
*
*
* Each metaslab's space is tracked in a single space map in the MOS,
* which is only updated in syncing context. Each time we sync a txg,
* we append the allocs and frees from that txg to the space map. The
* pool space is only updated once all metaslabs have finished syncing.
*
* To load the in-core free tree we read the space map from disk. This
* object contains a series of alloc and free records that are combined
* to make up the list of all free segments in this metaslab. These
* segments are represented in-core by the ms_tree and are stored in an
* AVL tree.
*
* As the space map grows (as a result of the appends) it will
* eventually become space-inefficient. When the metaslab's in-core
* free tree is zfs_condense_pct/100 times the size of the minimal
* on-disk representation, we rewrite it in its minimized form. If a
* metaslab needs to condense then we must set the ms_condensing flag to
* ensure that allocations are not performed on the metaslab that is
* being written.
*/
struct metaslab {
kmutex_t ms_lock;
+ kmutex_t ms_sync_lock;
kcondvar_t ms_load_cv;
space_map_t *ms_sm;
uint64_t ms_id;
uint64_t ms_start;
uint64_t ms_size;
uint64_t ms_fragmentation;
range_tree_t *ms_alloctree[TXG_SIZE];
range_tree_t *ms_tree;
/*
* The following range trees are accessed only from syncing context.
* ms_free*tree only have entries while syncing, and are empty
* between syncs.
*/
range_tree_t *ms_freeingtree; /* to free this syncing txg */
range_tree_t *ms_freedtree; /* already freed this syncing txg */
range_tree_t *ms_defertree[TXG_DEFER_SIZE];
boolean_t ms_condensing; /* condensing? */
boolean_t ms_condense_wanted;
/*
* We must hold both ms_lock and ms_group->mg_lock in order to
* modify ms_loaded.
*/
boolean_t ms_loaded;
boolean_t ms_loading;
int64_t ms_deferspace; /* sum of ms_defermap[] space */
uint64_t ms_weight; /* weight vs. others in group */
uint64_t ms_activation_weight; /* activation weight */
/*
* Track of whenever a metaslab is selected for loading or allocation.
* We use this value to determine how long the metaslab should
* stay cached.
*/
uint64_t ms_selected_txg;
uint64_t ms_alloc_txg; /* last successful alloc (debug only) */
uint64_t ms_max_size; /* maximum allocatable size */
/*
* The metaslab block allocators can optionally use a size-ordered
* range tree and/or an array of LBAs. Not all allocators use
* this functionality. The ms_size_tree should always contain the
* same number of segments as the ms_tree. The only difference
* is that the ms_size_tree is ordered by segment sizes.
*/
avl_tree_t ms_size_tree;
uint64_t ms_lbas[MAX_LBAS];
metaslab_group_t *ms_group; /* metaslab group */
avl_node_t ms_group_node; /* node in metaslab group tree */
txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
};
#ifdef __cplusplus
}
#endif
#endif /* _SYS_METASLAB_IMPL_H */
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h (revision 332525)
@@ -1,97 +1,100 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_RANGE_TREE_H
#define _SYS_RANGE_TREE_H
#include <sys/avl.h>
#include <sys/dmu.h>
#ifdef __cplusplus
extern "C" {
#endif
#define RANGE_TREE_HISTOGRAM_SIZE 64
typedef struct range_tree_ops range_tree_ops_t;
+/*
+ * Note: the range_tree may not be accessed concurrently; consumers
+ * must provide external locking if required.
+ */
typedef struct range_tree {
avl_tree_t rt_root; /* offset-ordered segment AVL tree */
uint64_t rt_space; /* sum of all segments in the map */
range_tree_ops_t *rt_ops;
void *rt_arg;
/*
* The rt_histogram maintains a histogram of ranges. Each bucket,
* rt_histogram[i], contains the number of ranges whose size is:
* 2^i <= size of range in bytes < 2^(i+1)
*/
uint64_t rt_histogram[RANGE_TREE_HISTOGRAM_SIZE];
- kmutex_t *rt_lock; /* pointer to lock that protects map */
} range_tree_t;
typedef struct range_seg {
avl_node_t rs_node; /* AVL node */
avl_node_t rs_pp_node; /* AVL picker-private node */
uint64_t rs_start; /* starting offset of this segment */
uint64_t rs_end; /* ending offset (non-inclusive) */
} range_seg_t;
struct range_tree_ops {
void (*rtop_create)(range_tree_t *rt, void *arg);
void (*rtop_destroy)(range_tree_t *rt, void *arg);
void (*rtop_add)(range_tree_t *rt, range_seg_t *rs, void *arg);
void (*rtop_remove)(range_tree_t *rt, range_seg_t *rs, void *arg);
void (*rtop_vacate)(range_tree_t *rt, void *arg);
};
typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size);
void range_tree_init(void);
void range_tree_fini(void);
-range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp);
+range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg);
void range_tree_destroy(range_tree_t *rt);
boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
uint64_t range_tree_space(range_tree_t *rt);
void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size);
void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst);
void range_tree_stat_verify(range_tree_t *rt);
void range_tree_add(void *arg, uint64_t start, uint64_t size);
void range_tree_remove(void *arg, uint64_t start, uint64_t size);
void range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size);
void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg);
void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg);
#ifdef __cplusplus
}
#endif
#endif /* _SYS_RANGE_TREE_H */
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h (revision 332525)
@@ -1,931 +1,945 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2017 Joyent, Inc.
* Copyright (c) 2017 Datto Inc.
*/
#ifndef _SYS_SPA_H
#define _SYS_SPA_H
#include <sys/avl.h>
#include <sys/zfs_context.h>
#include <sys/nvpair.h>
+#include <sys/sysevent.h>
#include <sys/sysmacros.h>
#include <sys/types.h>
#include <sys/fs/zfs.h>
#include <sys/dmu.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
* Forward references that lots of things need.
*/
typedef struct spa spa_t;
typedef struct vdev vdev_t;
typedef struct metaslab metaslab_t;
typedef struct metaslab_group metaslab_group_t;
typedef struct metaslab_class metaslab_class_t;
typedef struct zio zio_t;
typedef struct zilog zilog_t;
typedef struct spa_aux_vdev spa_aux_vdev_t;
typedef struct ddt ddt_t;
typedef struct ddt_entry ddt_entry_t;
struct dsl_pool;
struct dsl_dataset;
/*
* General-purpose 32-bit and 64-bit bitfield encodings.
*/
#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len))
#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len))
#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low))
#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low))
#define BF32_GET(x, low, len) BF32_DECODE(x, low, len)
#define BF64_GET(x, low, len) BF64_DECODE(x, low, len)
#define BF32_SET(x, low, len, val) do { \
ASSERT3U(val, <, 1U << (len)); \
ASSERT3U(low + len, <=, 32); \
(x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \
_NOTE(CONSTCOND) } while (0)
#define BF64_SET(x, low, len, val) do { \
ASSERT3U(val, <, 1ULL << (len)); \
ASSERT3U(low + len, <=, 64); \
((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \
_NOTE(CONSTCOND) } while (0)
#define BF32_GET_SB(x, low, len, shift, bias) \
((BF32_GET(x, low, len) + (bias)) << (shift))
#define BF64_GET_SB(x, low, len, shift, bias) \
((BF64_GET(x, low, len) + (bias)) << (shift))
#define BF32_SET_SB(x, low, len, shift, bias, val) do { \
ASSERT(IS_P2ALIGNED(val, 1U << shift)); \
ASSERT3S((val) >> (shift), >=, bias); \
BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \
_NOTE(CONSTCOND) } while (0)
#define BF64_SET_SB(x, low, len, shift, bias, val) do { \
ASSERT(IS_P2ALIGNED(val, 1ULL << shift)); \
ASSERT3S((val) >> (shift), >=, bias); \
BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \
_NOTE(CONSTCOND) } while (0)
/*
* We currently support block sizes from 512 bytes to 16MB.
* The benefits of larger blocks, and thus larger IO, need to be weighed
* against the cost of COWing a giant block to modify one byte, and the
* large latency of reading or writing a large block.
*
* Note that although blocks up to 16MB are supported, the recordsize
* property can not be set larger than zfs_max_recordsize (default 1MB).
* See the comment near zfs_max_recordsize in dsl_dataset.c for details.
*
* Note that although the LSIZE field of the blkptr_t can store sizes up
* to 32MB, the dnode's dn_datablkszsec can only store sizes up to
* 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB.
*/
#define SPA_MINBLOCKSHIFT 9
#define SPA_OLD_MAXBLOCKSHIFT 17
#define SPA_MAXBLOCKSHIFT 24
#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT)
#define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT)
#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT)
/*
* Default maximum supported logical ashift.
*
* The current 8k allocation block size limit is due to the 8k
* aligned/sized operations performed by vdev_probe() on
* vdev_label->vl_pad2. Using another "safe region" for these tests
* would allow the limit to be raised to 16k, at the expense of
* only having 8 available uberblocks in the label area.
*/
#define SPA_MAXASHIFT 13
/*
* Default minimum supported logical ashift.
*/
#define SPA_MINASHIFT SPA_MINBLOCKSHIFT
/*
* Size of block to hold the configuration data (a packed nvlist)
*/
#define SPA_CONFIG_BLOCKSIZE (1ULL << 14)
/*
* The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
* The ASIZE encoding should be at least 64 times larger (6 more bits)
* to support up to 4-way RAID-Z mirror mode with worst-case gang block
* overhead, three DVAs per bp, plus one more bit in case we do anything
* else that expands the ASIZE.
*/
#define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */
#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */
#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */
#define SPA_COMPRESSBITS 7
/*
* All SPA data is represented by 128-bit data virtual addresses (DVAs).
* The members of the dva_t should be considered opaque outside the SPA.
*/
typedef struct dva {
uint64_t dva_word[2];
} dva_t;
/*
* Each block has a 256-bit checksum -- strong enough for cryptographic hashes.
*/
typedef struct zio_cksum {
uint64_t zc_word[4];
} zio_cksum_t;
/*
* Some checksums/hashes need a 256-bit initialization salt. This salt is kept
* secret and is suitable for use in MAC algorithms as the key.
*/
typedef struct zio_cksum_salt {
uint8_t zcs_bytes[32];
} zio_cksum_salt_t;
/*
* Each block is described by its DVAs, time of birth, checksum, etc.
* The word-by-word, bit-by-bit layout of the blkptr is as follows:
*
* 64 56 48 40 32 24 16 8 0
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 0 | vdev1 | GRID | ASIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 1 |G| offset1 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 2 | vdev2 | GRID | ASIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 3 |G| offset2 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 4 | vdev3 | GRID | ASIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 5 |G| offset3 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 7 | padding |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 8 | padding |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 9 | physical birth txg |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* a | logical birth txg |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* b | fill count |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* c | checksum[0] |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* d | checksum[1] |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* e | checksum[2] |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* f | checksum[3] |
* +-------+-------+-------+-------+-------+-------+-------+-------+
*
* Legend:
*
* vdev virtual device ID
* offset offset into virtual device
* LSIZE logical size
* PSIZE physical size (after compression)
* ASIZE allocated size (including RAID-Z parity and gang block headers)
* GRID RAID-Z layout information (reserved for future use)
* cksum checksum function
* comp compression function
* G gang block indicator
* B byteorder (endianness)
* D dedup
* X encryption (on version 30, which is not supported)
* E blkptr_t contains embedded data (see below)
* lvl level of indirection
* type DMU object type
- * phys birth txg of block allocation; zero if same as logical birth txg
+ * phys birth txg when dva[0] was written; zero if same as logical birth txg
+ * note that typically all the dva's would be written in this
+ * txg, but they could be different if they were moved by
+ * device removal.
* log. birth transaction group in which the block was logically born
* fill count number of non-zero blocks under this bp
* checksum[4] 256-bit checksum of the data this bp describes
*/
/*
* "Embedded" blkptr_t's don't actually point to a block, instead they
* have a data payload embedded in the blkptr_t itself. See the comment
* in blkptr.c for more details.
*
* The blkptr_t is laid out as follows:
*
* 64 56 48 40 32 24 16 8 0
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 0 | payload |
* 1 | payload |
* 2 | payload |
* 3 | payload |
* 4 | payload |
* 5 | payload |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 6 |BDX|lvl| type | etype |E| comp| PSIZE| LSIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 7 | payload |
* 8 | payload |
* 9 | payload |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* a | logical birth txg |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* b | payload |
* c | payload |
* d | payload |
* e | payload |
* f | payload |
* +-------+-------+-------+-------+-------+-------+-------+-------+
*
* Legend:
*
* payload contains the embedded data
* B (byteorder) byteorder (endianness)
* D (dedup) padding (set to zero)
* X encryption (set to zero; see above)
* E (embedded) set to one
* lvl indirection level
* type DMU object type
* etype how to interpret embedded data (BP_EMBEDDED_TYPE_*)
* comp compression function of payload
* PSIZE size of payload after compression, in bytes
* LSIZE logical size of payload, in bytes
* note that 25 bits is enough to store the largest
* "normal" BP's LSIZE (2^16 * 2^9) in bytes
* log. birth transaction group in which the block was logically born
*
* Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded
* bp's they are stored in units of SPA_MINBLOCKSHIFT.
* Generally, the generic BP_GET_*() macros can be used on embedded BP's.
* The B, D, X, lvl, type, and comp fields are stored the same as with normal
* BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must
* be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before
* other macros, as they assert that they are only used on BP's of the correct
* "embedded-ness".
*/
#define BPE_GET_ETYPE(bp) \
(ASSERT(BP_IS_EMBEDDED(bp)), \
BF64_GET((bp)->blk_prop, 40, 8))
#define BPE_SET_ETYPE(bp, t) do { \
ASSERT(BP_IS_EMBEDDED(bp)); \
BF64_SET((bp)->blk_prop, 40, 8, t); \
_NOTE(CONSTCOND) } while (0)
#define BPE_GET_LSIZE(bp) \
(ASSERT(BP_IS_EMBEDDED(bp)), \
BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1))
#define BPE_SET_LSIZE(bp, x) do { \
ASSERT(BP_IS_EMBEDDED(bp)); \
BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \
_NOTE(CONSTCOND) } while (0)
#define BPE_GET_PSIZE(bp) \
(ASSERT(BP_IS_EMBEDDED(bp)), \
BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1))
#define BPE_SET_PSIZE(bp, x) do { \
ASSERT(BP_IS_EMBEDDED(bp)); \
BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \
_NOTE(CONSTCOND) } while (0)
typedef enum bp_embedded_type {
BP_EMBEDDED_TYPE_DATA,
BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */
NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED
} bp_embedded_type_t;
#define BPE_NUM_WORDS 14
#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
#define BPE_IS_PAYLOADWORD(bp, wp) \
((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
/*
* A block is a hole when it has either 1) never been written to, or
* 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads
* without physically allocating disk space. Holes are represented in the
* blkptr_t structure by zeroed blk_dva. Correct checking for holes is
* done through the BP_IS_HOLE macro. For holes, the logical size, level,
* DMU object type, and birth times are all also stored for holes that
* were written to at some point (i.e. were punched after having been filled).
*/
typedef struct blkptr {
dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
uint64_t blk_prop; /* size, compression, type, etc */
uint64_t blk_pad[2]; /* Extra space for the future */
uint64_t blk_phys_birth; /* txg when block was allocated */
uint64_t blk_birth; /* transaction group at birth */
uint64_t blk_fill; /* fill count */
zio_cksum_t blk_cksum; /* 256-bit checksum */
} blkptr_t;
/*
* Macros to get and set fields in a bp or DVA.
*/
#define DVA_GET_ASIZE(dva) \
BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0)
#define DVA_SET_ASIZE(dva, x) \
BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \
SPA_MINBLOCKSHIFT, 0, x)
#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8)
#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x)
#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, 32)
#define DVA_SET_VDEV(dva, x) BF64_SET((dva)->dva_word[0], 32, 32, x)
#define DVA_GET_OFFSET(dva) \
BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
#define DVA_SET_OFFSET(dva, x) \
BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x)
#define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1)
#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x)
#define BP_GET_LSIZE(bp) \
(BP_IS_EMBEDDED(bp) ? \
(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \
BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1))
#define BP_SET_LSIZE(bp, x) do { \
ASSERT(!BP_IS_EMBEDDED(bp)); \
BF64_SET_SB((bp)->blk_prop, \
0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
_NOTE(CONSTCOND) } while (0)
#define BP_GET_PSIZE(bp) \
(BP_IS_EMBEDDED(bp) ? 0 : \
BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1))
#define BP_SET_PSIZE(bp, x) do { \
ASSERT(!BP_IS_EMBEDDED(bp)); \
BF64_SET_SB((bp)->blk_prop, \
16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
_NOTE(CONSTCOND) } while (0)
#define BP_GET_COMPRESS(bp) \
BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS)
#define BP_SET_COMPRESS(bp, x) \
BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x)
#define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1)
#define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x)
#define BP_GET_CHECKSUM(bp) \
(BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \
BF64_GET((bp)->blk_prop, 40, 8))
#define BP_SET_CHECKSUM(bp, x) do { \
ASSERT(!BP_IS_EMBEDDED(bp)); \
BF64_SET((bp)->blk_prop, 40, 8, x); \
_NOTE(CONSTCOND) } while (0)
#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8)
#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x)
#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1)
#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x)
#define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1)
#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
#define BP_PHYSICAL_BIRTH(bp) \
(BP_IS_EMBEDDED(bp) ? 0 : \
(bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
#define BP_SET_BIRTH(bp, logical, physical) \
{ \
ASSERT(!BP_IS_EMBEDDED(bp)); \
(bp)->blk_birth = (logical); \
(bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
}
#define BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill)
#define BP_IS_METADATA(bp) \
(BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
#define BP_GET_ASIZE(bp) \
(BP_IS_EMBEDDED(bp) ? 0 : \
DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
DVA_GET_ASIZE(&(bp)->blk_dva[2]))
#define BP_GET_UCSIZE(bp) \
(BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
#define BP_GET_NDVAS(bp) \
(BP_IS_EMBEDDED(bp) ? 0 : \
!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
!!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
!!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
#define BP_COUNT_GANG(bp) \
(BP_IS_EMBEDDED(bp) ? 0 : \
(DVA_GET_GANG(&(bp)->blk_dva[0]) + \
DVA_GET_GANG(&(bp)->blk_dva[1]) + \
DVA_GET_GANG(&(bp)->blk_dva[2])))
#define DVA_EQUAL(dva1, dva2) \
((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
(dva1)->dva_word[0] == (dva2)->dva_word[0])
#define BP_EQUAL(bp1, bp2) \
(BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \
(bp1)->blk_birth == (bp2)->blk_birth && \
DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \
DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \
DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \
(0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \
((zc1).zc_word[1] - (zc2).zc_word[1]) | \
((zc1).zc_word[2] - (zc2).zc_word[2]) | \
((zc1).zc_word[3] - (zc2).zc_word[3])))
#define ZIO_CHECKSUM_IS_ZERO(zc) \
(0 == ((zc)->zc_word[0] | (zc)->zc_word[1] | \
(zc)->zc_word[2] | (zc)->zc_word[3]))
#define ZIO_CHECKSUM_BSWAP(zcp) \
{ \
(zcp)->zc_word[0] = BSWAP_64((zcp)->zc_word[0]); \
(zcp)->zc_word[1] = BSWAP_64((zcp)->zc_word[1]); \
(zcp)->zc_word[2] = BSWAP_64((zcp)->zc_word[2]); \
(zcp)->zc_word[3] = BSWAP_64((zcp)->zc_word[3]); \
}
#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0)
#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \
{ \
(zcp)->zc_word[0] = w0; \
(zcp)->zc_word[1] = w1; \
(zcp)->zc_word[2] = w2; \
(zcp)->zc_word[3] = w3; \
}
#define BP_IDENTITY(bp) (ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0])
#define BP_IS_GANG(bp) \
(BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp)))
#define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \
(dva)->dva_word[1] == 0ULL)
#define BP_IS_HOLE(bp) \
(!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp)))
/* BP_IS_RAIDZ(bp) assumes no block compression */
#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
BP_GET_PSIZE(bp))
#define BP_ZERO(bp) \
{ \
(bp)->blk_dva[0].dva_word[0] = 0; \
(bp)->blk_dva[0].dva_word[1] = 0; \
(bp)->blk_dva[1].dva_word[0] = 0; \
(bp)->blk_dva[1].dva_word[1] = 0; \
(bp)->blk_dva[2].dva_word[0] = 0; \
(bp)->blk_dva[2].dva_word[1] = 0; \
(bp)->blk_prop = 0; \
(bp)->blk_pad[0] = 0; \
(bp)->blk_pad[1] = 0; \
(bp)->blk_phys_birth = 0; \
(bp)->blk_birth = 0; \
(bp)->blk_fill = 0; \
ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
}
#if BYTE_ORDER == _BIG_ENDIAN
#define ZFS_HOST_BYTEORDER (0ULL)
#else
#define ZFS_HOST_BYTEORDER (1ULL)
#endif
#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
#define BP_SPRINTF_LEN 320
/*
* This macro allows code sharing between zfs, libzpool, and mdb.
* 'func' is either snprintf() or mdb_snprintf().
* 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
*/
#define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \
{ \
static const char *copyname[] = \
{ "zero", "single", "double", "triple" }; \
int len = 0; \
int copies = 0; \
\
if (bp == NULL) { \
len += func(buf + len, size - len, "<NULL>"); \
} else if (BP_IS_HOLE(bp)) { \
len += func(buf + len, size - len, \
"HOLE [L%llu %s] " \
"size=%llxL birth=%lluL", \
(u_longlong_t)BP_GET_LEVEL(bp), \
type, \
(u_longlong_t)BP_GET_LSIZE(bp), \
(u_longlong_t)bp->blk_birth); \
} else if (BP_IS_EMBEDDED(bp)) { \
len = func(buf + len, size - len, \
"EMBEDDED [L%llu %s] et=%u %s " \
"size=%llxL/%llxP birth=%lluL", \
(u_longlong_t)BP_GET_LEVEL(bp), \
type, \
(int)BPE_GET_ETYPE(bp), \
compress, \
(u_longlong_t)BPE_GET_LSIZE(bp), \
(u_longlong_t)BPE_GET_PSIZE(bp), \
(u_longlong_t)bp->blk_birth); \
} else { \
for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \
const dva_t *dva = &bp->blk_dva[d]; \
if (DVA_IS_VALID(dva)) \
copies++; \
len += func(buf + len, size - len, \
"DVA[%d]=<%llu:%llx:%llx>%c", d, \
(u_longlong_t)DVA_GET_VDEV(dva), \
(u_longlong_t)DVA_GET_OFFSET(dva), \
(u_longlong_t)DVA_GET_ASIZE(dva), \
ws); \
} \
if (BP_IS_GANG(bp) && \
DVA_GET_ASIZE(&bp->blk_dva[2]) <= \
DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \
copies--; \
len += func(buf + len, size - len, \
"[L%llu %s] %s %s %s %s %s %s%c" \
"size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \
"cksum=%llx:%llx:%llx:%llx", \
(u_longlong_t)BP_GET_LEVEL(bp), \
type, \
checksum, \
compress, \
BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \
BP_IS_GANG(bp) ? "gang" : "contiguous", \
BP_GET_DEDUP(bp) ? "dedup" : "unique", \
copyname[copies], \
ws, \
(u_longlong_t)BP_GET_LSIZE(bp), \
(u_longlong_t)BP_GET_PSIZE(bp), \
(u_longlong_t)bp->blk_birth, \
(u_longlong_t)BP_PHYSICAL_BIRTH(bp), \
(u_longlong_t)BP_GET_FILL(bp), \
ws, \
(u_longlong_t)bp->blk_cksum.zc_word[0], \
(u_longlong_t)bp->blk_cksum.zc_word[1], \
(u_longlong_t)bp->blk_cksum.zc_word[2], \
(u_longlong_t)bp->blk_cksum.zc_word[3]); \
} \
ASSERT(len < size); \
}
#define BP_GET_BUFC_TYPE(bp) \
(BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
typedef enum spa_import_type {
SPA_IMPORT_EXISTING,
SPA_IMPORT_ASSEMBLE
} spa_import_type_t;
/* state manipulation functions */
extern int spa_open(const char *pool, spa_t **, void *tag);
extern int spa_open_rewind(const char *pool, spa_t **, void *tag,
nvlist_t *policy, nvlist_t **config);
extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot,
size_t buflen);
extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props,
nvlist_t *zplprops);
#ifdef illumos
extern int spa_import_rootpool(char *devpath, char *devid);
#else
extern int spa_import_rootpool(const char *name);
#endif
extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props,
uint64_t flags);
extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
extern int spa_destroy(char *pool);
extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
boolean_t hardforce);
extern int spa_reset(char *pool);
extern void spa_async_request(spa_t *spa, int flag);
extern void spa_async_unrequest(spa_t *spa, int flag);
extern void spa_async_suspend(spa_t *spa);
extern void spa_async_resume(spa_t *spa);
extern spa_t *spa_inject_addref(char *pool);
extern void spa_inject_delref(spa_t *spa);
extern void spa_scan_stat_init(spa_t *spa);
extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
#define SPA_ASYNC_CONFIG_UPDATE 0x01
#define SPA_ASYNC_REMOVE 0x02
#define SPA_ASYNC_PROBE 0x04
#define SPA_ASYNC_RESILVER_DONE 0x08
#define SPA_ASYNC_RESILVER 0x10
#define SPA_ASYNC_AUTOEXPAND 0x20
#define SPA_ASYNC_REMOVE_DONE 0x40
#define SPA_ASYNC_REMOVE_STOP 0x80
/*
* Controls the behavior of spa_vdev_remove().
*/
#define SPA_REMOVE_UNSPARE 0x01
#define SPA_REMOVE_DONE 0x02
/* device manipulation */
extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
int replacing);
extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
int replace_done);
extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
extern boolean_t spa_vdev_remove_active(spa_t *spa);
extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
nvlist_t *props, boolean_t exp);
/* spare state (which is global across all pools) */
extern void spa_spare_add(vdev_t *vd);
extern void spa_spare_remove(vdev_t *vd);
extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt);
extern void spa_spare_activate(vdev_t *vd);
/* L2ARC state (which is global across all pools) */
extern void spa_l2cache_add(vdev_t *vd);
extern void spa_l2cache_remove(vdev_t *vd);
extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool);
extern void spa_l2cache_activate(vdev_t *vd);
extern void spa_l2cache_drop(spa_t *spa);
/* scanning */
extern int spa_scan(spa_t *spa, pool_scan_func_t func);
extern int spa_scan_stop(spa_t *spa);
extern int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t flag);
/* spa syncing */
extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
extern void spa_sync_allpools(void);
/* spa namespace global mutex */
extern kmutex_t spa_namespace_lock;
/*
* SPA configuration functions in spa_config.c
*/
#define SPA_CONFIG_UPDATE_POOL 0
#define SPA_CONFIG_UPDATE_VDEVS 1
-extern void spa_config_sync(spa_t *, boolean_t, boolean_t);
+extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t);
extern void spa_config_load(void);
extern nvlist_t *spa_all_configs(uint64_t *);
extern void spa_config_set(spa_t *spa, nvlist_t *config);
extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
int getstats);
extern void spa_config_update(spa_t *spa, int what);
/*
* Miscellaneous SPA routines in spa_misc.c
*/
/* Namespace manipulation */
extern spa_t *spa_lookup(const char *name);
extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot);
extern void spa_remove(spa_t *spa);
extern spa_t *spa_next(spa_t *prev);
/* Refcount functions */
extern void spa_open_ref(spa_t *spa, void *tag);
extern void spa_close(spa_t *spa, void *tag);
extern void spa_async_close(spa_t *spa, void *tag);
extern boolean_t spa_refcount_zero(spa_t *spa);
#define SCL_NONE 0x00
#define SCL_CONFIG 0x01
#define SCL_STATE 0x02
#define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */
#define SCL_ALLOC 0x08
#define SCL_ZIO 0x10
#define SCL_FREE 0x20
#define SCL_VDEV 0x40
#define SCL_LOCKS 7
#define SCL_ALL ((1 << SCL_LOCKS) - 1)
#define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO)
/* Pool configuration locks */
extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw);
extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw);
extern void spa_config_exit(spa_t *spa, int locks, void *tag);
extern int spa_config_held(spa_t *spa, int locks, krw_t rw);
/* Pool vdev add/remove lock */
extern uint64_t spa_vdev_enter(spa_t *spa);
extern uint64_t spa_vdev_config_enter(spa_t *spa);
extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg,
int error, char *tag);
extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
/* Pool vdev state change lock */
extern void spa_vdev_state_enter(spa_t *spa, int oplock);
extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error);
/* Log state */
typedef enum spa_log_state {
SPA_LOG_UNKNOWN = 0, /* unknown log state */
SPA_LOG_MISSING, /* missing log(s) */
SPA_LOG_CLEAR, /* clear the log(s) */
SPA_LOG_GOOD, /* log(s) are good */
} spa_log_state_t;
extern spa_log_state_t spa_get_log_state(spa_t *spa);
extern void spa_set_log_state(spa_t *spa, spa_log_state_t state);
-extern int spa_offline_log(spa_t *spa);
+extern int spa_reset_logs(spa_t *spa);
/* Log claim callback */
extern void spa_claim_notify(zio_t *zio);
/* Accessor functions */
extern boolean_t spa_shutting_down(spa_t *spa);
extern struct dsl_pool *spa_get_dsl(spa_t *spa);
extern boolean_t spa_is_initializing(spa_t *spa);
+extern boolean_t spa_indirect_vdevs_loaded(spa_t *spa);
extern blkptr_t *spa_get_rootblkptr(spa_t *spa);
extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp);
extern void spa_altroot(spa_t *, char *, size_t);
extern int spa_sync_pass(spa_t *spa);
extern char *spa_name(spa_t *spa);
extern uint64_t spa_guid(spa_t *spa);
extern uint64_t spa_load_guid(spa_t *spa);
extern uint64_t spa_last_synced_txg(spa_t *spa);
extern uint64_t spa_first_txg(spa_t *spa);
extern uint64_t spa_syncing_txg(spa_t *spa);
extern uint64_t spa_final_dirty_txg(spa_t *spa);
extern uint64_t spa_version(spa_t *spa);
extern pool_state_t spa_state(spa_t *spa);
extern spa_load_state_t spa_load_state(spa_t *spa);
extern uint64_t spa_freeze_txg(spa_t *spa);
extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize);
extern uint64_t spa_get_dspace(spa_t *spa);
extern uint64_t spa_get_slop_space(spa_t *spa);
extern void spa_update_dspace(spa_t *spa);
extern uint64_t spa_version(spa_t *spa);
extern boolean_t spa_deflate(spa_t *spa);
extern metaslab_class_t *spa_normal_class(spa_t *spa);
extern metaslab_class_t *spa_log_class(spa_t *spa);
extern void spa_evicting_os_register(spa_t *, objset_t *os);
extern void spa_evicting_os_deregister(spa_t *, objset_t *os);
extern void spa_evicting_os_wait(spa_t *spa);
extern int spa_max_replication(spa_t *spa);
extern int spa_prev_software_version(spa_t *spa);
extern int spa_busy(void);
extern uint8_t spa_get_failmode(spa_t *spa);
extern boolean_t spa_suspended(spa_t *spa);
extern uint64_t spa_bootfs(spa_t *spa);
extern uint64_t spa_delegation(spa_t *spa);
extern objset_t *spa_meta_objset(spa_t *spa);
extern uint64_t spa_deadman_synctime(spa_t *spa);
/* Miscellaneous support routines */
extern void spa_activate_mos_feature(spa_t *spa, const char *feature,
dmu_tx_t *tx);
extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature);
extern int spa_rename(const char *oldname, const char *newname);
extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid);
extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
extern char *spa_strdup(const char *);
extern void spa_strfree(char *);
extern uint64_t spa_get_random(uint64_t range);
extern uint64_t spa_generate_guid(spa_t *spa);
extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp);
extern void spa_freeze(spa_t *spa);
extern int spa_change_guid(spa_t *spa);
extern void spa_upgrade(spa_t *spa, uint64_t version);
extern void spa_evict_all(void);
extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,
boolean_t l2cache);
extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva);
extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp);
extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp);
extern boolean_t spa_has_slogs(spa_t *spa);
extern boolean_t spa_is_root(spa_t *spa);
extern boolean_t spa_writeable(spa_t *spa);
extern boolean_t spa_has_pending_synctask(spa_t *spa);
extern int spa_maxblocksize(spa_t *spa);
extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp);
+typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size,
+ void *arg);
+extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp,
+ spa_remap_cb_t callback, void *arg);
+extern uint64_t spa_get_last_removal_txg(spa_t *spa);
extern int spa_mode(spa_t *spa);
extern uint64_t zfs_strtonum(const char *str, char **nptr);
extern char *spa_his_ievent_table[];
extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx);
extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read,
char *his_buf);
extern int spa_history_log(spa_t *spa, const char *his_buf);
extern int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl);
extern void spa_history_log_version(spa_t *spa, const char *operation);
extern void spa_history_log_internal(spa_t *spa, const char *operation,
dmu_tx_t *tx, const char *fmt, ...);
extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op,
dmu_tx_t *tx, const char *fmt, ...);
extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation,
dmu_tx_t *tx, const char *fmt, ...);
/* error handling */
struct zbookmark_phys;
extern void spa_log_error(spa_t *spa, zio_t *zio);
extern void zfs_ereport_post(const char *cls, spa_t *spa, vdev_t *vd,
zio_t *zio, uint64_t stateoroffset, uint64_t length);
extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
extern void zfs_post_state_change(spa_t *spa, vdev_t *vd);
extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
extern uint64_t spa_get_errlog_size(spa_t *spa);
extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count);
extern void spa_errlog_rotate(spa_t *spa);
extern void spa_errlog_drain(spa_t *spa);
extern void spa_errlog_sync(spa_t *spa, uint64_t txg);
extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub);
/* vdev cache */
extern void vdev_cache_stat_init(void);
extern void vdev_cache_stat_fini(void);
/* Initialization and termination */
extern void spa_init(int flags);
extern void spa_fini(void);
extern void spa_boot_init(void);
/* properties */
extern int spa_prop_set(spa_t *spa, nvlist_t *nvp);
extern int spa_prop_get(spa_t *spa, nvlist_t **nvp);
extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx);
extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t);
/* asynchronous event notification */
extern void spa_event_notify(spa_t *spa, vdev_t *vdev, nvlist_t *hist_nvl,
const char *name);
+extern sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl,
+ const char *name);
+extern void spa_event_post(sysevent_t *ev);
+extern void spa_event_discard(sysevent_t *ev);
#ifdef ZFS_DEBUG
#define dprintf_bp(bp, fmt, ...) do { \
if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \
dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \
kmem_free(__blkbuf, BP_SPRINTF_LEN); \
} \
_NOTE(CONSTCOND) } while (0)
#else
#define dprintf_bp(bp, fmt, ...)
#endif
extern boolean_t spa_debug_enabled(spa_t *spa);
#define spa_dbgmsg(spa, ...) \
{ \
if (spa_debug_enabled(spa)) \
zfs_dbgmsg(__VA_ARGS__); \
}
extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */
#ifdef __cplusplus
}
#endif
#endif /* _SYS_SPA_H */
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h (revision 332525)
@@ -1,320 +1,389 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2017 Datto Inc.
*/
#ifndef _SYS_SPA_IMPL_H
#define _SYS_SPA_IMPL_H
#include <sys/spa.h>
#include <sys/vdev.h>
+#include <sys/vdev_removal.h>
#include <sys/metaslab.h>
#include <sys/dmu.h>
#include <sys/dsl_pool.h>
#include <sys/uberblock_impl.h>
#include <sys/zfs_context.h>
#include <sys/avl.h>
#include <sys/refcount.h>
#include <sys/bplist.h>
#include <sys/bpobj.h>
#include <sys/zfeature.h>
#include <zfeature_common.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef struct spa_error_entry {
zbookmark_phys_t se_bookmark;
char *se_name;
avl_node_t se_avl;
} spa_error_entry_t;
typedef struct spa_history_phys {
uint64_t sh_pool_create_len; /* ending offset of zpool create */
uint64_t sh_phys_max_off; /* physical EOF */
uint64_t sh_bof; /* logical BOF */
uint64_t sh_eof; /* logical EOF */
uint64_t sh_records_lost; /* num of records overwritten */
} spa_history_phys_t;
+/*
+ * All members must be uint64_t, for byteswap purposes.
+ */
+typedef struct spa_removing_phys {
+ uint64_t sr_state; /* dsl_scan_state_t */
+
+ /*
+ * The vdev ID that we most recently attempted to remove,
+ * or -1 if no removal has been attempted.
+ */
+ uint64_t sr_removing_vdev;
+
+ /*
+ * The vdev ID that we most recently successfully removed,
+ * or -1 if no devices have been removed.
+ */
+ uint64_t sr_prev_indirect_vdev;
+
+ uint64_t sr_start_time;
+ uint64_t sr_end_time;
+
+ /*
+ * Note that we can not use the space map's or indirect mapping's
+ * accounting as a substitute for these values, because we need to
+ * count frees of not-yet-copied data as though it did the copy.
+ * Otherwise, we could get into a situation where copied > to_copy,
+ * or we complete before copied == to_copy.
+ */
+ uint64_t sr_to_copy; /* bytes that need to be copied */
+ uint64_t sr_copied; /* bytes that have been copied or freed */
+} spa_removing_phys_t;
+
+/*
+ * This struct is stored as an entry in the DMU_POOL_DIRECTORY_OBJECT
+ * (with key DMU_POOL_CONDENSING_INDIRECT). It is present if a condense
+ * of an indirect vdev's mapping object is in progress.
+ */
+typedef struct spa_condensing_indirect_phys {
+ /*
+ * The vdev ID of the indirect vdev whose indirect mapping is
+ * being condensed.
+ */
+ uint64_t scip_vdev;
+
+ /*
+ * The vdev's old obsolete spacemap. This spacemap's contents are
+ * being integrated into the new mapping.
+ */
+ uint64_t scip_prev_obsolete_sm_object;
+
+ /*
+ * The new mapping object that is being created.
+ */
+ uint64_t scip_next_mapping_object;
+} spa_condensing_indirect_phys_t;
+
struct spa_aux_vdev {
uint64_t sav_object; /* MOS object for device list */
nvlist_t *sav_config; /* cached device config */
vdev_t **sav_vdevs; /* devices */
int sav_count; /* number devices */
boolean_t sav_sync; /* sync the device list */
nvlist_t **sav_pending; /* pending device additions */
uint_t sav_npending; /* # pending devices */
};
typedef struct spa_config_lock {
kmutex_t scl_lock;
kthread_t *scl_writer;
int scl_write_wanted;
kcondvar_t scl_cv;
refcount_t scl_count;
} spa_config_lock_t;
typedef struct spa_config_dirent {
list_node_t scd_link;
char *scd_path;
} spa_config_dirent_t;
typedef enum zio_taskq_type {
ZIO_TASKQ_ISSUE = 0,
ZIO_TASKQ_ISSUE_HIGH,
ZIO_TASKQ_INTERRUPT,
ZIO_TASKQ_INTERRUPT_HIGH,
ZIO_TASKQ_TYPES
} zio_taskq_type_t;
/*
* State machine for the zpool-poolname process. The states transitions
* are done as follows:
*
* From To Routine
* PROC_NONE -> PROC_CREATED spa_activate()
* PROC_CREATED -> PROC_ACTIVE spa_thread()
* PROC_ACTIVE -> PROC_DEACTIVATE spa_deactivate()
* PROC_DEACTIVATE -> PROC_GONE spa_thread()
* PROC_GONE -> PROC_NONE spa_deactivate()
*/
typedef enum spa_proc_state {
SPA_PROC_NONE, /* spa_proc = &p0, no process created */
SPA_PROC_CREATED, /* spa_activate() has proc, is waiting */
SPA_PROC_ACTIVE, /* taskqs created, spa_proc set */
SPA_PROC_DEACTIVATE, /* spa_deactivate() requests process exit */
SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */
} spa_proc_state_t;
typedef struct spa_taskqs {
uint_t stqs_count;
taskq_t **stqs_taskq;
} spa_taskqs_t;
typedef enum spa_all_vdev_zap_action {
AVZ_ACTION_NONE = 0,
AVZ_ACTION_DESTROY, /* Destroy all per-vdev ZAPs and the AVZ. */
AVZ_ACTION_REBUILD, /* Populate the new AVZ, see spa_avz_rebuild */
AVZ_ACTION_INITIALIZE
} spa_avz_action_t;
struct spa {
/*
* Fields protected by spa_namespace_lock.
*/
char spa_name[ZFS_MAX_DATASET_NAME_LEN]; /* pool name */
char *spa_comment; /* comment */
avl_node_t spa_avl; /* node in spa_namespace_avl */
nvlist_t *spa_config; /* last synced config */
nvlist_t *spa_config_syncing; /* currently syncing config */
nvlist_t *spa_config_splitting; /* config for splitting */
nvlist_t *spa_load_info; /* info and errors from load */
uint64_t spa_config_txg; /* txg of last config change */
int spa_sync_pass; /* iterate-to-convergence */
pool_state_t spa_state; /* pool state */
int spa_inject_ref; /* injection references */
uint8_t spa_sync_on; /* sync threads are running */
spa_load_state_t spa_load_state; /* current load operation */
+ boolean_t spa_indirect_vdevs_loaded; /* mappings loaded? */
uint64_t spa_import_flags; /* import specific flags */
spa_taskqs_t spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
dsl_pool_t *spa_dsl_pool;
boolean_t spa_is_initializing; /* true while opening pool */
metaslab_class_t *spa_normal_class; /* normal data class */
metaslab_class_t *spa_log_class; /* intent log data class */
uint64_t spa_first_txg; /* first txg after spa_open() */
uint64_t spa_final_txg; /* txg of export/destroy */
uint64_t spa_freeze_txg; /* freeze pool at this txg */
uint64_t spa_load_max_txg; /* best initial ub_txg */
uint64_t spa_claim_max_txg; /* highest claimed birth txg */
timespec_t spa_loaded_ts; /* 1st successful open time */
objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */
kmutex_t spa_evicting_os_lock; /* Evicting objset list lock */
list_t spa_evicting_os_list; /* Objsets being evicted. */
kcondvar_t spa_evicting_os_cv; /* Objset Eviction Completion */
txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */
vdev_t *spa_root_vdev; /* top-level vdev container */
int spa_min_ashift; /* of vdevs in normal class */
int spa_max_ashift; /* of vdevs in normal class */
uint64_t spa_config_guid; /* config pool guid */
uint64_t spa_load_guid; /* spa_load initialized guid */
uint64_t spa_last_synced_guid; /* last synced guid */
list_t spa_config_dirty_list; /* vdevs with dirty config */
list_t spa_state_dirty_list; /* vdevs with dirty state */
kmutex_t spa_alloc_lock;
avl_tree_t spa_alloc_tree;
spa_aux_vdev_t spa_spares; /* hot spares */
spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */
nvlist_t *spa_label_features; /* Features for reading MOS */
uint64_t spa_config_object; /* MOS object for pool config */
uint64_t spa_config_generation; /* config generation number */
uint64_t spa_syncing_txg; /* txg currently syncing */
bpobj_t spa_deferred_bpobj; /* deferred-free bplist */
bplist_t spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */
zio_cksum_salt_t spa_cksum_salt; /* secret salt for cksum */
/* checksum context templates */
kmutex_t spa_cksum_tmpls_lock;
void *spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS];
uberblock_t spa_ubsync; /* last synced uberblock */
uberblock_t spa_uberblock; /* current uberblock */
boolean_t spa_extreme_rewind; /* rewind past deferred frees */
uint64_t spa_last_io; /* lbolt of last non-scan I/O */
kmutex_t spa_scrub_lock; /* resilver/scrub lock */
uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */
kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */
uint8_t spa_scrub_active; /* active or suspended? */
uint8_t spa_scrub_type; /* type of scrub we're doing */
uint8_t spa_scrub_finished; /* indicator to rotate logs */
uint8_t spa_scrub_started; /* started since last boot */
uint8_t spa_scrub_reopen; /* scrub doing vdev_reopen */
uint64_t spa_scan_pass_start; /* start time per pass/reboot */
uint64_t spa_scan_pass_scrub_pause; /* scrub pause time */
uint64_t spa_scan_pass_scrub_spent_paused; /* total paused */
uint64_t spa_scan_pass_exam; /* examined bytes per pass */
kmutex_t spa_async_lock; /* protect async state */
kthread_t *spa_async_thread; /* thread doing async task */
kthread_t *spa_async_thread_vd; /* thread doing vd async task */
int spa_async_suspended; /* async tasks suspended */
kcondvar_t spa_async_cv; /* wait for thread_exit() */
uint16_t spa_async_tasks; /* async task mask */
+
+ spa_removing_phys_t spa_removing_phys;
+ spa_vdev_removal_t *spa_vdev_removal;
+
+ spa_condensing_indirect_phys_t spa_condensing_indirect_phys;
+ spa_condensing_indirect_t *spa_condensing_indirect;
+ kthread_t *spa_condense_thread; /* thread doing condense. */
+
char *spa_root; /* alternate root directory */
uint64_t spa_ena; /* spa-wide ereport ENA */
int spa_last_open_failed; /* error if last open failed */
uint64_t spa_last_ubsync_txg; /* "best" uberblock txg */
uint64_t spa_last_ubsync_txg_ts; /* timestamp from that ub */
uint64_t spa_load_txg; /* ub txg that loaded */
uint64_t spa_load_txg_ts; /* timestamp from that ub */
uint64_t spa_load_meta_errors; /* verify metadata err count */
uint64_t spa_load_data_errors; /* verify data err count */
uint64_t spa_verify_min_txg; /* start txg of verify scrub */
kmutex_t spa_errlog_lock; /* error log lock */
uint64_t spa_errlog_last; /* last error log object */
uint64_t spa_errlog_scrub; /* scrub error log object */
kmutex_t spa_errlist_lock; /* error list/ereport lock */
avl_tree_t spa_errlist_last; /* last error list */
avl_tree_t spa_errlist_scrub; /* scrub error list */
uint64_t spa_deflate; /* should we deflate? */
uint64_t spa_history; /* history object */
kmutex_t spa_history_lock; /* history lock */
vdev_t *spa_pending_vdev; /* pending vdev additions */
kmutex_t spa_props_lock; /* property lock */
uint64_t spa_pool_props_object; /* object for properties */
uint64_t spa_bootfs; /* default boot filesystem */
uint64_t spa_failmode; /* failure mode for the pool */
uint64_t spa_delegation; /* delegation on/off */
list_t spa_config_list; /* previous cache file(s) */
/* per-CPU array of root of async I/O: */
zio_t **spa_async_zio_root;
zio_t *spa_suspend_zio_root; /* root of all suspended I/O */
+ zio_t *spa_txg_zio[TXG_SIZE]; /* spa_sync() waits for this */
kmutex_t spa_suspend_lock; /* protects suspend_zio_root */
kcondvar_t spa_suspend_cv; /* notification of resume */
uint8_t spa_suspended; /* pool is suspended */
uint8_t spa_claiming; /* pool is doing zil_claim() */
boolean_t spa_debug; /* debug enabled? */
boolean_t spa_is_root; /* pool is root */
int spa_minref; /* num refs when first opened */
int spa_mode; /* FREAD | FWRITE */
spa_log_state_t spa_log_state; /* log state */
uint64_t spa_autoexpand; /* lun expansion on/off */
uint64_t spa_bootsize; /* efi system partition size */
ddt_t *spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */
uint64_t spa_ddt_stat_object; /* DDT statistics */
uint64_t spa_dedup_ditto; /* dedup ditto threshold */
uint64_t spa_dedup_checksum; /* default dedup checksum */
uint64_t spa_dspace; /* dspace in normal class */
kmutex_t spa_vdev_top_lock; /* dueling offline/remove */
kmutex_t spa_proc_lock; /* protects spa_proc* */
kcondvar_t spa_proc_cv; /* spa_proc_state transitions */
spa_proc_state_t spa_proc_state; /* see definition */
struct proc *spa_proc; /* "zpool-poolname" process */
uint64_t spa_did; /* if procp != p0, did of t1 */
kthread_t *spa_trim_thread; /* thread sending TRIM I/Os */
kmutex_t spa_trim_lock; /* protects spa_trim_cv */
kcondvar_t spa_trim_cv; /* used to notify TRIM thread */
boolean_t spa_autoreplace; /* autoreplace set in open */
int spa_vdev_locks; /* locks grabbed */
uint64_t spa_creation_version; /* version at pool creation */
uint64_t spa_prev_software_version; /* See ub_software_version */
uint64_t spa_feat_for_write_obj; /* required to write to pool */
uint64_t spa_feat_for_read_obj; /* required to read from pool */
uint64_t spa_feat_desc_obj; /* Feature descriptions */
uint64_t spa_feat_enabled_txg_obj; /* Feature enabled txg */
/* cache feature refcounts */
uint64_t spa_feat_refcount_cache[SPA_FEATURES];
#ifdef illumos
cyclic_id_t spa_deadman_cycid; /* cyclic id */
#else /* !illumos */
#ifdef _KERNEL
struct callout spa_deadman_cycid; /* callout id */
struct task spa_deadman_task;
#endif
#endif /* illumos */
uint64_t spa_deadman_calls; /* number of deadman calls */
hrtime_t spa_sync_starttime; /* starting time fo spa_sync */
uint64_t spa_deadman_synctime; /* deadman expiration timer */
uint64_t spa_all_vdev_zaps; /* ZAP of per-vd ZAP obj #s */
spa_avz_action_t spa_avz_action; /* destroy/rebuild AVZ? */
#ifdef illumos
/*
* spa_iokstat_lock protects spa_iokstat and
* spa_queue_stats[].
*/
kmutex_t spa_iokstat_lock;
struct kstat *spa_iokstat; /* kstat of io to this pool */
struct {
int spa_active;
int spa_queued;
} spa_queue_stats[ZIO_PRIORITY_NUM_QUEUEABLE];
#endif
hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */
/*
* spa_refcount & spa_config_lock must be the last elements
* because refcount_t changes size based on compilation options.
* In order for the MDB module to function correctly, the other
* fields must remain in the same location.
*/
spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */
refcount_t spa_refcount; /* number of opens */
#ifndef illumos
boolean_t spa_splitting_newspa; /* creating new spa in split */
#endif
};
extern const char *spa_config_path;
extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent);
+extern void spa_load_spares(spa_t *spa);
+extern void spa_load_l2cache(spa_t *spa);
#ifdef __cplusplus
}
#endif
#endif /* _SYS_SPA_IMPL_H */
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h (revision 332525)
@@ -1,164 +1,171 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_SPACE_MAP_H
#define _SYS_SPACE_MAP_H
#include <sys/avl.h>
#include <sys/range_tree.h>
#include <sys/dmu.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
* The size of the space map object has increased to include a histogram.
* The SPACE_MAP_SIZE_V0 designates the original size and is used to
* maintain backward compatibility.
*/
#define SPACE_MAP_SIZE_V0 (3 * sizeof (uint64_t))
#define SPACE_MAP_HISTOGRAM_SIZE 32
/*
* The space_map_phys is the on-disk representation of the space map.
* Consumers of space maps should never reference any of the members of this
* structure directly. These members may only be updated in syncing context.
*
* Note the smp_object is no longer used but remains in the structure
* for backward compatibility.
*/
typedef struct space_map_phys {
uint64_t smp_object; /* on-disk space map object */
uint64_t smp_objsize; /* size of the object */
uint64_t smp_alloc; /* space allocated from the map */
uint64_t smp_pad[5]; /* reserved */
/*
* The smp_histogram maintains a histogram of free regions. Each
* bucket, smp_histogram[i], contains the number of free regions
* whose size is:
* 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1)
*/
uint64_t smp_histogram[SPACE_MAP_HISTOGRAM_SIZE];
} space_map_phys_t;
/*
* The space map object defines a region of space, its size, how much is
* allocated, and the on-disk object that stores this information.
* Consumers of space maps may only access the members of this structure.
+ *
+ * Note: the space_map may not be accessed concurrently; consumers
+ * must provide external locking if required.
*/
typedef struct space_map {
uint64_t sm_start; /* start of map */
uint64_t sm_size; /* size of map */
uint8_t sm_shift; /* unit shift */
uint64_t sm_length; /* synced length */
uint64_t sm_alloc; /* synced space allocated */
objset_t *sm_os; /* objset for this map */
uint64_t sm_object; /* object id for this map */
uint32_t sm_blksz; /* block size for space map */
dmu_buf_t *sm_dbuf; /* space_map_phys_t dbuf */
space_map_phys_t *sm_phys; /* on-disk space map */
- kmutex_t *sm_lock; /* pointer to lock that protects map */
} space_map_t;
/*
* debug entry
*
* 1 3 10 50
* ,---+--------+------------+---------------------------------.
* | 1 | action | syncpass | txg (lower bits) |
* `---+--------+------------+---------------------------------'
* 63 62 60 59 50 49 0
*
*
* non-debug entry
*
* 1 47 1 15
* ,-----------------------------------------------------------.
* | 0 | offset (sm_shift units) | type | run |
* `-----------------------------------------------------------'
* 63 62 17 16 15 0
*/
/* All this stuff takes and returns bytes */
#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, 15) + 1)
#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, 15)
#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1)
#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1)
#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, 47)
#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, 47)
#define SM_DEBUG_DECODE(x) BF64_DECODE(x, 63, 1)
#define SM_DEBUG_ENCODE(x) BF64_ENCODE(x, 63, 1)
#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 3)
#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 3)
#define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10)
#define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10)
#define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50)
#define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50)
#define SM_RUN_MAX SM_RUN_DECODE(~0ULL)
typedef enum {
SM_ALLOC,
SM_FREE
} maptype_t;
+typedef int (*sm_cb_t)(maptype_t type, uint64_t offset, uint64_t size,
+ void *arg);
+
int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype);
+int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg);
void space_map_histogram_clear(space_map_t *sm);
void space_map_histogram_add(space_map_t *sm, range_tree_t *rt,
dmu_tx_t *tx);
void space_map_update(space_map_t *sm);
uint64_t space_map_object(space_map_t *sm);
uint64_t space_map_allocated(space_map_t *sm);
uint64_t space_map_length(space_map_t *sm);
void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
dmu_tx_t *tx);
void space_map_truncate(space_map_t *sm, dmu_tx_t *tx);
uint64_t space_map_alloc(objset_t *os, dmu_tx_t *tx);
void space_map_free(space_map_t *sm, dmu_tx_t *tx);
+void space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx);
int space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
- uint64_t start, uint64_t size, uint8_t shift, kmutex_t *lp);
+ uint64_t start, uint64_t size, uint8_t shift);
void space_map_close(space_map_t *sm);
int64_t space_map_alloc_delta(space_map_t *sm);
#ifdef __cplusplus
}
#endif
#endif /* _SYS_SPACE_MAP_H */
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h (revision 332525)
@@ -1,176 +1,180 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_VDEV_H
#define _SYS_VDEV_H
#include <sys/spa.h>
#include <sys/zio.h>
#include <sys/dmu.h>
#include <sys/space_map.h>
#include <sys/fs/zfs.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef enum vdev_dtl_type {
DTL_MISSING, /* 0% replication: no copies of the data */
DTL_PARTIAL, /* less than 100% replication: some copies missing */
DTL_SCRUB, /* unable to fully repair during scrub/resilver */
DTL_OUTAGE, /* temporarily missing (used to attempt detach) */
DTL_TYPES
} vdev_dtl_type_t;
extern boolean_t zfs_nocacheflush;
extern boolean_t zfs_trim_enabled;
extern int vdev_open(vdev_t *);
extern void vdev_open_children(vdev_t *);
extern boolean_t vdev_uses_zvols(vdev_t *);
extern int vdev_validate(vdev_t *, boolean_t);
extern void vdev_close(vdev_t *);
extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
extern void vdev_reopen(vdev_t *);
extern int vdev_validate_aux(vdev_t *vd);
extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio);
-
+extern boolean_t vdev_is_concrete(vdev_t *vd);
extern boolean_t vdev_is_bootable(vdev_t *vd);
extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
extern int vdev_count_leaves(spa_t *spa);
extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
uint64_t txg, uint64_t size);
extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
uint64_t txg, uint64_t size);
extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
int scrub_done);
extern boolean_t vdev_dtl_required(vdev_t *vd);
extern boolean_t vdev_resilver_needed(vdev_t *vd,
uint64_t *minp, uint64_t *maxp);
extern void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj,
dmu_tx_t *tx);
extern uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx);
extern void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx);
+extern void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx);
+extern void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset,
+ uint64_t size, uint64_t txg);
+extern void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev,
+ uint64_t offset, uint64_t size, dmu_tx_t *tx);
extern void vdev_hold(vdev_t *);
extern void vdev_rele(vdev_t *);
extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
extern void vdev_metaslab_fini(vdev_t *vd);
extern void vdev_metaslab_set_size(vdev_t *);
extern void vdev_ashift_optimize(vdev_t *);
extern void vdev_expand(vdev_t *vd, uint64_t txg);
extern void vdev_split(vdev_t *vd);
extern void vdev_deadman(vdev_t *vd);
-
extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
extern void vdev_clear_stats(vdev_t *vd);
extern void vdev_stat_update(zio_t *zio, uint64_t psize);
extern void vdev_scan_stat_init(vdev_t *vd);
extern void vdev_propagate_state(vdev_t *vd);
extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
vdev_aux_t aux);
extern void vdev_space_update(vdev_t *vd,
int64_t alloc_delta, int64_t defer_delta, int64_t space_delta);
extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux);
extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux);
extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
vdev_state_t *);
extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags);
extern void vdev_clear(spa_t *spa, vdev_t *vd);
extern boolean_t vdev_is_dead(vdev_t *vd);
extern boolean_t vdev_readable(vdev_t *vd);
extern boolean_t vdev_writeable(vdev_t *vd);
extern boolean_t vdev_allocatable(vdev_t *vd);
extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio);
extern void vdev_cache_init(vdev_t *vd);
extern void vdev_cache_fini(vdev_t *vd);
extern boolean_t vdev_cache_read(zio_t *zio);
extern void vdev_cache_write(zio_t *zio);
extern void vdev_cache_purge(vdev_t *vd);
extern void vdev_queue_init(vdev_t *vd);
extern void vdev_queue_fini(vdev_t *vd);
extern zio_t *vdev_queue_io(zio_t *zio);
extern void vdev_queue_io_done(zio_t *zio);
extern int vdev_queue_length(vdev_t *vd);
extern uint64_t vdev_queue_lastoffset(vdev_t *vd);
extern void vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio);
extern void vdev_config_dirty(vdev_t *vd);
extern void vdev_config_clean(vdev_t *vd);
extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg);
extern void vdev_state_dirty(vdev_t *vd);
extern void vdev_state_clean(vdev_t *vd);
typedef enum vdev_config_flag {
VDEV_CONFIG_SPARE = 1 << 0,
VDEV_CONFIG_L2CACHE = 1 << 1,
VDEV_CONFIG_REMOVING = 1 << 2,
VDEV_CONFIG_MOS = 1 << 3
} vdev_config_flag_t;
extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config);
extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
boolean_t getstats, vdev_config_flag_t flags);
/*
* Label routines
*/
struct uberblock;
extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset);
extern int vdev_label_number(uint64_t psise, uint64_t offset);
extern nvlist_t *vdev_label_read_config(vdev_t *vd, uint64_t txg);
extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **);
typedef enum {
VDEV_LABEL_CREATE, /* create/add a new device */
VDEV_LABEL_REPLACE, /* replace an existing device */
VDEV_LABEL_SPARE, /* add a new hot spare */
VDEV_LABEL_REMOVE, /* remove an existing device */
VDEV_LABEL_L2CACHE, /* add an L2ARC cache device */
VDEV_LABEL_SPLIT /* generating new label for split-off dev */
} vdev_labeltype_t;
extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason);
extern int vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size);
#ifdef __cplusplus
}
#endif
#endif /* _SYS_VDEV_H */
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h (revision 332525)
@@ -1,403 +1,496 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_VDEV_IMPL_H
#define _SYS_VDEV_IMPL_H
#include <sys/avl.h>
+#include <sys/bpobj.h>
#include <sys/dmu.h>
#include <sys/metaslab.h>
#include <sys/nvpair.h>
#include <sys/space_map.h>
#include <sys/vdev.h>
#include <sys/dkio.h>
#include <sys/uberblock_impl.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/vdev_indirect_births.h>
+#include <sys/vdev_removal.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
* Virtual device descriptors.
*
* All storage pool operations go through the virtual device framework,
* which provides data replication and I/O scheduling.
*/
/*
* Forward declarations that lots of things need.
*/
typedef struct vdev_queue vdev_queue_t;
typedef struct vdev_cache vdev_cache_t;
typedef struct vdev_cache_entry vdev_cache_entry_t;
struct abd;
extern int zfs_vdev_queue_depth_pct;
extern uint32_t zfs_vdev_async_write_max_active;
/*
* Virtual device operations
*/
typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size,
uint64_t *logical_ashift, uint64_t *physical_ashift);
typedef void vdev_close_func_t(vdev_t *vd);
typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
typedef void vdev_io_start_func_t(zio_t *zio);
typedef void vdev_io_done_func_t(zio_t *zio);
typedef void vdev_state_change_func_t(vdev_t *vd, int, int);
typedef void vdev_hold_func_t(vdev_t *vd);
typedef void vdev_rele_func_t(vdev_t *vd);
+typedef void vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd,
+ uint64_t offset, uint64_t size, void *arg);
+typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size,
+ vdev_remap_cb_t callback, void *arg);
+
typedef struct vdev_ops {
vdev_open_func_t *vdev_op_open;
vdev_close_func_t *vdev_op_close;
vdev_asize_func_t *vdev_op_asize;
vdev_io_start_func_t *vdev_op_io_start;
vdev_io_done_func_t *vdev_op_io_done;
vdev_state_change_func_t *vdev_op_state_change;
vdev_hold_func_t *vdev_op_hold;
vdev_rele_func_t *vdev_op_rele;
+ vdev_remap_func_t *vdev_op_remap;
char vdev_op_type[16];
boolean_t vdev_op_leaf;
} vdev_ops_t;
/*
* Virtual device properties
*/
struct vdev_cache_entry {
struct abd *ve_abd;
uint64_t ve_offset;
uint64_t ve_lastused;
avl_node_t ve_offset_node;
avl_node_t ve_lastused_node;
uint32_t ve_hits;
uint16_t ve_missed_update;
zio_t *ve_fill_io;
};
struct vdev_cache {
avl_tree_t vc_offset_tree;
avl_tree_t vc_lastused_tree;
kmutex_t vc_lock;
};
typedef struct vdev_queue_class {
uint32_t vqc_active;
/*
* Sorted by offset or timestamp, depending on if the queue is
* LBA-ordered vs FIFO.
*/
avl_tree_t vqc_queued_tree;
} vdev_queue_class_t;
struct vdev_queue {
vdev_t *vq_vdev;
vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
avl_tree_t vq_active_tree;
avl_tree_t vq_read_offset_tree;
avl_tree_t vq_write_offset_tree;
uint64_t vq_last_offset;
hrtime_t vq_io_complete_ts; /* time last i/o completed */
kmutex_t vq_lock;
uint64_t vq_lastoffset;
};
/*
+ * On-disk indirect vdev state.
+ *
+ * An indirect vdev is described exclusively in the MOS config of a pool.
+ * The config for an indirect vdev includes several fields, which are
+ * accessed in memory by a vdev_indirect_config_t.
+ */
+typedef struct vdev_indirect_config {
+ /*
+ * Object (in MOS) which contains the indirect mapping. This object
+ * contains an array of vdev_indirect_mapping_entry_phys_t ordered by
+ * vimep_src. The bonus buffer for this object is a
+ * vdev_indirect_mapping_phys_t. This object is allocated when a vdev
+ * removal is initiated.
+ *
+ * Note that this object can be empty if none of the data on the vdev
+ * has been copied yet.
+ */
+ uint64_t vic_mapping_object;
+
+ /*
+ * Object (in MOS) which contains the birth times for the mapping
+ * entries. This object contains an array of
+ * vdev_indirect_birth_entry_phys_t sorted by vibe_offset. The bonus
+ * buffer for this object is a vdev_indirect_birth_phys_t. This object
+ * is allocated when a vdev removal is initiated.
+ *
+ * Note that this object can be empty if none of the vdev has yet been
+ * copied.
+ */
+ uint64_t vic_births_object;
+
+ /*
+ * This is the vdev ID which was removed previous to this vdev, or
+ * UINT64_MAX if there are no previously removed vdevs.
+ */
+ uint64_t vic_prev_indirect_vdev;
+} vdev_indirect_config_t;
+
+/*
* Virtual device descriptor
*/
struct vdev {
/*
* Common to all vdev types.
*/
uint64_t vdev_id; /* child number in vdev parent */
uint64_t vdev_guid; /* unique ID for this vdev */
uint64_t vdev_guid_sum; /* self guid + all child guids */
uint64_t vdev_orig_guid; /* orig. guid prior to remove */
uint64_t vdev_asize; /* allocatable device capacity */
uint64_t vdev_min_asize; /* min acceptable asize */
uint64_t vdev_max_asize; /* max acceptable asize */
uint64_t vdev_ashift; /* block alignment shift */
/*
* Logical block alignment shift
*
* The smallest sized/aligned I/O supported by the device.
*/
uint64_t vdev_logical_ashift;
/*
* Physical block alignment shift
*
* The device supports logical I/Os with vdev_logical_ashift
* size/alignment, but optimum performance will be achieved by
* aligning/sizing requests to vdev_physical_ashift. Smaller
* requests may be inflated or incur device level read-modify-write
* operations.
*
* May be 0 to indicate no preference (i.e. use vdev_logical_ashift).
*/
uint64_t vdev_physical_ashift;
uint64_t vdev_state; /* see VDEV_STATE_* #defines */
uint64_t vdev_prevstate; /* used when reopening a vdev */
vdev_ops_t *vdev_ops; /* vdev operations */
spa_t *vdev_spa; /* spa for this vdev */
void *vdev_tsd; /* type-specific data */
vnode_t *vdev_name_vp; /* vnode for pathname */
vnode_t *vdev_devid_vp; /* vnode for devid */
vdev_t *vdev_top; /* top-level vdev */
vdev_t *vdev_parent; /* parent vdev */
vdev_t **vdev_child; /* array of children */
uint64_t vdev_children; /* number of children */
vdev_stat_t vdev_stat; /* virtual device statistics */
boolean_t vdev_expanding; /* expand the vdev? */
boolean_t vdev_reopening; /* reopen in progress? */
int vdev_open_error; /* error on last open */
kthread_t *vdev_open_thread; /* thread opening children */
uint64_t vdev_crtxg; /* txg when top-level was added */
/*
* Top-level vdev state.
*/
uint64_t vdev_ms_array; /* metaslab array object */
uint64_t vdev_ms_shift; /* metaslab size shift */
uint64_t vdev_ms_count; /* number of metaslabs */
metaslab_group_t *vdev_mg; /* metaslab group */
metaslab_t **vdev_ms; /* metaslab array */
txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */
txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */
txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
boolean_t vdev_remove_wanted; /* async remove wanted? */
boolean_t vdev_probe_wanted; /* async probe wanted? */
list_node_t vdev_config_dirty_node; /* config dirty list */
list_node_t vdev_state_dirty_node; /* state dirty list */
uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */
uint64_t vdev_islog; /* is an intent log device */
uint64_t vdev_removing; /* device is being removed? */
boolean_t vdev_ishole; /* is a hole in the namespace */
kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */
uint64_t vdev_top_zap;
/*
+ * Values stored in the config for an indirect or removing vdev.
+ */
+ vdev_indirect_config_t vdev_indirect_config;
+
+ /*
+ * The vdev_indirect_rwlock protects the vdev_indirect_mapping
+ * pointer from changing on indirect vdevs (when it is condensed).
+ * Note that removing (not yet indirect) vdevs have different
+ * access patterns (the mapping is not accessed from open context,
+ * e.g. from zio_read) and locking strategy (e.g. svr_lock).
+ */
+ krwlock_t vdev_indirect_rwlock;
+ vdev_indirect_mapping_t *vdev_indirect_mapping;
+ vdev_indirect_births_t *vdev_indirect_births;
+
+ /*
+ * In memory data structures used to manage the obsolete sm, for
+ * indirect or removing vdevs.
+ *
+ * The vdev_obsolete_segments is the in-core record of the segments
+ * that are no longer referenced anywhere in the pool (due to
+ * being freed or remapped and not referenced by any snapshots).
+ * During a sync, segments are added to vdev_obsolete_segments
+ * via vdev_indirect_mark_obsolete(); at the end of each sync
+ * pass, this is appended to vdev_obsolete_sm via
+ * vdev_indirect_sync_obsolete(). The vdev_obsolete_lock
+ * protects against concurrent modifications of vdev_obsolete_segments
+ * from multiple zio threads.
+ */
+ kmutex_t vdev_obsolete_lock;
+ range_tree_t *vdev_obsolete_segments;
+ space_map_t *vdev_obsolete_sm;
+
+ /*
* The queue depth parameters determine how many async writes are
* still pending (i.e. allocated by net yet issued to disk) per
* top-level (vdev_async_write_queue_depth) and the maximum allowed
* (vdev_max_async_write_queue_depth). These values only apply to
* top-level vdevs.
*/
uint64_t vdev_async_write_queue_depth;
uint64_t vdev_max_async_write_queue_depth;
/*
* Leaf vdev state.
*/
range_tree_t *vdev_dtl[DTL_TYPES]; /* dirty time logs */
space_map_t *vdev_dtl_sm; /* dirty time log space map */
txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */
uint64_t vdev_dtl_object; /* DTL object */
uint64_t vdev_psize; /* physical device capacity */
uint64_t vdev_wholedisk; /* true if this is a whole disk */
uint64_t vdev_offline; /* persistent offline state */
uint64_t vdev_faulted; /* persistent faulted state */
uint64_t vdev_degraded; /* persistent degraded state */
uint64_t vdev_removed; /* persistent removed state */
uint64_t vdev_resilver_txg; /* persistent resilvering state */
uint64_t vdev_nparity; /* number of parity devices for raidz */
char *vdev_path; /* vdev path (if any) */
char *vdev_devid; /* vdev devid (if any) */
char *vdev_physpath; /* vdev device path (if any) */
char *vdev_fru; /* physical FRU location */
uint64_t vdev_not_present; /* not present during import */
uint64_t vdev_unspare; /* unspare when resilvering done */
boolean_t vdev_nowritecache; /* true if flushwritecache failed */
boolean_t vdev_notrim; /* true if trim failed */
boolean_t vdev_checkremove; /* temporary online test */
boolean_t vdev_forcefault; /* force online fault */
boolean_t vdev_splitting; /* split or repair in progress */
boolean_t vdev_delayed_close; /* delayed device close? */
boolean_t vdev_tmpoffline; /* device taken offline temporarily? */
boolean_t vdev_detached; /* device detached? */
boolean_t vdev_cant_read; /* vdev is failing all reads */
boolean_t vdev_cant_write; /* vdev is failing all writes */
boolean_t vdev_isspare; /* was a hot spare */
boolean_t vdev_isl2cache; /* was a l2cache device */
vdev_queue_t vdev_queue; /* I/O deadline schedule queue */
vdev_cache_t vdev_cache; /* physical block cache */
spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */
zio_t *vdev_probe_zio; /* root of current probe */
vdev_aux_t vdev_label_aux; /* on-disk aux state */
struct trim_map *vdev_trimmap; /* map on outstanding trims */
uint16_t vdev_rotation_rate; /* rotational rate of the media */
#define VDEV_RATE_UNKNOWN 0
#define VDEV_RATE_NON_ROTATING 1
uint64_t vdev_leaf_zap;
/*
* For DTrace to work in userland (libzpool) context, these fields must
* remain at the end of the structure. DTrace will use the kernel's
* CTF definition for 'struct vdev', and since the size of a kmutex_t is
* larger in userland, the offsets for the rest of the fields would be
* incorrect.
*/
kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */
kmutex_t vdev_stat_lock; /* vdev_stat */
kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */
};
#define VDEV_RAIDZ_MAXPARITY 3
#define VDEV_PAD_SIZE (8 << 10)
/* 2 padding areas (vl_pad1 and vl_pad2) to skip */
#define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2
#define VDEV_PHYS_SIZE (112 << 10)
#define VDEV_UBERBLOCK_RING (128 << 10)
/* The largest uberblock we support is 8k. */
#define MAX_UBERBLOCK_SHIFT (13)
#define VDEV_UBERBLOCK_SHIFT(vd) \
MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \
MAX_UBERBLOCK_SHIFT)
#define VDEV_UBERBLOCK_COUNT(vd) \
(VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
#define VDEV_UBERBLOCK_OFFSET(vd, n) \
offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)])
#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd))
typedef struct vdev_phys {
char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)];
zio_eck_t vp_zbt;
} vdev_phys_t;
typedef struct vdev_label {
char vl_pad1[VDEV_PAD_SIZE]; /* 8K */
char vl_pad2[VDEV_PAD_SIZE]; /* 8K */
vdev_phys_t vl_vdev_phys; /* 112K */
char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */
} vdev_label_t; /* 256K total */
/*
* vdev_dirty() flags
*/
#define VDD_METASLAB 0x01
#define VDD_DTL 0x02
/* Offset of embedded boot loader region on each label */
#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t))
/*
* Size of embedded boot loader region on each label.
* The total size of the first two labels plus the boot area is 4MB.
*/
#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */
/*
* Size of label regions at the start and end of each leaf device.
*/
#define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE)
#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t))
#define VDEV_LABELS 4
#define VDEV_BEST_LABEL VDEV_LABELS
#define VDEV_ALLOC_LOAD 0
#define VDEV_ALLOC_ADD 1
#define VDEV_ALLOC_SPARE 2
#define VDEV_ALLOC_L2CACHE 3
#define VDEV_ALLOC_ROOTPOOL 4
#define VDEV_ALLOC_SPLIT 5
#define VDEV_ALLOC_ATTACH 6
/*
* Allocate or free a vdev
*/
extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid,
vdev_ops_t *ops);
extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config,
vdev_t *parent, uint_t id, int alloctype);
extern void vdev_free(vdev_t *vd);
/*
* Add or remove children and parents
*/
extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd);
extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd);
extern void vdev_compact_children(vdev_t *pvd);
extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops);
extern void vdev_remove_parent(vdev_t *cvd);
/*
* vdev sync load and sync
*/
extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd);
extern boolean_t vdev_log_state_valid(vdev_t *vd);
-extern void vdev_load(vdev_t *vd);
+extern int vdev_load(vdev_t *vd);
extern int vdev_dtl_load(vdev_t *vd);
extern void vdev_sync(vdev_t *vd, uint64_t txg);
extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg);
extern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg);
/*
* Available vdev types.
*/
extern vdev_ops_t vdev_root_ops;
extern vdev_ops_t vdev_mirror_ops;
extern vdev_ops_t vdev_replacing_ops;
extern vdev_ops_t vdev_raidz_ops;
#ifdef _KERNEL
extern vdev_ops_t vdev_geom_ops;
#else
extern vdev_ops_t vdev_disk_ops;
#endif
extern vdev_ops_t vdev_file_ops;
extern vdev_ops_t vdev_missing_ops;
extern vdev_ops_t vdev_hole_ops;
extern vdev_ops_t vdev_spare_ops;
+extern vdev_ops_t vdev_indirect_ops;
/*
* Common size functions
*/
extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
extern uint64_t vdev_get_min_asize(vdev_t *vd);
extern void vdev_set_min_asize(vdev_t *vd);
/*
* Global variables
*/
/* zdb uses this tunable, so it must be declared here to make lint happy. */
extern int zfs_vdev_cache_size;
extern uint_t zfs_geom_probe_vdev_key;
+
+/*
+ * Functions from vdev_indirect.c
+ */
+extern void vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx);
+extern boolean_t vdev_indirect_should_condense(vdev_t *vd);
+extern void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx);
+extern int vdev_obsolete_sm_object(vdev_t *vd);
+extern boolean_t vdev_obsolete_counts_are_precise(vdev_t *vd);
#ifdef illumos
/*
* The vdev_buf_t is used to translate between zio_t and buf_t, and back again.
*/
typedef struct vdev_buf {
buf_t vb_buf; /* buffer that describes the io */
zio_t *vb_io; /* pointer back to the original zio_t */
} vdev_buf_t;
#endif
#ifdef __cplusplus
}
#endif
#endif /* _SYS_VDEV_IMPL_H */
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h (nonexistent)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h (revision 332525)
@@ -0,0 +1,80 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2015 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_INDIRECT_BIRTHS_H
+#define _SYS_VDEV_INDIRECT_BIRTHS_H
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct vdev_indirect_birth_entry_phys {
+ uint64_t vibe_offset;
+ uint64_t vibe_phys_birth_txg;
+} vdev_indirect_birth_entry_phys_t;
+
+typedef struct vdev_indirect_birth_phys {
+ uint64_t vib_count; /* count of v_i_b_entry_phys_t's */
+} vdev_indirect_birth_phys_t;
+
+typedef struct vdev_indirect_births {
+ uint64_t vib_object;
+
+ /*
+ * Each entry indicates that everything up to but not including
+ * vibe_offset was copied in vibe_phys_birth_txg. Entries are sorted
+ * by increasing phys_birth, and also by increasing offset. See
+ * vdev_indirect_births_physbirth for usage.
+ */
+ vdev_indirect_birth_entry_phys_t *vib_entries;
+
+ objset_t *vib_objset;
+
+ dmu_buf_t *vib_dbuf;
+ vdev_indirect_birth_phys_t *vib_phys;
+} vdev_indirect_births_t;
+
+extern vdev_indirect_births_t *vdev_indirect_births_open(objset_t *os,
+ uint64_t object);
+extern void vdev_indirect_births_close(vdev_indirect_births_t *vib);
+extern boolean_t vdev_indirect_births_is_open(vdev_indirect_births_t *vib);
+extern uint64_t vdev_indirect_births_alloc(objset_t *os, dmu_tx_t *tx);
+extern void vdev_indirect_births_free(objset_t *os, uint64_t object,
+ dmu_tx_t *tx);
+
+extern uint64_t vdev_indirect_births_count(vdev_indirect_births_t *vib);
+extern uint64_t vdev_indirect_births_object(vdev_indirect_births_t *vib);
+
+extern void vdev_indirect_births_add_entry(vdev_indirect_births_t *vib,
+ uint64_t offset, uint64_t txg, dmu_tx_t *tx);
+
+extern uint64_t vdev_indirect_births_physbirth(vdev_indirect_births_t *vib,
+ uint64_t offset, uint64_t asize);
+
+extern uint64_t vdev_indirect_births_last_entry_txg(
+ vdev_indirect_births_t *vib);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_INDIRECT_BIRTHS_H */
Property changes on: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h (nonexistent)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h (revision 332525)
@@ -0,0 +1,141 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2015 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_INDIRECT_MAPPING_H
+#define _SYS_VDEV_INDIRECT_MAPPING_H
+
+#include <sys/dmu.h>
+#include <sys/list.h>
+#include <sys/spa.h>
+#include <sys/space_map.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct vdev_indirect_mapping_entry_phys {
+ /*
+ * Decode with DVA_MAPPING_* macros.
+ * Contains:
+ * the source offset (low 63 bits)
+ * the one-bit "mark", used for garbage collection (by zdb)
+ */
+ uint64_t vimep_src;
+
+ /*
+ * Note: the DVA's asize is 24 bits, and can thus store ranges
+ * up to 8GB.
+ */
+ dva_t vimep_dst;
+} vdev_indirect_mapping_entry_phys_t;
+
+#define DVA_MAPPING_GET_SRC_OFFSET(vimep) \
+ BF64_GET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0)
+#define DVA_MAPPING_SET_SRC_OFFSET(vimep, x) \
+ BF64_SET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0, x)
+
+typedef struct vdev_indirect_mapping_entry {
+ vdev_indirect_mapping_entry_phys_t vime_mapping;
+ uint32_t vime_obsolete_count;
+ list_node_t vime_node;
+} vdev_indirect_mapping_entry_t;
+
+/*
+ * This is stored in the bonus buffer of the mapping object, see comment of
+ * vdev_indirect_config for more details.
+ */
+typedef struct vdev_indirect_mapping_phys {
+ uint64_t vimp_max_offset;
+ uint64_t vimp_bytes_mapped;
+ uint64_t vimp_num_entries; /* number of v_i_m_entry_phys_t's */
+
+ /*
+ * For each entry in the mapping object, this object contains an
+ * entry representing the number of bytes of that mapping entry
+ * that were no longer in use by the pool at the time this indirect
+ * vdev was last condensed.
+ */
+ uint64_t vimp_counts_object;
+} vdev_indirect_mapping_phys_t;
+
+#define VDEV_INDIRECT_MAPPING_SIZE_V0 (3 * sizeof (uint64_t))
+
+typedef struct vdev_indirect_mapping {
+ uint64_t vim_object;
+ boolean_t vim_havecounts;
+
+ /*
+ * An ordered array of all mapping entries, sorted by source offset.
+ * Note that vim_entries is needed during a removal (and contains
+ * mappings that have been synced to disk so far) to handle frees
+ * from the removing device.
+ */
+ vdev_indirect_mapping_entry_phys_t *vim_entries;
+
+ objset_t *vim_objset;
+
+ dmu_buf_t *vim_dbuf;
+ vdev_indirect_mapping_phys_t *vim_phys;
+} vdev_indirect_mapping_t;
+
+extern vdev_indirect_mapping_t *vdev_indirect_mapping_open(objset_t *os,
+ uint64_t object);
+extern void vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim);
+extern uint64_t vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx);
+extern void vdev_indirect_mapping_free(objset_t *os, uint64_t obj,
+ dmu_tx_t *tx);
+
+extern uint64_t vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim);
+extern uint64_t vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim);
+extern uint64_t vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim);
+extern uint64_t vdev_indirect_mapping_bytes_mapped(
+ vdev_indirect_mapping_t *vim);
+extern uint64_t vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim);
+
+/*
+ * Writes the given list of vdev_indirect_mapping_entry_t to the mapping
+ * then updates internal state.
+ */
+extern void vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim,
+ list_t *vime_list, dmu_tx_t *tx);
+
+extern vdev_indirect_mapping_entry_phys_t *
+ vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim,
+ uint64_t offset);
+
+extern vdev_indirect_mapping_entry_phys_t *
+ vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim,
+ uint64_t offset);
+
+extern uint32_t *vdev_indirect_mapping_load_obsolete_counts(
+ vdev_indirect_mapping_t *vim);
+extern void vdev_indirect_mapping_load_obsolete_spacemap(
+ vdev_indirect_mapping_t *vim,
+ uint32_t *counts, space_map_t *obsolete_space_sm);
+extern void vdev_indirect_mapping_increment_obsolete_count(
+ vdev_indirect_mapping_t *vim,
+ uint64_t offset, uint64_t asize, uint32_t *counts);
+extern void vdev_indirect_mapping_free_obsolete_counts(
+ vdev_indirect_mapping_t *vim, uint32_t *counts);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_INDIRECT_MAPPING_H */
Property changes on: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h (nonexistent)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h (revision 332525)
@@ -0,0 +1,93 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_REMOVAL_H
+#define _SYS_VDEV_REMOVAL_H
+
+#include <sys/spa.h>
+#include <sys/bpobj.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/vdev_indirect_births.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct spa_vdev_removal {
+ vdev_t *svr_vdev;
+ uint64_t svr_max_offset_to_sync[TXG_SIZE];
+ /* Thread performing a vdev removal. */
+ kthread_t *svr_thread;
+ /* Segments left to copy from the current metaslab. */
+ range_tree_t *svr_allocd_segs;
+ kmutex_t svr_lock;
+ kcondvar_t svr_cv;
+ boolean_t svr_thread_exit;
+
+ /*
+ * New mappings to write out each txg.
+ */
+ list_t svr_new_segments[TXG_SIZE];
+
+ /*
+ * Ranges that were freed while a mapping was in flight. This is
+ * a subset of the ranges covered by vdev_im_new_segments.
+ */
+ range_tree_t *svr_frees[TXG_SIZE];
+
+ /*
+ * Number of bytes which we have finished our work for
+ * in each txg. This could be data copied (which will be part of
+ * the mappings in vdev_im_new_segments), or data freed before
+ * we got around to copying it.
+ */
+ uint64_t svr_bytes_done[TXG_SIZE];
+
+ /* List of leaf zap objects to be unlinked */
+ nvlist_t *svr_zaplist;
+} spa_vdev_removal_t;
+
+typedef struct spa_condensing_indirect {
+ /*
+ * New mappings to write out each txg.
+ */
+ list_t sci_new_mapping_entries[TXG_SIZE];
+
+ vdev_indirect_mapping_t *sci_new_mapping;
+} spa_condensing_indirect_t;
+
+extern int spa_remove_init(spa_t *);
+extern void spa_restart_removal(spa_t *);
+extern int spa_condense_init(spa_t *);
+extern void spa_condense_fini(spa_t *);
+extern void spa_condense_indirect_restart(spa_t *);
+extern void spa_vdev_condense_suspend(spa_t *);
+extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t);
+extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t, uint64_t);
+extern int spa_removal_get_stats(spa_t *, pool_removal_stat_t *);
+extern void svr_sync(spa_t *spa, dmu_tx_t *tx);
+extern void spa_vdev_remove_suspend(spa_t *);
+extern int spa_vdev_remove_cancel(spa_t *);
+extern void spa_vdev_removal_destroy(spa_vdev_removal_t *svr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_REMOVAL_H */
Property changes on: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h (revision 332525)
@@ -1,98 +1,99 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
*/
#ifndef _SYS_ZFS_DEBUG_H
#define _SYS_ZFS_DEBUG_H
#ifdef __cplusplus
extern "C" {
#endif
#ifndef TRUE
#define TRUE 1
#endif
#ifndef FALSE
#define FALSE 0
#endif
/*
* ZFS debugging
*/
#if defined(DEBUG) || !defined(_KERNEL)
#if !defined(ZFS_DEBUG)
#define ZFS_DEBUG
#endif
#endif
extern int zfs_flags;
extern boolean_t zfs_recover;
extern boolean_t zfs_free_leak_on_eio;
#define ZFS_DEBUG_DPRINTF (1 << 0)
#define ZFS_DEBUG_DBUF_VERIFY (1 << 1)
#define ZFS_DEBUG_DNODE_VERIFY (1 << 2)
#define ZFS_DEBUG_SNAPNAMES (1 << 3)
#define ZFS_DEBUG_MODIFY (1 << 4)
#define ZFS_DEBUG_SPA (1 << 5)
#define ZFS_DEBUG_ZIO_FREE (1 << 6)
#define ZFS_DEBUG_HISTOGRAM_VERIFY (1 << 7)
#define ZFS_DEBUG_METASLAB_VERIFY (1 << 8)
+#define ZFS_DEBUG_INDIRECT_REMAP (1 << 9)
#ifdef ZFS_DEBUG
extern void __dprintf(const char *file, const char *func,
int line, const char *fmt, ...);
#define dprintf(...) \
if (zfs_flags & ZFS_DEBUG_DPRINTF) \
__dprintf(__FILE__, __func__, __LINE__, __VA_ARGS__)
#else
#define dprintf(...) ((void)0)
#endif /* ZFS_DEBUG */
extern void zfs_panic_recover(const char *fmt, ...);
typedef struct zfs_dbgmsg {
list_node_t zdm_node;
time_t zdm_timestamp;
char zdm_msg[1]; /* variable length allocation */
} zfs_dbgmsg_t;
extern void zfs_dbgmsg_init(void);
extern void zfs_dbgmsg_fini(void);
extern void zfs_dbgmsg(const char *fmt, ...);
extern void zfs_dbgmsg_print(const char *tag);
#ifdef illumos
#ifndef _KERNEL
extern int dprintf_find_string(const char *string);
#endif
#endif /* illumos */
#ifdef __cplusplus
}
#endif
#endif /* _SYS_ZFS_DEBUG_H */
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h (revision 332525)
@@ -1,448 +1,448 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
/* Portions Copyright 2010 Robert Milkowski */
#ifndef _SYS_ZIL_H
#define _SYS_ZIL_H
#include <sys/types.h>
#include <sys/spa.h>
#include <sys/zio.h>
#include <sys/dmu.h>
#ifdef __cplusplus
extern "C" {
#endif
struct dsl_pool;
struct dsl_dataset;
struct lwb;
/*
* Intent log format:
*
* Each objset has its own intent log. The log header (zil_header_t)
* for objset N's intent log is kept in the Nth object of the SPA's
* intent_log objset. The log header points to a chain of log blocks,
* each of which contains log records (i.e., transactions) followed by
* a log block trailer (zil_trailer_t). The format of a log record
* depends on the record (or transaction) type, but all records begin
* with a common structure that defines the type, length, and txg.
*/
/*
* Intent log header - this on disk structure holds fields to manage
* the log. All fields are 64 bit to easily handle cross architectures.
*/
typedef struct zil_header {
uint64_t zh_claim_txg; /* txg in which log blocks were claimed */
uint64_t zh_replay_seq; /* highest replayed sequence number */
blkptr_t zh_log; /* log chain */
uint64_t zh_claim_blk_seq; /* highest claimed block sequence number */
uint64_t zh_flags; /* header flags */
uint64_t zh_claim_lr_seq; /* highest claimed lr sequence number */
uint64_t zh_pad[3];
} zil_header_t;
/*
* zh_flags bit settings
*/
#define ZIL_REPLAY_NEEDED 0x1 /* replay needed - internal only */
#define ZIL_CLAIM_LR_SEQ_VALID 0x2 /* zh_claim_lr_seq field is valid */
/*
* Log block chaining.
*
* Log blocks are chained together. Originally they were chained at the
* end of the block. For performance reasons the chain was moved to the
* beginning of the block which allows writes for only the data being used.
* The older position is supported for backwards compatability.
*
* The zio_eck_t contains a zec_cksum which for the intent log is
* the sequence number of this log block. A seq of 0 is invalid.
* The zec_cksum is checked by the SPA against the sequence
* number passed in the blk_cksum field of the blkptr_t
*/
typedef struct zil_chain {
uint64_t zc_pad;
blkptr_t zc_next_blk; /* next block in chain */
uint64_t zc_nused; /* bytes in log block used */
zio_eck_t zc_eck; /* block trailer */
} zil_chain_t;
#define ZIL_MIN_BLKSZ 4096ULL
/*
* ziltest is by and large an ugly hack, but very useful in
* checking replay without tedious work.
* When running ziltest we want to keep all itx's and so maintain
* a single list in the zl_itxg[] that uses a high txg: ZILTEST_TXG
* We subtract TXG_CONCURRENT_STATES to allow for common code.
*/
#define ZILTEST_TXG (UINT64_MAX - TXG_CONCURRENT_STATES)
/*
* The words of a log block checksum.
*/
#define ZIL_ZC_GUID_0 0
#define ZIL_ZC_GUID_1 1
#define ZIL_ZC_OBJSET 2
#define ZIL_ZC_SEQ 3
typedef enum zil_create {
Z_FILE,
Z_DIR,
Z_XATTRDIR,
} zil_create_t;
/*
* size of xvattr log section.
* its composed of lr_attr_t + xvattr bitmap + 2 64 bit timestamps
* for create time and a single 64 bit integer for all of the attributes,
* and 4 64 bit integers (32 bytes) for the scanstamp.
*
*/
#define ZIL_XVAT_SIZE(mapsize) \
sizeof (lr_attr_t) + (sizeof (uint32_t) * (mapsize - 1)) + \
(sizeof (uint64_t) * 7)
/*
* Size of ACL in log. The ACE data is padded out to properly align
* on 8 byte boundary.
*/
#define ZIL_ACE_LENGTH(x) (roundup(x, sizeof (uint64_t)))
/*
* Intent log transaction types and record structures
*/
#define TX_COMMIT 0 /* Commit marker (no on-disk state) */
#define TX_CREATE 1 /* Create file */
#define TX_MKDIR 2 /* Make directory */
#define TX_MKXATTR 3 /* Make XATTR directory */
#define TX_SYMLINK 4 /* Create symbolic link to a file */
#define TX_REMOVE 5 /* Remove file */
#define TX_RMDIR 6 /* Remove directory */
#define TX_LINK 7 /* Create hard link to a file */
#define TX_RENAME 8 /* Rename a file */
#define TX_WRITE 9 /* File write */
#define TX_TRUNCATE 10 /* Truncate a file */
#define TX_SETATTR 11 /* Set file attributes */
#define TX_ACL_V0 12 /* Set old formatted ACL */
#define TX_ACL 13 /* Set ACL */
#define TX_CREATE_ACL 14 /* create with ACL */
#define TX_CREATE_ATTR 15 /* create + attrs */
#define TX_CREATE_ACL_ATTR 16 /* create with ACL + attrs */
#define TX_MKDIR_ACL 17 /* mkdir with ACL */
#define TX_MKDIR_ATTR 18 /* mkdir with attr */
#define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */
#define TX_WRITE2 20 /* dmu_sync EALREADY write */
#define TX_MAX_TYPE 21 /* Max transaction type */
/*
* The transactions for mkdir, symlink, remove, rmdir, link, and rename
* may have the following bit set, indicating the original request
* specified case-insensitive handling of names.
*/
#define TX_CI ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */
/*
* Transactions for write, truncate, setattr, acl_v0, and acl can be logged
* out of order. For convenience in the code, all such records must have
* lr_foid at the same offset.
*/
#define TX_OOO(txtype) \
((txtype) == TX_WRITE || \
(txtype) == TX_TRUNCATE || \
(txtype) == TX_SETATTR || \
(txtype) == TX_ACL_V0 || \
(txtype) == TX_ACL || \
(txtype) == TX_WRITE2)
/*
* Format of log records.
* The fields are carefully defined to allow them to be aligned
* and sized the same on sparc & intel architectures.
* Each log record has a common structure at the beginning.
*
* The log record on disk (lrc_seq) holds the sequence number of all log
* records which is used to ensure we don't replay the same record.
*/
typedef struct { /* common log record header */
uint64_t lrc_txtype; /* intent log transaction type */
uint64_t lrc_reclen; /* transaction record length */
uint64_t lrc_txg; /* dmu transaction group number */
uint64_t lrc_seq; /* see comment above */
} lr_t;
/*
* Common start of all out-of-order record types (TX_OOO() above).
*/
typedef struct {
lr_t lr_common; /* common portion of log record */
uint64_t lr_foid; /* object id */
} lr_ooo_t;
/*
* Handle option extended vattr attributes.
*
* Whenever new attributes are added the version number
* will need to be updated as will code in
* zfs_log.c and zfs_replay.c
*/
typedef struct {
uint32_t lr_attr_masksize; /* number of elements in array */
uint32_t lr_attr_bitmap; /* First entry of array */
/* remainder of array and any additional fields */
} lr_attr_t;
/*
* log record for creates without optional ACL.
* This log record does support optional xvattr_t attributes.
*/
typedef struct {
lr_t lr_common; /* common portion of log record */
uint64_t lr_doid; /* object id of directory */
uint64_t lr_foid; /* object id of created file object */
uint64_t lr_mode; /* mode of object */
uint64_t lr_uid; /* uid of object */
uint64_t lr_gid; /* gid of object */
uint64_t lr_gen; /* generation (txg of creation) */
uint64_t lr_crtime[2]; /* creation time */
uint64_t lr_rdev; /* rdev of object to create */
/* name of object to create follows this */
/* for symlinks, link content follows name */
/* for creates with xvattr data, the name follows the xvattr info */
} lr_create_t;
/*
* FUID ACL record will be an array of ACEs from the original ACL.
* If this array includes ephemeral IDs, the record will also include
* an array of log-specific FUIDs to replace the ephemeral IDs.
* Only one copy of each unique domain will be present, so the log-specific
* FUIDs will use an index into a compressed domain table. On replay this
* information will be used to construct real FUIDs (and bypass idmap,
* since it may not be available).
*/
/*
* Log record for creates with optional ACL
* This log record is also used for recording any FUID
* information needed for replaying the create. If the
* file doesn't have any actual ACEs then the lr_aclcnt
* would be zero.
*
* After lr_acl_flags, there are a lr_acl_bytes number of variable sized ace's.
* If create is also setting xvattr's, then acl data follows xvattr.
* If ACE FUIDs are needed then they will follow the xvattr_t. Following
* the FUIDs will be the domain table information. The FUIDs for the owner
* and group will be in lr_create. Name follows ACL data.
*/
typedef struct {
lr_create_t lr_create; /* common create portion */
uint64_t lr_aclcnt; /* number of ACEs in ACL */
uint64_t lr_domcnt; /* number of unique domains */
uint64_t lr_fuidcnt; /* number of real fuids */
uint64_t lr_acl_bytes; /* number of bytes in ACL */
uint64_t lr_acl_flags; /* ACL flags */
} lr_acl_create_t;
typedef struct {
lr_t lr_common; /* common portion of log record */
uint64_t lr_doid; /* obj id of directory */
/* name of object to remove follows this */
} lr_remove_t;
typedef struct {
lr_t lr_common; /* common portion of log record */
uint64_t lr_doid; /* obj id of directory */
uint64_t lr_link_obj; /* obj id of link */
/* name of object to link follows this */
} lr_link_t;
typedef struct {
lr_t lr_common; /* common portion of log record */
uint64_t lr_sdoid; /* obj id of source directory */
uint64_t lr_tdoid; /* obj id of target directory */
/* 2 strings: names of source and destination follow this */
} lr_rename_t;
typedef struct {
lr_t lr_common; /* common portion of log record */
uint64_t lr_foid; /* file object to write */
uint64_t lr_offset; /* offset to write to */
uint64_t lr_length; /* user data length to write */
uint64_t lr_blkoff; /* no longer used */
blkptr_t lr_blkptr; /* spa block pointer for replay */
/* write data will follow for small writes */
} lr_write_t;
typedef struct {
lr_t lr_common; /* common portion of log record */
uint64_t lr_foid; /* object id of file to truncate */
uint64_t lr_offset; /* offset to truncate from */
uint64_t lr_length; /* length to truncate */
} lr_truncate_t;
typedef struct {
lr_t lr_common; /* common portion of log record */
uint64_t lr_foid; /* file object to change attributes */
uint64_t lr_mask; /* mask of attributes to set */
uint64_t lr_mode; /* mode to set */
uint64_t lr_uid; /* uid to set */
uint64_t lr_gid; /* gid to set */
uint64_t lr_size; /* size to set */
uint64_t lr_atime[2]; /* access time */
uint64_t lr_mtime[2]; /* modification time */
/* optional attribute lr_attr_t may be here */
} lr_setattr_t;
typedef struct {
lr_t lr_common; /* common portion of log record */
uint64_t lr_foid; /* obj id of file */
uint64_t lr_aclcnt; /* number of acl entries */
/* lr_aclcnt number of ace_t entries follow this */
} lr_acl_v0_t;
typedef struct {
lr_t lr_common; /* common portion of log record */
uint64_t lr_foid; /* obj id of file */
uint64_t lr_aclcnt; /* number of ACEs in ACL */
uint64_t lr_domcnt; /* number of unique domains */
uint64_t lr_fuidcnt; /* number of real fuids */
uint64_t lr_acl_bytes; /* number of bytes in ACL */
uint64_t lr_acl_flags; /* ACL flags */
/* lr_acl_bytes number of variable sized ace's follows */
} lr_acl_t;
/*
* ZIL structure definitions, interface function prototype and globals.
*/
/*
* Writes are handled in three different ways:
*
* WR_INDIRECT:
* In this mode, if we need to commit the write later, then the block
* is immediately written into the file system (using dmu_sync),
* and a pointer to the block is put into the log record.
* When the txg commits the block is linked in.
* This saves additionally writing the data into the log record.
* There are a few requirements for this to occur:
* - write is greater than zfs/zvol_immediate_write_sz
* - not using slogs (as slogs are assumed to always be faster
* than writing into the main pool)
* - the write occupies only one block
* WR_COPIED:
* If we know we'll immediately be committing the
* transaction (FSYNC or FDSYNC), the we allocate a larger
* log record here for the data and copy the data in.
* WR_NEED_COPY:
* Otherwise we don't allocate a buffer, and *if* we need to
* flush the write later then a buffer is allocated and
* we retrieve the data using the dmu.
*/
typedef enum {
WR_INDIRECT, /* indirect - a large write (dmu_sync() data */
/* and put blkptr in log, rather than actual data) */
WR_COPIED, /* immediate - data is copied into lr_write_t */
WR_NEED_COPY, /* immediate - data needs to be copied if pushed */
WR_NUM_STATES /* number of states */
} itx_wr_state_t;
typedef struct itx {
list_node_t itx_node; /* linkage on zl_itx_list */
void *itx_private; /* type-specific opaque data */
itx_wr_state_t itx_wr_state; /* write state */
uint8_t itx_sync; /* synchronous transaction */
uint64_t itx_oid; /* object id */
lr_t itx_lr; /* common part of log record */
/* followed by type-specific part of lr_xx_t and its immediate data */
} itx_t;
typedef int zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
uint64_t txg);
typedef int zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
uint64_t txg);
typedef int zil_replay_func_t(void *arg1, void *arg2, boolean_t byteswap);
typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf,
struct lwb *lwb, zio_t *zio);
extern int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg);
extern void zil_init(void);
extern void zil_fini(void);
extern zilog_t *zil_alloc(objset_t *os, zil_header_t *zh_phys);
extern void zil_free(zilog_t *zilog);
extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data);
extern void zil_close(zilog_t *zilog);
extern void zil_replay(objset_t *os, void *arg,
zil_replay_func_t *replay_func[TX_MAX_TYPE]);
extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx);
extern void zil_destroy(zilog_t *zilog, boolean_t keep_first);
extern void zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx);
extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx);
extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize);
extern void zil_itx_destroy(itx_t *itx);
extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
extern void zil_async_to_sync(zilog_t *zilog, uint64_t oid);
extern void zil_commit(zilog_t *zilog, uint64_t oid);
extern void zil_commit_impl(zilog_t *zilog, uint64_t oid);
-extern int zil_vdev_offline(const char *osname, void *txarg);
+extern int zil_reset(const char *osname, void *txarg);
extern int zil_claim(struct dsl_pool *dp,
struct dsl_dataset *ds, void *txarg);
extern int zil_check_log_chain(struct dsl_pool *dp,
struct dsl_dataset *ds, void *tx);
extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx);
extern void zil_clean(zilog_t *zilog, uint64_t synced_txg);
extern int zil_suspend(const char *osname, void **cookiep);
extern void zil_resume(void *cookie);
extern void zil_lwb_add_block(struct lwb *lwb, const blkptr_t *bp);
extern void zil_lwb_add_txg(struct lwb *lwb, uint64_t txg);
extern int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp);
extern void zil_set_sync(zilog_t *zilog, uint64_t syncval);
extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval);
extern int zil_replay_disable;
#ifdef __cplusplus
}
#endif
#endif /* _SYS_ZIL_H */
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h (revision 332525)
@@ -1,663 +1,663 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright 2016 Toomas Soome <tsoome@me.com>
*/
#ifndef _ZIO_H
#define _ZIO_H
#include <sys/zio_priority.h>
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/txg.h>
#include <sys/avl.h>
#include <sys/kstat.h>
#include <sys/fs/zfs.h>
#include <sys/zio_impl.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
* Embedded checksum
*/
#define ZEC_MAGIC 0x210da7ab10c7a11ULL
typedef struct zio_eck {
uint64_t zec_magic; /* for validation, endianness */
zio_cksum_t zec_cksum; /* 256-bit checksum */
} zio_eck_t;
/*
* Gang block headers are self-checksumming and contain an array
* of block pointers.
*/
#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE
#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \
sizeof (zio_eck_t)) / sizeof (blkptr_t))
#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \
sizeof (zio_eck_t) - \
(SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
sizeof (uint64_t))
typedef struct zio_gbh {
blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS];
uint64_t zg_filler[SPA_GBH_FILLER];
zio_eck_t zg_tail;
} zio_gbh_phys_t;
enum zio_checksum {
ZIO_CHECKSUM_INHERIT = 0,
ZIO_CHECKSUM_ON,
ZIO_CHECKSUM_OFF,
ZIO_CHECKSUM_LABEL,
ZIO_CHECKSUM_GANG_HEADER,
ZIO_CHECKSUM_ZILOG,
ZIO_CHECKSUM_FLETCHER_2,
ZIO_CHECKSUM_FLETCHER_4,
ZIO_CHECKSUM_SHA256,
ZIO_CHECKSUM_ZILOG2,
ZIO_CHECKSUM_NOPARITY,
ZIO_CHECKSUM_SHA512,
ZIO_CHECKSUM_SKEIN,
#ifdef illumos
ZIO_CHECKSUM_EDONR,
#endif
ZIO_CHECKSUM_FUNCTIONS
};
/*
* The number of "legacy" compression functions which can be set on individual
* objects.
*/
#define ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2
#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4
#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON
#define ZIO_CHECKSUM_MASK 0xffULL
#define ZIO_CHECKSUM_VERIFY (1 << 8)
#define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256
#define ZIO_DEDUPDITTO_MIN 100
/*
* The number of "legacy" compression functions which can be set on individual
* objects.
*/
#define ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4
/*
* The meaning of "compress = on" selected by the compression features enabled
* on a given pool.
*/
#define ZIO_COMPRESS_LEGACY_ON_VALUE ZIO_COMPRESS_LZJB
#define ZIO_COMPRESS_LZ4_ON_VALUE ZIO_COMPRESS_LZ4
#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF
#define BOOTFS_COMPRESS_VALID(compress) \
((compress) == ZIO_COMPRESS_LZJB || \
(compress) == ZIO_COMPRESS_LZ4 || \
(compress) == ZIO_COMPRESS_ON || \
(compress) == ZIO_COMPRESS_OFF)
#define ZIO_FAILURE_MODE_WAIT 0
#define ZIO_FAILURE_MODE_CONTINUE 1
#define ZIO_FAILURE_MODE_PANIC 2
enum zio_flag {
/*
* Flags inherited by gang, ddt, and vdev children,
* and that must be equal for two zios to aggregate
*/
ZIO_FLAG_DONT_AGGREGATE = 1 << 0,
ZIO_FLAG_IO_REPAIR = 1 << 1,
ZIO_FLAG_SELF_HEAL = 1 << 2,
ZIO_FLAG_RESILVER = 1 << 3,
ZIO_FLAG_SCRUB = 1 << 4,
ZIO_FLAG_SCAN_THREAD = 1 << 5,
ZIO_FLAG_PHYSICAL = 1 << 6,
#define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1)
/*
* Flags inherited by ddt, gang, and vdev children.
*/
ZIO_FLAG_CANFAIL = 1 << 7, /* must be first for INHERIT */
ZIO_FLAG_SPECULATIVE = 1 << 8,
ZIO_FLAG_CONFIG_WRITER = 1 << 9,
ZIO_FLAG_DONT_RETRY = 1 << 10,
ZIO_FLAG_DONT_CACHE = 1 << 11,
ZIO_FLAG_NODATA = 1 << 12,
ZIO_FLAG_INDUCE_DAMAGE = 1 << 13,
ZIO_FLAG_IO_ALLOCATING = 1 << 14,
#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1)
#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1)
/*
* Flags inherited by vdev children.
*/
ZIO_FLAG_IO_RETRY = 1 << 15, /* must be first for INHERIT */
ZIO_FLAG_PROBE = 1 << 16,
ZIO_FLAG_TRYHARD = 1 << 17,
ZIO_FLAG_OPTIONAL = 1 << 18,
#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1)
/*
* Flags not inherited by any children.
*/
ZIO_FLAG_DONT_QUEUE = 1 << 19, /* must be first for INHERIT */
ZIO_FLAG_DONT_PROPAGATE = 1 << 20,
ZIO_FLAG_IO_BYPASS = 1 << 21,
ZIO_FLAG_IO_REWRITE = 1 << 22,
ZIO_FLAG_RAW = 1 << 23,
ZIO_FLAG_GANG_CHILD = 1 << 24,
ZIO_FLAG_DDT_CHILD = 1 << 25,
ZIO_FLAG_GODFATHER = 1 << 26,
ZIO_FLAG_NOPWRITE = 1 << 27,
ZIO_FLAG_REEXECUTED = 1 << 28,
ZIO_FLAG_DELEGATED = 1 << 29,
};
#define ZIO_FLAG_MUSTSUCCEED 0
#define ZIO_DDT_CHILD_FLAGS(zio) \
(((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) | \
ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL)
#define ZIO_GANG_CHILD_FLAGS(zio) \
(((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) | \
ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL)
#define ZIO_VDEV_CHILD_FLAGS(zio) \
(((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \
- ZIO_FLAG_CANFAIL)
+ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_CANFAIL)
#define ZIO_CHILD_BIT(x) (1 << (x))
#define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1 << (x)))
enum zio_child {
ZIO_CHILD_VDEV = 0,
ZIO_CHILD_GANG,
ZIO_CHILD_DDT,
ZIO_CHILD_LOGICAL,
ZIO_CHILD_TYPES
};
#define ZIO_CHILD_VDEV_BIT ZIO_CHILD_BIT(ZIO_CHILD_VDEV)
#define ZIO_CHILD_GANG_BIT ZIO_CHILD_BIT(ZIO_CHILD_GANG)
#define ZIO_CHILD_DDT_BIT ZIO_CHILD_BIT(ZIO_CHILD_DDT)
#define ZIO_CHILD_LOGICAL_BIT ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL)
#define ZIO_CHILD_ALL_BITS \
(ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT | \
ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT)
enum zio_wait_type {
ZIO_WAIT_READY = 0,
ZIO_WAIT_DONE,
ZIO_WAIT_TYPES
};
/*
* We'll take the number 122 and 123 to indicate checksum errors and
* fragmentation. Those doesn't collide with any errno values as they
* are greater than ELAST.
*/
#define ECKSUM 122
#define EFRAGS 123
typedef void zio_done_func_t(zio_t *zio);
extern boolean_t zio_dva_throttle_enabled;
extern const char *zio_type_name[ZIO_TYPES];
/*
* A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
* identifies any block in the pool. By convention, the meta-objset (MOS)
* is objset 0, and the meta-dnode is object 0. This covers all blocks
* except root blocks and ZIL blocks, which are defined as follows:
*
* Root blocks (objset_phys_t) are object 0, level -1: <objset, 0, -1, 0>.
* ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
* dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
* dnode visit bookmarks are <objset, object id of dnode, -3, 0>.
*
* Note: this structure is called a bookmark because its original purpose
* was to remember where to resume a pool-wide traverse.
*
* Note: this structure is passed between userland and the kernel, and is
* stored on disk (by virtue of being incorporated into other on-disk
* structures, e.g. dsl_scan_phys_t).
*/
typedef struct zbookmark_phys {
uint64_t zb_objset;
uint64_t zb_object;
int64_t zb_level;
uint64_t zb_blkid;
} zbookmark_phys_t;
#define SET_BOOKMARK(zb, objset, object, level, blkid) \
{ \
(zb)->zb_objset = objset; \
(zb)->zb_object = object; \
(zb)->zb_level = level; \
(zb)->zb_blkid = blkid; \
}
#define ZB_DESTROYED_OBJSET (-1ULL)
#define ZB_ROOT_OBJECT (0ULL)
#define ZB_ROOT_LEVEL (-1LL)
#define ZB_ROOT_BLKID (0ULL)
#define ZB_ZIL_OBJECT (0ULL)
#define ZB_ZIL_LEVEL (-2LL)
#define ZB_DNODE_LEVEL (-3LL)
#define ZB_DNODE_BLKID (0ULL)
#define ZB_IS_ZERO(zb) \
((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \
(zb)->zb_level == 0 && (zb)->zb_blkid == 0)
#define ZB_IS_ROOT(zb) \
((zb)->zb_object == ZB_ROOT_OBJECT && \
(zb)->zb_level == ZB_ROOT_LEVEL && \
(zb)->zb_blkid == ZB_ROOT_BLKID)
typedef struct zio_prop {
enum zio_checksum zp_checksum;
enum zio_compress zp_compress;
dmu_object_type_t zp_type;
uint8_t zp_level;
uint8_t zp_copies;
boolean_t zp_dedup;
boolean_t zp_dedup_verify;
boolean_t zp_nopwrite;
} zio_prop_t;
typedef struct zio_cksum_report zio_cksum_report_t;
typedef void zio_cksum_finish_f(zio_cksum_report_t *rep,
const void *good_data);
typedef void zio_cksum_free_f(void *cbdata, size_t size);
struct zio_bad_cksum; /* defined in zio_checksum.h */
struct dnode_phys;
struct abd;
struct zio_cksum_report {
struct zio_cksum_report *zcr_next;
nvlist_t *zcr_ereport;
nvlist_t *zcr_detector;
void *zcr_cbdata;
size_t zcr_cbinfo; /* passed to zcr_free() */
uint64_t zcr_align;
uint64_t zcr_length;
zio_cksum_finish_f *zcr_finish;
zio_cksum_free_f *zcr_free;
/* internal use only */
struct zio_bad_cksum *zcr_ckinfo; /* information from failure */
};
typedef void zio_vsd_cksum_report_f(zio_t *zio, zio_cksum_report_t *zcr,
void *arg);
zio_vsd_cksum_report_f zio_vsd_default_cksum_report;
typedef struct zio_vsd_ops {
zio_done_func_t *vsd_free;
zio_vsd_cksum_report_f *vsd_cksum_report;
} zio_vsd_ops_t;
typedef struct zio_gang_node {
zio_gbh_phys_t *gn_gbh;
struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS];
} zio_gang_node_t;
typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp,
zio_gang_node_t *gn, struct abd *data, uint64_t offset);
typedef void zio_transform_func_t(zio_t *zio, struct abd *data, uint64_t size);
typedef struct zio_transform {
struct abd *zt_orig_abd;
uint64_t zt_orig_size;
uint64_t zt_bufsize;
zio_transform_func_t *zt_transform;
struct zio_transform *zt_next;
} zio_transform_t;
typedef int zio_pipe_stage_t(zio_t *zio);
/*
* The io_reexecute flags are distinct from io_flags because the child must
* be able to propagate them to the parent. The normal io_flags are local
* to the zio, not protected by any lock, and not modifiable by children;
* the reexecute flags are protected by io_lock, modifiable by children,
* and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set.
*/
#define ZIO_REEXECUTE_NOW 0x01
#define ZIO_REEXECUTE_SUSPEND 0x02
typedef struct zio_alloc_list {
list_t zal_list;
uint64_t zal_size;
} zio_alloc_list_t;
typedef struct zio_link {
zio_t *zl_parent;
zio_t *zl_child;
list_node_t zl_parent_node;
list_node_t zl_child_node;
} zio_link_t;
/*
* Used for TRIM kstat.
*/
typedef struct zio_trim_stats {
/*
* Number of bytes successfully TRIMmed.
*/
kstat_named_t bytes;
/*
* Number of successful TRIM requests.
*/
kstat_named_t success;
/*
* Number of TRIM requests that failed because TRIM is not
* supported.
*/
kstat_named_t unsupported;
/*
* Number of TRIM requests that failed for other reasons.
*/
kstat_named_t failed;
} zio_trim_stats_t;
extern zio_trim_stats_t zio_trim_stats;
#define ZIO_TRIM_STAT_INCR(stat, val) \
atomic_add_64(&zio_trim_stats.stat.value.ui64, (val));
#define ZIO_TRIM_STAT_BUMP(stat) \
ZIO_TRIM_STAT_INCR(stat, 1);
struct zio {
/* Core information about this I/O */
zbookmark_phys_t io_bookmark;
zio_prop_t io_prop;
zio_type_t io_type;
enum zio_child io_child_type;
int io_cmd;
zio_priority_t io_priority;
uint8_t io_reexecute;
uint8_t io_state[ZIO_WAIT_TYPES];
uint64_t io_txg;
spa_t *io_spa;
blkptr_t *io_bp;
blkptr_t *io_bp_override;
blkptr_t io_bp_copy;
list_t io_parent_list;
list_t io_child_list;
zio_t *io_logical;
zio_transform_t *io_transform_stack;
/* Callback info */
zio_done_func_t *io_ready;
zio_done_func_t *io_children_ready;
zio_done_func_t *io_physdone;
zio_done_func_t *io_done;
void *io_private;
int64_t io_prev_space_delta; /* DMU private */
blkptr_t io_bp_orig;
/* Data represented by this I/O */
struct abd *io_abd;
struct abd *io_orig_abd;
uint64_t io_size;
uint64_t io_orig_size;
/* io_lsize != io_orig_size iff this is a raw write */
uint64_t io_lsize;
/* Stuff for the vdev stack */
vdev_t *io_vd;
void *io_vsd;
const zio_vsd_ops_t *io_vsd_ops;
uint64_t io_offset;
hrtime_t io_timestamp;
hrtime_t io_queued_timestamp;
hrtime_t io_target_timestamp;
avl_node_t io_queue_node;
avl_node_t io_offset_node;
avl_node_t io_alloc_node;
zio_alloc_list_t io_alloc_list;
#ifdef __FreeBSD__
struct bio *io_bio;
#endif
/* Internal pipeline state */
enum zio_flag io_flags;
enum zio_stage io_stage;
enum zio_stage io_pipeline;
enum zio_flag io_orig_flags;
enum zio_stage io_orig_stage;
enum zio_stage io_orig_pipeline;
enum zio_stage io_pipeline_trace;
int io_error;
int io_child_error[ZIO_CHILD_TYPES];
uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
uint64_t io_child_count;
uint64_t io_phys_children;
uint64_t io_parent_count;
uint64_t *io_stall;
zio_t *io_gang_leader;
zio_gang_node_t *io_gang_tree;
void *io_executor;
void *io_waiter;
kmutex_t io_lock;
kcondvar_t io_cv;
/* FMA state */
zio_cksum_report_t *io_cksum_report;
uint64_t io_ena;
/* Taskq dispatching state */
taskq_ent_t io_tqent;
avl_node_t io_trim_node;
list_node_t io_trim_link;
};
extern int zio_bookmark_compare(const void *, const void *);
extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
zio_done_func_t *done, void *priv, enum zio_flag flags);
extern zio_t *zio_root(spa_t *spa,
zio_done_func_t *done, void *priv, enum zio_flag flags);
extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
struct abd *data, uint64_t lsize, zio_done_func_t *done, void *priv,
zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb);
extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *children_ready,
zio_done_func_t *physdone, zio_done_func_t *done,
void *priv, zio_priority_t priority, enum zio_flag flags,
const zbookmark_phys_t *zb);
extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
struct abd *data, uint64_t size, zio_done_func_t *done, void *priv,
zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb);
extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
boolean_t nopwrite);
extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);
extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
const blkptr_t *bp,
zio_done_func_t *done, void *priv, enum zio_flag flags);
extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv,
zio_priority_t priority, enum zio_flag flags);
extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
uint64_t size, struct abd *data, int checksum,
zio_done_func_t *done, void *priv, zio_priority_t priority,
enum zio_flag flags, boolean_t labels);
extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
uint64_t size, struct abd *data, int checksum,
zio_done_func_t *done, void *priv, zio_priority_t priority,
enum zio_flag flags, boolean_t labels);
extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
const blkptr_t *bp, uint64_t size, enum zio_flag flags);
extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
blkptr_t *old_bp, uint64_t size, boolean_t *slog);
extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
extern void zio_flush(zio_t *zio, vdev_t *vd);
extern zio_t *zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset,
uint64_t size);
extern void zio_shrink(zio_t *zio, uint64_t size);
extern int zio_wait(zio_t *zio);
extern void zio_nowait(zio_t *zio);
extern void zio_execute(zio_t *zio);
extern void zio_interrupt(zio_t *zio);
extern void zio_delay_init(zio_t *zio);
extern void zio_delay_interrupt(zio_t *zio);
extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **);
extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **);
extern zio_t *zio_unique_parent(zio_t *cio);
extern void zio_add_child(zio_t *pio, zio_t *cio);
extern void *zio_buf_alloc(size_t size);
extern void zio_buf_free(void *buf, size_t size);
extern void *zio_data_buf_alloc(size_t size);
extern void zio_data_buf_free(void *buf, size_t size);
extern void zio_push_transform(zio_t *zio, struct abd *abd, uint64_t size,
uint64_t bufsize, zio_transform_func_t *transform);
extern void zio_pop_transforms(zio_t *zio);
extern void zio_resubmit_stage_async(void *);
extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
uint64_t offset, struct abd *data, uint64_t size, int type,
zio_priority_t priority, enum zio_flag flags,
zio_done_func_t *done, void *priv);
extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
struct abd *data, uint64_t size, int type, zio_priority_t priority,
enum zio_flag flags, zio_done_func_t *done, void *priv);
extern void zio_vdev_io_bypass(zio_t *zio);
extern void zio_vdev_io_reissue(zio_t *zio);
extern void zio_vdev_io_redone(zio_t *zio);
extern void zio_checksum_verified(zio_t *zio);
extern int zio_worst_error(int e1, int e2);
extern enum zio_checksum zio_checksum_select(enum zio_checksum child,
enum zio_checksum parent);
extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa,
enum zio_checksum child, enum zio_checksum parent);
extern enum zio_compress zio_compress_select(spa_t *spa,
enum zio_compress child, enum zio_compress parent);
extern void zio_suspend(spa_t *spa, zio_t *zio);
extern int zio_resume(spa_t *spa);
extern void zio_resume_wait(spa_t *spa);
/*
* Initial setup and teardown.
*/
extern void zio_init(void);
extern void zio_fini(void);
/*
* Fault injection
*/
struct zinject_record;
extern uint32_t zio_injection_enabled;
extern int zio_inject_fault(char *name, int flags, int *id,
struct zinject_record *record);
extern int zio_inject_list_next(int *id, char *name, size_t buflen,
struct zinject_record *record);
extern int zio_clear_fault(int id);
extern void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type);
extern int zio_handle_fault_injection(zio_t *zio, int error);
extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
extern int zio_handle_label_injection(zio_t *zio, int error);
extern void zio_handle_ignored_writes(zio_t *zio);
extern hrtime_t zio_handle_io_delay(zio_t *zio);
/*
* Checksum ereport functions
*/
extern void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, struct zio *zio,
uint64_t offset, uint64_t length, void *arg, struct zio_bad_cksum *info);
extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report,
const void *good_data, const void *bad_data, boolean_t drop_if_identical);
extern void zfs_ereport_send_interim_checksum(zio_cksum_report_t *report);
extern void zfs_ereport_free_checksum(zio_cksum_report_t *report);
/* If we have the good data in hand, this function can be used */
extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
struct zio *zio, uint64_t offset, uint64_t length,
const void *good_data, const void *bad_data, struct zio_bad_cksum *info);
/* Called from spa_sync(), but primarily an injection handler */
extern void spa_handle_ignored_writes(spa_t *spa);
/* zbookmark_phys functions */
boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp,
const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block);
int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2,
uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2);
#ifdef __cplusplus
}
#endif
#endif /* _ZIO_H */
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h (revision 332525)
@@ -1,41 +1,42 @@
/*
* CDDL HEADER START
*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2014 by Delphix. All rights reserved.
*/
#ifndef _ZIO_PRIORITY_H
#define _ZIO_PRIORITY_H
#ifdef __cplusplus
extern "C" {
#endif
typedef enum zio_priority {
ZIO_PRIORITY_SYNC_READ,
ZIO_PRIORITY_SYNC_WRITE, /* ZIL */
ZIO_PRIORITY_ASYNC_READ, /* prefetch */
ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
ZIO_PRIORITY_TRIM, /* free requests used for TRIM */
+ ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */
ZIO_PRIORITY_NUM_QUEUEABLE,
ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
} zio_priority_t;
#ifdef __cplusplus
}
#endif
#endif /* _ZIO_PRIORITY_H */
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c (revision 332525)
@@ -1,901 +1,903 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org>
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/txg_impl.h>
#include <sys/dmu_impl.h>
#include <sys/dmu_tx.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_scan.h>
#include <sys/zil.h>
#include <sys/callb.h>
/*
* ZFS Transaction Groups
* ----------------------
*
* ZFS transaction groups are, as the name implies, groups of transactions
* that act on persistent state. ZFS asserts consistency at the granularity of
* these transaction groups. Each successive transaction group (txg) is
* assigned a 64-bit consecutive identifier. There are three active
* transaction group states: open, quiescing, or syncing. At any given time,
* there may be an active txg associated with each state; each active txg may
* either be processing, or blocked waiting to enter the next state. There may
* be up to three active txgs, and there is always a txg in the open state
* (though it may be blocked waiting to enter the quiescing state). In broad
* strokes, transactions -- operations that change in-memory structures -- are
* accepted into the txg in the open state, and are completed while the txg is
* in the open or quiescing states. The accumulated changes are written to
* disk in the syncing state.
*
* Open
*
* When a new txg becomes active, it first enters the open state. New
* transactions -- updates to in-memory structures -- are assigned to the
* currently open txg. There is always a txg in the open state so that ZFS can
* accept new changes (though the txg may refuse new changes if it has hit
* some limit). ZFS advances the open txg to the next state for a variety of
* reasons such as it hitting a time or size threshold, or the execution of an
* administrative action that must be completed in the syncing state.
*
* Quiescing
*
* After a txg exits the open state, it enters the quiescing state. The
* quiescing state is intended to provide a buffer between accepting new
* transactions in the open state and writing them out to stable storage in
* the syncing state. While quiescing, transactions can continue their
* operation without delaying either of the other states. Typically, a txg is
* in the quiescing state very briefly since the operations are bounded by
* software latencies rather than, say, slower I/O latencies. After all
* transactions complete, the txg is ready to enter the next state.
*
* Syncing
*
* In the syncing state, the in-memory state built up during the open and (to
* a lesser degree) the quiescing states is written to stable storage. The
* process of writing out modified data can, in turn modify more data. For
* example when we write new blocks, we need to allocate space for them; those
* allocations modify metadata (space maps)... which themselves must be
* written to stable storage. During the sync state, ZFS iterates, writing out
* data until it converges and all in-memory changes have been written out.
* The first such pass is the largest as it encompasses all the modified user
* data (as opposed to filesystem metadata). Subsequent passes typically have
* far less data to write as they consist exclusively of filesystem metadata.
*
* To ensure convergence, after a certain number of passes ZFS begins
* overwriting locations on stable storage that had been allocated earlier in
* the syncing state (and subsequently freed). ZFS usually allocates new
* blocks to optimize for large, continuous, writes. For the syncing state to
* converge however it must complete a pass where no new blocks are allocated
* since each allocation requires a modification of persistent metadata.
* Further, to hasten convergence, after a prescribed number of passes, ZFS
* also defers frees, and stops compressing.
*
* In addition to writing out user data, we must also execute synctasks during
* the syncing context. A synctask is the mechanism by which some
* administrative activities work such as creating and destroying snapshots or
* datasets. Note that when a synctask is initiated it enters the open txg,
* and ZFS then pushes that txg as quickly as possible to completion of the
* syncing state in order to reduce the latency of the administrative
* activity. To complete the syncing state, ZFS writes out a new uberblock,
* the root of the tree of blocks that comprise all state stored on the ZFS
* pool. Finally, if there is a quiesced txg waiting, we signal that it can
* now transition to the syncing state.
*/
static void txg_sync_thread(void *arg);
static void txg_quiesce_thread(void *arg);
int zfs_txg_timeout = 5; /* max seconds worth of delta per txg */
SYSCTL_DECL(_vfs_zfs);
SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS TXG");
SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, timeout, CTLFLAG_RWTUN, &zfs_txg_timeout, 0,
"Maximum seconds worth of delta per txg");
/*
* Prepare the txg subsystem.
*/
void
txg_init(dsl_pool_t *dp, uint64_t txg)
{
tx_state_t *tx = &dp->dp_tx;
int c;
bzero(tx, sizeof (tx_state_t));
tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP);
for (c = 0; c < max_ncpus; c++) {
int i;
mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_DEFAULT,
NULL);
for (i = 0; i < TXG_SIZE; i++) {
cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
NULL);
list_create(&tx->tx_cpu[c].tc_callbacks[i],
sizeof (dmu_tx_callback_t),
offsetof(dmu_tx_callback_t, dcb_node));
}
}
mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL);
cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL);
cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL);
cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL);
tx->tx_open_txg = txg;
}
/*
* Close down the txg subsystem.
*/
void
txg_fini(dsl_pool_t *dp)
{
tx_state_t *tx = &dp->dp_tx;
int c;
ASSERT0(tx->tx_threads);
mutex_destroy(&tx->tx_sync_lock);
cv_destroy(&tx->tx_sync_more_cv);
cv_destroy(&tx->tx_sync_done_cv);
cv_destroy(&tx->tx_quiesce_more_cv);
cv_destroy(&tx->tx_quiesce_done_cv);
cv_destroy(&tx->tx_exit_cv);
for (c = 0; c < max_ncpus; c++) {
int i;
mutex_destroy(&tx->tx_cpu[c].tc_open_lock);
mutex_destroy(&tx->tx_cpu[c].tc_lock);
for (i = 0; i < TXG_SIZE; i++) {
cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
list_destroy(&tx->tx_cpu[c].tc_callbacks[i]);
}
}
if (tx->tx_commit_cb_taskq != NULL)
taskq_destroy(tx->tx_commit_cb_taskq);
kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
bzero(tx, sizeof (tx_state_t));
}
/*
* Start syncing transaction groups.
*/
void
txg_sync_start(dsl_pool_t *dp)
{
tx_state_t *tx = &dp->dp_tx;
mutex_enter(&tx->tx_sync_lock);
dprintf("pool %p\n", dp);
ASSERT0(tx->tx_threads);
tx->tx_threads = 2;
tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
dp, 0, &p0, TS_RUN, minclsyspri);
/*
* The sync thread can need a larger-than-default stack size on
* 32-bit x86. This is due in part to nested pools and
* scrub_visitbp() recursion.
*/
tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread,
dp, 0, &p0, TS_RUN, minclsyspri);
mutex_exit(&tx->tx_sync_lock);
}
static void
txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr)
{
CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG);
mutex_enter(&tx->tx_sync_lock);
}
static void
txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
{
ASSERT(*tpp != NULL);
*tpp = NULL;
tx->tx_threads--;
cv_broadcast(&tx->tx_exit_cv);
CALLB_CPR_EXIT(cpr); /* drops &tx->tx_sync_lock */
thread_exit();
}
static void
txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time)
{
CALLB_CPR_SAFE_BEGIN(cpr);
if (time)
(void) cv_timedwait(cv, &tx->tx_sync_lock, time);
else
cv_wait(cv, &tx->tx_sync_lock);
CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);
}
/*
* Stop syncing transaction groups.
*/
void
txg_sync_stop(dsl_pool_t *dp)
{
tx_state_t *tx = &dp->dp_tx;
dprintf("pool %p\n", dp);
/*
* Finish off any work in progress.
*/
ASSERT3U(tx->tx_threads, ==, 2);
/*
* We need to ensure that we've vacated the deferred space_maps.
*/
txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE);
/*
* Wake all sync threads and wait for them to die.
*/
mutex_enter(&tx->tx_sync_lock);
ASSERT3U(tx->tx_threads, ==, 2);
tx->tx_exiting = 1;
cv_broadcast(&tx->tx_quiesce_more_cv);
cv_broadcast(&tx->tx_quiesce_done_cv);
cv_broadcast(&tx->tx_sync_more_cv);
while (tx->tx_threads != 0)
cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock);
tx->tx_exiting = 0;
mutex_exit(&tx->tx_sync_lock);
}
uint64_t
txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
{
tx_state_t *tx = &dp->dp_tx;
tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID];
uint64_t txg;
mutex_enter(&tc->tc_open_lock);
txg = tx->tx_open_txg;
mutex_enter(&tc->tc_lock);
tc->tc_count[txg & TXG_MASK]++;
mutex_exit(&tc->tc_lock);
th->th_cpu = tc;
th->th_txg = txg;
return (txg);
}
void
txg_rele_to_quiesce(txg_handle_t *th)
{
tx_cpu_t *tc = th->th_cpu;
ASSERT(!MUTEX_HELD(&tc->tc_lock));
mutex_exit(&tc->tc_open_lock);
}
void
txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks)
{
tx_cpu_t *tc = th->th_cpu;
int g = th->th_txg & TXG_MASK;
mutex_enter(&tc->tc_lock);
list_move_tail(&tc->tc_callbacks[g], tx_callbacks);
mutex_exit(&tc->tc_lock);
}
void
txg_rele_to_sync(txg_handle_t *th)
{
tx_cpu_t *tc = th->th_cpu;
int g = th->th_txg & TXG_MASK;
mutex_enter(&tc->tc_lock);
ASSERT(tc->tc_count[g] != 0);
if (--tc->tc_count[g] == 0)
cv_broadcast(&tc->tc_cv[g]);
mutex_exit(&tc->tc_lock);
th->th_cpu = NULL; /* defensive */
}
/*
* Blocks until all transactions in the group are committed.
*
* On return, the transaction group has reached a stable state in which it can
* then be passed off to the syncing context.
*/
static __noinline void
txg_quiesce(dsl_pool_t *dp, uint64_t txg)
{
tx_state_t *tx = &dp->dp_tx;
int g = txg & TXG_MASK;
int c;
/*
* Grab all tc_open_locks so nobody else can get into this txg.
*/
for (c = 0; c < max_ncpus; c++)
mutex_enter(&tx->tx_cpu[c].tc_open_lock);
ASSERT(txg == tx->tx_open_txg);
tx->tx_open_txg++;
tx->tx_open_time = gethrtime();
DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
/*
* Now that we've incremented tx_open_txg, we can let threads
* enter the next transaction group.
*/
for (c = 0; c < max_ncpus; c++)
mutex_exit(&tx->tx_cpu[c].tc_open_lock);
/*
* Quiesce the transaction group by waiting for everyone to txg_exit().
*/
for (c = 0; c < max_ncpus; c++) {
tx_cpu_t *tc = &tx->tx_cpu[c];
mutex_enter(&tc->tc_lock);
while (tc->tc_count[g] != 0)
cv_wait(&tc->tc_cv[g], &tc->tc_lock);
mutex_exit(&tc->tc_lock);
}
}
static void
txg_do_callbacks(void *arg)
{
list_t *cb_list = arg;
dmu_tx_do_callbacks(cb_list, 0);
list_destroy(cb_list);
kmem_free(cb_list, sizeof (list_t));
}
/*
* Dispatch the commit callbacks registered on this txg to worker threads.
*
* If no callbacks are registered for a given TXG, nothing happens.
* This function creates a taskq for the associated pool, if needed.
*/
static void
txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
{
int c;
tx_state_t *tx = &dp->dp_tx;
list_t *cb_list;
for (c = 0; c < max_ncpus; c++) {
tx_cpu_t *tc = &tx->tx_cpu[c];
/*
* No need to lock tx_cpu_t at this point, since this can
* only be called once a txg has been synced.
*/
int g = txg & TXG_MASK;
if (list_is_empty(&tc->tc_callbacks[g]))
continue;
if (tx->tx_commit_cb_taskq == NULL) {
/*
* Commit callback taskq hasn't been created yet.
*/
tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb",
max_ncpus, minclsyspri, max_ncpus, max_ncpus * 2,
TASKQ_PREPOPULATE);
}
cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
list_create(cb_list, sizeof (dmu_tx_callback_t),
offsetof(dmu_tx_callback_t, dcb_node));
list_move_tail(cb_list, &tc->tc_callbacks[g]);
(void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *)
txg_do_callbacks, cb_list, TQ_SLEEP);
}
}
static void
txg_sync_thread(void *arg)
{
dsl_pool_t *dp = arg;
spa_t *spa = dp->dp_spa;
tx_state_t *tx = &dp->dp_tx;
callb_cpr_t cpr;
uint64_t start, delta;
txg_thread_enter(tx, &cpr);
start = delta = 0;
for (;;) {
uint64_t timeout = zfs_txg_timeout * hz;
uint64_t timer;
uint64_t txg;
/*
* We sync when we're scanning, there's someone waiting
* on us, or the quiesce thread has handed off a txg to
* us, or we have reached our timeout.
*/
timer = (delta >= timeout ? 0 : timeout - delta);
while (!dsl_scan_active(dp->dp_scan) &&
!tx->tx_exiting && timer > 0 &&
tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
tx->tx_quiesced_txg == 0 &&
dp->dp_dirty_total < zfs_dirty_data_sync) {
dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
delta = ddi_get_lbolt() - start;
timer = (delta > timeout ? 0 : timeout - delta);
}
/*
* Wait until the quiesce thread hands off a txg to us,
* prompting it to do so if necessary.
*/
while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) {
if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
cv_broadcast(&tx->tx_quiesce_more_cv);
txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
}
if (tx->tx_exiting)
txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
/*
* Consume the quiesced txg which has been handed off to
* us. This may cause the quiescing thread to now be
* able to quiesce another txg, so we must signal it.
*/
txg = tx->tx_quiesced_txg;
tx->tx_quiesced_txg = 0;
tx->tx_syncing_txg = txg;
DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
cv_broadcast(&tx->tx_quiesce_more_cv);
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
mutex_exit(&tx->tx_sync_lock);
start = ddi_get_lbolt();
spa_sync(spa, txg);
delta = ddi_get_lbolt() - start;
mutex_enter(&tx->tx_sync_lock);
tx->tx_synced_txg = txg;
tx->tx_syncing_txg = 0;
DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg);
cv_broadcast(&tx->tx_sync_done_cv);
/*
* Dispatch commit callbacks to worker threads.
*/
txg_dispatch_callbacks(dp, txg);
}
}
static void
txg_quiesce_thread(void *arg)
{
dsl_pool_t *dp = arg;
tx_state_t *tx = &dp->dp_tx;
callb_cpr_t cpr;
txg_thread_enter(tx, &cpr);
for (;;) {
uint64_t txg;
/*
* We quiesce when there's someone waiting on us.
* However, we can only have one txg in "quiescing" or
* "quiesced, waiting to sync" state. So we wait until
* the "quiesced, waiting to sync" txg has been consumed
* by the sync thread.
*/
while (!tx->tx_exiting &&
(tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
tx->tx_quiesced_txg != 0))
txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
if (tx->tx_exiting)
txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread);
txg = tx->tx_open_txg;
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
txg, tx->tx_quiesce_txg_waiting,
tx->tx_sync_txg_waiting);
mutex_exit(&tx->tx_sync_lock);
txg_quiesce(dp, txg);
mutex_enter(&tx->tx_sync_lock);
/*
* Hand this txg off to the sync thread.
*/
dprintf("quiesce done, handing off txg %llu\n", txg);
tx->tx_quiesced_txg = txg;
DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
cv_broadcast(&tx->tx_sync_more_cv);
cv_broadcast(&tx->tx_quiesce_done_cv);
}
}
/*
* Delay this thread by delay nanoseconds if we are still in the open
* transaction group and there is already a waiting txg quiesing or quiesced.
* Abort the delay if this txg stalls or enters the quiesing state.
*/
void
txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
{
tx_state_t *tx = &dp->dp_tx;
hrtime_t start = gethrtime();
/* don't delay if this txg could transition to quiescing immediately */
if (tx->tx_open_txg > txg ||
tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1)
return;
mutex_enter(&tx->tx_sync_lock);
if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) {
mutex_exit(&tx->tx_sync_lock);
return;
}
while (gethrtime() - start < delay &&
tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) {
(void) cv_timedwait_hires(&tx->tx_quiesce_more_cv,
&tx->tx_sync_lock, delay, resolution, 0);
}
mutex_exit(&tx->tx_sync_lock);
}
void
txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
{
tx_state_t *tx = &dp->dp_tx;
ASSERT(!dsl_pool_config_held(dp));
mutex_enter(&tx->tx_sync_lock);
ASSERT3U(tx->tx_threads, ==, 2);
if (txg == 0)
txg = tx->tx_open_txg + TXG_DEFER_SIZE;
if (tx->tx_sync_txg_waiting < txg)
tx->tx_sync_txg_waiting = txg;
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
while (tx->tx_synced_txg < txg) {
dprintf("broadcasting sync more "
"tx_synced=%llu waiting=%llu dp=%p\n",
tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
cv_broadcast(&tx->tx_sync_more_cv);
cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
}
mutex_exit(&tx->tx_sync_lock);
}
void
txg_wait_open(dsl_pool_t *dp, uint64_t txg)
{
tx_state_t *tx = &dp->dp_tx;
ASSERT(!dsl_pool_config_held(dp));
mutex_enter(&tx->tx_sync_lock);
ASSERT3U(tx->tx_threads, ==, 2);
if (txg == 0)
txg = tx->tx_open_txg + 1;
if (tx->tx_quiesce_txg_waiting < txg)
tx->tx_quiesce_txg_waiting = txg;
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
while (tx->tx_open_txg < txg) {
cv_broadcast(&tx->tx_quiesce_more_cv);
cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
}
mutex_exit(&tx->tx_sync_lock);
}
/*
* If there isn't a txg syncing or in the pipeline, push another txg through
* the pipeline by queiscing the open txg.
*/
void
txg_kick(dsl_pool_t *dp)
{
tx_state_t *tx = &dp->dp_tx;
ASSERT(!dsl_pool_config_held(dp));
mutex_enter(&tx->tx_sync_lock);
if (tx->tx_syncing_txg == 0 &&
tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
tx->tx_quiesced_txg <= tx->tx_synced_txg) {
tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1;
cv_broadcast(&tx->tx_quiesce_more_cv);
}
mutex_exit(&tx->tx_sync_lock);
}
boolean_t
txg_stalled(dsl_pool_t *dp)
{
tx_state_t *tx = &dp->dp_tx;
return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
}
boolean_t
txg_sync_waiting(dsl_pool_t *dp)
{
tx_state_t *tx = &dp->dp_tx;
return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting ||
tx->tx_quiesced_txg != 0);
}
/*
* Verify that this txg is active (open, quiescing, syncing). Non-active
* txg's should not be manipulated.
*/
void
txg_verify(spa_t *spa, uint64_t txg)
{
dsl_pool_t *dp = spa_get_dsl(spa);
if (txg <= TXG_INITIAL || txg == ZILTEST_TXG)
return;
ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
ASSERT3U(txg, >=, dp->dp_tx.tx_synced_txg);
ASSERT3U(txg, >=, dp->dp_tx.tx_open_txg - TXG_CONCURRENT_STATES);
}
/*
* Per-txg object lists.
*/
void
txg_list_create(txg_list_t *tl, spa_t *spa, size_t offset)
{
int t;
mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL);
tl->tl_offset = offset;
tl->tl_spa = spa;
for (t = 0; t < TXG_SIZE; t++)
tl->tl_head[t] = NULL;
}
void
txg_list_destroy(txg_list_t *tl)
{
int t;
for (t = 0; t < TXG_SIZE; t++)
ASSERT(txg_list_empty(tl, t));
mutex_destroy(&tl->tl_lock);
}
boolean_t
txg_list_empty(txg_list_t *tl, uint64_t txg)
{
txg_verify(tl->tl_spa, txg);
return (tl->tl_head[txg & TXG_MASK] == NULL);
}
/*
* Returns true if all txg lists are empty.
*
* Warning: this is inherently racy (an item could be added immediately
* after this function returns). We don't bother with the lock because
* it wouldn't change the semantics.
*/
boolean_t
txg_all_lists_empty(txg_list_t *tl)
{
for (int i = 0; i < TXG_SIZE; i++) {
if (!txg_list_empty(tl, i)) {
return (B_FALSE);
}
}
return (B_TRUE);
}
/*
* Add an entry to the list (unless it's already on the list).
* Returns B_TRUE if it was actually added.
*/
boolean_t
txg_list_add(txg_list_t *tl, void *p, uint64_t txg)
{
int t = txg & TXG_MASK;
txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
boolean_t add;
txg_verify(tl->tl_spa, txg);
mutex_enter(&tl->tl_lock);
add = (tn->tn_member[t] == 0);
if (add) {
tn->tn_member[t] = 1;
tn->tn_next[t] = tl->tl_head[t];
tl->tl_head[t] = tn;
}
mutex_exit(&tl->tl_lock);
return (add);
}
/*
* Add an entry to the end of the list, unless it's already on the list.
* (walks list to find end)
* Returns B_TRUE if it was actually added.
*/
boolean_t
txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg)
{
int t = txg & TXG_MASK;
txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
boolean_t add;
txg_verify(tl->tl_spa, txg);
mutex_enter(&tl->tl_lock);
add = (tn->tn_member[t] == 0);
if (add) {
txg_node_t **tp;
for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t])
continue;
tn->tn_member[t] = 1;
tn->tn_next[t] = NULL;
*tp = tn;
}
mutex_exit(&tl->tl_lock);
return (add);
}
/*
* Remove the head of the list and return it.
*/
void *
txg_list_remove(txg_list_t *tl, uint64_t txg)
{
int t = txg & TXG_MASK;
txg_node_t *tn;
void *p = NULL;
txg_verify(tl->tl_spa, txg);
mutex_enter(&tl->tl_lock);
if ((tn = tl->tl_head[t]) != NULL) {
+ ASSERT(tn->tn_member[t]);
+ ASSERT(tn->tn_next[t] == NULL || tn->tn_next[t]->tn_member[t]);
p = (char *)tn - tl->tl_offset;
tl->tl_head[t] = tn->tn_next[t];
tn->tn_next[t] = NULL;
tn->tn_member[t] = 0;
}
mutex_exit(&tl->tl_lock);
return (p);
}
/*
* Remove a specific item from the list and return it.
*/
void *
txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg)
{
int t = txg & TXG_MASK;
txg_node_t *tn, **tp;
txg_verify(tl->tl_spa, txg);
mutex_enter(&tl->tl_lock);
for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) {
if ((char *)tn - tl->tl_offset == p) {
*tp = tn->tn_next[t];
tn->tn_next[t] = NULL;
tn->tn_member[t] = 0;
mutex_exit(&tl->tl_lock);
return (p);
}
}
mutex_exit(&tl->tl_lock);
return (NULL);
}
boolean_t
txg_list_member(txg_list_t *tl, void *p, uint64_t txg)
{
int t = txg & TXG_MASK;
txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
txg_verify(tl->tl_spa, txg);
return (tn->tn_member[t] != 0);
}
/*
* Walk a txg list -- only safe if you know it's not changing.
*/
void *
txg_list_head(txg_list_t *tl, uint64_t txg)
{
int t = txg & TXG_MASK;
txg_node_t *tn = tl->tl_head[t];
txg_verify(tl->tl_spa, txg);
return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
}
void *
txg_list_next(txg_list_t *tl, void *p, uint64_t txg)
{
int t = txg & TXG_MASK;
txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
txg_verify(tl->tl_spa, txg);
tn = tn->tn_next[t];
return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c (revision 332525)
@@ -1,3644 +1,3821 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc.
* Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2016 Toomas Soome <tsoome@me.com>
* Copyright 2017 Joyent, Inc.
*/
#include <sys/zfs_context.h>
#include <sys/fm/fs/zfs.h>
#include <sys/spa.h>
#include <sys/spa_impl.h>
+#include <sys/bpobj.h>
#include <sys/dmu.h>
#include <sys/dmu_tx.h>
+#include <sys/dsl_dir.h>
#include <sys/vdev_impl.h>
#include <sys/uberblock_impl.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
#include <sys/space_map.h>
#include <sys/space_reftree.h>
#include <sys/zio.h>
#include <sys/zap.h>
#include <sys/fs/zfs.h>
#include <sys/arc.h>
#include <sys/zil.h>
#include <sys/dsl_scan.h>
#include <sys/abd.h>
#include <sys/trim_map.h>
SYSCTL_DECL(_vfs_zfs);
SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");
/*
* Virtual device management.
*/
/*
* The limit for ZFS to automatically increase a top-level vdev's ashift
* from logical ashift to physical ashift.
*
* Example: one or more 512B emulation child vdevs
* child->vdev_ashift = 9 (512 bytes)
* child->vdev_physical_ashift = 12 (4096 bytes)
* zfs_max_auto_ashift = 11 (2048 bytes)
* zfs_min_auto_ashift = 9 (512 bytes)
*
* On pool creation or the addition of a new top-level vdev, ZFS will
* increase the ashift of the top-level vdev to 2048 as limited by
* zfs_max_auto_ashift.
*
* Example: one or more 512B emulation child vdevs
* child->vdev_ashift = 9 (512 bytes)
* child->vdev_physical_ashift = 12 (4096 bytes)
* zfs_max_auto_ashift = 13 (8192 bytes)
* zfs_min_auto_ashift = 9 (512 bytes)
*
* On pool creation or the addition of a new top-level vdev, ZFS will
* increase the ashift of the top-level vdev to 4096 to match the
* max vdev_physical_ashift.
*
* Example: one or more 512B emulation child vdevs
* child->vdev_ashift = 9 (512 bytes)
* child->vdev_physical_ashift = 9 (512 bytes)
* zfs_max_auto_ashift = 13 (8192 bytes)
* zfs_min_auto_ashift = 12 (4096 bytes)
*
* On pool creation or the addition of a new top-level vdev, ZFS will
* increase the ashift of the top-level vdev to 4096 to match the
* zfs_min_auto_ashift.
*/
static uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT;
static uint64_t zfs_min_auto_ashift = SPA_MINASHIFT;
static int
sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS)
{
uint64_t val;
int err;
val = zfs_max_auto_ashift;
err = sysctl_handle_64(oidp, &val, 0, req);
if (err != 0 || req->newptr == NULL)
return (err);
if (val > SPA_MAXASHIFT || val < zfs_min_auto_ashift)
return (EINVAL);
zfs_max_auto_ashift = val;
return (0);
}
SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift,
CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
sysctl_vfs_zfs_max_auto_ashift, "QU",
"Max ashift used when optimising for logical -> physical sectors size on "
"new top-level vdevs.");
static int
sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS)
{
uint64_t val;
int err;
val = zfs_min_auto_ashift;
err = sysctl_handle_64(oidp, &val, 0, req);
if (err != 0 || req->newptr == NULL)
return (err);
if (val < SPA_MINASHIFT || val > zfs_max_auto_ashift)
return (EINVAL);
zfs_min_auto_ashift = val;
return (0);
}
SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift,
CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
sysctl_vfs_zfs_min_auto_ashift, "QU",
"Min ashift used when creating new top-level vdevs.");
static vdev_ops_t *vdev_ops_table[] = {
&vdev_root_ops,
&vdev_raidz_ops,
&vdev_mirror_ops,
&vdev_replacing_ops,
&vdev_spare_ops,
#ifdef _KERNEL
&vdev_geom_ops,
#else
&vdev_disk_ops,
#endif
&vdev_file_ops,
&vdev_missing_ops,
&vdev_hole_ops,
+ &vdev_indirect_ops,
NULL
};
/*
* When a vdev is added, it will be divided into approximately (but no
* more than) this number of metaslabs.
*/
int metaslabs_per_vdev = 200;
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, metaslabs_per_vdev, CTLFLAG_RDTUN,
&metaslabs_per_vdev, 0,
"When a vdev is added, how many metaslabs the vdev should be divided into");
/*
* Given a vdev type, return the appropriate ops vector.
*/
static vdev_ops_t *
vdev_getops(const char *type)
{
vdev_ops_t *ops, **opspp;
for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
if (strcmp(ops->vdev_op_type, type) == 0)
break;
return (ops);
}
/*
* Default asize function: return the MAX of psize with the asize of
* all children. This is what's used by anything other than RAID-Z.
*/
uint64_t
vdev_default_asize(vdev_t *vd, uint64_t psize)
{
uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
uint64_t csize;
for (int c = 0; c < vd->vdev_children; c++) {
csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
asize = MAX(asize, csize);
}
return (asize);
}
/*
* Get the minimum allocatable size. We define the allocatable size as
* the vdev's asize rounded to the nearest metaslab. This allows us to
* replace or attach devices which don't have the same physical size but
* can still satisfy the same number of allocations.
*/
uint64_t
vdev_get_min_asize(vdev_t *vd)
{
vdev_t *pvd = vd->vdev_parent;
/*
* If our parent is NULL (inactive spare or cache) or is the root,
* just return our own asize.
*/
if (pvd == NULL)
return (vd->vdev_asize);
/*
* The top-level vdev just returns the allocatable size rounded
* to the nearest metaslab.
*/
if (vd == vd->vdev_top)
return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
/*
* The allocatable space for a raidz vdev is N * sizeof(smallest child),
* so each child must provide at least 1/Nth of its asize.
*/
if (pvd->vdev_ops == &vdev_raidz_ops)
return ((pvd->vdev_min_asize + pvd->vdev_children - 1) /
pvd->vdev_children);
return (pvd->vdev_min_asize);
}
void
vdev_set_min_asize(vdev_t *vd)
{
vd->vdev_min_asize = vdev_get_min_asize(vd);
for (int c = 0; c < vd->vdev_children; c++)
vdev_set_min_asize(vd->vdev_child[c]);
}
vdev_t *
vdev_lookup_top(spa_t *spa, uint64_t vdev)
{
vdev_t *rvd = spa->spa_root_vdev;
ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
if (vdev < rvd->vdev_children) {
ASSERT(rvd->vdev_child[vdev] != NULL);
return (rvd->vdev_child[vdev]);
}
return (NULL);
}
vdev_t *
vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
{
vdev_t *mvd;
if (vd->vdev_guid == guid)
return (vd);
for (int c = 0; c < vd->vdev_children; c++)
if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
NULL)
return (mvd);
return (NULL);
}
static int
vdev_count_leaves_impl(vdev_t *vd)
{
int n = 0;
if (vd->vdev_ops->vdev_op_leaf)
return (1);
for (int c = 0; c < vd->vdev_children; c++)
n += vdev_count_leaves_impl(vd->vdev_child[c]);
return (n);
}
int
vdev_count_leaves(spa_t *spa)
{
return (vdev_count_leaves_impl(spa->spa_root_vdev));
}
void
vdev_add_child(vdev_t *pvd, vdev_t *cvd)
{
size_t oldsize, newsize;
uint64_t id = cvd->vdev_id;
vdev_t **newchild;
spa_t *spa = cvd->vdev_spa;
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
ASSERT(cvd->vdev_parent == NULL);
cvd->vdev_parent = pvd;
if (pvd == NULL)
return;
ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
oldsize = pvd->vdev_children * sizeof (vdev_t *);
pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
newsize = pvd->vdev_children * sizeof (vdev_t *);
newchild = kmem_zalloc(newsize, KM_SLEEP);
if (pvd->vdev_child != NULL) {
bcopy(pvd->vdev_child, newchild, oldsize);
kmem_free(pvd->vdev_child, oldsize);
}
pvd->vdev_child = newchild;
pvd->vdev_child[id] = cvd;
cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
/*
* Walk up all ancestors to update guid sum.
*/
for (; pvd != NULL; pvd = pvd->vdev_parent)
pvd->vdev_guid_sum += cvd->vdev_guid_sum;
}
void
vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
{
int c;
uint_t id = cvd->vdev_id;
ASSERT(cvd->vdev_parent == pvd);
if (pvd == NULL)
return;
ASSERT(id < pvd->vdev_children);
ASSERT(pvd->vdev_child[id] == cvd);
pvd->vdev_child[id] = NULL;
cvd->vdev_parent = NULL;
for (c = 0; c < pvd->vdev_children; c++)
if (pvd->vdev_child[c])
break;
if (c == pvd->vdev_children) {
kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
pvd->vdev_child = NULL;
pvd->vdev_children = 0;
}
/*
* Walk up all ancestors to update guid sum.
*/
for (; pvd != NULL; pvd = pvd->vdev_parent)
pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
}
/*
* Remove any holes in the child array.
*/
void
vdev_compact_children(vdev_t *pvd)
{
vdev_t **newchild, *cvd;
int oldc = pvd->vdev_children;
int newc;
ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
for (int c = newc = 0; c < oldc; c++)
if (pvd->vdev_child[c])
newc++;
newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
for (int c = newc = 0; c < oldc; c++) {
if ((cvd = pvd->vdev_child[c]) != NULL) {
newchild[newc] = cvd;
cvd->vdev_id = newc++;
}
}
kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
pvd->vdev_child = newchild;
pvd->vdev_children = newc;
}
/*
* Allocate and minimally initialize a vdev_t.
*/
vdev_t *
vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
{
vdev_t *vd;
+ vdev_indirect_config_t *vic;
vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
+ vic = &vd->vdev_indirect_config;
if (spa->spa_root_vdev == NULL) {
ASSERT(ops == &vdev_root_ops);
spa->spa_root_vdev = vd;
spa->spa_load_guid = spa_generate_guid(NULL);
}
if (guid == 0 && ops != &vdev_hole_ops) {
if (spa->spa_root_vdev == vd) {
/*
* The root vdev's guid will also be the pool guid,
* which must be unique among all pools.
*/
guid = spa_generate_guid(NULL);
} else {
/*
* Any other vdev's guid must be unique within the pool.
*/
guid = spa_generate_guid(spa);
}
ASSERT(!spa_guid_exists(spa_guid(spa), guid));
}
vd->vdev_spa = spa;
vd->vdev_id = id;
vd->vdev_guid = guid;
vd->vdev_guid_sum = guid;
vd->vdev_ops = ops;
vd->vdev_state = VDEV_STATE_CLOSED;
vd->vdev_ishole = (ops == &vdev_hole_ops);
+ vic->vic_prev_indirect_vdev = UINT64_MAX;
+ rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
+ vd->vdev_obsolete_segments = range_tree_create(NULL, NULL);
+
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
for (int t = 0; t < DTL_TYPES; t++) {
- vd->vdev_dtl[t] = range_tree_create(NULL, NULL,
- &vd->vdev_dtl_lock);
+ vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
}
txg_list_create(&vd->vdev_ms_list, spa,
offsetof(struct metaslab, ms_txg_node));
txg_list_create(&vd->vdev_dtl_list, spa,
offsetof(struct vdev, vdev_dtl_node));
vd->vdev_stat.vs_timestamp = gethrtime();
vdev_queue_init(vd);
vdev_cache_init(vd);
return (vd);
}
/*
* Allocate a new vdev. The 'alloctype' is used to control whether we are
* creating a new vdev or loading an existing one - the behavior is slightly
* different for each case.
*/
int
vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
int alloctype)
{
vdev_ops_t *ops;
char *type;
uint64_t guid = 0, islog, nparity;
vdev_t *vd;
+ vdev_indirect_config_t *vic;
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
return (SET_ERROR(EINVAL));
if ((ops = vdev_getops(type)) == NULL)
return (SET_ERROR(EINVAL));
/*
* If this is a load, get the vdev guid from the nvlist.
* Otherwise, vdev_alloc_common() will generate one for us.
*/
if (alloctype == VDEV_ALLOC_LOAD) {
uint64_t label_id;
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
label_id != id)
return (SET_ERROR(EINVAL));
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
return (SET_ERROR(EINVAL));
} else if (alloctype == VDEV_ALLOC_SPARE) {
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
return (SET_ERROR(EINVAL));
} else if (alloctype == VDEV_ALLOC_L2CACHE) {
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
return (SET_ERROR(EINVAL));
} else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
return (SET_ERROR(EINVAL));
}
/*
* The first allocated vdev must be of type 'root'.
*/
if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
return (SET_ERROR(EINVAL));
/*
* Determine whether we're a log vdev.
*/
islog = 0;
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
return (SET_ERROR(ENOTSUP));
if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
return (SET_ERROR(ENOTSUP));
/*
* Set the nparity property for RAID-Z vdevs.
*/
nparity = -1ULL;
if (ops == &vdev_raidz_ops) {
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
&nparity) == 0) {
if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
return (SET_ERROR(EINVAL));
/*
* Previous versions could only support 1 or 2 parity
* device.
*/
if (nparity > 1 &&
spa_version(spa) < SPA_VERSION_RAIDZ2)
return (SET_ERROR(ENOTSUP));
if (nparity > 2 &&
spa_version(spa) < SPA_VERSION_RAIDZ3)
return (SET_ERROR(ENOTSUP));
} else {
/*
* We require the parity to be specified for SPAs that
* support multiple parity levels.
*/
if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
return (SET_ERROR(EINVAL));
/*
* Otherwise, we default to 1 parity device for RAID-Z.
*/
nparity = 1;
}
} else {
nparity = 0;
}
ASSERT(nparity != -1ULL);
vd = vdev_alloc_common(spa, id, guid, ops);
+ vic = &vd->vdev_indirect_config;
vd->vdev_islog = islog;
vd->vdev_nparity = nparity;
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
vd->vdev_path = spa_strdup(vd->vdev_path);
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
vd->vdev_devid = spa_strdup(vd->vdev_devid);
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
&vd->vdev_physpath) == 0)
vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
vd->vdev_fru = spa_strdup(vd->vdev_fru);
/*
* Set the whole_disk property. If it's not specified, leave the value
* as -1.
*/
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
&vd->vdev_wholedisk) != 0)
vd->vdev_wholedisk = -1ULL;
+ ASSERT0(vic->vic_mapping_object);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
+ &vic->vic_mapping_object);
+ ASSERT0(vic->vic_births_object);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
+ &vic->vic_births_object);
+ ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
+ &vic->vic_prev_indirect_vdev);
+
/*
* Look for the 'not present' flag. This will only be set if the device
* was not present at the time of import.
*/
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
&vd->vdev_not_present);
/*
* Get the alignment requirement.
*/
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
/*
* Retrieve the vdev creation time.
*/
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
&vd->vdev_crtxg);
/*
* If we're a top-level vdev, try to load the allocation parameters.
*/
if (parent && !parent->vdev_parent &&
(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
&vd->vdev_ms_array);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
&vd->vdev_ms_shift);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
&vd->vdev_asize);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
&vd->vdev_removing);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
&vd->vdev_top_zap);
} else {
ASSERT0(vd->vdev_top_zap);
}
if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) {
ASSERT(alloctype == VDEV_ALLOC_LOAD ||
alloctype == VDEV_ALLOC_ADD ||
alloctype == VDEV_ALLOC_SPLIT ||
alloctype == VDEV_ALLOC_ROOTPOOL);
vd->vdev_mg = metaslab_group_create(islog ?
spa_log_class(spa) : spa_normal_class(spa), vd);
}
if (vd->vdev_ops->vdev_op_leaf &&
(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
(void) nvlist_lookup_uint64(nv,
ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap);
} else {
ASSERT0(vd->vdev_leaf_zap);
}
/*
* If we're a leaf vdev, try to load the DTL object and other state.
*/
if (vd->vdev_ops->vdev_op_leaf &&
(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
alloctype == VDEV_ALLOC_ROOTPOOL)) {
if (alloctype == VDEV_ALLOC_LOAD) {
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
&vd->vdev_dtl_object);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
&vd->vdev_unspare);
}
if (alloctype == VDEV_ALLOC_ROOTPOOL) {
uint64_t spare = 0;
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
&spare) == 0 && spare)
spa_spare_add(vd);
}
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
&vd->vdev_offline);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
&vd->vdev_resilver_txg);
/*
* When importing a pool, we want to ignore the persistent fault
* state, as the diagnosis made on another system may not be
* valid in the current context. Local vdevs will
* remain in the faulted state.
*/
if (spa_load_state(spa) == SPA_LOAD_OPEN) {
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
&vd->vdev_faulted);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
&vd->vdev_degraded);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
&vd->vdev_removed);
if (vd->vdev_faulted || vd->vdev_degraded) {
char *aux;
vd->vdev_label_aux =
VDEV_AUX_ERR_EXCEEDED;
if (nvlist_lookup_string(nv,
ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
strcmp(aux, "external") == 0)
vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
}
}
}
/*
* Add ourselves to the parent's list of children.
*/
vdev_add_child(parent, vd);
*vdp = vd;
return (0);
}
void
vdev_free(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
/*
* vdev_free() implies closing the vdev first. This is simpler than
* trying to ensure complicated semantics for all callers.
*/
vdev_close(vd);
ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
/*
* Free all children.
*/
for (int c = 0; c < vd->vdev_children; c++)
vdev_free(vd->vdev_child[c]);
ASSERT(vd->vdev_child == NULL);
ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
/*
* Discard allocation state.
*/
if (vd->vdev_mg != NULL) {
vdev_metaslab_fini(vd);
metaslab_group_destroy(vd->vdev_mg);
}
ASSERT0(vd->vdev_stat.vs_space);
ASSERT0(vd->vdev_stat.vs_dspace);
ASSERT0(vd->vdev_stat.vs_alloc);
/*
* Remove this vdev from its parent's child list.
*/
vdev_remove_child(vd->vdev_parent, vd);
ASSERT(vd->vdev_parent == NULL);
/*
* Clean up vdev structure.
*/
vdev_queue_fini(vd);
vdev_cache_fini(vd);
if (vd->vdev_path)
spa_strfree(vd->vdev_path);
if (vd->vdev_devid)
spa_strfree(vd->vdev_devid);
if (vd->vdev_physpath)
spa_strfree(vd->vdev_physpath);
if (vd->vdev_fru)
spa_strfree(vd->vdev_fru);
if (vd->vdev_isspare)
spa_spare_remove(vd);
if (vd->vdev_isl2cache)
spa_l2cache_remove(vd);
txg_list_destroy(&vd->vdev_ms_list);
txg_list_destroy(&vd->vdev_dtl_list);
mutex_enter(&vd->vdev_dtl_lock);
space_map_close(vd->vdev_dtl_sm);
for (int t = 0; t < DTL_TYPES; t++) {
range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
range_tree_destroy(vd->vdev_dtl[t]);
}
mutex_exit(&vd->vdev_dtl_lock);
+ EQUIV(vd->vdev_indirect_births != NULL,
+ vd->vdev_indirect_mapping != NULL);
+ if (vd->vdev_indirect_births != NULL) {
+ vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
+ vdev_indirect_births_close(vd->vdev_indirect_births);
+ }
+
+ if (vd->vdev_obsolete_sm != NULL) {
+ ASSERT(vd->vdev_removing ||
+ vd->vdev_ops == &vdev_indirect_ops);
+ space_map_close(vd->vdev_obsolete_sm);
+ vd->vdev_obsolete_sm = NULL;
+ }
+ range_tree_destroy(vd->vdev_obsolete_segments);
+ rw_destroy(&vd->vdev_indirect_rwlock);
+ mutex_destroy(&vd->vdev_obsolete_lock);
+
mutex_destroy(&vd->vdev_queue_lock);
mutex_destroy(&vd->vdev_dtl_lock);
mutex_destroy(&vd->vdev_stat_lock);
mutex_destroy(&vd->vdev_probe_lock);
if (vd == spa->spa_root_vdev)
spa->spa_root_vdev = NULL;
kmem_free(vd, sizeof (vdev_t));
}
/*
* Transfer top-level vdev state from svd to tvd.
*/
static void
vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
{
spa_t *spa = svd->vdev_spa;
metaslab_t *msp;
vdev_t *vd;
int t;
ASSERT(tvd == tvd->vdev_top);
tvd->vdev_ms_array = svd->vdev_ms_array;
tvd->vdev_ms_shift = svd->vdev_ms_shift;
tvd->vdev_ms_count = svd->vdev_ms_count;
tvd->vdev_top_zap = svd->vdev_top_zap;
svd->vdev_ms_array = 0;
svd->vdev_ms_shift = 0;
svd->vdev_ms_count = 0;
svd->vdev_top_zap = 0;
if (tvd->vdev_mg)
ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
tvd->vdev_mg = svd->vdev_mg;
tvd->vdev_ms = svd->vdev_ms;
svd->vdev_mg = NULL;
svd->vdev_ms = NULL;
if (tvd->vdev_mg != NULL)
tvd->vdev_mg->mg_vd = tvd;
tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
svd->vdev_stat.vs_alloc = 0;
svd->vdev_stat.vs_space = 0;
svd->vdev_stat.vs_dspace = 0;
for (t = 0; t < TXG_SIZE; t++) {
while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
}
if (list_link_active(&svd->vdev_config_dirty_node)) {
vdev_config_clean(svd);
vdev_config_dirty(tvd);
}
if (list_link_active(&svd->vdev_state_dirty_node)) {
vdev_state_clean(svd);
vdev_state_dirty(tvd);
}
tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
svd->vdev_deflate_ratio = 0;
tvd->vdev_islog = svd->vdev_islog;
svd->vdev_islog = 0;
}
static void
vdev_top_update(vdev_t *tvd, vdev_t *vd)
{
if (vd == NULL)
return;
vd->vdev_top = tvd;
for (int c = 0; c < vd->vdev_children; c++)
vdev_top_update(tvd, vd->vdev_child[c]);
}
/*
* Add a mirror/replacing vdev above an existing vdev.
*/
vdev_t *
vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
{
spa_t *spa = cvd->vdev_spa;
vdev_t *pvd = cvd->vdev_parent;
vdev_t *mvd;
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
mvd->vdev_asize = cvd->vdev_asize;
mvd->vdev_min_asize = cvd->vdev_min_asize;
mvd->vdev_max_asize = cvd->vdev_max_asize;
+ mvd->vdev_psize = cvd->vdev_psize;
mvd->vdev_ashift = cvd->vdev_ashift;
mvd->vdev_logical_ashift = cvd->vdev_logical_ashift;
mvd->vdev_physical_ashift = cvd->vdev_physical_ashift;
mvd->vdev_state = cvd->vdev_state;
mvd->vdev_crtxg = cvd->vdev_crtxg;
vdev_remove_child(pvd, cvd);
vdev_add_child(pvd, mvd);
cvd->vdev_id = mvd->vdev_children;
vdev_add_child(mvd, cvd);
vdev_top_update(cvd->vdev_top, cvd->vdev_top);
if (mvd == mvd->vdev_top)
vdev_top_transfer(cvd, mvd);
return (mvd);
}
/*
* Remove a 1-way mirror/replacing vdev from the tree.
*/
void
vdev_remove_parent(vdev_t *cvd)
{
vdev_t *mvd = cvd->vdev_parent;
vdev_t *pvd = mvd->vdev_parent;
ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
ASSERT(mvd->vdev_children == 1);
ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
mvd->vdev_ops == &vdev_replacing_ops ||
mvd->vdev_ops == &vdev_spare_ops);
cvd->vdev_ashift = mvd->vdev_ashift;
cvd->vdev_logical_ashift = mvd->vdev_logical_ashift;
cvd->vdev_physical_ashift = mvd->vdev_physical_ashift;
vdev_remove_child(mvd, cvd);
vdev_remove_child(pvd, mvd);
/*
* If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
* Otherwise, we could have detached an offline device, and when we
* go to import the pool we'll think we have two top-level vdevs,
* instead of a different version of the same top-level vdev.
*/
if (mvd->vdev_top == mvd) {
uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
cvd->vdev_orig_guid = cvd->vdev_guid;
cvd->vdev_guid += guid_delta;
cvd->vdev_guid_sum += guid_delta;
}
cvd->vdev_id = mvd->vdev_id;
vdev_add_child(pvd, cvd);
vdev_top_update(cvd->vdev_top, cvd->vdev_top);
if (cvd == cvd->vdev_top)
vdev_top_transfer(mvd, cvd);
ASSERT(mvd->vdev_children == 0);
vdev_free(mvd);
}
int
vdev_metaslab_init(vdev_t *vd, uint64_t txg)
{
spa_t *spa = vd->vdev_spa;
objset_t *mos = spa->spa_meta_objset;
uint64_t m;
uint64_t oldc = vd->vdev_ms_count;
uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
metaslab_t **mspp;
int error;
ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
/*
* This vdev is not being allocated from yet or is a hole.
*/
if (vd->vdev_ms_shift == 0)
return (0);
ASSERT(!vd->vdev_ishole);
- /*
- * Compute the raidz-deflation ratio. Note, we hard-code
- * in 128k (1 << 17) because it is the "typical" blocksize.
- * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
- * otherwise it would inconsistently account for existing bp's.
- */
- vd->vdev_deflate_ratio = (1 << 17) /
- (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
-
ASSERT(oldc <= newc);
mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
if (oldc != 0) {
bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
}
vd->vdev_ms = mspp;
vd->vdev_ms_count = newc;
for (m = oldc; m < newc; m++) {
uint64_t object = 0;
- if (txg == 0) {
+ /*
+ * vdev_ms_array may be 0 if we are creating the "fake"
+ * metaslabs for an indirect vdev for zdb's leak detection.
+ * See zdb_leak_init().
+ */
+ if (txg == 0 && vd->vdev_ms_array != 0) {
error = dmu_read(mos, vd->vdev_ms_array,
m * sizeof (uint64_t), sizeof (uint64_t), &object,
DMU_READ_PREFETCH);
if (error)
return (error);
}
error = metaslab_init(vd->vdev_mg, m, object, txg,
&(vd->vdev_ms[m]));
if (error)
return (error);
}
if (txg == 0)
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
/*
* If the vdev is being removed we don't activate
* the metaslabs since we want to ensure that no new
* allocations are performed on this device.
*/
if (oldc == 0 && !vd->vdev_removing)
metaslab_group_activate(vd->vdev_mg);
if (txg == 0)
spa_config_exit(spa, SCL_ALLOC, FTAG);
return (0);
}
void
vdev_metaslab_fini(vdev_t *vd)
{
- uint64_t m;
- uint64_t count = vd->vdev_ms_count;
-
if (vd->vdev_ms != NULL) {
+ uint64_t count = vd->vdev_ms_count;
+
metaslab_group_passivate(vd->vdev_mg);
- for (m = 0; m < count; m++) {
+ for (uint64_t m = 0; m < count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
if (msp != NULL)
metaslab_fini(msp);
}
kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
vd->vdev_ms = NULL;
+
+ vd->vdev_ms_count = 0;
}
+ ASSERT0(vd->vdev_ms_count);
}
typedef struct vdev_probe_stats {
boolean_t vps_readable;
boolean_t vps_writeable;
int vps_flags;
} vdev_probe_stats_t;
static void
vdev_probe_done(zio_t *zio)
{
spa_t *spa = zio->io_spa;
vdev_t *vd = zio->io_vd;
vdev_probe_stats_t *vps = zio->io_private;
ASSERT(vd->vdev_probe_zio != NULL);
if (zio->io_type == ZIO_TYPE_READ) {
if (zio->io_error == 0)
vps->vps_readable = 1;
if (zio->io_error == 0 && spa_writeable(spa)) {
zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
zio->io_offset, zio->io_size, zio->io_abd,
ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
} else {
abd_free(zio->io_abd);
}
} else if (zio->io_type == ZIO_TYPE_WRITE) {
if (zio->io_error == 0)
vps->vps_writeable = 1;
abd_free(zio->io_abd);
} else if (zio->io_type == ZIO_TYPE_NULL) {
zio_t *pio;
vd->vdev_cant_read |= !vps->vps_readable;
vd->vdev_cant_write |= !vps->vps_writeable;
if (vdev_readable(vd) &&
(vdev_writeable(vd) || !spa_writeable(spa))) {
zio->io_error = 0;
} else {
ASSERT(zio->io_error != 0);
+ zfs_dbgmsg("failed probe on vdev %llu",
+ (longlong_t)vd->vdev_id);
zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
spa, vd, NULL, 0, 0);
zio->io_error = SET_ERROR(ENXIO);
}
mutex_enter(&vd->vdev_probe_lock);
ASSERT(vd->vdev_probe_zio == zio);
vd->vdev_probe_zio = NULL;
mutex_exit(&vd->vdev_probe_lock);
zio_link_t *zl = NULL;
while ((pio = zio_walk_parents(zio, &zl)) != NULL)
if (!vdev_accessible(vd, pio))
pio->io_error = SET_ERROR(ENXIO);
kmem_free(vps, sizeof (*vps));
}
}
/*
* Determine whether this device is accessible.
*
* Read and write to several known locations: the pad regions of each
* vdev label but the first, which we leave alone in case it contains
* a VTOC.
*/
zio_t *
vdev_probe(vdev_t *vd, zio_t *zio)
{
spa_t *spa = vd->vdev_spa;
vdev_probe_stats_t *vps = NULL;
zio_t *pio;
ASSERT(vd->vdev_ops->vdev_op_leaf);
/*
* Don't probe the probe.
*/
if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
return (NULL);
/*
* To prevent 'probe storms' when a device fails, we create
* just one probe i/o at a time. All zios that want to probe
* this vdev will become parents of the probe io.
*/
mutex_enter(&vd->vdev_probe_lock);
if ((pio = vd->vdev_probe_zio) == NULL) {
vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
ZIO_FLAG_TRYHARD;
if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
/*
* vdev_cant_read and vdev_cant_write can only
* transition from TRUE to FALSE when we have the
* SCL_ZIO lock as writer; otherwise they can only
* transition from FALSE to TRUE. This ensures that
* any zio looking at these values can assume that
* failures persist for the life of the I/O. That's
* important because when a device has intermittent
* connectivity problems, we want to ensure that
* they're ascribed to the device (ENXIO) and not
* the zio (EIO).
*
* Since we hold SCL_ZIO as writer here, clear both
* values so the probe can reevaluate from first
* principles.
*/
vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
vd->vdev_cant_read = B_FALSE;
vd->vdev_cant_write = B_FALSE;
}
vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
vdev_probe_done, vps,
vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
/*
* We can't change the vdev state in this context, so we
* kick off an async task to do it on our behalf.
*/
if (zio != NULL) {
vd->vdev_probe_wanted = B_TRUE;
spa_async_request(spa, SPA_ASYNC_PROBE);
}
}
if (zio != NULL)
zio_add_child(zio, pio);
mutex_exit(&vd->vdev_probe_lock);
if (vps == NULL) {
ASSERT(zio != NULL);
return (NULL);
}
for (int l = 1; l < VDEV_LABELS; l++) {
zio_nowait(zio_read_phys(pio, vd,
vdev_label_offset(vd->vdev_psize, l,
offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE,
abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE),
ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
}
if (zio == NULL)
return (pio);
zio_nowait(pio);
return (NULL);
}
static void
vdev_open_child(void *arg)
{
vdev_t *vd = arg;
vd->vdev_open_thread = curthread;
vd->vdev_open_error = vdev_open(vd);
vd->vdev_open_thread = NULL;
}
boolean_t
vdev_uses_zvols(vdev_t *vd)
{
if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
strlen(ZVOL_DIR)) == 0)
return (B_TRUE);
for (int c = 0; c < vd->vdev_children; c++)
if (vdev_uses_zvols(vd->vdev_child[c]))
return (B_TRUE);
return (B_FALSE);
}
void
vdev_open_children(vdev_t *vd)
{
taskq_t *tq;
int children = vd->vdev_children;
/*
* in order to handle pools on top of zvols, do the opens
* in a single thread so that the same thread holds the
* spa_namespace_lock
*/
if (B_TRUE || vdev_uses_zvols(vd)) {
for (int c = 0; c < children; c++)
vd->vdev_child[c]->vdev_open_error =
vdev_open(vd->vdev_child[c]);
return;
}
tq = taskq_create("vdev_open", children, minclsyspri,
children, children, TASKQ_PREPOPULATE);
for (int c = 0; c < children; c++)
VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
TQ_SLEEP) != 0);
taskq_destroy(tq);
}
/*
+ * Compute the raidz-deflation ratio. Note, we hard-code
+ * in 128k (1 << 17) because it is the "typical" blocksize.
+ * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
+ * otherwise it would inconsistently account for existing bp's.
+ */
+static void
+vdev_set_deflate_ratio(vdev_t *vd)
+{
+ if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {
+ vd->vdev_deflate_ratio = (1 << 17) /
+ (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
+ }
+}
+
+/*
* Prepare a virtual device for access.
*/
int
vdev_open(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
int error;
uint64_t osize = 0;
uint64_t max_osize = 0;
uint64_t asize, max_asize, psize;
uint64_t logical_ashift = 0;
uint64_t physical_ashift = 0;
ASSERT(vd->vdev_open_thread == curthread ||
spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
vd->vdev_state == VDEV_STATE_CANT_OPEN ||
vd->vdev_state == VDEV_STATE_OFFLINE);
vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
vd->vdev_cant_read = B_FALSE;
vd->vdev_cant_write = B_FALSE;
vd->vdev_notrim = B_FALSE;
vd->vdev_min_asize = vdev_get_min_asize(vd);
/*
* If this vdev is not removed, check its fault status. If it's
* faulted, bail out of the open.
*/
if (!vd->vdev_removed && vd->vdev_faulted) {
ASSERT(vd->vdev_children == 0);
ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
vd->vdev_label_aux);
return (SET_ERROR(ENXIO));
} else if (vd->vdev_offline) {
ASSERT(vd->vdev_children == 0);
vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
return (SET_ERROR(ENXIO));
}
error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
&logical_ashift, &physical_ashift);
/*
* Reset the vdev_reopening flag so that we actually close
* the vdev on error.
*/
vd->vdev_reopening = B_FALSE;
if (zio_injection_enabled && error == 0)
error = zio_handle_device_injection(vd, NULL, ENXIO);
if (error) {
if (vd->vdev_removed &&
vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
vd->vdev_removed = B_FALSE;
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
vd->vdev_stat.vs_aux);
return (error);
}
vd->vdev_removed = B_FALSE;
/*
* Recheck the faulted flag now that we have confirmed that
* the vdev is accessible. If we're faulted, bail.
*/
if (vd->vdev_faulted) {
ASSERT(vd->vdev_children == 0);
ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
vd->vdev_label_aux);
return (SET_ERROR(ENXIO));
}
if (vd->vdev_degraded) {
ASSERT(vd->vdev_children == 0);
vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
VDEV_AUX_ERR_EXCEEDED);
} else {
vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
}
/*
* For hole or missing vdevs we just return success.
*/
if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
return (0);
if (zfs_trim_enabled && !vd->vdev_notrim && vd->vdev_ops->vdev_op_leaf)
trim_map_create(vd);
for (int c = 0; c < vd->vdev_children; c++) {
if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
VDEV_AUX_NONE);
break;
}
}
osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t));
if (vd->vdev_children == 0) {
if (osize < SPA_MINDEVSIZE) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_TOO_SMALL);
return (SET_ERROR(EOVERFLOW));
}
psize = osize;
asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
max_asize = max_osize - (VDEV_LABEL_START_SIZE +
VDEV_LABEL_END_SIZE);
} else {
if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_TOO_SMALL);
return (SET_ERROR(EOVERFLOW));
}
psize = 0;
asize = osize;
max_asize = max_osize;
}
vd->vdev_psize = psize;
/*
* Make sure the allocatable size hasn't shrunk too much.
*/
if (asize < vd->vdev_min_asize) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_BAD_LABEL);
return (SET_ERROR(EINVAL));
}
vd->vdev_physical_ashift =
MAX(physical_ashift, vd->vdev_physical_ashift);
vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift);
vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift);
if (vd->vdev_logical_ashift > SPA_MAXASHIFT) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_ASHIFT_TOO_BIG);
return (EINVAL);
}
if (vd->vdev_asize == 0) {
/*
* This is the first-ever open, so use the computed values.
* For testing purposes, a higher ashift can be requested.
*/
vd->vdev_asize = asize;
vd->vdev_max_asize = max_asize;
} else {
/*
* Make sure the alignment requirement hasn't increased.
*/
if (vd->vdev_ashift > vd->vdev_top->vdev_ashift &&
vd->vdev_ops->vdev_op_leaf) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_BAD_LABEL);
return (EINVAL);
}
vd->vdev_max_asize = max_asize;
}
/*
* If all children are healthy we update asize if either:
* The asize has increased, due to a device expansion caused by dynamic
* LUN growth or vdev replacement, and automatic expansion is enabled;
* making the additional space available.
*
* The asize has decreased, due to a device shrink usually caused by a
* vdev replace with a smaller device. This ensures that calculations
* based of max_asize and asize e.g. esize are always valid. It's safe
* to do this as we've already validated that asize is greater than
* vdev_min_asize.
*/
if (vd->vdev_state == VDEV_STATE_HEALTHY &&
((asize > vd->vdev_asize &&
(vd->vdev_expanding || spa->spa_autoexpand)) ||
(asize < vd->vdev_asize)))
vd->vdev_asize = asize;
vdev_set_min_asize(vd);
/*
* Ensure we can issue some IO before declaring the
* vdev open for business.
*/
if (vd->vdev_ops->vdev_op_leaf &&
(error = zio_wait(vdev_probe(vd, NULL))) != 0) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
VDEV_AUX_ERR_EXCEEDED);
return (error);
}
+ if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
+ !vd->vdev_isl2cache && !vd->vdev_islog) {
+ if (vd->vdev_ashift > spa->spa_max_ashift)
+ spa->spa_max_ashift = vd->vdev_ashift;
+ if (vd->vdev_ashift < spa->spa_min_ashift)
+ spa->spa_min_ashift = vd->vdev_ashift;
+ }
+
/*
* Track the min and max ashift values for normal data devices.
*/
if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
!vd->vdev_islog && vd->vdev_aux == NULL) {
if (vd->vdev_ashift > spa->spa_max_ashift)
spa->spa_max_ashift = vd->vdev_ashift;
if (vd->vdev_ashift < spa->spa_min_ashift)
spa->spa_min_ashift = vd->vdev_ashift;
}
/*
* If a leaf vdev has a DTL, and seems healthy, then kick off a
* resilver. But don't do this if we are doing a reopen for a scrub,
* since this would just restart the scrub we are already doing.
*/
if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
vdev_resilver_needed(vd, NULL, NULL))
spa_async_request(spa, SPA_ASYNC_RESILVER);
return (0);
}
/*
* Called once the vdevs are all opened, this routine validates the label
* contents. This needs to be done before vdev_load() so that we don't
* inadvertently do repair I/Os to the wrong device.
*
* If 'strict' is false ignore the spa guid check. This is necessary because
* if the machine crashed during a re-guid the new guid might have been written
* to all of the vdev labels, but not the cached config. The strict check
* will be performed when the pool is opened again using the mos config.
*
* This function will only return failure if one of the vdevs indicates that it
* has since been destroyed or exported. This is only possible if
* /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state
* will be updated but the function will return 0.
*/
int
vdev_validate(vdev_t *vd, boolean_t strict)
{
spa_t *spa = vd->vdev_spa;
nvlist_t *label;
uint64_t guid = 0, top_guid;
uint64_t state;
for (int c = 0; c < vd->vdev_children; c++)
if (vdev_validate(vd->vdev_child[c], strict) != 0)
return (SET_ERROR(EBADF));
/*
* If the device has already failed, or was marked offline, don't do
* any further validation. Otherwise, label I/O will fail and we will
* overwrite the previous state.
*/
if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
uint64_t aux_guid = 0;
nvlist_t *nvl;
uint64_t txg = spa_last_synced_txg(spa) != 0 ?
spa_last_synced_txg(spa) : -1ULL;
if ((label = vdev_label_read_config(vd, txg)) == NULL) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_BAD_LABEL);
return (0);
}
/*
* Determine if this vdev has been split off into another
* pool. If so, then refuse to open it.
*/
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
&aux_guid) == 0 && aux_guid == spa_guid(spa)) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_SPLIT_POOL);
nvlist_free(label);
return (0);
}
if (strict && (nvlist_lookup_uint64(label,
ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
guid != spa_guid(spa))) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
nvlist_free(label);
return (0);
}
if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
!= 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
&aux_guid) != 0)
aux_guid = 0;
/*
* If this vdev just became a top-level vdev because its
* sibling was detached, it will have adopted the parent's
* vdev guid -- but the label may or may not be on disk yet.
* Fortunately, either version of the label will have the
* same top guid, so if we're a top-level vdev, we can
* safely compare to that instead.
*
* If we split this vdev off instead, then we also check the
* original pool's guid. We don't want to consider the vdev
* corrupt if it is partway through a split operation.
*/
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
&guid) != 0 ||
nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID,
&top_guid) != 0 ||
((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) &&
(vd->vdev_guid != top_guid || vd != vd->vdev_top))) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
nvlist_free(label);
return (0);
}
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
&state) != 0) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
nvlist_free(label);
return (0);
}
nvlist_free(label);
/*
* If this is a verbatim import, no need to check the
* state of the pool.
*/
if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
spa_load_state(spa) == SPA_LOAD_OPEN &&
state != POOL_STATE_ACTIVE)
return (SET_ERROR(EBADF));
/*
* If we were able to open and validate a vdev that was
* previously marked permanently unavailable, clear that state
* now.
*/
if (vd->vdev_not_present)
vd->vdev_not_present = 0;
}
return (0);
}
/*
* Close a virtual device.
*/
void
vdev_close(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
vdev_t *pvd = vd->vdev_parent;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
/*
* If our parent is reopening, then we are as well, unless we are
* going offline.
*/
if (pvd != NULL && pvd->vdev_reopening)
vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
vd->vdev_ops->vdev_op_close(vd);
vdev_cache_purge(vd);
if (vd->vdev_ops->vdev_op_leaf)
trim_map_destroy(vd);
/*
* We record the previous state before we close it, so that if we are
* doing a reopen(), we don't generate FMA ereports if we notice that
* it's still faulted.
*/
vd->vdev_prevstate = vd->vdev_state;
if (vd->vdev_offline)
vd->vdev_state = VDEV_STATE_OFFLINE;
else
vd->vdev_state = VDEV_STATE_CLOSED;
vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
}
void
vdev_hold(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
ASSERT(spa_is_root(spa));
if (spa->spa_state == POOL_STATE_UNINITIALIZED)
return;
for (int c = 0; c < vd->vdev_children; c++)
vdev_hold(vd->vdev_child[c]);
if (vd->vdev_ops->vdev_op_leaf)
vd->vdev_ops->vdev_op_hold(vd);
}
void
vdev_rele(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
ASSERT(spa_is_root(spa));
for (int c = 0; c < vd->vdev_children; c++)
vdev_rele(vd->vdev_child[c]);
if (vd->vdev_ops->vdev_op_leaf)
vd->vdev_ops->vdev_op_rele(vd);
}
/*
* Reopen all interior vdevs and any unopened leaves. We don't actually
* reopen leaf vdevs which had previously been opened as they might deadlock
* on the spa_config_lock. Instead we only obtain the leaf's physical size.
* If the leaf has never been opened then open it, as usual.
*/
void
vdev_reopen(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
/* set the reopening flag unless we're taking the vdev offline */
vd->vdev_reopening = !vd->vdev_offline;
vdev_close(vd);
(void) vdev_open(vd);
/*
* Call vdev_validate() here to make sure we have the same device.
* Otherwise, a device with an invalid label could be successfully
* opened in response to vdev_reopen().
*/
if (vd->vdev_aux) {
(void) vdev_validate_aux(vd);
if (vdev_readable(vd) && vdev_writeable(vd) &&
vd->vdev_aux == &spa->spa_l2cache &&
!l2arc_vdev_present(vd))
l2arc_add_vdev(spa, vd);
} else {
(void) vdev_validate(vd, B_TRUE);
}
/*
* Reassess parent vdev's health.
*/
vdev_propagate_state(vd);
}
int
vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
{
int error;
/*
* Normally, partial opens (e.g. of a mirror) are allowed.
* For a create, however, we want to fail the request if
* there are any components we can't open.
*/
error = vdev_open(vd);
if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
vdev_close(vd);
return (error ? error : ENXIO);
}
/*
* Recursively load DTLs and initialize all labels.
*/
if ((error = vdev_dtl_load(vd)) != 0 ||
(error = vdev_label_init(vd, txg, isreplacing ?
VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
vdev_close(vd);
return (error);
}
return (0);
}
void
vdev_metaslab_set_size(vdev_t *vd)
{
/*
* Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev.
*/
vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev);
vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
}
/*
* Maximize performance by inflating the configured ashift for top level
* vdevs to be as close to the physical ashift as possible while maintaining
* administrator defined limits and ensuring it doesn't go below the
* logical ashift.
*/
void
vdev_ashift_optimize(vdev_t *vd)
{
if (vd == vd->vdev_top) {
if (vd->vdev_ashift < vd->vdev_physical_ashift) {
vd->vdev_ashift = MIN(
MAX(zfs_max_auto_ashift, vd->vdev_ashift),
MAX(zfs_min_auto_ashift, vd->vdev_physical_ashift));
} else {
/*
* Unusual case where logical ashift > physical ashift
* so we can't cap the calculated ashift based on max
* ashift as that would cause failures.
* We still check if we need to increase it to match
* the min ashift.
*/
vd->vdev_ashift = MAX(zfs_min_auto_ashift,
vd->vdev_ashift);
}
}
}
void
vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
{
ASSERT(vd == vd->vdev_top);
- ASSERT(!vd->vdev_ishole);
+ /* indirect vdevs don't have metaslabs or dtls */
+ ASSERT(vdev_is_concrete(vd) || flags == 0);
ASSERT(ISP2(flags));
ASSERT(spa_writeable(vd->vdev_spa));
if (flags & VDD_METASLAB)
(void) txg_list_add(&vd->vdev_ms_list, arg, txg);
if (flags & VDD_DTL)
(void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
}
void
vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
{
for (int c = 0; c < vd->vdev_children; c++)
vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
if (vd->vdev_ops->vdev_op_leaf)
vdev_dirty(vd->vdev_top, flags, vd, txg);
}
/*
* DTLs.
*
* A vdev's DTL (dirty time log) is the set of transaction groups for which
* the vdev has less than perfect replication. There are four kinds of DTL:
*
* DTL_MISSING: txgs for which the vdev has no valid copies of the data
*
* DTL_PARTIAL: txgs for which data is available, but not fully replicated
*
* DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
* scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
* txgs that was scrubbed.
*
* DTL_OUTAGE: txgs which cannot currently be read, whether due to
* persistent errors or just some device being offline.
* Unlike the other three, the DTL_OUTAGE map is not generally
* maintained; it's only computed when needed, typically to
* determine whether a device can be detached.
*
* For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
* either has the data or it doesn't.
*
* For interior vdevs such as mirror and RAID-Z the picture is more complex.
* A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
* if any child is less than fully replicated, then so is its parent.
* A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
* comprising only those txgs which appear in 'maxfaults' or more children;
* those are the txgs we don't have enough replication to read. For example,
* double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
* thus, its DTL_MISSING consists of the set of txgs that appear in more than
* two child DTL_MISSING maps.
*
* It should be clear from the above that to compute the DTLs and outage maps
* for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
* Therefore, that is all we keep on disk. When loading the pool, or after
* a configuration change, we generate all other DTLs from first principles.
*/
void
vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
{
range_tree_t *rt = vd->vdev_dtl[t];
ASSERT(t < DTL_TYPES);
ASSERT(vd != vd->vdev_spa->spa_root_vdev);
ASSERT(spa_writeable(vd->vdev_spa));
- mutex_enter(rt->rt_lock);
+ mutex_enter(&vd->vdev_dtl_lock);
if (!range_tree_contains(rt, txg, size))
range_tree_add(rt, txg, size);
- mutex_exit(rt->rt_lock);
+ mutex_exit(&vd->vdev_dtl_lock);
}
boolean_t
vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
{
range_tree_t *rt = vd->vdev_dtl[t];
boolean_t dirty = B_FALSE;
ASSERT(t < DTL_TYPES);
ASSERT(vd != vd->vdev_spa->spa_root_vdev);
- mutex_enter(rt->rt_lock);
+ /*
+ * While we are loading the pool, the DTLs have not been loaded yet.
+ * Ignore the DTLs and try all devices. This avoids a recursive
+ * mutex enter on the vdev_dtl_lock, and also makes us try hard
+ * when loading the pool (relying on the checksum to ensure that
+ * we get the right data -- note that we while loading, we are
+ * only reading the MOS, which is always checksummed).
+ */
+ if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE)
+ return (B_FALSE);
+
+ mutex_enter(&vd->vdev_dtl_lock);
if (range_tree_space(rt) != 0)
dirty = range_tree_contains(rt, txg, size);
- mutex_exit(rt->rt_lock);
+ mutex_exit(&vd->vdev_dtl_lock);
return (dirty);
}
boolean_t
vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
{
range_tree_t *rt = vd->vdev_dtl[t];
boolean_t empty;
- mutex_enter(rt->rt_lock);
+ mutex_enter(&vd->vdev_dtl_lock);
empty = (range_tree_space(rt) == 0);
- mutex_exit(rt->rt_lock);
+ mutex_exit(&vd->vdev_dtl_lock);
return (empty);
}
/*
* Returns the lowest txg in the DTL range.
*/
static uint64_t
vdev_dtl_min(vdev_t *vd)
{
range_seg_t *rs;
ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
ASSERT0(vd->vdev_children);
rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root);
return (rs->rs_start - 1);
}
/*
* Returns the highest txg in the DTL.
*/
static uint64_t
vdev_dtl_max(vdev_t *vd)
{
range_seg_t *rs;
ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
ASSERT0(vd->vdev_children);
rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root);
return (rs->rs_end);
}
/*
* Determine if a resilvering vdev should remove any DTL entries from
* its range. If the vdev was resilvering for the entire duration of the
* scan then it should excise that range from its DTLs. Otherwise, this
* vdev is considered partially resilvered and should leave its DTL
* entries intact. The comment in vdev_dtl_reassess() describes how we
* excise the DTLs.
*/
static boolean_t
vdev_dtl_should_excise(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
ASSERT0(scn->scn_phys.scn_errors);
ASSERT0(vd->vdev_children);
if (vd->vdev_state < VDEV_STATE_DEGRADED)
return (B_FALSE);
if (vd->vdev_resilver_txg == 0 ||
range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0)
return (B_TRUE);
/*
* When a resilver is initiated the scan will assign the scn_max_txg
* value to the highest txg value that exists in all DTLs. If this
* device's max DTL is not part of this scan (i.e. it is not in
* the range (scn_min_txg, scn_max_txg] then it is not eligible
* for excision.
*/
if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd));
ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg);
ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg);
return (B_TRUE);
}
return (B_FALSE);
}
/*
* Reassess DTLs after a config change or scrub completion.
*/
void
vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
{
spa_t *spa = vd->vdev_spa;
avl_tree_t reftree;
int minref;
ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
for (int c = 0; c < vd->vdev_children; c++)
vdev_dtl_reassess(vd->vdev_child[c], txg,
scrub_txg, scrub_done);
- if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux)
+ if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux)
return;
if (vd->vdev_ops->vdev_op_leaf) {
dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
mutex_enter(&vd->vdev_dtl_lock);
/*
* If we've completed a scan cleanly then determine
* if this vdev should remove any DTLs. We only want to
* excise regions on vdevs that were available during
* the entire duration of this scan.
*/
if (scrub_txg != 0 &&
(spa->spa_scrub_started ||
(scn != NULL && scn->scn_phys.scn_errors == 0)) &&
vdev_dtl_should_excise(vd)) {
/*
* We completed a scrub up to scrub_txg. If we
* did it without rebooting, then the scrub dtl
* will be valid, so excise the old region and
* fold in the scrub dtl. Otherwise, leave the
* dtl as-is if there was an error.
*
* There's little trick here: to excise the beginning
* of the DTL_MISSING map, we put it into a reference
* tree and then add a segment with refcnt -1 that
* covers the range [0, scrub_txg). This means
* that each txg in that range has refcnt -1 or 0.
* We then add DTL_SCRUB with a refcnt of 2, so that
* entries in the range [0, scrub_txg) will have a
* positive refcnt -- either 1 or 2. We then convert
* the reference tree into the new DTL_MISSING map.
*/
space_reftree_create(&reftree);
space_reftree_add_map(&reftree,
vd->vdev_dtl[DTL_MISSING], 1);
space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
space_reftree_add_map(&reftree,
vd->vdev_dtl[DTL_SCRUB], 2);
space_reftree_generate_map(&reftree,
vd->vdev_dtl[DTL_MISSING], 1);
space_reftree_destroy(&reftree);
}
range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
range_tree_walk(vd->vdev_dtl[DTL_MISSING],
range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
if (scrub_done)
range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
if (!vdev_readable(vd))
range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
else
range_tree_walk(vd->vdev_dtl[DTL_MISSING],
range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
/*
* If the vdev was resilvering and no longer has any
* DTLs then reset its resilvering flag and dirty
* the top level so that we persist the change.
*/
if (vd->vdev_resilver_txg != 0 &&
range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 &&
range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) {
vd->vdev_resilver_txg = 0;
vdev_config_dirty(vd->vdev_top);
}
mutex_exit(&vd->vdev_dtl_lock);
if (txg != 0)
vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
return;
}
mutex_enter(&vd->vdev_dtl_lock);
for (int t = 0; t < DTL_TYPES; t++) {
/* account for child's outage in parent's missing map */
int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
if (t == DTL_SCRUB)
continue; /* leaf vdevs only */
if (t == DTL_PARTIAL)
minref = 1; /* i.e. non-zero */
else if (vd->vdev_nparity != 0)
minref = vd->vdev_nparity + 1; /* RAID-Z */
else
minref = vd->vdev_children; /* any kind of mirror */
space_reftree_create(&reftree);
for (int c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
mutex_enter(&cvd->vdev_dtl_lock);
space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
mutex_exit(&cvd->vdev_dtl_lock);
}
space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
space_reftree_destroy(&reftree);
}
mutex_exit(&vd->vdev_dtl_lock);
}
int
vdev_dtl_load(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
objset_t *mos = spa->spa_meta_objset;
int error = 0;
if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
- ASSERT(!vd->vdev_ishole);
+ ASSERT(vdev_is_concrete(vd));
error = space_map_open(&vd->vdev_dtl_sm, mos,
- vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock);
+ vd->vdev_dtl_object, 0, -1ULL, 0);
if (error)
return (error);
ASSERT(vd->vdev_dtl_sm != NULL);
mutex_enter(&vd->vdev_dtl_lock);
/*
* Now that we've opened the space_map we need to update
* the in-core DTL.
*/
space_map_update(vd->vdev_dtl_sm);
error = space_map_load(vd->vdev_dtl_sm,
vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
mutex_exit(&vd->vdev_dtl_lock);
return (error);
}
for (int c = 0; c < vd->vdev_children; c++) {
error = vdev_dtl_load(vd->vdev_child[c]);
if (error != 0)
break;
}
return (error);
}
void
vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx)
{
spa_t *spa = vd->vdev_spa;
VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx));
VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
zapobj, tx));
}
uint64_t
vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx)
{
spa_t *spa = vd->vdev_spa;
uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA,
DMU_OT_NONE, 0, tx);
ASSERT(zap != 0);
VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
zap, tx));
return (zap);
}
void
vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx)
{
if (vd->vdev_ops != &vdev_hole_ops &&
vd->vdev_ops != &vdev_missing_ops &&
vd->vdev_ops != &vdev_root_ops &&
!vd->vdev_top->vdev_removing) {
if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) {
vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx);
}
if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
}
}
for (uint64_t i = 0; i < vd->vdev_children; i++) {
vdev_construct_zaps(vd->vdev_child[i], tx);
}
}
void
vdev_dtl_sync(vdev_t *vd, uint64_t txg)
{
spa_t *spa = vd->vdev_spa;
range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
objset_t *mos = spa->spa_meta_objset;
range_tree_t *rtsync;
- kmutex_t rtlock;
dmu_tx_t *tx;
uint64_t object = space_map_object(vd->vdev_dtl_sm);
- ASSERT(!vd->vdev_ishole);
+ ASSERT(vdev_is_concrete(vd));
ASSERT(vd->vdev_ops->vdev_op_leaf);
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
mutex_enter(&vd->vdev_dtl_lock);
space_map_free(vd->vdev_dtl_sm, tx);
space_map_close(vd->vdev_dtl_sm);
vd->vdev_dtl_sm = NULL;
mutex_exit(&vd->vdev_dtl_lock);
/*
* We only destroy the leaf ZAP for detached leaves or for
* removed log devices. Removed data devices handle leaf ZAP
* cleanup later, once cancellation is no longer possible.
*/
if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached ||
vd->vdev_top->vdev_islog)) {
vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx);
vd->vdev_leaf_zap = 0;
}
dmu_tx_commit(tx);
return;
}
if (vd->vdev_dtl_sm == NULL) {
uint64_t new_object;
new_object = space_map_alloc(mos, tx);
VERIFY3U(new_object, !=, 0);
VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
- 0, -1ULL, 0, &vd->vdev_dtl_lock));
+ 0, -1ULL, 0));
ASSERT(vd->vdev_dtl_sm != NULL);
}
- bzero(&rtlock, sizeof(rtlock));
- mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL);
+ rtsync = range_tree_create(NULL, NULL);
- rtsync = range_tree_create(NULL, NULL, &rtlock);
-
- mutex_enter(&rtlock);
-
mutex_enter(&vd->vdev_dtl_lock);
range_tree_walk(rt, range_tree_add, rtsync);
mutex_exit(&vd->vdev_dtl_lock);
space_map_truncate(vd->vdev_dtl_sm, tx);
space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx);
range_tree_vacate(rtsync, NULL, NULL);
range_tree_destroy(rtsync);
- mutex_exit(&rtlock);
- mutex_destroy(&rtlock);
-
/*
* If the object for the space map has changed then dirty
* the top level so that we update the config.
*/
if (object != space_map_object(vd->vdev_dtl_sm)) {
zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, "
"new object %llu", txg, spa_name(spa), object,
space_map_object(vd->vdev_dtl_sm));
vdev_config_dirty(vd->vdev_top);
}
dmu_tx_commit(tx);
mutex_enter(&vd->vdev_dtl_lock);
space_map_update(vd->vdev_dtl_sm);
mutex_exit(&vd->vdev_dtl_lock);
}
/*
* Determine whether the specified vdev can be offlined/detached/removed
* without losing data.
*/
boolean_t
vdev_dtl_required(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
vdev_t *tvd = vd->vdev_top;
uint8_t cant_read = vd->vdev_cant_read;
boolean_t required;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
if (vd == spa->spa_root_vdev || vd == tvd)
return (B_TRUE);
/*
* Temporarily mark the device as unreadable, and then determine
* whether this results in any DTL outages in the top-level vdev.
* If not, we can safely offline/detach/remove the device.
*/
vd->vdev_cant_read = B_TRUE;
vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
vd->vdev_cant_read = cant_read;
vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
if (!required && zio_injection_enabled)
required = !!zio_handle_device_injection(vd, NULL, ECHILD);
return (required);
}
/*
* Determine if resilver is needed, and if so the txg range.
*/
boolean_t
vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
{
boolean_t needed = B_FALSE;
uint64_t thismin = UINT64_MAX;
uint64_t thismax = 0;
if (vd->vdev_children == 0) {
mutex_enter(&vd->vdev_dtl_lock);
if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 &&
vdev_writeable(vd)) {
thismin = vdev_dtl_min(vd);
thismax = vdev_dtl_max(vd);
needed = B_TRUE;
}
mutex_exit(&vd->vdev_dtl_lock);
} else {
for (int c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
uint64_t cmin, cmax;
if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
thismin = MIN(thismin, cmin);
thismax = MAX(thismax, cmax);
needed = B_TRUE;
}
}
}
if (needed && minp) {
*minp = thismin;
*maxp = thismax;
}
return (needed);
}
-void
+int
vdev_load(vdev_t *vd)
{
+ int error = 0;
/*
* Recursively load all children.
*/
- for (int c = 0; c < vd->vdev_children; c++)
- vdev_load(vd->vdev_child[c]);
+ for (int c = 0; c < vd->vdev_children; c++) {
+ error = vdev_load(vd->vdev_child[c]);
+ if (error != 0) {
+ return (error);
+ }
+ }
+ vdev_set_deflate_ratio(vd);
+
/*
* If this is a top-level vdev, initialize its metaslabs.
*/
- if (vd == vd->vdev_top && !vd->vdev_ishole &&
- (vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
- vdev_metaslab_init(vd, 0) != 0))
- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
+ if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
+ if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ return (SET_ERROR(ENXIO));
+ } else if ((error = vdev_metaslab_init(vd, 0)) != 0) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ return (error);
+ }
+ }
/*
* If this is a leaf vdev, load its DTL.
*/
- if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0)
+ if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
+ return (error);
+ }
+
+ uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd);
+ if (obsolete_sm_object != 0) {
+ objset_t *mos = vd->vdev_spa->spa_meta_objset;
+ ASSERT(vd->vdev_asize != 0);
+ ASSERT(vd->vdev_obsolete_sm == NULL);
+
+ if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
+ obsolete_sm_object, 0, vd->vdev_asize, 0))) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ return (error);
+ }
+ space_map_update(vd->vdev_obsolete_sm);
+ }
+
+ return (0);
}
/*
* The special vdev case is used for hot spares and l2cache devices. Its
* sole purpose it to set the vdev state for the associated vdev. To do this,
* we make sure that we can open the underlying device, then try to read the
* label, and make sure that the label is sane and that it hasn't been
* repurposed to another pool.
*/
int
vdev_validate_aux(vdev_t *vd)
{
nvlist_t *label;
uint64_t guid, version;
uint64_t state;
if (!vdev_readable(vd))
return (0);
if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
return (-1);
}
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
!SPA_VERSION_IS_SUPPORTED(version) ||
nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
guid != vd->vdev_guid ||
nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
nvlist_free(label);
return (-1);
}
/*
* We don't actually check the pool state here. If it's in fact in
* use by another pool, we update this fact on the fly when requested.
*/
nvlist_free(label);
return (0);
}
+/*
+ * Free the objects used to store this vdev's spacemaps, and the array
+ * that points to them.
+ */
void
-vdev_remove(vdev_t *vd, uint64_t txg)
+vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx)
{
+ if (vd->vdev_ms_array == 0)
+ return;
+
+ objset_t *mos = vd->vdev_spa->spa_meta_objset;
+ uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift;
+ size_t array_bytes = array_count * sizeof (uint64_t);
+ uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP);
+ VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0,
+ array_bytes, smobj_array, 0));
+
+ for (uint64_t i = 0; i < array_count; i++) {
+ uint64_t smobj = smobj_array[i];
+ if (smobj == 0)
+ continue;
+
+ space_map_free_obj(mos, smobj, tx);
+ }
+
+ kmem_free(smobj_array, array_bytes);
+ VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx));
+ vd->vdev_ms_array = 0;
+}
+
+static void
+vdev_remove_empty(vdev_t *vd, uint64_t txg)
+{
spa_t *spa = vd->vdev_spa;
- objset_t *mos = spa->spa_meta_objset;
dmu_tx_t *tx;
- tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
ASSERT(vd == vd->vdev_top);
ASSERT3U(txg, ==, spa_syncing_txg(spa));
if (vd->vdev_ms != NULL) {
metaslab_group_t *mg = vd->vdev_mg;
metaslab_group_histogram_verify(mg);
metaslab_class_histogram_verify(mg->mg_class);
for (int m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
if (msp == NULL || msp->ms_sm == NULL)
continue;
mutex_enter(&msp->ms_lock);
/*
* If the metaslab was not loaded when the vdev
* was removed then the histogram accounting may
* not be accurate. Update the histogram information
* here so that we ensure that the metaslab group
* and metaslab class are up-to-date.
*/
metaslab_group_histogram_remove(mg, msp);
VERIFY0(space_map_allocated(msp->ms_sm));
- space_map_free(msp->ms_sm, tx);
space_map_close(msp->ms_sm);
msp->ms_sm = NULL;
mutex_exit(&msp->ms_lock);
}
metaslab_group_histogram_verify(mg);
metaslab_class_histogram_verify(mg->mg_class);
for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
ASSERT0(mg->mg_histogram[i]);
-
}
- if (vd->vdev_ms_array) {
- (void) dmu_object_free(mos, vd->vdev_ms_array, tx);
- vd->vdev_ms_array = 0;
- }
+ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+ vdev_destroy_spacemaps(vd, tx);
if (vd->vdev_islog && vd->vdev_top_zap != 0) {
vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
vd->vdev_top_zap = 0;
}
dmu_tx_commit(tx);
}
void
vdev_sync_done(vdev_t *vd, uint64_t txg)
{
metaslab_t *msp;
boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
- ASSERT(!vd->vdev_ishole);
+ ASSERT(vdev_is_concrete(vd));
while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
metaslab_sync_done(msp, txg);
if (reassess)
metaslab_sync_reassess(vd->vdev_mg);
}
void
vdev_sync(vdev_t *vd, uint64_t txg)
{
spa_t *spa = vd->vdev_spa;
vdev_t *lvd;
metaslab_t *msp;
dmu_tx_t *tx;
- ASSERT(!vd->vdev_ishole);
+ if (range_tree_space(vd->vdev_obsolete_segments) > 0) {
+ dmu_tx_t *tx;
- if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
+ ASSERT(vd->vdev_removing ||
+ vd->vdev_ops == &vdev_indirect_ops);
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+ vdev_indirect_sync_obsolete(vd, tx);
+ dmu_tx_commit(tx);
+
+ /*
+ * If the vdev is indirect, it can't have dirty
+ * metaslabs or DTLs.
+ */
+ if (vd->vdev_ops == &vdev_indirect_ops) {
+ ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
+ ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
+ return;
+ }
+ }
+
+ ASSERT(vdev_is_concrete(vd));
+
+ if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 &&
+ !vd->vdev_removing) {
ASSERT(vd == vd->vdev_top);
+ ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
ASSERT(vd->vdev_ms_array != 0);
vdev_config_dirty(vd);
dmu_tx_commit(tx);
}
- /*
- * Remove the metadata associated with this vdev once it's empty.
- */
- if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
- vdev_remove(vd, txg);
-
while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
metaslab_sync(msp, txg);
(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
}
while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
vdev_dtl_sync(lvd, txg);
+ /*
+ * Remove the metadata associated with this vdev once it's empty.
+ * Note that this is typically used for log/cache device removal;
+ * we don't empty toplevel vdevs when removing them. But if
+ * a toplevel happens to be emptied, this is not harmful.
+ */
+ if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) {
+ vdev_remove_empty(vd, txg);
+ }
+
(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
}
uint64_t
vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
{
return (vd->vdev_ops->vdev_op_asize(vd, psize));
}
/*
* Mark the given vdev faulted. A faulted vdev behaves as if the device could
* not be opened, and no I/O is attempted.
*/
int
vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
{
vdev_t *vd, *tvd;
spa_vdev_state_enter(spa, SCL_NONE);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
tvd = vd->vdev_top;
/*
* We don't directly use the aux state here, but if we do a
* vdev_reopen(), we need this value to be present to remember why we
* were faulted.
*/
vd->vdev_label_aux = aux;
/*
* Faulted state takes precedence over degraded.
*/
vd->vdev_delayed_close = B_FALSE;
vd->vdev_faulted = 1ULL;
vd->vdev_degraded = 0ULL;
vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
/*
* If this device has the only valid copy of the data, then
* back off and simply mark the vdev as degraded instead.
*/
if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
vd->vdev_degraded = 1ULL;
vd->vdev_faulted = 0ULL;
/*
* If we reopen the device and it's not dead, only then do we
* mark it degraded.
*/
vdev_reopen(tvd);
if (vdev_readable(vd))
vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
}
return (spa_vdev_state_exit(spa, vd, 0));
}
/*
* Mark the given vdev degraded. A degraded vdev is purely an indication to the
* user that something is wrong. The vdev continues to operate as normal as far
* as I/O is concerned.
*/
int
vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
{
vdev_t *vd;
spa_vdev_state_enter(spa, SCL_NONE);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
/*
* If the vdev is already faulted, then don't do anything.
*/
if (vd->vdev_faulted || vd->vdev_degraded)
return (spa_vdev_state_exit(spa, NULL, 0));
vd->vdev_degraded = 1ULL;
if (!vdev_is_dead(vd))
vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
aux);
return (spa_vdev_state_exit(spa, vd, 0));
}
/*
* Online the given vdev.
*
* If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached
* spare device should be detached when the device finishes resilvering.
* Second, the online should be treated like a 'test' online case, so no FMA
* events are generated if the device fails to open.
*/
int
vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
{
vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
boolean_t wasoffline;
vdev_state_t oldstate;
spa_vdev_state_enter(spa, SCL_NONE);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
oldstate = vd->vdev_state;
tvd = vd->vdev_top;
vd->vdev_offline = B_FALSE;
vd->vdev_tmpoffline = B_FALSE;
vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
/* XXX - L2ARC 1.0 does not support expansion */
if (!vd->vdev_aux) {
for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND);
}
vdev_reopen(tvd);
vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
if (!vd->vdev_aux) {
for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
pvd->vdev_expanding = B_FALSE;
}
if (newstate)
*newstate = vd->vdev_state;
if ((flags & ZFS_ONLINE_UNSPARE) &&
!vdev_is_dead(vd) && vd->vdev_parent &&
vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
vd->vdev_parent->vdev_child[0] == vd)
vd->vdev_unspare = B_TRUE;
if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
/* XXX - L2ARC 1.0 does not support expansion */
if (vd->vdev_aux)
return (spa_vdev_state_exit(spa, vd, ENOTSUP));
spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
}
if (wasoffline ||
(oldstate < VDEV_STATE_DEGRADED &&
vd->vdev_state >= VDEV_STATE_DEGRADED))
spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE);
return (spa_vdev_state_exit(spa, vd, 0));
}
static int
vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
{
vdev_t *vd, *tvd;
int error = 0;
uint64_t generation;
metaslab_group_t *mg;
top:
spa_vdev_state_enter(spa, SCL_ALLOC);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
tvd = vd->vdev_top;
mg = tvd->vdev_mg;
generation = spa->spa_config_generation + 1;
/*
* If the device isn't already offline, try to offline it.
*/
if (!vd->vdev_offline) {
/*
* If this device has the only valid copy of some data,
* don't allow it to be offlined. Log devices are always
* expendable.
*/
if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
vdev_dtl_required(vd))
return (spa_vdev_state_exit(spa, NULL, EBUSY));
/*
* If the top-level is a slog and it has had allocations
* then proceed. We check that the vdev's metaslab group
* is not NULL since it's possible that we may have just
* added this vdev but not yet initialized its metaslabs.
*/
if (tvd->vdev_islog && mg != NULL) {
/*
* Prevent any future allocations.
*/
metaslab_group_passivate(mg);
(void) spa_vdev_state_exit(spa, vd, 0);
- error = spa_offline_log(spa);
+ error = spa_reset_logs(spa);
spa_vdev_state_enter(spa, SCL_ALLOC);
/*
* Check to see if the config has changed.
*/
if (error || generation != spa->spa_config_generation) {
metaslab_group_activate(mg);
if (error)
return (spa_vdev_state_exit(spa,
vd, error));
(void) spa_vdev_state_exit(spa, vd, 0);
goto top;
}
ASSERT0(tvd->vdev_stat.vs_alloc);
}
/*
* Offline this device and reopen its top-level vdev.
* If the top-level vdev is a log device then just offline
* it. Otherwise, if this action results in the top-level
* vdev becoming unusable, undo it and fail the request.
*/
vd->vdev_offline = B_TRUE;
vdev_reopen(tvd);
if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
vdev_is_dead(tvd)) {
vd->vdev_offline = B_FALSE;
vdev_reopen(tvd);
return (spa_vdev_state_exit(spa, NULL, EBUSY));
}
/*
* Add the device back into the metaslab rotor so that
* once we online the device it's open for business.
*/
if (tvd->vdev_islog && mg != NULL)
metaslab_group_activate(mg);
}
vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
return (spa_vdev_state_exit(spa, vd, 0));
}
int
vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
{
int error;
mutex_enter(&spa->spa_vdev_top_lock);
error = vdev_offline_locked(spa, guid, flags);
mutex_exit(&spa->spa_vdev_top_lock);
return (error);
}
/*
* Clear the error counts associated with this vdev. Unlike vdev_online() and
* vdev_offline(), we assume the spa config is locked. We also clear all
* children. If 'vd' is NULL, then the user wants to clear all vdevs.
*/
void
vdev_clear(spa_t *spa, vdev_t *vd)
{
vdev_t *rvd = spa->spa_root_vdev;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
if (vd == NULL)
vd = rvd;
vd->vdev_stat.vs_read_errors = 0;
vd->vdev_stat.vs_write_errors = 0;
vd->vdev_stat.vs_checksum_errors = 0;
for (int c = 0; c < vd->vdev_children; c++)
vdev_clear(spa, vd->vdev_child[c]);
if (vd == rvd) {
for (int c = 0; c < spa->spa_l2cache.sav_count; c++)
vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]);
for (int c = 0; c < spa->spa_spares.sav_count; c++)
vdev_clear(spa, spa->spa_spares.sav_vdevs[c]);
}
/*
+ * It makes no sense to "clear" an indirect vdev.
+ */
+ if (!vdev_is_concrete(vd))
+ return;
+
+ /*
* If we're in the FAULTED state or have experienced failed I/O, then
* clear the persistent state and attempt to reopen the device. We
* also mark the vdev config dirty, so that the new faulted state is
* written out to disk.
*/
if (vd->vdev_faulted || vd->vdev_degraded ||
!vdev_readable(vd) || !vdev_writeable(vd)) {
/*
* When reopening in reponse to a clear event, it may be due to
* a fmadm repair request. In this case, if the device is
* still broken, we want to still post the ereport again.
*/
vd->vdev_forcefault = B_TRUE;
vd->vdev_faulted = vd->vdev_degraded = 0ULL;
vd->vdev_cant_read = B_FALSE;
vd->vdev_cant_write = B_FALSE;
vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
vd->vdev_forcefault = B_FALSE;
if (vd != rvd && vdev_writeable(vd->vdev_top))
vdev_state_dirty(vd->vdev_top);
if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
spa_async_request(spa, SPA_ASYNC_RESILVER);
spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR);
}
/*
* When clearing a FMA-diagnosed fault, we always want to
* unspare the device, as we assume that the original spare was
* done in response to the FMA fault.
*/
if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
vd->vdev_parent->vdev_child[0] == vd)
vd->vdev_unspare = B_TRUE;
}
boolean_t
vdev_is_dead(vdev_t *vd)
{
/*
* Holes and missing devices are always considered "dead".
* This simplifies the code since we don't have to check for
* these types of devices in the various code paths.
* Instead we rely on the fact that we skip over dead devices
* before issuing I/O to them.
*/
- return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole ||
+ return (vd->vdev_state < VDEV_STATE_DEGRADED ||
+ vd->vdev_ops == &vdev_hole_ops ||
vd->vdev_ops == &vdev_missing_ops);
}
boolean_t
vdev_readable(vdev_t *vd)
{
return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
}
boolean_t
vdev_writeable(vdev_t *vd)
{
- return (!vdev_is_dead(vd) && !vd->vdev_cant_write);
+ return (!vdev_is_dead(vd) && !vd->vdev_cant_write &&
+ vdev_is_concrete(vd));
}
boolean_t
vdev_allocatable(vdev_t *vd)
{
uint64_t state = vd->vdev_state;
/*
* We currently allow allocations from vdevs which may be in the
* process of reopening (i.e. VDEV_STATE_CLOSED). If the device
* fails to reopen then we'll catch it later when we're holding
* the proper locks. Note that we have to get the vdev state
* in a local variable because although it changes atomically,
* we're asking two separate questions about it.
*/
return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
- !vd->vdev_cant_write && !vd->vdev_ishole &&
+ !vd->vdev_cant_write && vdev_is_concrete(vd) &&
vd->vdev_mg->mg_initialized);
}
boolean_t
vdev_accessible(vdev_t *vd, zio_t *zio)
{
ASSERT(zio->io_vd == vd);
if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
return (B_FALSE);
if (zio->io_type == ZIO_TYPE_READ)
return (!vd->vdev_cant_read);
if (zio->io_type == ZIO_TYPE_WRITE)
return (!vd->vdev_cant_write);
return (B_TRUE);
}
/*
* Get statistics for the given vdev.
*/
void
vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
{
spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *tvd = vd->vdev_top;
ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
mutex_enter(&vd->vdev_stat_lock);
bcopy(&vd->vdev_stat, vs, sizeof (*vs));
vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
vs->vs_state = vd->vdev_state;
vs->vs_rsize = vdev_get_min_asize(vd);
if (vd->vdev_ops->vdev_op_leaf)
vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
/*
* Report expandable space on top-level, non-auxillary devices only.
* The expandable space is reported in terms of metaslab sized units
* since that determines how much space the pool can expand.
*/
if (vd->vdev_aux == NULL && tvd != NULL && vd->vdev_max_asize != 0) {
vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize -
spa->spa_bootsize, 1ULL << tvd->vdev_ms_shift);
}
vs->vs_configured_ashift = vd->vdev_top != NULL
? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
vs->vs_logical_ashift = vd->vdev_logical_ashift;
vs->vs_physical_ashift = vd->vdev_physical_ashift;
- if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) {
+ if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
+ vdev_is_concrete(vd)) {
vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation;
}
/*
* If we're getting stats on the root vdev, aggregate the I/O counts
* over all top-level vdevs (i.e. the direct children of the root).
*/
if (vd == rvd) {
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *cvd = rvd->vdev_child[c];
vdev_stat_t *cvs = &cvd->vdev_stat;
for (int t = 0; t < ZIO_TYPES; t++) {
vs->vs_ops[t] += cvs->vs_ops[t];
vs->vs_bytes[t] += cvs->vs_bytes[t];
}
cvs->vs_scan_removing = cvd->vdev_removing;
}
}
mutex_exit(&vd->vdev_stat_lock);
}
void
vdev_clear_stats(vdev_t *vd)
{
mutex_enter(&vd->vdev_stat_lock);
vd->vdev_stat.vs_space = 0;
vd->vdev_stat.vs_dspace = 0;
vd->vdev_stat.vs_alloc = 0;
mutex_exit(&vd->vdev_stat_lock);
}
void
vdev_scan_stat_init(vdev_t *vd)
{
vdev_stat_t *vs = &vd->vdev_stat;
for (int c = 0; c < vd->vdev_children; c++)
vdev_scan_stat_init(vd->vdev_child[c]);
mutex_enter(&vd->vdev_stat_lock);
vs->vs_scan_processed = 0;
mutex_exit(&vd->vdev_stat_lock);
}
void
vdev_stat_update(zio_t *zio, uint64_t psize)
{
spa_t *spa = zio->io_spa;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
vdev_t *pvd;
uint64_t txg = zio->io_txg;
vdev_stat_t *vs = &vd->vdev_stat;
zio_type_t type = zio->io_type;
int flags = zio->io_flags;
/*
* If this i/o is a gang leader, it didn't do any actual work.
*/
if (zio->io_gang_tree)
return;
if (zio->io_error == 0) {
/*
* If this is a root i/o, don't count it -- we've already
* counted the top-level vdevs, and vdev_get_stats() will
* aggregate them when asked. This reduces contention on
* the root vdev_stat_lock and implicitly handles blocks
* that compress away to holes, for which there is no i/o.
* (Holes never create vdev children, so all the counters
* remain zero, which is what we want.)
*
* Note: this only applies to successful i/o (io_error == 0)
* because unlike i/o counts, errors are not additive.
* When reading a ditto block, for example, failure of
* one top-level vdev does not imply a root-level error.
*/
if (vd == rvd)
return;
ASSERT(vd == zio->io_vd);
if (flags & ZIO_FLAG_IO_BYPASS)
return;
mutex_enter(&vd->vdev_stat_lock);
if (flags & ZIO_FLAG_IO_REPAIR) {
if (flags & ZIO_FLAG_SCAN_THREAD) {
dsl_scan_phys_t *scn_phys =
&spa->spa_dsl_pool->dp_scan->scn_phys;
uint64_t *processed = &scn_phys->scn_processed;
/* XXX cleanup? */
if (vd->vdev_ops->vdev_op_leaf)
atomic_add_64(processed, psize);
vs->vs_scan_processed += psize;
}
if (flags & ZIO_FLAG_SELF_HEAL)
vs->vs_self_healed += psize;
}
vs->vs_ops[type]++;
vs->vs_bytes[type] += psize;
mutex_exit(&vd->vdev_stat_lock);
return;
}
if (flags & ZIO_FLAG_SPECULATIVE)
return;
/*
* If this is an I/O error that is going to be retried, then ignore the
* error. Otherwise, the user may interpret B_FAILFAST I/O errors as
* hard errors, when in reality they can happen for any number of
* innocuous reasons (bus resets, MPxIO link failure, etc).
*/
if (zio->io_error == EIO &&
!(zio->io_flags & ZIO_FLAG_IO_RETRY))
return;
/*
* Intent logs writes won't propagate their error to the root
* I/O so don't mark these types of failures as pool-level
* errors.
*/
if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
return;
mutex_enter(&vd->vdev_stat_lock);
if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
if (zio->io_error == ECKSUM)
vs->vs_checksum_errors++;
else
vs->vs_read_errors++;
}
if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
vs->vs_write_errors++;
mutex_exit(&vd->vdev_stat_lock);
- if (type == ZIO_TYPE_WRITE && txg != 0 &&
+ if (spa->spa_load_state == SPA_LOAD_NONE &&
+ type == ZIO_TYPE_WRITE && txg != 0 &&
(!(flags & ZIO_FLAG_IO_REPAIR) ||
(flags & ZIO_FLAG_SCAN_THREAD) ||
spa->spa_claiming)) {
/*
* This is either a normal write (not a repair), or it's
* a repair induced by the scrub thread, or it's a repair
* made by zil_claim() during spa_load() in the first txg.
* In the normal case, we commit the DTL change in the same
* txg as the block was born. In the scrub-induced repair
* case, we know that scrubs run in first-pass syncing context,
* so we commit the DTL change in spa_syncing_txg(spa).
* In the zil_claim() case, we commit in spa_first_txg(spa).
*
* We currently do not make DTL entries for failed spontaneous
* self-healing writes triggered by normal (non-scrubbing)
* reads, because we have no transactional context in which to
* do so -- and it's not clear that it'd be desirable anyway.
*/
if (vd->vdev_ops->vdev_op_leaf) {
uint64_t commit_txg = txg;
if (flags & ZIO_FLAG_SCAN_THREAD) {
ASSERT(flags & ZIO_FLAG_IO_REPAIR);
ASSERT(spa_sync_pass(spa) == 1);
vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
commit_txg = spa_syncing_txg(spa);
} else if (spa->spa_claiming) {
ASSERT(flags & ZIO_FLAG_IO_REPAIR);
commit_txg = spa_first_txg(spa);
}
ASSERT(commit_txg >= spa_syncing_txg(spa));
if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
return;
for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
}
if (vd != rvd)
vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
}
}
/*
* Update the in-core space usage stats for this vdev, its metaslab class,
* and the root vdev.
*/
void
vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
int64_t space_delta)
{
int64_t dspace_delta = space_delta;
spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev;
metaslab_group_t *mg = vd->vdev_mg;
metaslab_class_t *mc = mg ? mg->mg_class : NULL;
ASSERT(vd == vd->vdev_top);
/*
* Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
* factor. We must calculate this here and not at the root vdev
* because the root vdev's psize-to-asize is simply the max of its
* childrens', thus not accurate enough for us.
*/
ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
vd->vdev_deflate_ratio;
mutex_enter(&vd->vdev_stat_lock);
vd->vdev_stat.vs_alloc += alloc_delta;
vd->vdev_stat.vs_space += space_delta;
vd->vdev_stat.vs_dspace += dspace_delta;
mutex_exit(&vd->vdev_stat_lock);
if (mc == spa_normal_class(spa)) {
mutex_enter(&rvd->vdev_stat_lock);
rvd->vdev_stat.vs_alloc += alloc_delta;
rvd->vdev_stat.vs_space += space_delta;
rvd->vdev_stat.vs_dspace += dspace_delta;
mutex_exit(&rvd->vdev_stat_lock);
}
if (mc != NULL) {
ASSERT(rvd == vd->vdev_parent);
ASSERT(vd->vdev_ms_count != 0);
metaslab_class_space_update(mc,
alloc_delta, defer_delta, space_delta, dspace_delta);
}
}
/*
* Mark a top-level vdev's config as dirty, placing it on the dirty list
* so that it will be written out next time the vdev configuration is synced.
* If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
*/
void
vdev_config_dirty(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev;
int c;
ASSERT(spa_writeable(spa));
/*
* If this is an aux vdev (as with l2cache and spare devices), then we
* update the vdev config manually and set the sync flag.
*/
if (vd->vdev_aux != NULL) {
spa_aux_vdev_t *sav = vd->vdev_aux;
nvlist_t **aux;
uint_t naux;
for (c = 0; c < sav->sav_count; c++) {
if (sav->sav_vdevs[c] == vd)
break;
}
if (c == sav->sav_count) {
/*
* We're being removed. There's nothing more to do.
*/
ASSERT(sav->sav_sync == B_TRUE);
return;
}
sav->sav_sync = B_TRUE;
if (nvlist_lookup_nvlist_array(sav->sav_config,
ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
}
ASSERT(c < naux);
/*
* Setting the nvlist in the middle if the array is a little
* sketchy, but it will work.
*/
nvlist_free(aux[c]);
aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
return;
}
/*
* The dirty list is protected by the SCL_CONFIG lock. The caller
* must either hold SCL_CONFIG as writer, or must be the sync thread
* (which holds SCL_CONFIG as reader). There's only one sync thread,
* so this is sufficient to ensure mutual exclusion.
*/
ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
(dsl_pool_sync_context(spa_get_dsl(spa)) &&
spa_config_held(spa, SCL_CONFIG, RW_READER)));
if (vd == rvd) {
for (c = 0; c < rvd->vdev_children; c++)
vdev_config_dirty(rvd->vdev_child[c]);
} else {
ASSERT(vd == vd->vdev_top);
if (!list_link_active(&vd->vdev_config_dirty_node) &&
- !vd->vdev_ishole)
+ vdev_is_concrete(vd)) {
list_insert_head(&spa->spa_config_dirty_list, vd);
+ }
}
}
void
vdev_config_clean(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
(dsl_pool_sync_context(spa_get_dsl(spa)) &&
spa_config_held(spa, SCL_CONFIG, RW_READER)));
ASSERT(list_link_active(&vd->vdev_config_dirty_node));
list_remove(&spa->spa_config_dirty_list, vd);
}
/*
* Mark a top-level vdev's state as dirty, so that the next pass of
* spa_sync() can convert this into vdev_config_dirty(). We distinguish
* the state changes from larger config changes because they require
* much less locking, and are often needed for administrative actions.
*/
void
vdev_state_dirty(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
ASSERT(spa_writeable(spa));
ASSERT(vd == vd->vdev_top);
/*
* The state list is protected by the SCL_STATE lock. The caller
* must either hold SCL_STATE as writer, or must be the sync thread
* (which holds SCL_STATE as reader). There's only one sync thread,
* so this is sufficient to ensure mutual exclusion.
*/
ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
(dsl_pool_sync_context(spa_get_dsl(spa)) &&
spa_config_held(spa, SCL_STATE, RW_READER)));
- if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole)
+ if (!list_link_active(&vd->vdev_state_dirty_node) &&
+ vdev_is_concrete(vd))
list_insert_head(&spa->spa_state_dirty_list, vd);
}
void
vdev_state_clean(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
(dsl_pool_sync_context(spa_get_dsl(spa)) &&
spa_config_held(spa, SCL_STATE, RW_READER)));
ASSERT(list_link_active(&vd->vdev_state_dirty_node));
list_remove(&spa->spa_state_dirty_list, vd);
}
/*
* Propagate vdev state up from children to parent.
*/
void
vdev_propagate_state(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev;
int degraded = 0, faulted = 0;
int corrupted = 0;
vdev_t *child;
if (vd->vdev_children > 0) {
for (int c = 0; c < vd->vdev_children; c++) {
child = vd->vdev_child[c];
/*
- * Don't factor holes into the decision.
+ * Don't factor holes or indirect vdevs into the
+ * decision.
*/
- if (child->vdev_ishole)
+ if (!vdev_is_concrete(child))
continue;
if (!vdev_readable(child) ||
(!vdev_writeable(child) && spa_writeable(spa))) {
/*
* Root special: if there is a top-level log
* device, treat the root vdev as if it were
* degraded.
*/
if (child->vdev_islog && vd == rvd)
degraded++;
else
faulted++;
} else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
degraded++;
}
if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
corrupted++;
}
vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
/*
* Root special: if there is a top-level vdev that cannot be
* opened due to corrupted metadata, then propagate the root
* vdev's aux state as 'corrupt' rather than 'insufficient
* replicas'.
*/
if (corrupted && vd == rvd &&
rvd->vdev_state == VDEV_STATE_CANT_OPEN)
vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
}
if (vd->vdev_parent)
vdev_propagate_state(vd->vdev_parent);
}
/*
* Set a vdev's state. If this is during an open, we don't update the parent
* state, because we're in the process of opening children depth-first.
* Otherwise, we propagate the change to the parent.
*
* If this routine places a device in a faulted state, an appropriate ereport is
* generated.
*/
void
vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
{
uint64_t save_state;
spa_t *spa = vd->vdev_spa;
if (state == vd->vdev_state) {
vd->vdev_stat.vs_aux = aux;
return;
}
save_state = vd->vdev_state;
vd->vdev_state = state;
vd->vdev_stat.vs_aux = aux;
/*
* If we are setting the vdev state to anything but an open state, then
* always close the underlying device unless the device has requested
* a delayed close (i.e. we're about to remove or fault the device).
* Otherwise, we keep accessible but invalid devices open forever.
* We don't call vdev_close() itself, because that implies some extra
* checks (offline, etc) that we don't want here. This is limited to
* leaf devices, because otherwise closing the device will affect other
* children.
*/
if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
vd->vdev_ops->vdev_op_leaf)
vd->vdev_ops->vdev_op_close(vd);
if (vd->vdev_removed &&
state == VDEV_STATE_CANT_OPEN &&
(aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
/*
* If the previous state is set to VDEV_STATE_REMOVED, then this
* device was previously marked removed and someone attempted to
* reopen it. If this failed due to a nonexistent device, then
* keep the device in the REMOVED state. We also let this be if
* it is one of our special test online cases, which is only
* attempting to online the device and shouldn't generate an FMA
* fault.
*/
vd->vdev_state = VDEV_STATE_REMOVED;
vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
} else if (state == VDEV_STATE_REMOVED) {
vd->vdev_removed = B_TRUE;
} else if (state == VDEV_STATE_CANT_OPEN) {
/*
* If we fail to open a vdev during an import or recovery, we
* mark it as "not available", which signifies that it was
* never there to begin with. Failure to open such a device
* is not considered an error.
*/
if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
spa_load_state(spa) == SPA_LOAD_RECOVER) &&
vd->vdev_ops->vdev_op_leaf)
vd->vdev_not_present = 1;
/*
* Post the appropriate ereport. If the 'prevstate' field is
* set to something other than VDEV_STATE_UNKNOWN, it indicates
* that this is part of a vdev_reopen(). In this case, we don't
* want to post the ereport if the device was already in the
* CANT_OPEN state beforehand.
*
* If the 'checkremove' flag is set, then this is an attempt to
* online the device in response to an insertion event. If we
* hit this case, then we have detected an insertion event for a
* faulted or offline device that wasn't in the removed state.
* In this scenario, we don't post an ereport because we are
* about to replace the device, or attempt an online with
* vdev_forcefault, which will generate the fault for us.
*/
if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
!vd->vdev_not_present && !vd->vdev_checkremove &&
vd != spa->spa_root_vdev) {
const char *class;
switch (aux) {
case VDEV_AUX_OPEN_FAILED:
class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
break;
case VDEV_AUX_CORRUPT_DATA:
class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
break;
case VDEV_AUX_NO_REPLICAS:
class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
break;
case VDEV_AUX_BAD_GUID_SUM:
class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
break;
case VDEV_AUX_TOO_SMALL:
class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
break;
case VDEV_AUX_BAD_LABEL:
class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
break;
default:
class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
}
zfs_ereport_post(class, spa, vd, NULL, save_state, 0);
}
/* Erase any notion of persistent removed state */
vd->vdev_removed = B_FALSE;
} else {
vd->vdev_removed = B_FALSE;
}
/*
* Notify the fmd of the state change. Be verbose and post
* notifications even for stuff that's not important; the fmd agent can
* sort it out. Don't emit state change events for non-leaf vdevs since
* they can't change state on their own. The FMD can check their state
* if it wants to when it sees that a leaf vdev had a state change.
*/
if (vd->vdev_ops->vdev_op_leaf)
zfs_post_state_change(spa, vd);
if (!isopen && vd->vdev_parent)
vdev_propagate_state(vd->vdev_parent);
}
/*
* Check the vdev configuration to ensure that it's capable of supporting
* a root pool. We do not support partial configuration.
* In addition, only a single top-level vdev is allowed.
*
* FreeBSD does not have above limitations.
*/
boolean_t
vdev_is_bootable(vdev_t *vd)
{
#ifdef illumos
if (!vd->vdev_ops->vdev_op_leaf) {
char *vdev_type = vd->vdev_ops->vdev_op_type;
if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 &&
vd->vdev_children > 1) {
return (B_FALSE);
- } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
+ } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 ||
+ strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) {
return (B_FALSE);
}
}
for (int c = 0; c < vd->vdev_children; c++) {
if (!vdev_is_bootable(vd->vdev_child[c]))
return (B_FALSE);
}
#endif /* illumos */
return (B_TRUE);
}
+boolean_t
+vdev_is_concrete(vdev_t *vd)
+{
+ vdev_ops_t *ops = vd->vdev_ops;
+ if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops ||
+ ops == &vdev_missing_ops || ops == &vdev_root_ops) {
+ return (B_FALSE);
+ } else {
+ return (B_TRUE);
+ }
+}
+
/*
* Load the state from the original vdev tree (ovd) which
* we've retrieved from the MOS config object. If the original
* vdev was offline or faulted then we transfer that state to the
* device in the current vdev tree (nvd).
*/
void
vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
{
spa_t *spa = nvd->vdev_spa;
ASSERT(nvd->vdev_top->vdev_islog);
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
for (int c = 0; c < nvd->vdev_children; c++)
vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
if (nvd->vdev_ops->vdev_op_leaf) {
/*
* Restore the persistent vdev state
*/
nvd->vdev_offline = ovd->vdev_offline;
nvd->vdev_faulted = ovd->vdev_faulted;
nvd->vdev_degraded = ovd->vdev_degraded;
nvd->vdev_removed = ovd->vdev_removed;
}
}
/*
* Determine if a log device has valid content. If the vdev was
* removed or faulted in the MOS config then we know that
* the content on the log device has already been written to the pool.
*/
boolean_t
vdev_log_state_valid(vdev_t *vd)
{
if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
!vd->vdev_removed)
return (B_TRUE);
for (int c = 0; c < vd->vdev_children; c++)
if (vdev_log_state_valid(vd->vdev_child[c]))
return (B_TRUE);
return (B_FALSE);
}
/*
* Expand a vdev if possible.
*/
void
vdev_expand(vdev_t *vd, uint64_t txg)
{
ASSERT(vd->vdev_top == vd);
ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
- if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
+ vdev_set_deflate_ratio(vd);
+
+ if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
+ vdev_is_concrete(vd)) {
VERIFY(vdev_metaslab_init(vd, txg) == 0);
vdev_config_dirty(vd);
}
}
/*
* Split a vdev.
*/
void
vdev_split(vdev_t *vd)
{
vdev_t *cvd, *pvd = vd->vdev_parent;
vdev_remove_child(pvd, vd);
vdev_compact_children(pvd);
cvd = pvd->vdev_child[0];
if (pvd->vdev_children == 1) {
vdev_remove_parent(cvd);
cvd->vdev_splitting = B_TRUE;
}
vdev_propagate_state(cvd);
}
void
vdev_deadman(vdev_t *vd)
{
for (int c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
vdev_deadman(cvd);
}
if (vd->vdev_ops->vdev_op_leaf) {
vdev_queue_t *vq = &vd->vdev_queue;
mutex_enter(&vq->vq_lock);
if (avl_numnodes(&vq->vq_active_tree) > 0) {
spa_t *spa = vd->vdev_spa;
zio_t *fio;
uint64_t delta;
/*
* Look at the head of all the pending queues,
* if any I/O has been outstanding for longer than
* the spa_deadman_synctime we panic the system.
*/
fio = avl_first(&vq->vq_active_tree);
delta = gethrtime() - fio->io_timestamp;
if (delta > spa_deadman_synctime(spa)) {
zfs_dbgmsg("SLOW IO: zio timestamp %lluns, "
"delta %lluns, last io %lluns",
fio->io_timestamp, delta,
vq->vq_io_complete_ts);
fm_panic("I/O to pool '%s' appears to be "
"hung on vdev guid %llu at '%s'.",
spa_name(spa),
(long long unsigned int) vd->vdev_guid,
vd->vdev_path);
}
}
mutex_exit(&vq->vq_lock);
}
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c (revision 332525)
@@ -1,918 +1,919 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2013 Joyent, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/spa_impl.h>
#include <sys/refcount.h>
#include <sys/vdev_disk.h>
#include <sys/vdev_impl.h>
#include <sys/abd.h>
#include <sys/fs/zfs.h>
#include <sys/zio.h>
#include <sys/sunldi.h>
#include <sys/efi_partition.h>
#include <sys/fm/fs/zfs.h>
/*
* Virtual device vector for disks.
*/
extern ldi_ident_t zfs_li;
static void vdev_disk_close(vdev_t *);
typedef struct vdev_disk_ldi_cb {
list_node_t lcb_next;
ldi_callback_id_t lcb_id;
} vdev_disk_ldi_cb_t;
static void
vdev_disk_alloc(vdev_t *vd)
{
vdev_disk_t *dvd;
dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
/*
* Create the LDI event callback list.
*/
list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t),
offsetof(vdev_disk_ldi_cb_t, lcb_next));
}
static void
vdev_disk_free(vdev_t *vd)
{
vdev_disk_t *dvd = vd->vdev_tsd;
vdev_disk_ldi_cb_t *lcb;
if (dvd == NULL)
return;
/*
* We have already closed the LDI handle. Clean up the LDI event
* callbacks and free vd->vdev_tsd.
*/
while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) {
list_remove(&dvd->vd_ldi_cbs, lcb);
(void) ldi_ev_remove_callbacks(lcb->lcb_id);
kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t));
}
list_destroy(&dvd->vd_ldi_cbs);
kmem_free(dvd, sizeof (vdev_disk_t));
vd->vdev_tsd = NULL;
}
/* ARGSUSED */
static int
vdev_disk_off_notify(ldi_handle_t lh, ldi_ev_cookie_t ecookie, void *arg,
void *ev_data)
{
vdev_t *vd = (vdev_t *)arg;
vdev_disk_t *dvd = vd->vdev_tsd;
/*
* Ignore events other than offline.
*/
if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
return (LDI_EV_SUCCESS);
/*
* All LDI handles must be closed for the state change to succeed, so
* call on vdev_disk_close() to do this.
*
* We inform vdev_disk_close that it is being called from offline
* notify context so it will defer cleanup of LDI event callbacks and
* freeing of vd->vdev_tsd to the offline finalize or a reopen.
*/
dvd->vd_ldi_offline = B_TRUE;
vdev_disk_close(vd);
/*
* Now that the device is closed, request that the spa_async_thread
* mark the device as REMOVED and notify FMA of the removal.
*/
zfs_post_remove(vd->vdev_spa, vd);
vd->vdev_remove_wanted = B_TRUE;
spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
return (LDI_EV_SUCCESS);
}
/* ARGSUSED */
static void
vdev_disk_off_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie,
int ldi_result, void *arg, void *ev_data)
{
vdev_t *vd = (vdev_t *)arg;
/*
* Ignore events other than offline.
*/
if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
return;
/*
* We have already closed the LDI handle in notify.
* Clean up the LDI event callbacks and free vd->vdev_tsd.
*/
vdev_disk_free(vd);
/*
* Request that the vdev be reopened if the offline state change was
* unsuccessful.
*/
if (ldi_result != LDI_EV_SUCCESS) {
vd->vdev_probe_wanted = B_TRUE;
spa_async_request(vd->vdev_spa, SPA_ASYNC_PROBE);
}
}
static ldi_ev_callback_t vdev_disk_off_callb = {
.cb_vers = LDI_EV_CB_VERS,
.cb_notify = vdev_disk_off_notify,
.cb_finalize = vdev_disk_off_finalize
};
/* ARGSUSED */
static void
vdev_disk_dgrd_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie,
int ldi_result, void *arg, void *ev_data)
{
vdev_t *vd = (vdev_t *)arg;
/*
* Ignore events other than degrade.
*/
if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_DEGRADE) != 0)
return;
/*
* Degrade events always succeed. Mark the vdev as degraded.
* This status is purely informative for the user.
*/
(void) vdev_degrade(vd->vdev_spa, vd->vdev_guid, 0);
}
static ldi_ev_callback_t vdev_disk_dgrd_callb = {
.cb_vers = LDI_EV_CB_VERS,
.cb_notify = NULL,
.cb_finalize = vdev_disk_dgrd_finalize
};
static void
vdev_disk_hold(vdev_t *vd)
{
ddi_devid_t devid;
char *minor;
ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
/*
* We must have a pathname, and it must be absolute.
*/
if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
return;
/*
* Only prefetch path and devid info if the device has
* never been opened.
*/
if (vd->vdev_tsd != NULL)
return;
if (vd->vdev_wholedisk == -1ULL) {
size_t len = strlen(vd->vdev_path) + 3;
char *buf = kmem_alloc(len, KM_SLEEP);
(void) snprintf(buf, len, "%ss0", vd->vdev_path);
(void) ldi_vp_from_name(buf, &vd->vdev_name_vp);
kmem_free(buf, len);
}
if (vd->vdev_name_vp == NULL)
(void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp);
if (vd->vdev_devid != NULL &&
ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) {
(void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp);
ddi_devid_str_free(minor);
ddi_devid_free(devid);
}
}
static void
vdev_disk_rele(vdev_t *vd)
{
ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
if (vd->vdev_name_vp) {
VN_RELE_ASYNC(vd->vdev_name_vp,
dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
vd->vdev_name_vp = NULL;
}
if (vd->vdev_devid_vp) {
VN_RELE_ASYNC(vd->vdev_devid_vp,
dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
vd->vdev_devid_vp = NULL;
}
}
/*
* We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when
* even a fallback to DKIOCGMEDIAINFO fails.
*/
#ifdef DEBUG
#define VDEV_DEBUG(...) cmn_err(CE_NOTE, __VA_ARGS__)
#else
#define VDEV_DEBUG(...) /* Nothing... */
#endif
static int
vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
uint64_t *ashift)
{
spa_t *spa = vd->vdev_spa;
vdev_disk_t *dvd = vd->vdev_tsd;
ldi_ev_cookie_t ecookie;
vdev_disk_ldi_cb_t *lcb;
union {
struct dk_minfo_ext ude;
struct dk_minfo ud;
} dks;
struct dk_minfo_ext *dkmext = &dks.ude;
struct dk_minfo *dkm = &dks.ud;
int error;
dev_t dev;
int otyp;
boolean_t validate_devid = B_FALSE;
ddi_devid_t devid;
uint64_t capacity = 0, blksz = 0, pbsize;
/*
* We must have a pathname, and it must be absolute.
*/
if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
return (SET_ERROR(EINVAL));
}
/*
* Reopen the device if it's not currently open. Otherwise,
* just update the physical size of the device.
*/
if (dvd != NULL) {
if (dvd->vd_ldi_offline && dvd->vd_lh == NULL) {
/*
* If we are opening a device in its offline notify
* context, the LDI handle was just closed. Clean
* up the LDI event callbacks and free vd->vdev_tsd.
*/
vdev_disk_free(vd);
} else {
ASSERT(vd->vdev_reopening);
goto skip_open;
}
}
/*
* Create vd->vdev_tsd.
*/
vdev_disk_alloc(vd);
dvd = vd->vdev_tsd;
/*
* When opening a disk device, we want to preserve the user's original
* intent. We always want to open the device by the path the user gave
* us, even if it is one of multiple paths to the save device. But we
* also want to be able to survive disks being removed/recabled.
* Therefore the sequence of opening devices is:
*
* 1. Try opening the device by path. For legacy pools without the
* 'whole_disk' property, attempt to fix the path by appending 's0'.
*
* 2. If the devid of the device matches the stored value, return
* success.
*
* 3. Otherwise, the device may have moved. Try opening the device
* by the devid instead.
*/
if (vd->vdev_devid != NULL) {
if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
&dvd->vd_minor) != 0) {
vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
return (SET_ERROR(EINVAL));
}
}
error = EINVAL; /* presume failure */
if (vd->vdev_path != NULL) {
if (vd->vdev_wholedisk == -1ULL) {
size_t len = strlen(vd->vdev_path) + 3;
char *buf = kmem_alloc(len, KM_SLEEP);
(void) snprintf(buf, len, "%ss0", vd->vdev_path);
error = ldi_open_by_name(buf, spa_mode(spa), kcred,
&dvd->vd_lh, zfs_li);
if (error == 0) {
spa_strfree(vd->vdev_path);
vd->vdev_path = buf;
vd->vdev_wholedisk = 1ULL;
} else {
kmem_free(buf, len);
}
}
/*
* If we have not yet opened the device, try to open it by the
* specified path.
*/
if (error != 0) {
error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
kcred, &dvd->vd_lh, zfs_li);
}
/*
* Compare the devid to the stored value.
*/
if (error == 0 && vd->vdev_devid != NULL &&
ldi_get_devid(dvd->vd_lh, &devid) == 0) {
if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
error = SET_ERROR(EINVAL);
(void) ldi_close(dvd->vd_lh, spa_mode(spa),
kcred);
dvd->vd_lh = NULL;
}
ddi_devid_free(devid);
}
/*
* If we succeeded in opening the device, but 'vdev_wholedisk'
* is not yet set, then this must be a slice.
*/
if (error == 0 && vd->vdev_wholedisk == -1ULL)
vd->vdev_wholedisk = 0;
}
/*
* If we were unable to open by path, or the devid check fails, open by
* devid instead.
*/
if (error != 0 && vd->vdev_devid != NULL) {
error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
spa_mode(spa), kcred, &dvd->vd_lh, zfs_li);
}
/*
* If all else fails, then try opening by physical path (if available)
* or the logical path (if we failed due to the devid check). While not
* as reliable as the devid, this will give us something, and the higher
* level vdev validation will prevent us from opening the wrong device.
*/
if (error) {
if (vd->vdev_devid != NULL)
validate_devid = B_TRUE;
if (vd->vdev_physpath != NULL &&
(dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV)
error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa),
kcred, &dvd->vd_lh, zfs_li);
/*
* Note that we don't support the legacy auto-wholedisk support
* as above. This hasn't been used in a very long time and we
* don't need to propagate its oddities to this edge condition.
*/
if (error && vd->vdev_path != NULL)
error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
kcred, &dvd->vd_lh, zfs_li);
}
if (error) {
vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
return (error);
}
/*
* Now that the device has been successfully opened, update the devid
* if necessary.
*/
if (validate_devid && spa_writeable(spa) &&
ldi_get_devid(dvd->vd_lh, &devid) == 0) {
if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
char *vd_devid;
vd_devid = ddi_devid_str_encode(devid, dvd->vd_minor);
zfs_dbgmsg("vdev %s: update devid from %s, "
"to %s", vd->vdev_path, vd->vdev_devid, vd_devid);
spa_strfree(vd->vdev_devid);
vd->vdev_devid = spa_strdup(vd_devid);
ddi_devid_str_free(vd_devid);
}
ddi_devid_free(devid);
}
/*
* Once a device is opened, verify that the physical device path (if
* available) is up to date.
*/
if (ldi_get_dev(dvd->vd_lh, &dev) == 0 &&
ldi_get_otyp(dvd->vd_lh, &otyp) == 0) {
char *physpath, *minorname;
physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
minorname = NULL;
if (ddi_dev_pathname(dev, otyp, physpath) == 0 &&
ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 &&
(vd->vdev_physpath == NULL ||
strcmp(vd->vdev_physpath, physpath) != 0)) {
if (vd->vdev_physpath)
spa_strfree(vd->vdev_physpath);
(void) strlcat(physpath, ":", MAXPATHLEN);
(void) strlcat(physpath, minorname, MAXPATHLEN);
vd->vdev_physpath = spa_strdup(physpath);
}
if (minorname)
kmem_free(minorname, strlen(minorname) + 1);
kmem_free(physpath, MAXPATHLEN);
}
/*
* Register callbacks for the LDI offline event.
*/
if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) ==
LDI_EV_SUCCESS) {
lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
list_insert_tail(&dvd->vd_ldi_cbs, lcb);
(void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
&vdev_disk_off_callb, (void *) vd, &lcb->lcb_id);
}
/*
* Register callbacks for the LDI degrade event.
*/
if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) ==
LDI_EV_SUCCESS) {
lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
list_insert_tail(&dvd->vd_ldi_cbs, lcb);
(void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
&vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id);
}
skip_open:
/*
* Determine the actual size of the device.
*/
if (ldi_get_size(dvd->vd_lh, psize) != 0) {
vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
return (SET_ERROR(EINVAL));
}
*max_psize = *psize;
/*
* Determine the device's minimum transfer size.
* If the ioctl isn't supported, assume DEV_BSIZE.
*/
if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT,
(intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) {
capacity = dkmext->dki_capacity - 1;
blksz = dkmext->dki_lbsize;
pbsize = dkmext->dki_pbsize;
} else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO,
(intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) {
VDEV_DEBUG(
"vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n",
vd->vdev_path);
capacity = dkm->dki_capacity - 1;
blksz = dkm->dki_lbsize;
pbsize = blksz;
} else {
VDEV_DEBUG("vdev_disk_open(\"%s\"): "
"both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n",
vd->vdev_path, error);
pbsize = DEV_BSIZE;
}
*ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1;
if (vd->vdev_wholedisk == 1) {
int wce = 1;
if (error == 0) {
/*
* If we have the capability to expand, we'd have
* found out via success from DKIOCGMEDIAINFO{,EXT}.
* Adjust max_psize upward accordingly since we know
* we own the whole disk now.
*/
*max_psize = capacity * blksz;
}
/*
* Since we own the whole disk, try to enable disk write
* caching. We ignore errors because it's OK if we can't do it.
*/
(void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
FKIOCTL, kcred, NULL);
}
/*
* Clear the nowritecache bit, so that on a vdev_reopen() we will
* try again.
*/
vd->vdev_nowritecache = B_FALSE;
return (0);
}
static void
vdev_disk_close(vdev_t *vd)
{
vdev_disk_t *dvd = vd->vdev_tsd;
if (vd->vdev_reopening || dvd == NULL)
return;
if (dvd->vd_minor != NULL) {
ddi_devid_str_free(dvd->vd_minor);
dvd->vd_minor = NULL;
}
if (dvd->vd_devid != NULL) {
ddi_devid_free(dvd->vd_devid);
dvd->vd_devid = NULL;
}
if (dvd->vd_lh != NULL) {
(void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
dvd->vd_lh = NULL;
}
vd->vdev_delayed_close = B_FALSE;
/*
* If we closed the LDI handle due to an offline notify from LDI,
* don't free vd->vdev_tsd or unregister the callbacks here;
* the offline finalize callback or a reopen will take care of it.
*/
if (dvd->vd_ldi_offline)
return;
vdev_disk_free(vd);
}
int
vdev_disk_physio(vdev_t *vd, caddr_t data,
size_t size, uint64_t offset, int flags, boolean_t isdump)
{
vdev_disk_t *dvd = vd->vdev_tsd;
/*
* If the vdev is closed, it's likely in the REMOVED or FAULTED state.
* Nothing to be done here but return failure.
*/
if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL))
return (EIO);
ASSERT(vd->vdev_ops == &vdev_disk_ops);
/*
* If in the context of an active crash dump, use the ldi_dump(9F)
* call instead of ldi_strategy(9F) as usual.
*/
if (isdump) {
ASSERT3P(dvd, !=, NULL);
return (ldi_dump(dvd->vd_lh, data, lbtodb(offset),
lbtodb(size)));
}
return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
}
int
vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data,
size_t size, uint64_t offset, int flags)
{
buf_t *bp;
int error = 0;
if (vd_lh == NULL)
return (SET_ERROR(EINVAL));
ASSERT(flags & B_READ || flags & B_WRITE);
bp = getrbuf(KM_SLEEP);
bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST;
bp->b_bcount = size;
bp->b_un.b_addr = (void *)data;
bp->b_lblkno = lbtodb(offset);
bp->b_bufsize = size;
error = ldi_strategy(vd_lh, bp);
ASSERT(error == 0);
if ((error = biowait(bp)) == 0 && bp->b_resid != 0)
error = SET_ERROR(EIO);
freerbuf(bp);
return (error);
}
static void
vdev_disk_io_intr(buf_t *bp)
{
vdev_buf_t *vb = (vdev_buf_t *)bp;
zio_t *zio = vb->vb_io;
/*
* The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO.
* Rather than teach the rest of the stack about other error
* possibilities (EFAULT, etc), we normalize the error value here.
*/
zio->io_error = (geterror(bp) != 0 ? SET_ERROR(EIO) : 0);
if (zio->io_error == 0 && bp->b_resid != 0)
zio->io_error = SET_ERROR(EIO);
if (zio->io_type == ZIO_TYPE_READ) {
abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size);
} else {
abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size);
}
kmem_free(vb, sizeof (vdev_buf_t));
zio_delay_interrupt(zio);
}
static void
vdev_disk_ioctl_free(zio_t *zio)
{
kmem_free(zio->io_vsd, sizeof (struct dk_callback));
}
static const zio_vsd_ops_t vdev_disk_vsd_ops = {
vdev_disk_ioctl_free,
zio_vsd_default_cksum_report
};
static void
vdev_disk_ioctl_done(void *zio_arg, int error)
{
zio_t *zio = zio_arg;
zio->io_error = error;
zio_interrupt(zio);
}
static void
vdev_disk_io_start(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
vdev_disk_t *dvd = vd->vdev_tsd;
vdev_buf_t *vb;
struct dk_callback *dkc;
buf_t *bp;
int error;
/*
* If the vdev is closed, it's likely in the REMOVED or FAULTED state.
* Nothing to be done here but return failure.
*/
if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) {
zio->io_error = SET_ERROR(ENXIO);
zio_interrupt(zio);
return;
}
if (zio->io_type == ZIO_TYPE_IOCTL) {
/* XXPOLICY */
if (!vdev_readable(vd)) {
zio->io_error = SET_ERROR(ENXIO);
zio_interrupt(zio);
return;
}
switch (zio->io_cmd) {
case DKIOCFLUSHWRITECACHE:
if (zfs_nocacheflush)
break;
if (vd->vdev_nowritecache) {
zio->io_error = SET_ERROR(ENOTSUP);
break;
}
zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP);
zio->io_vsd_ops = &vdev_disk_vsd_ops;
dkc->dkc_callback = vdev_disk_ioctl_done;
dkc->dkc_flag = FLUSH_VOLATILE;
dkc->dkc_cookie = zio;
error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
(uintptr_t)dkc, FKIOCTL, kcred, NULL);
if (error == 0) {
/*
* The ioctl will be done asychronously,
* and will call vdev_disk_ioctl_done()
* upon completion.
*/
return;
}
zio->io_error = error;
break;
default:
zio->io_error = SET_ERROR(ENOTSUP);
}
zio_execute(zio);
return;
}
ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
zio->io_target_timestamp = zio_handle_io_delay(zio);
vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);
vb->vb_io = zio;
bp = &vb->vb_buf;
bioinit(bp);
bp->b_flags = B_BUSY | B_NOCACHE |
(zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
bp->b_flags |= B_FAILFAST;
bp->b_bcount = zio->io_size;
if (zio->io_type == ZIO_TYPE_READ) {
bp->b_un.b_addr =
abd_borrow_buf(zio->io_abd, zio->io_size);
} else {
bp->b_un.b_addr =
abd_borrow_buf_copy(zio->io_abd, zio->io_size);
}
bp->b_lblkno = lbtodb(zio->io_offset);
bp->b_bufsize = zio->io_size;
bp->b_iodone = (int (*)())vdev_disk_io_intr;
/* ldi_strategy() will return non-zero only on programming errors */
VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
}
static void
vdev_disk_io_done(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
/*
* If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
* the device has been removed. If this is the case, then we trigger an
* asynchronous removal of the device. Otherwise, probe the device and
* make sure it's still accessible.
*/
if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
vdev_disk_t *dvd = vd->vdev_tsd;
int state = DKIO_NONE;
if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
/*
* We post the resource as soon as possible, instead of
* when the async removal actually happens, because the
* DE is using this information to discard previous I/O
* errors.
*/
zfs_post_remove(zio->io_spa, vd);
vd->vdev_remove_wanted = B_TRUE;
spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
} else if (!vd->vdev_delayed_close) {
vd->vdev_delayed_close = B_TRUE;
}
}
}
vdev_ops_t vdev_disk_ops = {
vdev_disk_open,
vdev_disk_close,
vdev_default_asize,
vdev_disk_io_start,
vdev_disk_io_done,
NULL,
vdev_disk_hold,
vdev_disk_rele,
+ NULL,
VDEV_TYPE_DISK, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
/*
* Given the root disk device devid or pathname, read the label from
* the device, and construct a configuration nvlist.
*/
int
vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
{
ldi_handle_t vd_lh;
vdev_label_t *label;
uint64_t s, size;
int l;
ddi_devid_t tmpdevid;
int error = -1;
char *minor_name;
/*
* Read the device label and build the nvlist.
*/
if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid,
&minor_name) == 0) {
error = ldi_open_by_devid(tmpdevid, minor_name,
FREAD, kcred, &vd_lh, zfs_li);
ddi_devid_free(tmpdevid);
ddi_devid_str_free(minor_name);
}
if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh,
zfs_li)))
return (error);
if (ldi_get_size(vd_lh, &s)) {
(void) ldi_close(vd_lh, FREAD, kcred);
return (SET_ERROR(EIO));
}
size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
*config = NULL;
for (l = 0; l < VDEV_LABELS; l++) {
uint64_t offset, state, txg = 0;
/* read vdev label */
offset = vdev_label_offset(size, l, 0);
if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
continue;
if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
*config = NULL;
continue;
}
if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
&state) != 0 || state >= POOL_STATE_DESTROYED) {
nvlist_free(*config);
*config = NULL;
continue;
}
if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
&txg) != 0 || txg == 0) {
nvlist_free(*config);
*config = NULL;
continue;
}
break;
}
kmem_free(label, sizeof (vdev_label_t));
(void) ldi_close(vd_lh, FREAD, kcred);
if (*config == NULL)
error = SET_ERROR(EIDRM);
return (error);
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c (revision 332525)
@@ -1,294 +1,296 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/vdev_file.h>
#include <sys/vdev_impl.h>
#include <sys/zio.h>
#include <sys/fs/zfs.h>
#include <sys/fm/fs/zfs.h>
#include <sys/abd.h>
/*
* Virtual device vector for files.
*/
static taskq_t *vdev_file_taskq;
void
vdev_file_init(void)
{
vdev_file_taskq = taskq_create("z_vdev_file", MAX(max_ncpus, 16),
minclsyspri, max_ncpus, INT_MAX, 0);
}
void
vdev_file_fini(void)
{
taskq_destroy(vdev_file_taskq);
}
static void
vdev_file_hold(vdev_t *vd)
{
ASSERT(vd->vdev_path != NULL);
}
static void
vdev_file_rele(vdev_t *vd)
{
ASSERT(vd->vdev_path != NULL);
}
static int
vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
uint64_t *logical_ashift, uint64_t *physical_ashift)
{
vdev_file_t *vf;
vnode_t *vp;
vattr_t vattr;
int error;
/*
* We must have a pathname, and it must be absolute.
*/
if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
return (SET_ERROR(EINVAL));
}
/*
* Reopen the device if it's not currently open. Otherwise,
* just update the physical size of the device.
*/
if (vd->vdev_tsd != NULL) {
ASSERT(vd->vdev_reopening);
vf = vd->vdev_tsd;
vp = vf->vf_vnode;
goto skip_open;
}
vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
/*
* We always open the files from the root of the global zone, even if
* we're in a local zone. If the user has gotten to this point, the
* administrator has already decided that the pool should be available
* to local zone users, so the underlying devices should be as well.
*/
ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
if (error) {
vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
kmem_free(vd->vdev_tsd, sizeof (vdev_file_t));
vd->vdev_tsd = NULL;
return (error);
}
vf->vf_vnode = vp;
#ifdef _KERNEL
/*
* Make sure it's a regular file.
*/
if (vp->v_type != VREG) {
#ifdef __FreeBSD__
(void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL);
#endif
vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
#ifdef __FreeBSD__
kmem_free(vd->vdev_tsd, sizeof (vdev_file_t));
vd->vdev_tsd = NULL;
#endif
return (SET_ERROR(ENODEV));
}
#endif /* _KERNEL */
skip_open:
/*
* Determine the physical size of the file.
*/
vattr.va_mask = AT_SIZE;
vn_lock(vp, LK_SHARED | LK_RETRY);
error = VOP_GETATTR(vp, &vattr, kcred);
VOP_UNLOCK(vp, 0);
if (error) {
(void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL);
vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
kmem_free(vd->vdev_tsd, sizeof (vdev_file_t));
vd->vdev_tsd = NULL;
return (error);
}
vd->vdev_notrim = B_TRUE;
*max_psize = *psize = vattr.va_size;
*logical_ashift = SPA_MINBLOCKSHIFT;
*physical_ashift = SPA_MINBLOCKSHIFT;
return (0);
}
static void
vdev_file_close(vdev_t *vd)
{
vdev_file_t *vf = vd->vdev_tsd;
if (vd->vdev_reopening || vf == NULL)
return;
if (vf->vf_vnode != NULL) {
(void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0,
kcred, NULL);
}
vd->vdev_delayed_close = B_FALSE;
kmem_free(vf, sizeof (vdev_file_t));
vd->vdev_tsd = NULL;
}
/*
* Implements the interrupt side for file vdev types. This routine will be
* called when the I/O completes allowing us to transfer the I/O to the
* interrupt taskqs. For consistency, the code structure mimics disk vdev
* types.
*/
static void
vdev_file_io_intr(zio_t *zio)
{
zio_delay_interrupt(zio);
}
static void
vdev_file_io_strategy(void *arg)
{
zio_t *zio = arg;
vdev_t *vd = zio->io_vd;
vdev_file_t *vf;
vnode_t *vp;
void *addr;
ssize_t resid;
vf = vd->vdev_tsd;
vp = vf->vf_vnode;
ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
if (zio->io_type == ZIO_TYPE_READ) {
addr = abd_borrow_buf(zio->io_abd, zio->io_size);
} else {
addr = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
}
zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
UIO_READ : UIO_WRITE, vp, addr, zio->io_size,
zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
if (zio->io_type == ZIO_TYPE_READ) {
abd_return_buf_copy(zio->io_abd, addr, zio->io_size);
} else {
abd_return_buf(zio->io_abd, addr, zio->io_size);
}
if (resid != 0 && zio->io_error == 0)
zio->io_error = ENOSPC;
vdev_file_io_intr(zio);
}
static void
vdev_file_io_start(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
vdev_file_t *vf = vd->vdev_tsd;
if (zio->io_type == ZIO_TYPE_IOCTL) {
/* XXPOLICY */
if (!vdev_readable(vd)) {
zio->io_error = SET_ERROR(ENXIO);
zio_interrupt(zio);
return;
}
switch (zio->io_cmd) {
case DKIOCFLUSHWRITECACHE:
zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
kcred, NULL);
break;
default:
zio->io_error = SET_ERROR(ENOTSUP);
}
zio_execute(zio);
return;
}
ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
zio->io_target_timestamp = zio_handle_io_delay(zio);
VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio,
TQ_SLEEP), !=, 0);
}
/* ARGSUSED */
static void
vdev_file_io_done(zio_t *zio)
{
}
vdev_ops_t vdev_file_ops = {
vdev_file_open,
vdev_file_close,
vdev_default_asize,
vdev_file_io_start,
vdev_file_io_done,
NULL,
vdev_file_hold,
vdev_file_rele,
+ NULL,
VDEV_TYPE_FILE, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
/*
* From userland we access disks just like files.
*/
#ifndef _KERNEL
vdev_ops_t vdev_disk_ops = {
vdev_file_open,
vdev_file_close,
vdev_default_asize,
vdev_file_io_start,
vdev_file_io_done,
NULL,
vdev_file_hold,
vdev_file_rele,
+ NULL,
VDEV_TYPE_DISK, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
#endif
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c (revision 332525)
@@ -1,1152 +1,1153 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
* All rights reserved.
*
* Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
*/
#include <sys/zfs_context.h>
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/bio.h>
#include <sys/disk.h>
#include <sys/spa.h>
#include <sys/spa_impl.h>
#include <sys/vdev_impl.h>
#include <sys/fs/zfs.h>
#include <sys/zio.h>
#include <geom/geom.h>
#include <geom/geom_int.h>
/*
* Virtual device vector for GEOM.
*/
static g_attrchanged_t vdev_geom_attrchanged;
struct g_class zfs_vdev_class = {
.name = "ZFS::VDEV",
.version = G_VERSION,
.attrchanged = vdev_geom_attrchanged,
};
struct consumer_vdev_elem {
SLIST_ENTRY(consumer_vdev_elem) elems;
vdev_t *vd;
};
SLIST_HEAD(consumer_priv_t, consumer_vdev_elem);
_Static_assert(sizeof(((struct g_consumer*)NULL)->private)
== sizeof(struct consumer_priv_t*),
"consumer_priv_t* can't be stored in g_consumer.private");
DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
SYSCTL_DECL(_vfs_zfs_vdev);
/* Don't send BIO_FLUSH. */
static int vdev_geom_bio_flush_disable;
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN,
&vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
/* Don't send BIO_DELETE. */
static int vdev_geom_bio_delete_disable;
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN,
&vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
/* Declare local functions */
static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
/*
* Thread local storage used to indicate when a thread is probing geoms
* for their guids. If NULL, this thread is not tasting geoms. If non NULL,
* it is looking for a replacement for the vdev_t* that is its value.
*/
uint_t zfs_geom_probe_vdev_key;
static void
vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
{
int error;
uint16_t rate;
error = g_getattr("GEOM::rotation_rate", cp, &rate);
if (error == 0)
vd->vdev_rotation_rate = rate;
else
vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
}
static void
vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp,
boolean_t do_null_update)
{
boolean_t needs_update = B_FALSE;
char *physpath;
int error, physpath_len;
physpath_len = MAXPATHLEN;
physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
if (error == 0) {
char *old_physpath;
/* g_topology lock ensures that vdev has not been closed */
g_topology_assert();
old_physpath = vd->vdev_physpath;
vd->vdev_physpath = spa_strdup(physpath);
if (old_physpath != NULL) {
needs_update = (strcmp(old_physpath,
vd->vdev_physpath) != 0);
spa_strfree(old_physpath);
} else
needs_update = do_null_update;
}
g_free(physpath);
/*
* If the physical path changed, update the config.
* Only request an update for previously unset physpaths if
* requested by the caller.
*/
if (needs_update)
spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE);
}
static void
vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
{
char *old_physpath;
struct consumer_priv_t *priv;
struct consumer_vdev_elem *elem;
int error;
priv = (struct consumer_priv_t*)&cp->private;
if (SLIST_EMPTY(priv))
return;
SLIST_FOREACH(elem, priv, elems) {
vdev_t *vd = elem->vd;
if (strcmp(attr, "GEOM::rotation_rate") == 0) {
vdev_geom_set_rotation_rate(vd, cp);
return;
}
if (strcmp(attr, "GEOM::physpath") == 0) {
vdev_geom_set_physpath(vd, cp, /*null_update*/B_TRUE);
return;
}
}
}
static void
vdev_geom_orphan(struct g_consumer *cp)
{
struct consumer_priv_t *priv;
struct consumer_vdev_elem *elem;
g_topology_assert();
priv = (struct consumer_priv_t*)&cp->private;
if (SLIST_EMPTY(priv))
/* Vdev close in progress. Ignore the event. */
return;
/*
* Orphan callbacks occur from the GEOM event thread.
* Concurrent with this call, new I/O requests may be
* working their way through GEOM about to find out
* (only once executed by the g_down thread) that we've
* been orphaned from our disk provider. These I/Os
* must be retired before we can detach our consumer.
* This is most easily achieved by acquiring the
* SPA ZIO configuration lock as a writer, but doing
* so with the GEOM topology lock held would cause
* a lock order reversal. Instead, rely on the SPA's
* async removal support to invoke a close on this
* vdev once it is safe to do so.
*/
SLIST_FOREACH(elem, priv, elems) {
vdev_t *vd = elem->vd;
vd->vdev_remove_wanted = B_TRUE;
spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
}
}
static struct g_consumer *
vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity)
{
struct g_geom *gp;
struct g_consumer *cp;
int error;
g_topology_assert();
ZFS_LOG(1, "Attaching to %s.", pp->name);
if (sanity) {
if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
ZFS_LOG(1, "Failing attach of %s. "
"Incompatible sectorsize %d\n",
pp->name, pp->sectorsize);
return (NULL);
} else if (pp->mediasize < SPA_MINDEVSIZE) {
ZFS_LOG(1, "Failing attach of %s. "
"Incompatible mediasize %ju\n",
pp->name, pp->mediasize);
return (NULL);
}
}
/* Do we have geom already? No? Create one. */
LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
if (gp->flags & G_GEOM_WITHER)
continue;
if (strcmp(gp->name, "zfs::vdev") != 0)
continue;
break;
}
if (gp == NULL) {
gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
gp->orphan = vdev_geom_orphan;
gp->attrchanged = vdev_geom_attrchanged;
cp = g_new_consumer(gp);
error = g_attach(cp, pp);
if (error != 0) {
ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
__LINE__, error);
vdev_geom_detach(cp, B_FALSE);
return (NULL);
}
error = g_access(cp, 1, 0, 1);
if (error != 0) {
ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__,
__LINE__, error);
vdev_geom_detach(cp, B_FALSE);
return (NULL);
}
ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
} else {
/* Check if we are already connected to this provider. */
LIST_FOREACH(cp, &gp->consumer, consumer) {
if (cp->provider == pp) {
ZFS_LOG(1, "Found consumer for %s.", pp->name);
break;
}
}
if (cp == NULL) {
cp = g_new_consumer(gp);
error = g_attach(cp, pp);
if (error != 0) {
ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
__func__, __LINE__, error);
vdev_geom_detach(cp, B_FALSE);
return (NULL);
}
error = g_access(cp, 1, 0, 1);
if (error != 0) {
ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
__func__, __LINE__, error);
vdev_geom_detach(cp, B_FALSE);
return (NULL);
}
ZFS_LOG(1, "Created consumer for %s.", pp->name);
} else {
error = g_access(cp, 1, 0, 1);
if (error != 0) {
ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
__func__, __LINE__, error);
return (NULL);
}
ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
}
}
if (vd != NULL)
vd->vdev_tsd = cp;
cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
return (cp);
}
static void
vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
{
struct g_geom *gp;
g_topology_assert();
ZFS_LOG(1, "Detaching from %s.",
cp->provider && cp->provider->name ? cp->provider->name : "NULL");
gp = cp->geom;
if (open_for_read)
g_access(cp, -1, 0, -1);
/* Destroy consumer on last close. */
if (cp->acr == 0 && cp->ace == 0) {
if (cp->acw > 0)
g_access(cp, 0, -cp->acw, 0);
if (cp->provider != NULL) {
ZFS_LOG(1, "Destroying consumer for %s.",
cp->provider->name ? cp->provider->name : "NULL");
g_detach(cp);
}
g_destroy_consumer(cp);
}
/* Destroy geom if there are no consumers left. */
if (LIST_EMPTY(&gp->consumer)) {
ZFS_LOG(1, "Destroyed geom %s.", gp->name);
g_wither_geom(gp, ENXIO);
}
}
static void
vdev_geom_close_locked(vdev_t *vd)
{
struct g_consumer *cp;
struct consumer_priv_t *priv;
struct consumer_vdev_elem *elem, *elem_temp;
g_topology_assert();
cp = vd->vdev_tsd;
vd->vdev_delayed_close = B_FALSE;
if (cp == NULL)
return;
ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__));
priv = (struct consumer_priv_t*)&cp->private;
vd->vdev_tsd = NULL;
SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) {
if (elem->vd == vd) {
SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems);
g_free(elem);
}
}
vdev_geom_detach(cp, B_TRUE);
}
/*
* Issue one or more bios to the vdev in parallel
* cmds, datas, offsets, errors, and sizes are arrays of length ncmds. Each IO
* operation is described by parallel entries from each array. There may be
* more bios actually issued than entries in the array
*/
static void
vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
off_t *sizes, int *errors, int ncmds)
{
struct bio **bios;
u_char *p;
off_t off, maxio, s, end;
int i, n_bios, j;
size_t bios_size;
maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
n_bios = 0;
/* How many bios are required for all commands ? */
for (i = 0; i < ncmds; i++)
n_bios += (sizes[i] + maxio - 1) / maxio;
/* Allocate memory for the bios */
bios_size = n_bios * sizeof(struct bio*);
bios = kmem_zalloc(bios_size, KM_SLEEP);
/* Prepare and issue all of the bios */
for (i = j = 0; i < ncmds; i++) {
off = offsets[i];
p = datas[i];
s = sizes[i];
end = off + s;
ASSERT((off % cp->provider->sectorsize) == 0);
ASSERT((s % cp->provider->sectorsize) == 0);
for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
bios[j] = g_alloc_bio();
bios[j]->bio_cmd = cmds[i];
bios[j]->bio_done = NULL;
bios[j]->bio_offset = off;
bios[j]->bio_length = MIN(s, maxio);
bios[j]->bio_data = p;
g_io_request(bios[j], cp);
}
}
ASSERT(j == n_bios);
/* Wait for all of the bios to complete, and clean them up */
for (i = j = 0; i < ncmds; i++) {
off = offsets[i];
s = sizes[i];
end = off + s;
for (; off < end; off += maxio, s -= maxio, j++) {
errors[i] = biowait(bios[j], "vdev_geom_io") || errors[i];
g_destroy_bio(bios[j]);
}
}
kmem_free(bios, bios_size);
}
/*
* Read the vdev config from a device. Return the number of valid labels that
* were found. The vdev config will be returned in config if and only if at
* least one valid label was found.
*/
static int
vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config)
{
struct g_provider *pp;
vdev_phys_t *vdev_lists[VDEV_LABELS];
char *buf;
size_t buflen;
uint64_t psize, state, txg;
off_t offsets[VDEV_LABELS];
off_t size;
off_t sizes[VDEV_LABELS];
int cmds[VDEV_LABELS];
int errors[VDEV_LABELS];
int l, nlabels;
g_topology_assert_not();
pp = cp->provider;
ZFS_LOG(1, "Reading config from %s...", pp->name);
psize = pp->mediasize;
psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
size = sizeof(*vdev_lists[0]) + pp->sectorsize -
((sizeof(*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
buflen = sizeof(vdev_lists[0]->vp_nvlist);
*config = NULL;
/* Create all of the IO requests */
for (l = 0; l < VDEV_LABELS; l++) {
cmds[l] = BIO_READ;
vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
sizes[l] = size;
errors[l] = 0;
ASSERT(offsets[l] % pp->sectorsize == 0);
}
/* Issue the IO requests */
vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
VDEV_LABELS);
/* Parse the labels */
nlabels = 0;
for (l = 0; l < VDEV_LABELS; l++) {
if (errors[l] != 0)
continue;
buf = vdev_lists[l]->vp_nvlist;
if (nvlist_unpack(buf, buflen, config, 0) != 0)
continue;
if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
&state) != 0 || state > POOL_STATE_L2CACHE) {
nvlist_free(*config);
*config = NULL;
continue;
}
if (state != POOL_STATE_SPARE &&
state != POOL_STATE_L2CACHE &&
(nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
&txg) != 0 || txg == 0)) {
nvlist_free(*config);
*config = NULL;
continue;
}
nlabels++;
}
/* Free the label storage */
for (l = 0; l < VDEV_LABELS; l++)
kmem_free(vdev_lists[l], size);
return (nlabels);
}
static void
resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
{
nvlist_t **new_configs;
uint64_t i;
if (id < *count)
return;
new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *),
KM_SLEEP);
for (i = 0; i < *count; i++)
new_configs[i] = (*configs)[i];
if (*configs != NULL)
kmem_free(*configs, *count * sizeof(void *));
*configs = new_configs;
*count = id + 1;
}
static void
process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
const char *name, uint64_t* known_pool_guid)
{
nvlist_t *vdev_tree;
uint64_t pool_guid;
uint64_t vdev_guid, known_guid;
uint64_t id, txg, known_txg;
char *pname;
int i;
if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
strcmp(pname, name) != 0)
goto ignore;
if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
goto ignore;
if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
goto ignore;
if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
goto ignore;
if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
goto ignore;
VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
if (*known_pool_guid != 0) {
if (pool_guid != *known_pool_guid)
goto ignore;
} else
*known_pool_guid = pool_guid;
resize_configs(configs, count, id);
if ((*configs)[id] != NULL) {
VERIFY(nvlist_lookup_uint64((*configs)[id],
ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
if (txg <= known_txg)
goto ignore;
nvlist_free((*configs)[id]);
}
(*configs)[id] = cfg;
return;
ignore:
nvlist_free(cfg);
}
int
vdev_geom_read_pool_label(const char *name,
nvlist_t ***configs, uint64_t *count)
{
struct g_class *mp;
struct g_geom *gp;
struct g_provider *pp;
struct g_consumer *zcp;
nvlist_t *vdev_cfg;
uint64_t pool_guid;
int error, nlabels;
DROP_GIANT();
g_topology_lock();
*configs = NULL;
*count = 0;
pool_guid = 0;
LIST_FOREACH(mp, &g_classes, class) {
if (mp == &zfs_vdev_class)
continue;
LIST_FOREACH(gp, &mp->geom, geom) {
if (gp->flags & G_GEOM_WITHER)
continue;
LIST_FOREACH(pp, &gp->provider, provider) {
if (pp->flags & G_PF_WITHER)
continue;
zcp = vdev_geom_attach(pp, NULL, B_TRUE);
if (zcp == NULL)
continue;
g_topology_unlock();
nlabels = vdev_geom_read_config(zcp, &vdev_cfg);
g_topology_lock();
vdev_geom_detach(zcp, B_TRUE);
if (nlabels == 0)
continue;
ZFS_LOG(1, "successfully read vdev config");
process_vdev_config(configs, count,
vdev_cfg, name, &pool_guid);
}
}
}
g_topology_unlock();
PICKUP_GIANT();
return (*count > 0 ? 0 : ENOENT);
}
enum match {
NO_MATCH = 0, /* No matching labels found */
TOPGUID_MATCH = 1, /* Labels match top guid, not vdev guid*/
ZERO_MATCH = 1, /* Should never be returned */
ONE_MATCH = 2, /* 1 label matching the vdev_guid */
TWO_MATCH = 3, /* 2 label matching the vdev_guid */
THREE_MATCH = 4, /* 3 label matching the vdev_guid */
FULL_MATCH = 5 /* all labels match the vdev_guid */
};
static enum match
vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
{
nvlist_t *config;
uint64_t pool_guid, top_guid, vdev_guid;
struct g_consumer *cp;
int nlabels;
cp = vdev_geom_attach(pp, NULL, B_TRUE);
if (cp == NULL) {
ZFS_LOG(1, "Unable to attach tasting instance to %s.",
pp->name);
return (NO_MATCH);
}
g_topology_unlock();
nlabels = vdev_geom_read_config(cp, &config);
g_topology_lock();
vdev_geom_detach(cp, B_TRUE);
if (nlabels == 0) {
ZFS_LOG(1, "Unable to read config from %s.", pp->name);
return (NO_MATCH);
}
pool_guid = 0;
(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid);
top_guid = 0;
(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid);
vdev_guid = 0;
(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
nvlist_free(config);
/*
* Check that the label's pool guid matches the desired guid.
* Inactive spares and L2ARCs do not have any pool guid in the label.
*/
if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) {
ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.",
pp->name,
(uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid);
return (NO_MATCH);
}
/*
* Check that the label's vdev guid matches the desired guid.
* The second condition handles possible race on vdev detach, when
* remaining vdev receives GUID of destroyed top level mirror vdev.
*/
if (vdev_guid == vd->vdev_guid) {
ZFS_LOG(1, "guids match for provider %s.", pp->name);
return (ZERO_MATCH + nlabels);
} else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) {
ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name);
return (TOPGUID_MATCH);
}
ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.",
pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid);
return (NO_MATCH);
}
static struct g_consumer *
vdev_geom_attach_by_guids(vdev_t *vd)
{
struct g_class *mp;
struct g_geom *gp;
struct g_provider *pp, *best_pp;
struct g_consumer *cp;
enum match match, best_match;
g_topology_assert();
cp = NULL;
best_pp = NULL;
best_match = NO_MATCH;
LIST_FOREACH(mp, &g_classes, class) {
if (mp == &zfs_vdev_class)
continue;
LIST_FOREACH(gp, &mp->geom, geom) {
if (gp->flags & G_GEOM_WITHER)
continue;
LIST_FOREACH(pp, &gp->provider, provider) {
match = vdev_attach_ok(vd, pp);
if (match > best_match) {
best_match = match;
best_pp = pp;
}
if (match == FULL_MATCH)
goto out;
}
}
}
out:
if (best_pp) {
cp = vdev_geom_attach(best_pp, vd, B_TRUE);
if (cp == NULL) {
printf("ZFS WARNING: Unable to attach to %s.\n",
best_pp->name);
}
}
return (cp);
}
static struct g_consumer *
vdev_geom_open_by_guids(vdev_t *vd)
{
struct g_consumer *cp;
char *buf;
size_t len;
g_topology_assert();
ZFS_LOG(1, "Searching by guids [%ju:%ju].",
(uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
cp = vdev_geom_attach_by_guids(vd);
if (cp != NULL) {
len = strlen(cp->provider->name) + strlen("/dev/") + 1;
buf = kmem_alloc(len, KM_SLEEP);
snprintf(buf, len, "/dev/%s", cp->provider->name);
spa_strfree(vd->vdev_path);
vd->vdev_path = buf;
ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
(uintmax_t)spa_guid(vd->vdev_spa),
(uintmax_t)vd->vdev_guid, cp->provider->name);
} else {
ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
(uintmax_t)spa_guid(vd->vdev_spa),
(uintmax_t)vd->vdev_guid);
}
return (cp);
}
static struct g_consumer *
vdev_geom_open_by_path(vdev_t *vd, int check_guid)
{
struct g_provider *pp;
struct g_consumer *cp;
g_topology_assert();
cp = NULL;
pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
if (pp != NULL) {
ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH)
cp = vdev_geom_attach(pp, vd, B_FALSE);
}
return (cp);
}
static int
vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
uint64_t *logical_ashift, uint64_t *physical_ashift)
{
struct g_provider *pp;
struct g_consumer *cp;
size_t bufsize;
int error;
/* Set the TLS to indicate downstack that we should not access zvols*/
VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
/*
* We must have a pathname, and it must be absolute.
*/
if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
return (EINVAL);
}
/*
* Reopen the device if it's not currently open. Otherwise,
* just update the physical size of the device.
*/
if ((cp = vd->vdev_tsd) != NULL) {
ASSERT(vd->vdev_reopening);
goto skip_open;
}
DROP_GIANT();
g_topology_lock();
error = 0;
if (vd->vdev_spa->spa_splitting_newspa ||
(vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) {
/*
* We are dealing with a vdev that hasn't been previously
* opened (since boot), and we are not loading an
* existing pool configuration. This looks like a
* vdev add operation to a new or existing pool.
* Assume the user knows what he/she is doing and find
* GEOM provider by its name, ignoring GUID mismatches.
*
* XXPOLICY: It would be safer to only allow a device
* that is unlabeled or labeled but missing
* GUID information to be opened in this fashion,
* unless we are doing a split, in which case we
* should allow any guid.
*/
cp = vdev_geom_open_by_path(vd, 0);
} else {
/*
* Try using the recorded path for this device, but only
* accept it if its label data contains the expected GUIDs.
*/
cp = vdev_geom_open_by_path(vd, 1);
if (cp == NULL) {
/*
* The device at vd->vdev_path doesn't have the
* expected GUIDs. The disks might have merely
* moved around so try all other GEOM providers
* to find one with the right GUIDs.
*/
cp = vdev_geom_open_by_guids(vd);
}
}
/* Clear the TLS now that tasting is done */
VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
if (cp == NULL) {
ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path);
error = ENOENT;
} else {
struct consumer_priv_t *priv;
struct consumer_vdev_elem *elem;
int spamode;
priv = (struct consumer_priv_t*)&cp->private;
if (cp->private == NULL)
SLIST_INIT(priv);
elem = g_malloc(sizeof(*elem), M_WAITOK|M_ZERO);
elem->vd = vd;
SLIST_INSERT_HEAD(priv, elem, elems);
spamode = spa_mode(vd->vdev_spa);
if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
!ISP2(cp->provider->sectorsize)) {
ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
cp->provider->name);
vdev_geom_close_locked(vd);
error = EINVAL;
cp = NULL;
} else if (cp->acw == 0 && (spamode & FWRITE) != 0) {
int i;
for (i = 0; i < 5; i++) {
error = g_access(cp, 0, 1, 0);
if (error == 0)
break;
g_topology_unlock();
tsleep(vd, 0, "vdev", hz / 2);
g_topology_lock();
}
if (error != 0) {
printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
cp->provider->name, error);
vdev_geom_close_locked(vd);
cp = NULL;
}
}
}
/* Fetch initial physical path information for this device. */
if (cp != NULL) {
vdev_geom_attrchanged(cp, "GEOM::physpath");
/* Set other GEOM characteristics */
vdev_geom_set_physpath(vd, cp, /*do_null_update*/B_FALSE);
vdev_geom_set_rotation_rate(vd, cp);
}
g_topology_unlock();
PICKUP_GIANT();
if (cp == NULL) {
vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
return (error);
}
skip_open:
pp = cp->provider;
/*
* Determine the actual size of the device.
*/
*max_psize = *psize = pp->mediasize;
/*
* Determine the device's minimum transfer size and preferred
* transfer size.
*/
*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
*physical_ashift = 0;
if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0)
*physical_ashift = highbit(pp->stripesize) - 1;
/*
* Clear the nowritecache settings, so that on a vdev_reopen()
* we will try again.
*/
vd->vdev_nowritecache = B_FALSE;
return (0);
}
static void
vdev_geom_close(vdev_t *vd)
{
struct g_consumer *cp;
cp = vd->vdev_tsd;
DROP_GIANT();
g_topology_lock();
if (!vd->vdev_reopening ||
(cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 ||
(cp->provider != NULL && cp->provider->error != 0))))
vdev_geom_close_locked(vd);
g_topology_unlock();
PICKUP_GIANT();
}
static void
vdev_geom_io_intr(struct bio *bp)
{
vdev_t *vd;
zio_t *zio;
zio = bp->bio_caller1;
vd = zio->io_vd;
zio->io_error = bp->bio_error;
if (zio->io_error == 0 && bp->bio_resid != 0)
zio->io_error = SET_ERROR(EIO);
switch(zio->io_error) {
case ENOTSUP:
/*
* If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
* that future attempts will never succeed. In this case
* we set a persistent flag so that we don't bother with
* requests in the future.
*/
switch(bp->bio_cmd) {
case BIO_FLUSH:
vd->vdev_nowritecache = B_TRUE;
break;
case BIO_DELETE:
vd->vdev_notrim = B_TRUE;
break;
}
break;
case ENXIO:
if (!vd->vdev_remove_wanted) {
/*
* If provider's error is set we assume it is being
* removed.
*/
if (bp->bio_to->error != 0) {
vd->vdev_remove_wanted = B_TRUE;
spa_async_request(zio->io_spa,
SPA_ASYNC_REMOVE);
} else if (!vd->vdev_delayed_close) {
vd->vdev_delayed_close = B_TRUE;
}
}
break;
}
/*
* We have to split bio freeing into two parts, because the ABD code
* cannot be called in this context and vdev_op_io_done is not called
* for ZIO_TYPE_IOCTL zio-s.
*/
if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
g_destroy_bio(bp);
zio->io_bio = NULL;
}
zio_delay_interrupt(zio);
}
static void
vdev_geom_io_start(zio_t *zio)
{
vdev_t *vd;
struct g_consumer *cp;
struct bio *bp;
int error;
vd = zio->io_vd;
switch (zio->io_type) {
case ZIO_TYPE_IOCTL:
/* XXPOLICY */
if (!vdev_readable(vd)) {
zio->io_error = SET_ERROR(ENXIO);
zio_interrupt(zio);
return;
} else {
switch (zio->io_cmd) {
case DKIOCFLUSHWRITECACHE:
if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
break;
if (vd->vdev_nowritecache) {
zio->io_error = SET_ERROR(ENOTSUP);
break;
}
goto sendreq;
default:
zio->io_error = SET_ERROR(ENOTSUP);
}
}
zio_execute(zio);
return;
case ZIO_TYPE_FREE:
if (vd->vdev_notrim) {
zio->io_error = SET_ERROR(ENOTSUP);
} else if (!vdev_geom_bio_delete_disable) {
goto sendreq;
}
zio_execute(zio);
return;
}
sendreq:
ASSERT(zio->io_type == ZIO_TYPE_READ ||
zio->io_type == ZIO_TYPE_WRITE ||
zio->io_type == ZIO_TYPE_FREE ||
zio->io_type == ZIO_TYPE_IOCTL);
cp = vd->vdev_tsd;
if (cp == NULL) {
zio->io_error = SET_ERROR(ENXIO);
zio_interrupt(zio);
return;
}
bp = g_alloc_bio();
bp->bio_caller1 = zio;
switch (zio->io_type) {
case ZIO_TYPE_READ:
case ZIO_TYPE_WRITE:
zio->io_target_timestamp = zio_handle_io_delay(zio);
bp->bio_offset = zio->io_offset;
bp->bio_length = zio->io_size;
if (zio->io_type == ZIO_TYPE_READ) {
bp->bio_cmd = BIO_READ;
bp->bio_data =
abd_borrow_buf(zio->io_abd, zio->io_size);
} else {
bp->bio_cmd = BIO_WRITE;
bp->bio_data =
abd_borrow_buf_copy(zio->io_abd, zio->io_size);
}
break;
case ZIO_TYPE_FREE:
bp->bio_cmd = BIO_DELETE;
bp->bio_data = NULL;
bp->bio_offset = zio->io_offset;
bp->bio_length = zio->io_size;
break;
case ZIO_TYPE_IOCTL:
bp->bio_cmd = BIO_FLUSH;
bp->bio_flags |= BIO_ORDERED;
bp->bio_data = NULL;
bp->bio_offset = cp->provider->mediasize;
bp->bio_length = 0;
break;
}
bp->bio_done = vdev_geom_io_intr;
zio->io_bio = bp;
g_io_request(bp, cp);
}
static void
vdev_geom_io_done(zio_t *zio)
{
struct bio *bp = zio->io_bio;
if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
ASSERT(bp == NULL);
return;
}
if (bp == NULL) {
ASSERT3S(zio->io_error, ==, ENXIO);
return;
}
if (zio->io_type == ZIO_TYPE_READ)
abd_return_buf_copy(zio->io_abd, bp->bio_data, zio->io_size);
else
abd_return_buf(zio->io_abd, bp->bio_data, zio->io_size);
g_destroy_bio(bp);
zio->io_bio = NULL;
}
static void
vdev_geom_hold(vdev_t *vd)
{
}
static void
vdev_geom_rele(vdev_t *vd)
{
}
vdev_ops_t vdev_geom_ops = {
vdev_geom_open,
vdev_geom_close,
vdev_default_asize,
vdev_geom_io_start,
vdev_geom_io_done,
NULL,
vdev_geom_hold,
vdev_geom_rele,
+ NULL,
VDEV_TYPE_DISK, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c (nonexistent)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c (revision 332525)
@@ -0,0 +1,1037 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/metaslab.h>
+#include <sys/refcount.h>
+#include <sys/dmu.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zap.h>
+
+/*
+ * An indirect vdev corresponds to a vdev that has been removed. Since
+ * we cannot rewrite block pointers of snapshots, etc., we keep a
+ * mapping from old location on the removed device to the new location
+ * on another device in the pool and use this mapping whenever we need
+ * to access the DVA. Unfortunately, this mapping did not respect
+ * logical block boundaries when it was first created, and so a DVA on
+ * this indirect vdev may be "split" into multiple sections that each
+ * map to a different location. As a consequence, not all DVAs can be
+ * translated to an equivalent new DVA. Instead we must provide a
+ * "vdev_remap" operation that executes a callback on each contiguous
+ * segment of the new location. This function is used in multiple ways:
+ *
+ * - reads and repair writes to this device use the callback to create
+ * a child io for each mapped segment.
+ *
+ * - frees and claims to this device use the callback to free or claim
+ * each mapped segment. (Note that we don't actually need to claim
+ * log blocks on indirect vdevs, because we don't allocate to
+ * removing vdevs. However, zdb uses zio_claim() for its leak
+ * detection.)
+ */
+
+/*
+ * "Big theory statement" for how we mark blocks obsolete.
+ *
+ * When a block on an indirect vdev is freed or remapped, a section of
+ * that vdev's mapping may no longer be referenced (aka "obsolete"). We
+ * keep track of how much of each mapping entry is obsolete. When
+ * an entry becomes completely obsolete, we can remove it, thus reducing
+ * the memory used by the mapping. The complete picture of obsolescence
+ * is given by the following data structures, described below:
+ * - the entry-specific obsolete count
+ * - the vdev-specific obsolete spacemap
+ * - the pool-specific obsolete bpobj
+ *
+ * == On disk data structures used ==
+ *
+ * We track the obsolete space for the pool using several objects. Each
+ * of these objects is created on demand and freed when no longer
+ * needed, and is assumed to be empty if it does not exist.
+ * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects.
+ *
+ * - Each vic_mapping_object (associated with an indirect vdev) can
+ * have a vimp_counts_object. This is an array of uint32_t's
+ * with the same number of entries as the vic_mapping_object. When
+ * the mapping is condensed, entries from the vic_obsolete_sm_object
+ * (see below) are folded into the counts. Therefore, each
+ * obsolete_counts entry tells us the number of bytes in the
+ * corresponding mapping entry that were not referenced when the
+ * mapping was last condensed.
+ *
+ * - Each indirect or removing vdev can have a vic_obsolete_sm_object.
+ * This is a space map containing an alloc entry for every DVA that
+ * has been obsoleted since the last time this indirect vdev was
+ * condensed. We use this object in order to improve performance
+ * when marking a DVA as obsolete. Instead of modifying an arbitrary
+ * offset of the vimp_counts_object, we only need to append an entry
+ * to the end of this object. When a DVA becomes obsolete, it is
+ * added to the obsolete space map. This happens when the DVA is
+ * freed, remapped and not referenced by a snapshot, or the last
+ * snapshot referencing it is destroyed.
+ *
+ * - Each dataset can have a ds_remap_deadlist object. This is a
+ * deadlist object containing all blocks that were remapped in this
+ * dataset but referenced in a previous snapshot. Blocks can *only*
+ * appear on this list if they were remapped (dsl_dataset_block_remapped);
+ * blocks that were killed in a head dataset are put on the normal
+ * ds_deadlist and marked obsolete when they are freed.
+ *
+ * - The pool can have a dp_obsolete_bpobj. This is a list of blocks
+ * in the pool that need to be marked obsolete. When a snapshot is
+ * destroyed, we move some of the ds_remap_deadlist to the obsolete
+ * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then
+ * asynchronously process the obsolete bpobj, moving its entries to
+ * the specific vdevs' obsolete space maps.
+ *
+ * == Summary of how we mark blocks as obsolete ==
+ *
+ * - When freeing a block: if any DVA is on an indirect vdev, append to
+ * vic_obsolete_sm_object.
+ * - When remapping a block, add dva to ds_remap_deadlist (if prev snap
+ * references; otherwise append to vic_obsolete_sm_object).
+ * - When freeing a snapshot: move parts of ds_remap_deadlist to
+ * dp_obsolete_bpobj (same algorithm as ds_deadlist).
+ * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to
+ * individual vdev's vic_obsolete_sm_object.
+ */
+
+/*
+ * "Big theory statement" for how we condense indirect vdevs.
+ *
+ * Condensing an indirect vdev's mapping is the process of determining
+ * the precise counts of obsolete space for each mapping entry (by
+ * integrating the obsolete spacemap into the obsolete counts) and
+ * writing out a new mapping that contains only referenced entries.
+ *
+ * We condense a vdev when we expect the mapping to shrink (see
+ * vdev_indirect_should_condense()), but only perform one condense at a
+ * time to limit the memory usage. In addition, we use a separate
+ * open-context thread (spa_condense_indirect_thread) to incrementally
+ * create the new mapping object in a way that minimizes the impact on
+ * the rest of the system.
+ *
+ * == Generating a new mapping ==
+ *
+ * To generate a new mapping, we follow these steps:
+ *
+ * 1. Save the old obsolete space map and create a new mapping object
+ * (see spa_condense_indirect_start_sync()). This initializes the
+ * spa_condensing_indirect_phys with the "previous obsolete space map",
+ * which is now read only. Newly obsolete DVAs will be added to a
+ * new (initially empty) obsolete space map, and will not be
+ * considered as part of this condense operation.
+ *
+ * 2. Construct in memory the precise counts of obsolete space for each
+ * mapping entry, by incorporating the obsolete space map into the
+ * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().)
+ *
+ * 3. Iterate through each mapping entry, writing to the new mapping any
+ * entries that are not completely obsolete (i.e. which don't have
+ * obsolete count == mapping length). (See
+ * spa_condense_indirect_generate_new_mapping().)
+ *
+ * 4. Destroy the old mapping object and switch over to the new one
+ * (spa_condense_indirect_complete_sync).
+ *
+ * == Restarting from failure ==
+ *
+ * To restart the condense when we import/open the pool, we must start
+ * at the 2nd step above: reconstruct the precise counts in memory,
+ * based on the space map + counts. Then in the 3rd step, we start
+ * iterating where we left off: at vimp_max_offset of the new mapping
+ * object.
+ */
+
+boolean_t zfs_condense_indirect_vdevs_enable = B_TRUE;
+
+/*
+ * Condense if at least this percent of the bytes in the mapping is
+ * obsolete. With the default of 25%, the amount of space mapped
+ * will be reduced to 1% of its original size after at most 16
+ * condenses. Higher values will condense less often (causing less
+ * i/o); lower values will reduce the mapping size more quickly.
+ */
+int zfs_indirect_condense_obsolete_pct = 25;
+
+/*
+ * Condense if the obsolete space map takes up more than this amount of
+ * space on disk (logically). This limits the amount of disk space
+ * consumed by the obsolete space map; the default of 1GB is small enough
+ * that we typically don't mind "wasting" it.
+ */
+uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024;
+
+/*
+ * Don't bother condensing if the mapping uses less than this amount of
+ * memory. The default of 128KB is considered a "trivial" amount of
+ * memory and not worth reducing.
+ */
+uint64_t zfs_condense_min_mapping_bytes = 128 * 1024;
+
+/*
+ * This is used by the test suite so that it can ensure that certain
+ * actions happen while in the middle of a condense (which might otherwise
+ * complete too quickly). If used to reduce the performance impact of
+ * condensing in production, a maximum value of 1 should be sufficient.
+ */
+int zfs_condense_indirect_commit_entry_delay_ticks = 0;
+
+/*
+ * Mark the given offset and size as being obsolete in the given txg.
+ */
+void
+vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size,
+ uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ ASSERT3U(spa_syncing_txg(spa), ==, txg);
+ ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0);
+ ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
+ ASSERT(size > 0);
+ VERIFY(vdev_indirect_mapping_entry_for_offset(
+ vd->vdev_indirect_mapping, offset) != NULL);
+
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+ mutex_enter(&vd->vdev_obsolete_lock);
+ range_tree_add(vd->vdev_obsolete_segments, offset, size);
+ mutex_exit(&vd->vdev_obsolete_lock);
+ vdev_dirty(vd, 0, NULL, txg);
+ }
+}
+
+/*
+ * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This
+ * wrapper is provided because the DMU does not know about vdev_t's and
+ * cannot directly call vdev_indirect_mark_obsolete.
+ */
+void
+spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset,
+ uint64_t size, dmu_tx_t *tx)
+{
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ /* The DMU can only remap indirect vdevs. */
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+ vdev_indirect_mark_obsolete(vd, offset, size, dmu_tx_get_txg(tx));
+}
+
+static spa_condensing_indirect_t *
+spa_condensing_indirect_create(spa_t *spa)
+{
+ spa_condensing_indirect_phys_t *scip =
+ &spa->spa_condensing_indirect_phys;
+ spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP);
+ objset_t *mos = spa->spa_meta_objset;
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ list_create(&sci->sci_new_mapping_entries[i],
+ sizeof (vdev_indirect_mapping_entry_t),
+ offsetof(vdev_indirect_mapping_entry_t, vime_node));
+ }
+
+ sci->sci_new_mapping =
+ vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object);
+
+ return (sci);
+}
+
+static void
+spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci)
+{
+ for (int i = 0; i < TXG_SIZE; i++)
+ list_destroy(&sci->sci_new_mapping_entries[i]);
+
+ if (sci->sci_new_mapping != NULL)
+ vdev_indirect_mapping_close(sci->sci_new_mapping);
+
+ kmem_free(sci, sizeof (*sci));
+}
+
+boolean_t
+vdev_indirect_should_condense(vdev_t *vd)
+{
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool));
+
+ if (!zfs_condense_indirect_vdevs_enable)
+ return (B_FALSE);
+
+ /*
+ * We can only condense one indirect vdev at a time.
+ */
+ if (spa->spa_condensing_indirect != NULL)
+ return (B_FALSE);
+
+ if (spa_shutting_down(spa))
+ return (B_FALSE);
+
+ /*
+ * The mapping object size must not change while we are
+ * condensing, so we can only condense indirect vdevs
+ * (not vdevs that are still in the middle of being removed).
+ */
+ if (vd->vdev_ops != &vdev_indirect_ops)
+ return (B_FALSE);
+
+ /*
+ * If nothing new has been marked obsolete, there is no
+ * point in condensing.
+ */
+ if (vd->vdev_obsolete_sm == NULL) {
+ ASSERT0(vdev_obsolete_sm_object(vd));
+ return (B_FALSE);
+ }
+
+ ASSERT(vd->vdev_obsolete_sm != NULL);
+
+ ASSERT3U(vdev_obsolete_sm_object(vd), ==,
+ space_map_object(vd->vdev_obsolete_sm));
+
+ uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim);
+ uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm);
+ uint64_t mapping_size = vdev_indirect_mapping_size(vim);
+ uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm);
+
+ ASSERT3U(bytes_obsolete, <=, bytes_mapped);
+
+ /*
+ * If a high percentage of the bytes that are mapped have become
+ * obsolete, condense (unless the mapping is already small enough).
+ * This has a good chance of reducing the amount of memory used
+ * by the mapping.
+ */
+ if (bytes_obsolete * 100 / bytes_mapped >=
+ zfs_indirect_condense_obsolete_pct &&
+ mapping_size > zfs_condense_min_mapping_bytes) {
+ zfs_dbgmsg("should condense vdev %llu because obsolete "
+ "spacemap covers %d%% of %lluMB mapping",
+ (u_longlong_t)vd->vdev_id,
+ (int)(bytes_obsolete * 100 / bytes_mapped),
+ (u_longlong_t)bytes_mapped / 1024 / 1024);
+ return (B_TRUE);
+ }
+
+ /*
+ * If the obsolete space map takes up too much space on disk,
+ * condense in order to free up this disk space.
+ */
+ if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) {
+ zfs_dbgmsg("should condense vdev %llu because obsolete sm "
+ "length %lluMB >= max size %lluMB",
+ (u_longlong_t)vd->vdev_id,
+ (u_longlong_t)obsolete_sm_size / 1024 / 1024,
+ (u_longlong_t)zfs_condense_max_obsolete_bytes /
+ 1024 / 1024);
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * This sync task completes (finishes) a condense, deleting the old
+ * mapping and replacing it with the new one.
+ */
+static void
+spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_condensing_indirect_t *sci = arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ spa_condensing_indirect_phys_t *scip =
+ &spa->spa_condensing_indirect_phys;
+ vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev);
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+ objset_t *mos = spa->spa_meta_objset;
+ vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
+ uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping);
+ uint64_t new_count =
+ vdev_indirect_mapping_num_entries(sci->sci_new_mapping);
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+ ASSERT3P(sci, ==, spa->spa_condensing_indirect);
+ for (int i = 0; i < TXG_SIZE; i++) {
+ ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
+ }
+ ASSERT(vic->vic_mapping_object != 0);
+ ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
+ ASSERT(scip->scip_next_mapping_object != 0);
+ ASSERT(scip->scip_prev_obsolete_sm_object != 0);
+
+ /*
+ * Reset vdev_indirect_mapping to refer to the new object.
+ */
+ rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER);
+ vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
+ vd->vdev_indirect_mapping = sci->sci_new_mapping;
+ rw_exit(&vd->vdev_indirect_rwlock);
+
+ sci->sci_new_mapping = NULL;
+ vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
+ vic->vic_mapping_object = scip->scip_next_mapping_object;
+ scip->scip_next_mapping_object = 0;
+
+ space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx);
+ spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+ scip->scip_prev_obsolete_sm_object = 0;
+
+ scip->scip_vdev = 0;
+
+ VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_CONDENSING_INDIRECT, tx));
+ spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
+ spa->spa_condensing_indirect = NULL;
+
+ zfs_dbgmsg("finished condense of vdev %llu in txg %llu: "
+ "new mapping object %llu has %llu entries "
+ "(was %llu entries)",
+ vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object,
+ new_count, old_count);
+
+ vdev_config_dirty(spa->spa_root_vdev);
+}
+
+/*
+ * This sync task appends entries to the new mapping object.
+ */
+static void
+spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_condensing_indirect_t *sci = arg;
+ uint64_t txg = dmu_tx_get_txg(tx);
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT3P(sci, ==, spa->spa_condensing_indirect);
+
+ vdev_indirect_mapping_add_entries(sci->sci_new_mapping,
+ &sci->sci_new_mapping_entries[txg & TXG_MASK], tx);
+ ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK]));
+}
+
+/*
+ * Open-context function to add one entry to the new mapping. The new
+ * entry will be remembered and written from syncing context.
+ */
+static void
+spa_condense_indirect_commit_entry(spa_t *spa,
+ vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count)
+{
+ spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
+
+ ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst));
+
+ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count));
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+
+ /*
+ * If we are the first entry committed this txg, kick off the sync
+ * task to write to the MOS on our behalf.
+ */
+ if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) {
+ dsl_sync_task_nowait(dmu_tx_pool(tx),
+ spa_condense_indirect_commit_sync, sci,
+ 0, ZFS_SPACE_CHECK_NONE, tx);
+ }
+
+ vdev_indirect_mapping_entry_t *vime =
+ kmem_alloc(sizeof (*vime), KM_SLEEP);
+ vime->vime_mapping = *vimep;
+ vime->vime_obsolete_count = count;
+ list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime);
+
+ dmu_tx_commit(tx);
+}
+
+static void
+spa_condense_indirect_generate_new_mapping(vdev_t *vd,
+ uint32_t *obsolete_counts, uint64_t start_index)
+{
+ spa_t *spa = vd->vdev_spa;
+ uint64_t mapi = start_index;
+ vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
+ uint64_t old_num_entries =
+ vdev_indirect_mapping_num_entries(old_mapping);
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+ ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev);
+
+ zfs_dbgmsg("starting condense of vdev %llu from index %llu",
+ (u_longlong_t)vd->vdev_id,
+ (u_longlong_t)mapi);
+
+ while (mapi < old_num_entries && !spa_shutting_down(spa)) {
+ vdev_indirect_mapping_entry_phys_t *entry =
+ &old_mapping->vim_entries[mapi];
+ uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst);
+ ASSERT3U(obsolete_counts[mapi], <=, entry_size);
+ if (obsolete_counts[mapi] < entry_size) {
+ spa_condense_indirect_commit_entry(spa, entry,
+ obsolete_counts[mapi]);
+
+ /*
+ * This delay may be requested for testing, debugging,
+ * or performance reasons.
+ */
+ delay(zfs_condense_indirect_commit_entry_delay_ticks);
+ }
+
+ mapi++;
+ }
+ if (spa_shutting_down(spa)) {
+ zfs_dbgmsg("pausing condense of vdev %llu at index %llu",
+ (u_longlong_t)vd->vdev_id,
+ (u_longlong_t)mapi);
+ }
+}
+
+static void
+spa_condense_indirect_thread(void *arg)
+{
+ vdev_t *vd = arg;
+ spa_t *spa = vd->vdev_spa;
+ spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
+ spa_condensing_indirect_phys_t *scip =
+ &spa->spa_condensing_indirect_phys;
+ uint32_t *counts;
+ uint64_t start_index;
+ vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
+ space_map_t *prev_obsolete_sm = NULL;
+
+ ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
+ ASSERT(scip->scip_next_mapping_object != 0);
+ ASSERT(scip->scip_prev_obsolete_sm_object != 0);
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ /*
+ * The list must start out empty in order for the
+ * _commit_sync() sync task to be properly registered
+ * on the first call to _commit_entry(); so it's wise
+ * to double check and ensure we actually are starting
+ * with empty lists.
+ */
+ ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
+ }
+
+ VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
+ scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
+ space_map_update(prev_obsolete_sm);
+ counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping);
+ if (prev_obsolete_sm != NULL) {
+ vdev_indirect_mapping_load_obsolete_spacemap(old_mapping,
+ counts, prev_obsolete_sm);
+ }
+ space_map_close(prev_obsolete_sm);
+
+ /*
+ * Generate new mapping. Determine what index to continue from
+ * based on the max offset that we've already written in the
+ * new mapping.
+ */
+ uint64_t max_offset =
+ vdev_indirect_mapping_max_offset(sci->sci_new_mapping);
+ if (max_offset == 0) {
+ /* We haven't written anything to the new mapping yet. */
+ start_index = 0;
+ } else {
+ /*
+ * Pick up from where we left off. _entry_for_offset()
+ * returns a pointer into the vim_entries array. If
+ * max_offset is greater than any of the mappings
+ * contained in the table NULL will be returned and
+ * that indicates we've exhausted our iteration of the
+ * old_mapping.
+ */
+
+ vdev_indirect_mapping_entry_phys_t *entry =
+ vdev_indirect_mapping_entry_for_offset_or_next(old_mapping,
+ max_offset);
+
+ if (entry == NULL) {
+ /*
+ * We've already written the whole new mapping.
+ * This special value will cause us to skip the
+ * generate_new_mapping step and just do the sync
+ * task to complete the condense.
+ */
+ start_index = UINT64_MAX;
+ } else {
+ start_index = entry - old_mapping->vim_entries;
+ ASSERT3U(start_index, <,
+ vdev_indirect_mapping_num_entries(old_mapping));
+ }
+ }
+
+ spa_condense_indirect_generate_new_mapping(vd, counts, start_index);
+
+ vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts);
+
+ /*
+ * We may have bailed early from generate_new_mapping(), if
+ * the spa is shutting down. In this case, do not complete
+ * the condense.
+ */
+ if (!spa_shutting_down(spa)) {
+ VERIFY0(dsl_sync_task(spa_name(spa), NULL,
+ spa_condense_indirect_complete_sync, sci, 0,
+ ZFS_SPACE_CHECK_NONE));
+ }
+
+ mutex_enter(&spa->spa_async_lock);
+ spa->spa_condense_thread = NULL;
+ cv_broadcast(&spa->spa_async_cv);
+ mutex_exit(&spa->spa_async_lock);
+ thread_exit();
+}
+
+/*
+ * Sync task to begin the condensing process.
+ */
+void
+spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx)
+{
+ spa_t *spa = vd->vdev_spa;
+ spa_condensing_indirect_phys_t *scip =
+ &spa->spa_condensing_indirect_phys;
+
+ ASSERT0(scip->scip_next_mapping_object);
+ ASSERT0(scip->scip_prev_obsolete_sm_object);
+ ASSERT0(scip->scip_vdev);
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS));
+ ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping));
+
+ uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd);
+ ASSERT(obsolete_sm_obj != 0);
+
+ scip->scip_vdev = vd->vdev_id;
+ scip->scip_next_mapping_object =
+ vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx);
+
+ scip->scip_prev_obsolete_sm_object = obsolete_sm_obj;
+
+ /*
+ * We don't need to allocate a new space map object, since
+ * vdev_indirect_sync_obsolete will allocate one when needed.
+ */
+ space_map_close(vd->vdev_obsolete_sm);
+ vd->vdev_obsolete_sm = NULL;
+ VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx));
+
+ VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
+ sizeof (*scip) / sizeof (uint64_t), scip, tx));
+
+ ASSERT3P(spa->spa_condensing_indirect, ==, NULL);
+ spa->spa_condensing_indirect = spa_condensing_indirect_create(spa);
+
+ zfs_dbgmsg("starting condense of vdev %llu in txg %llu: "
+ "posm=%llu nm=%llu",
+ vd->vdev_id, dmu_tx_get_txg(tx),
+ (u_longlong_t)scip->scip_prev_obsolete_sm_object,
+ (u_longlong_t)scip->scip_next_mapping_object);
+
+ ASSERT3P(spa->spa_condense_thread, ==, NULL);
+ spa->spa_condense_thread = thread_create(NULL, 0,
+ spa_condense_indirect_thread, vd, 0, &p0, TS_RUN, minclsyspri);
+}
+
+/*
+ * Sync to the given vdev's obsolete space map any segments that are no longer
+ * referenced as of the given txg.
+ *
+ * If the obsolete space map doesn't exist yet, create and open it.
+ */
+void
+vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
+{
+ spa_t *spa = vd->vdev_spa;
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+
+ ASSERT3U(vic->vic_mapping_object, !=, 0);
+ ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0);
+ ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
+ ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS));
+
+ if (vdev_obsolete_sm_object(vd) == 0) {
+ uint64_t obsolete_sm_object =
+ space_map_alloc(spa->spa_meta_objset, tx);
+
+ ASSERT(vd->vdev_top_zap != 0);
+ VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM,
+ sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx));
+ ASSERT3U(vdev_obsolete_sm_object(vd), !=, 0);
+
+ spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+ VERIFY0(space_map_open(&vd->vdev_obsolete_sm,
+ spa->spa_meta_objset, obsolete_sm_object,
+ 0, vd->vdev_asize, 0));
+ space_map_update(vd->vdev_obsolete_sm);
+ }
+
+ ASSERT(vd->vdev_obsolete_sm != NULL);
+ ASSERT3U(vdev_obsolete_sm_object(vd), ==,
+ space_map_object(vd->vdev_obsolete_sm));
+
+ space_map_write(vd->vdev_obsolete_sm,
+ vd->vdev_obsolete_segments, SM_ALLOC, tx);
+ space_map_update(vd->vdev_obsolete_sm);
+ range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
+}
+
+int
+spa_condense_init(spa_t *spa)
+{
+ int error = zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
+ sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t),
+ &spa->spa_condensing_indirect_phys);
+ if (error == 0) {
+ if (spa_writeable(spa)) {
+ spa->spa_condensing_indirect =
+ spa_condensing_indirect_create(spa);
+ }
+ return (0);
+ } else if (error == ENOENT) {
+ return (0);
+ } else {
+ return (error);
+ }
+}
+
+void
+spa_condense_fini(spa_t *spa)
+{
+ if (spa->spa_condensing_indirect != NULL) {
+ spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
+ spa->spa_condensing_indirect = NULL;
+ }
+}
+
+/*
+ * Restart the condense - called when the pool is opened.
+ */
+void
+spa_condense_indirect_restart(spa_t *spa)
+{
+ vdev_t *vd;
+ ASSERT(spa->spa_condensing_indirect != NULL);
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ vd = vdev_lookup_top(spa,
+ spa->spa_condensing_indirect_phys.scip_vdev);
+ ASSERT(vd != NULL);
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ ASSERT3P(spa->spa_condense_thread, ==, NULL);
+ spa->spa_condense_thread = thread_create(NULL, 0,
+ spa_condense_indirect_thread, vd, 0, &p0, TS_RUN,
+ minclsyspri);
+}
+
+/*
+ * Gets the obsolete spacemap object from the vdev's ZAP.
+ * Returns the spacemap object, or 0 if it wasn't in the ZAP or the ZAP doesn't
+ * exist yet.
+ */
+int
+vdev_obsolete_sm_object(vdev_t *vd)
+{
+ ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+ if (vd->vdev_top_zap == 0) {
+ return (0);
+ }
+
+ uint64_t sm_obj = 0;
+ int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (sm_obj), 1, &sm_obj);
+
+ ASSERT(err == 0 || err == ENOENT);
+
+ return (sm_obj);
+}
+
+boolean_t
+vdev_obsolete_counts_are_precise(vdev_t *vd)
+{
+ ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+ if (vd->vdev_top_zap == 0) {
+ return (B_FALSE);
+ }
+
+ uint64_t val = 0;
+ int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val);
+
+ ASSERT(err == 0 || err == ENOENT);
+
+ return (val != 0);
+}
+
+/* ARGSUSED */
+static void
+vdev_indirect_close(vdev_t *vd)
+{
+}
+
+/* ARGSUSED */
+static void
+vdev_indirect_io_done(zio_t *zio)
+{
+}
+
+/* ARGSUSED */
+static int
+vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+ *psize = *max_psize = vd->vdev_asize +
+ VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
+ *logical_ashift = vd->vdev_ashift;
+ *physical_ashift = vd->vdev_physical_ashift;
+ return (0);
+}
+
+typedef struct remap_segment {
+ vdev_t *rs_vd;
+ uint64_t rs_offset;
+ uint64_t rs_asize;
+ uint64_t rs_split_offset;
+ list_node_t rs_node;
+} remap_segment_t;
+
+remap_segment_t *
+rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset)
+{
+ remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP);
+ rs->rs_vd = vd;
+ rs->rs_offset = offset;
+ rs->rs_asize = asize;
+ rs->rs_split_offset = split_offset;
+ return (rs);
+}
+
+/*
+ * Goes through the relevant indirect mappings until it hits a concrete vdev
+ * and issues the callback. On the way to the concrete vdev, if any other
+ * indirect vdevs are encountered, then the callback will also be called on
+ * each of those indirect vdevs. For example, if the segment is mapped to
+ * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is
+ * mapped to segment B on concrete vdev 2, then the callback will be called on
+ * both vdev 1 and vdev 2.
+ *
+ * While the callback passed to vdev_indirect_remap() is called on every vdev
+ * the function encounters, certain callbacks only care about concrete vdevs.
+ * These types of callbacks should return immediately and explicitly when they
+ * are called on an indirect vdev.
+ *
+ * Because there is a possibility that a DVA section in the indirect device
+ * has been split into multiple sections in our mapping, we keep track
+ * of the relevant contiguous segments of the new location (remap_segment_t)
+ * in a stack. This way we can call the callback for each of the new sections
+ * created by a single section of the indirect device. Note though, that in
+ * this scenario the callbacks in each split block won't occur in-order in
+ * terms of offset, so callers should not make any assumptions about that.
+ *
+ * For callbacks that don't handle split blocks and immediately return when
+ * they encounter them (as is the case for remap_blkptr_cb), the caller can
+ * assume that its callback will be applied from the first indirect vdev
+ * encountered to the last one and then the concrete vdev, in that order.
+ */
+static void
+vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize,
+ void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg)
+{
+ list_t stack;
+ spa_t *spa = vd->vdev_spa;
+
+ list_create(&stack, sizeof (remap_segment_t),
+ offsetof(remap_segment_t, rs_node));
+
+ for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0);
+ rs != NULL; rs = list_remove_head(&stack)) {
+ vdev_t *v = rs->rs_vd;
+
+ /*
+ * Note: this can be called from open context
+ * (eg. zio_read()), so we need the rwlock to prevent
+ * the mapping from being changed by condensing.
+ */
+ rw_enter(&v->vdev_indirect_rwlock, RW_READER);
+ vdev_indirect_mapping_t *vim = v->vdev_indirect_mapping;
+ ASSERT3P(vim, !=, NULL);
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+ ASSERT(rs->rs_asize > 0);
+
+ vdev_indirect_mapping_entry_phys_t *mapping =
+ vdev_indirect_mapping_entry_for_offset(vim, rs->rs_offset);
+ ASSERT3P(mapping, !=, NULL);
+
+ while (rs->rs_asize > 0) {
+ /*
+ * Note: the vdev_indirect_mapping can not change
+ * while we are running. It only changes while the
+ * removal is in progress, and then only from syncing
+ * context. While a removal is in progress, this
+ * function is only called for frees, which also only
+ * happen from syncing context.
+ */
+
+ uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst);
+ uint64_t dst_offset =
+ DVA_GET_OFFSET(&mapping->vimep_dst);
+ uint64_t dst_vdev = DVA_GET_VDEV(&mapping->vimep_dst);
+
+ ASSERT3U(rs->rs_offset, >=,
+ DVA_MAPPING_GET_SRC_OFFSET(mapping));
+ ASSERT3U(rs->rs_offset, <,
+ DVA_MAPPING_GET_SRC_OFFSET(mapping) + size);
+ ASSERT3U(dst_vdev, !=, v->vdev_id);
+
+ uint64_t inner_offset = rs->rs_offset -
+ DVA_MAPPING_GET_SRC_OFFSET(mapping);
+ uint64_t inner_size =
+ MIN(rs->rs_asize, size - inner_offset);
+
+ vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev);
+ ASSERT3P(dst_v, !=, NULL);
+
+ if (dst_v->vdev_ops == &vdev_indirect_ops) {
+ list_insert_head(&stack,
+ rs_alloc(dst_v, dst_offset + inner_offset,
+ inner_size, rs->rs_split_offset));
+
+ }
+
+ if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) &&
+ IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) {
+ /*
+ * Note: This clause exists only solely for
+ * testing purposes. We use it to ensure that
+ * split blocks work and that the callbacks
+ * using them yield the same result if issued
+ * in reverse order.
+ */
+ uint64_t inner_half = inner_size / 2;
+
+ func(rs->rs_split_offset + inner_half, dst_v,
+ dst_offset + inner_offset + inner_half,
+ inner_half, arg);
+
+ func(rs->rs_split_offset, dst_v,
+ dst_offset + inner_offset,
+ inner_half, arg);
+ } else {
+ func(rs->rs_split_offset, dst_v,
+ dst_offset + inner_offset,
+ inner_size, arg);
+ }
+
+ rs->rs_offset += inner_size;
+ rs->rs_asize -= inner_size;
+ rs->rs_split_offset += inner_size;
+ mapping++;
+ }
+
+ rw_exit(&v->vdev_indirect_rwlock);
+ kmem_free(rs, sizeof (remap_segment_t));
+ }
+ list_destroy(&stack);
+}
+
+static void
+vdev_indirect_child_io_done(zio_t *zio)
+{
+ zio_t *pio = zio->io_private;
+
+ mutex_enter(&pio->io_lock);
+ pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
+ mutex_exit(&pio->io_lock);
+
+ abd_put(zio->io_abd);
+}
+
+static void
+vdev_indirect_io_start_cb(uint64_t split_offset, vdev_t *vd, uint64_t offset,
+ uint64_t size, void *arg)
+{
+ zio_t *zio = arg;
+
+ ASSERT3P(vd, !=, NULL);
+
+ if (vd->vdev_ops == &vdev_indirect_ops)
+ return;
+
+ zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset,
+ abd_get_offset(zio->io_abd, split_offset),
+ size, zio->io_type, zio->io_priority,
+ 0, vdev_indirect_child_io_done, zio));
+}
+
+static void
+vdev_indirect_io_start(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+ if (zio->io_type != ZIO_TYPE_READ) {
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+ ASSERT((zio->io_flags &
+ (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
+ }
+
+ vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size,
+ vdev_indirect_io_start_cb, zio);
+
+ zio_execute(zio);
+}
+
+vdev_ops_t vdev_indirect_ops = {
+ vdev_indirect_open,
+ vdev_indirect_close,
+ vdev_default_asize,
+ vdev_indirect_io_start,
+ vdev_indirect_io_done,
+ NULL,
+ NULL,
+ NULL,
+ vdev_indirect_remap,
+ VDEV_TYPE_INDIRECT, /* name of this vdev type */
+ B_FALSE /* leaf vdev */
+};
Property changes on: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c (nonexistent)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c (revision 332525)
@@ -0,0 +1,212 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2015 by Delphix. All rights reserved.
+ */
+
+#include <sys/dmu_tx.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/vdev_indirect_births.h>
+
+static boolean_t
+vdev_indirect_births_verify(vdev_indirect_births_t *vib)
+{
+ ASSERT(vib != NULL);
+
+ ASSERT(vib->vib_object != 0);
+ ASSERT(vib->vib_objset != NULL);
+ ASSERT(vib->vib_phys != NULL);
+ ASSERT(vib->vib_dbuf != NULL);
+
+ EQUIV(vib->vib_phys->vib_count > 0, vib->vib_entries != NULL);
+
+ return (B_TRUE);
+}
+
+uint64_t
+vdev_indirect_births_count(vdev_indirect_births_t *vib)
+{
+ ASSERT(vdev_indirect_births_verify(vib));
+
+ return (vib->vib_phys->vib_count);
+}
+
+uint64_t
+vdev_indirect_births_object(vdev_indirect_births_t *vib)
+{
+ ASSERT(vdev_indirect_births_verify(vib));
+
+ return (vib->vib_object);
+}
+
+static uint64_t
+vdev_indirect_births_size_impl(vdev_indirect_births_t *vib)
+{
+ return (vib->vib_phys->vib_count * sizeof (*vib->vib_entries));
+}
+
+void
+vdev_indirect_births_close(vdev_indirect_births_t *vib)
+{
+ ASSERT(vdev_indirect_births_verify(vib));
+
+ if (vib->vib_phys->vib_count > 0) {
+ uint64_t births_size = vdev_indirect_births_size_impl(vib);
+
+ kmem_free(vib->vib_entries, births_size);
+ vib->vib_entries = NULL;
+ }
+
+ dmu_buf_rele(vib->vib_dbuf, vib);
+
+ vib->vib_objset = NULL;
+ vib->vib_object = 0;
+ vib->vib_dbuf = NULL;
+ vib->vib_phys = NULL;
+
+ kmem_free(vib, sizeof (*vib));
+}
+
+uint64_t
+vdev_indirect_births_alloc(objset_t *os, dmu_tx_t *tx)
+{
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ return (dmu_object_alloc(os,
+ DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
+ DMU_OTN_UINT64_METADATA, sizeof (vdev_indirect_birth_phys_t),
+ tx));
+}
+
+vdev_indirect_births_t *
+vdev_indirect_births_open(objset_t *os, uint64_t births_object)
+{
+ vdev_indirect_births_t *vib = kmem_zalloc(sizeof (*vib), KM_SLEEP);
+
+ vib->vib_objset = os;
+ vib->vib_object = births_object;
+
+ VERIFY0(dmu_bonus_hold(os, vib->vib_object, vib, &vib->vib_dbuf));
+ vib->vib_phys = vib->vib_dbuf->db_data;
+
+ if (vib->vib_phys->vib_count > 0) {
+ uint64_t births_size = vdev_indirect_births_size_impl(vib);
+ vib->vib_entries = kmem_alloc(births_size, KM_SLEEP);
+ VERIFY0(dmu_read(vib->vib_objset, vib->vib_object, 0,
+ births_size, vib->vib_entries, DMU_READ_PREFETCH));
+ }
+
+ ASSERT(vdev_indirect_births_verify(vib));
+
+ return (vib);
+}
+
+void
+vdev_indirect_births_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ VERIFY0(dmu_object_free(os, object, tx));
+}
+
+void
+vdev_indirect_births_add_entry(vdev_indirect_births_t *vib,
+ uint64_t max_offset, uint64_t txg, dmu_tx_t *tx)
+{
+ vdev_indirect_birth_entry_phys_t vibe;
+ uint64_t old_size;
+ uint64_t new_size;
+ vdev_indirect_birth_entry_phys_t *new_entries;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx)));
+ ASSERT(vdev_indirect_births_verify(vib));
+
+ dmu_buf_will_dirty(vib->vib_dbuf, tx);
+
+ vibe.vibe_offset = max_offset;
+ vibe.vibe_phys_birth_txg = txg;
+
+ old_size = vdev_indirect_births_size_impl(vib);
+ dmu_write(vib->vib_objset, vib->vib_object, old_size, sizeof (vibe),
+ &vibe, tx);
+ vib->vib_phys->vib_count++;
+ new_size = vdev_indirect_births_size_impl(vib);
+
+ new_entries = kmem_alloc(new_size, KM_SLEEP);
+ if (old_size > 0) {
+ bcopy(vib->vib_entries, new_entries, old_size);
+ kmem_free(vib->vib_entries, old_size);
+ }
+ new_entries[vib->vib_phys->vib_count - 1] = vibe;
+ vib->vib_entries = new_entries;
+}
+
+uint64_t
+vdev_indirect_births_last_entry_txg(vdev_indirect_births_t *vib)
+{
+ ASSERT(vdev_indirect_births_verify(vib));
+ ASSERT(vib->vib_phys->vib_count > 0);
+
+ vdev_indirect_birth_entry_phys_t *last =
+ &vib->vib_entries[vib->vib_phys->vib_count - 1];
+ return (last->vibe_phys_birth_txg);
+}
+
+/*
+ * Return the txg in which the given range was copied (i.e. its physical
+ * birth txg). The specified offset+asize must be contiguously mapped
+ * (i.e. not a split block).
+ *
+ * The entries are sorted by increasing phys_birth, and also by increasing
+ * offset. We find the specified offset by binary search. Note that we
+ * can not use bsearch() because looking at each entry independently is
+ * insufficient to find the correct entry. Each entry implicitly relies
+ * on the previous entry: an entry indicates that the offsets from the
+ * end of the previous entry to the end of this entry were written in the
+ * specified txg.
+ */
+uint64_t
+vdev_indirect_births_physbirth(vdev_indirect_births_t *vib, uint64_t offset,
+ uint64_t asize)
+{
+ vdev_indirect_birth_entry_phys_t *base;
+ vdev_indirect_birth_entry_phys_t *last;
+
+ ASSERT(vdev_indirect_births_verify(vib));
+ ASSERT(vib->vib_phys->vib_count > 0);
+
+ base = vib->vib_entries;
+ last = base + vib->vib_phys->vib_count - 1;
+
+ ASSERT3U(offset, <, last->vibe_offset);
+
+ while (last >= base) {
+ vdev_indirect_birth_entry_phys_t *p =
+ base + ((last - base) / 2);
+ if (offset >= p->vibe_offset) {
+ base = p + 1;
+ } else if (p == vib->vib_entries ||
+ offset >= (p - 1)->vibe_offset) {
+ ASSERT3U(offset + asize, <=, p->vibe_offset);
+ return (p->vibe_phys_birth_txg);
+ } else {
+ last = p - 1;
+ }
+ }
+ ASSERT(!"offset not found");
+ return (-1);
+}
Property changes on: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c (nonexistent)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c (revision 332525)
@@ -0,0 +1,594 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2015 by Delphix. All rights reserved.
+ */
+
+#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/zfeature.h>
+#include <sys/dmu_objset.h>
+
+static boolean_t
+vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim)
+{
+ ASSERT(vim != NULL);
+
+ ASSERT(vim->vim_object != 0);
+ ASSERT(vim->vim_objset != NULL);
+ ASSERT(vim->vim_phys != NULL);
+ ASSERT(vim->vim_dbuf != NULL);
+
+ EQUIV(vim->vim_phys->vimp_num_entries > 0,
+ vim->vim_entries != NULL);
+ if (vim->vim_phys->vimp_num_entries > 0) {
+ vdev_indirect_mapping_entry_phys_t *last_entry =
+ &vim->vim_entries[vim->vim_phys->vimp_num_entries - 1];
+ uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(last_entry);
+ uint64_t size = DVA_GET_ASIZE(&last_entry->vimep_dst);
+
+ ASSERT3U(vim->vim_phys->vimp_max_offset, >=, offset + size);
+ }
+ if (vim->vim_havecounts) {
+ ASSERT(vim->vim_phys->vimp_counts_object != 0);
+ }
+
+ return (B_TRUE);
+}
+
+uint64_t
+vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim)
+{
+ ASSERT(vdev_indirect_mapping_verify(vim));
+
+ return (vim->vim_phys->vimp_num_entries);
+}
+
+uint64_t
+vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim)
+{
+ ASSERT(vdev_indirect_mapping_verify(vim));
+
+ return (vim->vim_phys->vimp_max_offset);
+}
+
+uint64_t
+vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim)
+{
+ ASSERT(vdev_indirect_mapping_verify(vim));
+
+ return (vim->vim_object);
+}
+
+uint64_t
+vdev_indirect_mapping_bytes_mapped(vdev_indirect_mapping_t *vim)
+{
+ ASSERT(vdev_indirect_mapping_verify(vim));
+
+ return (vim->vim_phys->vimp_bytes_mapped);
+}
+
+/*
+ * The length (in bytes) of the mapping object array in memory and
+ * (logically) on disk.
+ *
+ * Note that unlike most of our accessor functions,
+ * we don't assert that the struct is consistent; therefore it can be
+ * called while there may be concurrent changes, if we don't care about
+ * the value being immediately stale (e.g. from spa_removal_get_stats()).
+ */
+uint64_t
+vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim)
+{
+ return (vim->vim_phys->vimp_num_entries * sizeof (*vim->vim_entries));
+}
+
+/*
+ * Compare an offset with an indirect mapping entry; there are three
+ * possible scenarios:
+ *
+ * 1. The offset is "less than" the mapping entry; meaning the
+ * offset is less than the source offset of the mapping entry. In
+ * this case, there is no overlap between the offset and the
+ * mapping entry and -1 will be returned.
+ *
+ * 2. The offset is "greater than" the mapping entry; meaning the
+ * offset is greater than the mapping entry's source offset plus
+ * the entry's size. In this case, there is no overlap between
+ * the offset and the mapping entry and 1 will be returned.
+ *
+ * NOTE: If the offset is actually equal to the entry's offset
+ * plus size, this is considered to be "greater" than the entry,
+ * and this case applies (i.e. 1 will be returned). Thus, the
+ * entry's "range" can be considered to be inclusive at its
+ * start, but exclusive at its end: e.g. [src, src + size).
+ *
+ * 3. The last case to consider is if the offset actually falls
+ * within the mapping entry's range. If this is the case, the
+ * offset is considered to be "equal to" the mapping entry and
+ * 0 will be returned.
+ *
+ * NOTE: If the offset is equal to the entry's source offset,
+ * this case applies and 0 will be returned. If the offset is
+ * equal to the entry's source plus its size, this case does
+ * *not* apply (see "NOTE" above for scenario 2), and 1 will be
+ * returned.
+ */
+static int
+dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem)
+{
+ const uint64_t *key = v_key;
+ const vdev_indirect_mapping_entry_phys_t *array_elem =
+ v_array_elem;
+ uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem);
+
+ if (*key < src_offset) {
+ return (-1);
+ } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) {
+ return (0);
+ } else {
+ return (1);
+ }
+}
+
+/*
+ * Returns the mapping entry for the given offset.
+ *
+ * It's possible that the given offset will not be in the mapping table
+ * (i.e. no mapping entries contain this offset), in which case, the
+ * return value value depends on the "next_if_missing" parameter.
+ *
+ * If the offset is not found in the table and "next_if_missing" is
+ * B_FALSE, then NULL will always be returned. The behavior is intended
+ * to allow consumers to get the entry corresponding to the offset
+ * parameter, iff the offset overlaps with an entry in the table.
+ *
+ * If the offset is not found in the table and "next_if_missing" is
+ * B_TRUE, then the entry nearest to the given offset will be returned,
+ * such that the entry's source offset is greater than the offset
+ * passed in (i.e. the "next" mapping entry in the table is returned, if
+ * the offset is missing from the table). If there are no entries whose
+ * source offset is greater than the passed in offset, NULL is returned.
+ */
+static vdev_indirect_mapping_entry_phys_t *
+vdev_indirect_mapping_entry_for_offset_impl(vdev_indirect_mapping_t *vim,
+ uint64_t offset, boolean_t next_if_missing)
+{
+ ASSERT(vdev_indirect_mapping_verify(vim));
+ ASSERT(vim->vim_phys->vimp_num_entries > 0);
+
+ vdev_indirect_mapping_entry_phys_t *entry = NULL;
+
+ uint64_t last = vim->vim_phys->vimp_num_entries - 1;
+ uint64_t base = 0;
+
+ /*
+ * We don't define these inside of the while loop because we use
+ * their value in the case that offset isn't in the mapping.
+ */
+ uint64_t mid;
+ int result;
+
+ while (last >= base) {
+ mid = base + ((last - base) >> 1);
+
+ result = dva_mapping_overlap_compare(&offset,
+ &vim->vim_entries[mid]);
+
+ if (result == 0) {
+ entry = &vim->vim_entries[mid];
+ break;
+ } else if (result < 0) {
+ last = mid - 1;
+ } else {
+ base = mid + 1;
+ }
+ }
+
+ if (entry == NULL && next_if_missing) {
+ ASSERT3U(base, ==, last + 1);
+ ASSERT(mid == base || mid == last);
+ ASSERT3S(result, !=, 0);
+
+ /*
+ * The offset we're looking for isn't actually contained
+ * in the mapping table, thus we need to return the
+ * closest mapping entry that is greater than the
+ * offset. We reuse the result of the last comparison,
+ * comparing the mapping entry at index "mid" and the
+ * offset. The offset is guaranteed to lie between
+ * indices one less than "mid", and one greater than
+ * "mid"; we just need to determine if offset is greater
+ * than, or less than the mapping entry contained at
+ * index "mid".
+ */
+
+ uint64_t index;
+ if (result < 0)
+ index = mid;
+ else
+ index = mid + 1;
+
+ ASSERT3U(index, <=, vim->vim_phys->vimp_num_entries);
+
+ if (index == vim->vim_phys->vimp_num_entries) {
+ /*
+ * If "index" is past the end of the entries
+ * array, then not only is the offset not in the
+ * mapping table, but it's actually greater than
+ * all entries in the table. In this case, we
+ * can't return a mapping entry greater than the
+ * offset (since none exist), so we return NULL.
+ */
+
+ ASSERT3S(dva_mapping_overlap_compare(&offset,
+ &vim->vim_entries[index - 1]), >, 0);
+
+ return (NULL);
+ } else {
+ /*
+ * Just to be safe, we verify the offset falls
+ * in between the mapping entries at index and
+ * one less than index. Since we know the offset
+ * doesn't overlap an entry, and we're supposed
+ * to return the entry just greater than the
+ * offset, both of the following tests must be
+ * true.
+ */
+ ASSERT3S(dva_mapping_overlap_compare(&offset,
+ &vim->vim_entries[index]), <, 0);
+ IMPLY(index >= 1, dva_mapping_overlap_compare(&offset,
+ &vim->vim_entries[index - 1]) > 0);
+
+ return (&vim->vim_entries[index]);
+ }
+ } else {
+ return (entry);
+ }
+}
+
+vdev_indirect_mapping_entry_phys_t *
+vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim,
+ uint64_t offset)
+{
+ return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset,
+ B_FALSE));
+}
+
+vdev_indirect_mapping_entry_phys_t *
+vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim,
+ uint64_t offset)
+{
+ return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset,
+ B_TRUE));
+}
+
+
+void
+vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim)
+{
+ ASSERT(vdev_indirect_mapping_verify(vim));
+
+ if (vim->vim_phys->vimp_num_entries > 0) {
+ uint64_t map_size = vdev_indirect_mapping_size(vim);
+ kmem_free(vim->vim_entries, map_size);
+ vim->vim_entries = NULL;
+ }
+
+ dmu_buf_rele(vim->vim_dbuf, vim);
+
+ vim->vim_objset = NULL;
+ vim->vim_object = 0;
+ vim->vim_dbuf = NULL;
+ vim->vim_phys = NULL;
+
+ kmem_free(vim, sizeof (*vim));
+}
+
+uint64_t
+vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx)
+{
+ uint64_t object;
+ ASSERT(dmu_tx_is_syncing(tx));
+ uint64_t bonus_size = VDEV_INDIRECT_MAPPING_SIZE_V0;
+
+ if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+ bonus_size = sizeof (vdev_indirect_mapping_phys_t);
+ }
+
+ object = dmu_object_alloc(os,
+ DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
+ DMU_OTN_UINT64_METADATA, bonus_size,
+ tx);
+
+ if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+ dmu_buf_t *dbuf;
+ vdev_indirect_mapping_phys_t *vimp;
+
+ VERIFY0(dmu_bonus_hold(os, object, FTAG, &dbuf));
+ dmu_buf_will_dirty(dbuf, tx);
+ vimp = dbuf->db_data;
+ vimp->vimp_counts_object = dmu_object_alloc(os,
+ DMU_OTN_UINT32_METADATA, SPA_OLD_MAXBLOCKSIZE,
+ DMU_OT_NONE, 0, tx);
+ spa_feature_incr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+ dmu_buf_rele(dbuf, FTAG);
+ }
+
+ return (object);
+}
+
+
+vdev_indirect_mapping_t *
+vdev_indirect_mapping_open(objset_t *os, uint64_t mapping_object)
+{
+ vdev_indirect_mapping_t *vim = kmem_zalloc(sizeof (*vim), KM_SLEEP);
+ dmu_object_info_t doi;
+ VERIFY0(dmu_object_info(os, mapping_object, &doi));
+
+ vim->vim_objset = os;
+ vim->vim_object = mapping_object;
+
+ VERIFY0(dmu_bonus_hold(os, vim->vim_object, vim,
+ &vim->vim_dbuf));
+ vim->vim_phys = vim->vim_dbuf->db_data;
+
+ vim->vim_havecounts =
+ (doi.doi_bonus_size > VDEV_INDIRECT_MAPPING_SIZE_V0);
+
+ if (vim->vim_phys->vimp_num_entries > 0) {
+ uint64_t map_size = vdev_indirect_mapping_size(vim);
+ vim->vim_entries = kmem_alloc(map_size, KM_SLEEP);
+ VERIFY0(dmu_read(os, vim->vim_object, 0, map_size,
+ vim->vim_entries, DMU_READ_PREFETCH));
+ }
+
+ ASSERT(vdev_indirect_mapping_verify(vim));
+
+ return (vim);
+}
+
+void
+vdev_indirect_mapping_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(os, object);
+ if (vim->vim_havecounts) {
+ VERIFY0(dmu_object_free(os, vim->vim_phys->vimp_counts_object,
+ tx));
+ spa_feature_decr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+ }
+ vdev_indirect_mapping_close(vim);
+
+ VERIFY0(dmu_object_free(os, object, tx));
+}
+
+/*
+ * Append the list of vdev_indirect_mapping_entry_t's to the on-disk
+ * mapping object. Also remove the entries from the list and free them.
+ * This also implicitly extends the max_offset of the mapping (to the end
+ * of the last entry).
+ */
+void
+vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim,
+ list_t *list, dmu_tx_t *tx)
+{
+ vdev_indirect_mapping_entry_phys_t *mapbuf;
+ uint64_t old_size;
+ uint32_t *countbuf = NULL;
+ vdev_indirect_mapping_entry_phys_t *old_entries;
+ uint64_t old_count;
+ uint64_t entries_written = 0;
+
+ ASSERT(vdev_indirect_mapping_verify(vim));
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx)));
+ ASSERT(!list_is_empty(list));
+
+ old_size = vdev_indirect_mapping_size(vim);
+ old_entries = vim->vim_entries;
+ old_count = vim->vim_phys->vimp_num_entries;
+
+ dmu_buf_will_dirty(vim->vim_dbuf, tx);
+
+ mapbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
+ if (vim->vim_havecounts) {
+ countbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
+ ASSERT(spa_feature_is_active(vim->vim_objset->os_spa,
+ SPA_FEATURE_OBSOLETE_COUNTS));
+ }
+ while (!list_is_empty(list)) {
+ uint64_t i;
+ /*
+ * Write entries from the list to the
+ * vdev_im_object in batches of size SPA_OLD_MAXBLOCKSIZE.
+ */
+ for (i = 0; i < SPA_OLD_MAXBLOCKSIZE / sizeof (*mapbuf); i++) {
+ vdev_indirect_mapping_entry_t *entry =
+ list_remove_head(list);
+ if (entry == NULL)
+ break;
+
+ uint64_t size =
+ DVA_GET_ASIZE(&entry->vime_mapping.vimep_dst);
+ uint64_t src_offset =
+ DVA_MAPPING_GET_SRC_OFFSET(&entry->vime_mapping);
+
+ /*
+ * We shouldn't be adding an entry which is fully
+ * obsolete.
+ */
+ ASSERT3U(entry->vime_obsolete_count, <, size);
+ IMPLY(entry->vime_obsolete_count != 0,
+ vim->vim_havecounts);
+
+ mapbuf[i] = entry->vime_mapping;
+ if (vim->vim_havecounts)
+ countbuf[i] = entry->vime_obsolete_count;
+
+ vim->vim_phys->vimp_bytes_mapped += size;
+ ASSERT3U(src_offset, >=,
+ vim->vim_phys->vimp_max_offset);
+ vim->vim_phys->vimp_max_offset = src_offset + size;
+
+ entries_written++;
+
+ kmem_free(entry, sizeof (*entry));
+ }
+ dmu_write(vim->vim_objset, vim->vim_object,
+ vim->vim_phys->vimp_num_entries * sizeof (*mapbuf),
+ i * sizeof (*mapbuf),
+ mapbuf, tx);
+ if (vim->vim_havecounts) {
+ dmu_write(vim->vim_objset,
+ vim->vim_phys->vimp_counts_object,
+ vim->vim_phys->vimp_num_entries *
+ sizeof (*countbuf),
+ i * sizeof (*countbuf), countbuf, tx);
+ }
+ vim->vim_phys->vimp_num_entries += i;
+ }
+ zio_buf_free(mapbuf, SPA_OLD_MAXBLOCKSIZE);
+ if (vim->vim_havecounts)
+ zio_buf_free(countbuf, SPA_OLD_MAXBLOCKSIZE);
+
+ /*
+ * Update the entry array to reflect the new entries. First, copy
+ * over any old entries then read back the new entries we just wrote.
+ */
+ uint64_t new_size = vdev_indirect_mapping_size(vim);
+ ASSERT3U(new_size, >, old_size);
+ ASSERT3U(new_size - old_size, ==,
+ entries_written * sizeof (vdev_indirect_mapping_entry_phys_t));
+ vim->vim_entries = kmem_alloc(new_size, KM_SLEEP);
+ if (old_size > 0) {
+ bcopy(old_entries, vim->vim_entries, old_size);
+ kmem_free(old_entries, old_size);
+ }
+ VERIFY0(dmu_read(vim->vim_objset, vim->vim_object, old_size,
+ new_size - old_size, &vim->vim_entries[old_count],
+ DMU_READ_PREFETCH));
+
+ zfs_dbgmsg("txg %llu: wrote %llu entries to "
+ "indirect mapping obj %llu; max offset=0x%llx",
+ (u_longlong_t)dmu_tx_get_txg(tx),
+ (u_longlong_t)entries_written,
+ (u_longlong_t)vim->vim_object,
+ (u_longlong_t)vim->vim_phys->vimp_max_offset);
+}
+
+/*
+ * Increment the relevant counts for the specified offset and length.
+ * The counts array must be obtained from
+ * vdev_indirect_mapping_load_obsolete_counts().
+ */
+void
+vdev_indirect_mapping_increment_obsolete_count(vdev_indirect_mapping_t *vim,
+ uint64_t offset, uint64_t length, uint32_t *counts)
+{
+ vdev_indirect_mapping_entry_phys_t *mapping;
+ uint64_t index;
+
+ mapping = vdev_indirect_mapping_entry_for_offset(vim, offset);
+
+ ASSERT(length > 0);
+ ASSERT3P(mapping, !=, NULL);
+
+ index = mapping - vim->vim_entries;
+
+ while (length > 0) {
+ ASSERT3U(index, <, vdev_indirect_mapping_num_entries(vim));
+
+ uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst);
+ uint64_t inner_offset = offset -
+ DVA_MAPPING_GET_SRC_OFFSET(mapping);
+ VERIFY3U(inner_offset, <, size);
+ uint64_t inner_size = MIN(length, size - inner_offset);
+
+ VERIFY3U(counts[index] + inner_size, <=, size);
+ counts[index] += inner_size;
+
+ offset += inner_size;
+ length -= inner_size;
+ mapping++;
+ index++;
+ }
+}
+
+typedef struct load_obsolete_space_map_arg {
+ vdev_indirect_mapping_t *losma_vim;
+ uint32_t *losma_counts;
+} load_obsolete_space_map_arg_t;
+
+static int
+load_obsolete_sm_callback(maptype_t type, uint64_t offset, uint64_t size,
+ void *arg)
+{
+ load_obsolete_space_map_arg_t *losma = arg;
+ ASSERT3S(type, ==, SM_ALLOC);
+
+ vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim,
+ offset, size, losma->losma_counts);
+
+ return (0);
+}
+
+/*
+ * Modify the counts (increment them) based on the spacemap.
+ */
+void
+vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim,
+ uint32_t *counts, space_map_t *obsolete_space_sm)
+{
+ load_obsolete_space_map_arg_t losma;
+ losma.losma_counts = counts;
+ losma.losma_vim = vim;
+ VERIFY0(space_map_iterate(obsolete_space_sm,
+ load_obsolete_sm_callback, &losma));
+}
+
+/*
+ * Read the obsolete counts from disk, returning them in an array.
+ */
+uint32_t *
+vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t *vim)
+{
+ ASSERT(vdev_indirect_mapping_verify(vim));
+
+ uint64_t counts_size =
+ vim->vim_phys->vimp_num_entries * sizeof (uint32_t);
+ uint32_t *counts = kmem_alloc(counts_size, KM_SLEEP);
+ if (vim->vim_havecounts) {
+ VERIFY0(dmu_read(vim->vim_objset,
+ vim->vim_phys->vimp_counts_object,
+ 0, counts_size,
+ counts, DMU_READ_PREFETCH));
+ } else {
+ bzero(counts, counts_size);
+ }
+ return (counts);
+}
+
+extern void
+vdev_indirect_mapping_free_obsolete_counts(vdev_indirect_mapping_t *vim,
+ uint32_t *counts)
+{
+ ASSERT(vdev_indirect_mapping_verify(vim));
+
+ kmem_free(counts, vim->vim_phys->vimp_num_entries * sizeof (uint32_t));
+}
Property changes on: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c (revision 332525)
@@ -1,1360 +1,1437 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
/*
* Virtual Device Labels
* ---------------------
*
* The vdev label serves several distinct purposes:
*
* 1. Uniquely identify this device as part of a ZFS pool and confirm its
* identity within the pool.
*
* 2. Verify that all the devices given in a configuration are present
* within the pool.
*
* 3. Determine the uberblock for the pool.
*
* 4. In case of an import operation, determine the configuration of the
* toplevel vdev of which it is a part.
*
* 5. If an import operation cannot find all the devices in the pool,
* provide enough information to the administrator to determine which
* devices are missing.
*
* It is important to note that while the kernel is responsible for writing the
* label, it only consumes the information in the first three cases. The
* latter information is only consumed in userland when determining the
* configuration to import a pool.
*
*
* Label Organization
* ------------------
*
* Before describing the contents of the label, it's important to understand how
* the labels are written and updated with respect to the uberblock.
*
* When the pool configuration is altered, either because it was newly created
* or a device was added, we want to update all the labels such that we can deal
* with fatal failure at any point. To this end, each disk has two labels which
* are updated before and after the uberblock is synced. Assuming we have
* labels and an uberblock with the following transaction groups:
*
* L1 UB L2
* +------+ +------+ +------+
* | | | | | |
* | t10 | | t10 | | t10 |
* | | | | | |
* +------+ +------+ +------+
*
* In this stable state, the labels and the uberblock were all updated within
* the same transaction group (10). Each label is mirrored and checksummed, so
* that we can detect when we fail partway through writing the label.
*
* In order to identify which labels are valid, the labels are written in the
* following manner:
*
* 1. For each vdev, update 'L1' to the new label
* 2. Update the uberblock
* 3. For each vdev, update 'L2' to the new label
*
* Given arbitrary failure, we can determine the correct label to use based on
* the transaction group. If we fail after updating L1 but before updating the
* UB, we will notice that L1's transaction group is greater than the uberblock,
* so L2 must be valid. If we fail after writing the uberblock but before
* writing L2, we will notice that L2's transaction group is less than L1, and
* therefore L1 is valid.
*
* Another added complexity is that not every label is updated when the config
* is synced. If we add a single device, we do not want to have to re-write
* every label for every device in the pool. This means that both L1 and L2 may
* be older than the pool uberblock, because the necessary information is stored
* on another vdev.
*
*
* On-disk Format
* --------------
*
* The vdev label consists of two distinct parts, and is wrapped within the
* vdev_label_t structure. The label includes 8k of padding to permit legacy
* VTOC disk labels, but is otherwise ignored.
*
* The first half of the label is a packed nvlist which contains pool wide
* properties, per-vdev properties, and configuration information. It is
* described in more detail below.
*
* The latter half of the label consists of a redundant array of uberblocks.
* These uberblocks are updated whenever a transaction group is committed,
* or when the configuration is updated. When a pool is loaded, we scan each
* vdev for the 'best' uberblock.
*
*
* Configuration Information
* -------------------------
*
* The nvlist describing the pool and vdev contains the following elements:
*
* version ZFS on-disk version
* name Pool name
* state Pool state
* txg Transaction group in which this label was written
* pool_guid Unique identifier for this pool
* vdev_tree An nvlist describing vdev tree.
* features_for_read
* An nvlist of the features necessary for reading the MOS.
*
* Each leaf device label also contains the following:
*
* top_guid Unique ID for top-level vdev in which this is contained
* guid Unique ID for the leaf vdev
*
* The 'vs' configuration follows the format described in 'spa_config.c'.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/spa_impl.h>
#include <sys/dmu.h>
#include <sys/zap.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
#include <sys/uberblock_impl.h>
#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
#include <sys/zio.h>
#include <sys/dsl_scan.h>
#include <sys/abd.h>
#include <sys/fs/zfs.h>
#include <sys/trim_map.h>
static boolean_t vdev_trim_on_init = B_TRUE;
SYSCTL_DECL(_vfs_zfs_vdev);
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, trim_on_init, CTLFLAG_RW,
&vdev_trim_on_init, 0, "Enable/disable full vdev trim on initialisation");
/*
* Basic routines to read and write from a vdev label.
* Used throughout the rest of this file.
*/
uint64_t
vdev_label_offset(uint64_t psize, int l, uint64_t offset)
{
ASSERT(offset < sizeof (vdev_label_t));
ASSERT(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t) == 0);
return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
}
/*
* Returns back the vdev label associated with the passed in offset.
*/
int
vdev_label_number(uint64_t psize, uint64_t offset)
{
int l;
if (offset >= psize - VDEV_LABEL_END_SIZE) {
offset -= psize - VDEV_LABEL_END_SIZE;
offset += (VDEV_LABELS / 2) * sizeof (vdev_label_t);
}
l = offset / sizeof (vdev_label_t);
return (l < VDEV_LABELS ? l : -1);
}
static void
vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
uint64_t size, zio_done_func_t *done, void *private, int flags)
{
ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) ==
SCL_STATE_ALL);
ASSERT(flags & ZIO_FLAG_CONFIG_WRITER);
zio_nowait(zio_read_phys(zio, vd,
vdev_label_offset(vd->vdev_psize, l, offset),
size, buf, ZIO_CHECKSUM_LABEL, done, private,
ZIO_PRIORITY_SYNC_READ, flags, B_TRUE));
}
static void
vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
uint64_t size, zio_done_func_t *done, void *private, int flags)
{
ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL ||
(spa_config_held(zio->io_spa, SCL_CONFIG | SCL_STATE, RW_READER) ==
(SCL_CONFIG | SCL_STATE) &&
dsl_pool_sync_context(spa_get_dsl(zio->io_spa))));
ASSERT(flags & ZIO_FLAG_CONFIG_WRITER);
zio_nowait(zio_write_phys(zio, vd,
vdev_label_offset(vd->vdev_psize, l, offset),
size, buf, ZIO_CHECKSUM_LABEL, done, private,
ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE));
}
/*
* Generate the nvlist representing this vdev's config.
*/
nvlist_t *
vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
vdev_config_flag_t flags)
{
nvlist_t *nv = NULL;
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
nv = fnvlist_alloc();
fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type);
if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)))
fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id);
fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid);
if (vd->vdev_path != NULL)
fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path);
if (vd->vdev_devid != NULL)
fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid);
if (vd->vdev_physpath != NULL)
fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
vd->vdev_physpath);
if (vd->vdev_fru != NULL)
fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);
if (vd->vdev_nparity != 0) {
ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
VDEV_TYPE_RAIDZ) == 0);
/*
* Make sure someone hasn't managed to sneak a fancy new vdev
* into a crufty old storage pool.
*/
ASSERT(vd->vdev_nparity == 1 ||
(vd->vdev_nparity <= 2 &&
spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
(vd->vdev_nparity <= 3 &&
spa_version(spa) >= SPA_VERSION_RAIDZ3));
/*
* Note that we'll add the nparity tag even on storage pools
* that only support a single parity device -- older software
* will just ignore it.
*/
fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity);
}
if (vd->vdev_wholedisk != -1ULL)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
vd->vdev_wholedisk);
if (vd->vdev_not_present)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
if (vd->vdev_isspare)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
vd == vd->vdev_top) {
fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
vd->vdev_ms_array);
fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
vd->vdev_ms_shift);
fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
vd->vdev_asize);
fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog);
- if (vd->vdev_removing)
+ if (vd->vdev_removing) {
fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
vd->vdev_removing);
+ }
}
if (vd->vdev_dtl_sm != NULL) {
fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
space_map_object(vd->vdev_dtl_sm));
}
+ if (vic->vic_mapping_object != 0) {
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
+ vic->vic_mapping_object);
+ }
+
+ if (vic->vic_births_object != 0) {
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
+ vic->vic_births_object);
+ }
+
+ if (vic->vic_prev_indirect_vdev != UINT64_MAX) {
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
+ vic->vic_prev_indirect_vdev);
+ }
+
if (vd->vdev_crtxg)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
if (flags & VDEV_CONFIG_MOS) {
if (vd->vdev_leaf_zap != 0) {
ASSERT(vd->vdev_ops->vdev_op_leaf);
fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP,
vd->vdev_leaf_zap);
}
if (vd->vdev_top_zap != 0) {
ASSERT(vd == vd->vdev_top);
fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
vd->vdev_top_zap);
}
}
if (getstats) {
vdev_stat_t vs;
- pool_scan_stat_t ps;
vdev_get_stats(vd, &vs);
fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t));
/* provide either current or previous scan information */
+ pool_scan_stat_t ps;
if (spa_scan_get_stats(spa, &ps) == 0) {
fnvlist_add_uint64_array(nv,
ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
sizeof (pool_scan_stat_t) / sizeof (uint64_t));
}
+
+ pool_removal_stat_t prs;
+ if (spa_removal_get_stats(spa, &prs) == 0) {
+ fnvlist_add_uint64_array(nv,
+ ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs,
+ sizeof (prs) / sizeof (uint64_t));
+ }
+
+ /*
+ * Note: this can be called from open context
+ * (spa_get_stats()), so we need the rwlock to prevent
+ * the mapping from being changed by condensing.
+ */
+ rw_enter(&vd->vdev_indirect_rwlock, RW_READER);
+ if (vd->vdev_indirect_mapping != NULL) {
+ ASSERT(vd->vdev_indirect_births != NULL);
+ vdev_indirect_mapping_t *vim =
+ vd->vdev_indirect_mapping;
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
+ vdev_indirect_mapping_size(vim));
+ }
+ rw_exit(&vd->vdev_indirect_rwlock);
+ if (vd->vdev_mg != NULL &&
+ vd->vdev_mg->mg_fragmentation != ZFS_FRAG_INVALID) {
+ /*
+ * Compute approximately how much memory would be used
+ * for the indirect mapping if this device were to
+ * be removed.
+ *
+ * Note: If the frag metric is invalid, then not
+ * enough metaslabs have been converted to have
+ * histograms.
+ */
+ uint64_t seg_count = 0;
+
+ /*
+ * There are the same number of allocated segments
+ * as free segments, so we will have at least one
+ * entry per free segment.
+ */
+ for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+ seg_count += vd->vdev_mg->mg_histogram[i];
+ }
+
+ /*
+ * The maximum length of a mapping is SPA_MAXBLOCKSIZE,
+ * so we need at least one entry per SPA_MAXBLOCKSIZE
+ * of allocated data.
+ */
+ seg_count += vd->vdev_stat.vs_alloc / SPA_MAXBLOCKSIZE;
+
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
+ seg_count *
+ sizeof (vdev_indirect_mapping_entry_phys_t));
+ }
}
if (!vd->vdev_ops->vdev_op_leaf) {
nvlist_t **child;
int c, idx;
ASSERT(!vd->vdev_ishole);
child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
KM_SLEEP);
for (c = 0, idx = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
/*
* If we're generating an nvlist of removing
* vdevs then skip over any device which is
* not being removed.
*/
if ((flags & VDEV_CONFIG_REMOVING) &&
!cvd->vdev_removing)
continue;
child[idx++] = vdev_config_generate(spa, cvd,
getstats, flags);
}
if (idx) {
fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
child, idx);
}
for (c = 0; c < idx; c++)
nvlist_free(child[c]);
kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
} else {
const char *aux = NULL;
if (vd->vdev_offline && !vd->vdev_tmpoffline)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE);
if (vd->vdev_resilver_txg != 0)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
vd->vdev_resilver_txg);
if (vd->vdev_faulted)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE);
if (vd->vdev_degraded)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, B_TRUE);
if (vd->vdev_removed)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, B_TRUE);
if (vd->vdev_unspare)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE);
if (vd->vdev_ishole)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE);
switch (vd->vdev_stat.vs_aux) {
case VDEV_AUX_ERR_EXCEEDED:
aux = "err_exceeded";
break;
case VDEV_AUX_EXTERNAL:
aux = "external";
break;
}
if (aux != NULL)
fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux);
if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) {
fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
vd->vdev_orig_guid);
}
}
return (nv);
}
/*
* Generate a view of the top-level vdevs. If we currently have holes
* in the namespace, then generate an array which contains a list of holey
* vdevs. Additionally, add the number of top-level children that currently
* exist.
*/
void
vdev_top_config_generate(spa_t *spa, nvlist_t *config)
{
vdev_t *rvd = spa->spa_root_vdev;
uint64_t *array;
uint_t c, idx;
array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP);
for (c = 0, idx = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
- if (tvd->vdev_ishole)
+ if (tvd->vdev_ishole) {
array[idx++] = c;
+ }
}
if (idx) {
VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY,
array, idx) == 0);
}
VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
rvd->vdev_children) == 0);
kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
}
/*
* Returns the configuration from the label of the given vdev. For vdevs
* which don't have a txg value stored on their label (i.e. spares/cache)
* or have not been completely initialized (txg = 0) just return
* the configuration from the first valid label we find. Otherwise,
* find the most up-to-date label that does not exceed the specified
* 'txg' value.
*/
nvlist_t *
vdev_label_read_config(vdev_t *vd, uint64_t txg)
{
spa_t *spa = vd->vdev_spa;
nvlist_t *config = NULL;
vdev_phys_t *vp;
abd_t *vp_abd;
zio_t *zio;
uint64_t best_txg = 0;
int error = 0;
int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
ZIO_FLAG_SPECULATIVE;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
if (!vdev_readable(vd))
return (NULL);
vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
vp = abd_to_buf(vp_abd);
retry:
for (int l = 0; l < VDEV_LABELS; l++) {
nvlist_t *label = NULL;
zio = zio_root(spa, NULL, NULL, flags);
vdev_label_read(zio, vd, l, vp_abd,
offsetof(vdev_label_t, vl_vdev_phys),
sizeof (vdev_phys_t), NULL, NULL, flags);
if (zio_wait(zio) == 0 &&
nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
&label, 0) == 0) {
uint64_t label_txg = 0;
/*
* Auxiliary vdevs won't have txg values in their
* labels and newly added vdevs may not have been
* completely initialized so just return the
* configuration from the first valid label we
* encounter.
*/
error = nvlist_lookup_uint64(label,
ZPOOL_CONFIG_POOL_TXG, &label_txg);
if ((error || label_txg == 0) && !config) {
config = label;
break;
} else if (label_txg <= txg && label_txg > best_txg) {
best_txg = label_txg;
nvlist_free(config);
config = fnvlist_dup(label);
}
}
if (label != NULL) {
nvlist_free(label);
label = NULL;
}
}
if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) {
flags |= ZIO_FLAG_TRYHARD;
goto retry;
}
abd_free(vp_abd);
return (config);
}
/*
* Determine if a device is in use. The 'spare_guid' parameter will be filled
* in with the device guid if this spare is active elsewhere on the system.
*/
static boolean_t
vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
uint64_t *spare_guid, uint64_t *l2cache_guid)
{
spa_t *spa = vd->vdev_spa;
uint64_t state, pool_guid, device_guid, txg, spare_pool;
uint64_t vdtxg = 0;
nvlist_t *label;
if (spare_guid)
*spare_guid = 0ULL;
if (l2cache_guid)
*l2cache_guid = 0ULL;
/*
* Read the label, if any, and perform some basic sanity checks.
*/
if ((label = vdev_label_read_config(vd, -1ULL)) == NULL)
return (B_FALSE);
(void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
&vdtxg);
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
&state) != 0 ||
nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
&device_guid) != 0) {
nvlist_free(label);
return (B_FALSE);
}
if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
&pool_guid) != 0 ||
nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
&txg) != 0)) {
nvlist_free(label);
return (B_FALSE);
}
nvlist_free(label);
/*
* Check to see if this device indeed belongs to the pool it claims to
* be a part of. The only way this is allowed is if the device is a hot
* spare (which we check for later on).
*/
if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
!spa_guid_exists(pool_guid, device_guid) &&
!spa_spare_exists(device_guid, NULL, NULL) &&
!spa_l2cache_exists(device_guid, NULL))
return (B_FALSE);
/*
* If the transaction group is zero, then this an initialized (but
* unused) label. This is only an error if the create transaction
* on-disk is the same as the one we're using now, in which case the
* user has attempted to add the same vdev multiple times in the same
* transaction.
*/
if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
txg == 0 && vdtxg == crtxg)
return (B_TRUE);
/*
* Check to see if this is a spare device. We do an explicit check for
* spa_has_spare() here because it may be on our pending list of spares
* to add. We also check if it is an l2cache device.
*/
if (spa_spare_exists(device_guid, &spare_pool, NULL) ||
spa_has_spare(spa, device_guid)) {
if (spare_guid)
*spare_guid = device_guid;
switch (reason) {
case VDEV_LABEL_CREATE:
case VDEV_LABEL_L2CACHE:
return (B_TRUE);
case VDEV_LABEL_REPLACE:
return (!spa_has_spare(spa, device_guid) ||
spare_pool != 0ULL);
case VDEV_LABEL_SPARE:
return (spa_has_spare(spa, device_guid));
}
}
/*
* Check to see if this is an l2cache device.
*/
if (spa_l2cache_exists(device_guid, NULL))
return (B_TRUE);
/*
* We can't rely on a pool's state if it's been imported
* read-only. Instead we look to see if the pools is marked
* read-only in the namespace and set the state to active.
*/
if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
(spa = spa_by_guid(pool_guid, device_guid)) != NULL &&
spa_mode(spa) == FREAD)
state = POOL_STATE_ACTIVE;
/*
* If the device is marked ACTIVE, then this device is in use by another
* pool on the system.
*/
return (state == POOL_STATE_ACTIVE);
}
/*
* Initialize a vdev label. We check to make sure each leaf device is not in
* use, and writable. We put down an initial label which we will later
* overwrite with a complete label. Note that it's important to do this
* sequentially, not in parallel, so that we catch cases of multiple use of the
* same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with
* itself.
*/
int
vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
{
spa_t *spa = vd->vdev_spa;
nvlist_t *label;
vdev_phys_t *vp;
abd_t *vp_abd;
abd_t *pad2;
uberblock_t *ub;
abd_t *ub_abd;
zio_t *zio;
char *buf;
size_t buflen;
int error;
uint64_t spare_guid, l2cache_guid;
int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
for (int c = 0; c < vd->vdev_children; c++)
if ((error = vdev_label_init(vd->vdev_child[c],
crtxg, reason)) != 0)
return (error);
/* Track the creation time for this vdev */
vd->vdev_crtxg = crtxg;
if (!vd->vdev_ops->vdev_op_leaf || !spa_writeable(spa))
return (0);
/*
* Dead vdevs cannot be initialized.
*/
if (vdev_is_dead(vd))
return (SET_ERROR(EIO));
/*
* Determine if the vdev is in use.
*/
if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT &&
vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid))
return (SET_ERROR(EBUSY));
/*
* If this is a request to add or replace a spare or l2cache device
* that is in use elsewhere on the system, then we must update the
* guid (which was initialized to a random value) to reflect the
* actual GUID (which is shared between multiple pools).
*/
if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_L2CACHE &&
spare_guid != 0ULL) {
uint64_t guid_delta = spare_guid - vd->vdev_guid;
vd->vdev_guid += guid_delta;
for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
pvd->vdev_guid_sum += guid_delta;
/*
* If this is a replacement, then we want to fallthrough to the
* rest of the code. If we're adding a spare, then it's already
* labeled appropriately and we can just return.
*/
if (reason == VDEV_LABEL_SPARE)
return (0);
ASSERT(reason == VDEV_LABEL_REPLACE ||
reason == VDEV_LABEL_SPLIT);
}
if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE &&
l2cache_guid != 0ULL) {
uint64_t guid_delta = l2cache_guid - vd->vdev_guid;
vd->vdev_guid += guid_delta;
for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
pvd->vdev_guid_sum += guid_delta;
/*
* If this is a replacement, then we want to fallthrough to the
* rest of the code. If we're adding an l2cache, then it's
* already labeled appropriately and we can just return.
*/
if (reason == VDEV_LABEL_L2CACHE)
return (0);
ASSERT(reason == VDEV_LABEL_REPLACE);
}
/*
* TRIM the whole thing, excluding the blank space and boot header
* as specified by ZFS On-Disk Specification (section 1.3), so that
* we start with a clean slate.
* It's just an optimization, so we don't care if it fails.
* Don't TRIM if removing so that we don't interfere with zpool
* disaster recovery.
*/
if (zfs_trim_enabled && vdev_trim_on_init && !vd->vdev_notrim &&
(reason == VDEV_LABEL_CREATE || reason == VDEV_LABEL_SPARE ||
reason == VDEV_LABEL_L2CACHE))
zio_wait(zio_trim(NULL, spa, vd, VDEV_SKIP_SIZE,
vd->vdev_psize - VDEV_SKIP_SIZE));
/*
* Initialize its label.
*/
vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
abd_zero(vp_abd, sizeof (vdev_phys_t));
vp = abd_to_buf(vp_abd);
/*
* Generate a label describing the pool and our top-level vdev.
* We mark it as being from txg 0 to indicate that it's not
* really part of an active pool just yet. The labels will
* be written again with a meaningful txg by spa_sync().
*/
if (reason == VDEV_LABEL_SPARE ||
(reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) {
/*
* For inactive hot spares, we generate a special label that
* identifies as a mutually shared hot spare. We write the
* label if we are adding a hot spare, or if we are removing an
* active hot spare (in which case we want to revert the
* labels).
*/
VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
spa_version(spa)) == 0);
VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
POOL_STATE_SPARE) == 0);
VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
vd->vdev_guid) == 0);
} else if (reason == VDEV_LABEL_L2CACHE ||
(reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) {
/*
* For level 2 ARC devices, add a special label.
*/
VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
spa_version(spa)) == 0);
VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
POOL_STATE_L2CACHE) == 0);
VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
vd->vdev_guid) == 0);
} else {
uint64_t txg = 0ULL;
if (reason == VDEV_LABEL_SPLIT)
txg = spa->spa_uberblock.ub_txg;
label = spa_config_generate(spa, vd, txg, B_FALSE);
/*
* Add our creation time. This allows us to detect multiple
* vdev uses as described above, and automatically expires if we
* fail.
*/
VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
crtxg) == 0);
}
buf = vp->vp_nvlist;
buflen = sizeof (vp->vp_nvlist);
error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
if (error != 0) {
nvlist_free(label);
abd_free(vp_abd);
/* EFAULT means nvlist_pack ran out of room */
return (error == EFAULT ? ENAMETOOLONG : EINVAL);
}
/*
* Initialize uberblock template.
*/
ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE);
abd_zero(ub_abd, VDEV_UBERBLOCK_RING);
abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t));
ub = abd_to_buf(ub_abd);
ub->ub_txg = 0;
/* Initialize the 2nd padding area. */
pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
abd_zero(pad2, VDEV_PAD_SIZE);
/*
* Write everything in parallel.
*/
retry:
zio = zio_root(spa, NULL, NULL, flags);
for (int l = 0; l < VDEV_LABELS; l++) {
vdev_label_write(zio, vd, l, vp_abd,
offsetof(vdev_label_t, vl_vdev_phys),
sizeof (vdev_phys_t), NULL, NULL, flags);
/*
* Skip the 1st padding area.
* Zero out the 2nd padding area where it might have
* left over data from previous filesystem format.
*/
vdev_label_write(zio, vd, l, pad2,
offsetof(vdev_label_t, vl_pad2),
VDEV_PAD_SIZE, NULL, NULL, flags);
vdev_label_write(zio, vd, l, ub_abd,
offsetof(vdev_label_t, vl_uberblock),
VDEV_UBERBLOCK_RING, NULL, NULL, flags);
}
error = zio_wait(zio);
if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
flags |= ZIO_FLAG_TRYHARD;
goto retry;
}
nvlist_free(label);
abd_free(pad2);
abd_free(ub_abd);
abd_free(vp_abd);
/*
* If this vdev hasn't been previously identified as a spare, then we
* mark it as such only if a) we are labeling it as a spare, or b) it
* exists as a spare elsewhere in the system. Do the same for
* level 2 ARC devices.
*/
if (error == 0 && !vd->vdev_isspare &&
(reason == VDEV_LABEL_SPARE ||
spa_spare_exists(vd->vdev_guid, NULL, NULL)))
spa_spare_add(vd);
if (error == 0 && !vd->vdev_isl2cache &&
(reason == VDEV_LABEL_L2CACHE ||
spa_l2cache_exists(vd->vdev_guid, NULL)))
spa_l2cache_add(vd);
return (error);
}
int
vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size)
{
spa_t *spa = vd->vdev_spa;
zio_t *zio;
abd_t *pad2;
int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
int error;
if (size > VDEV_PAD_SIZE)
return (EINVAL);
if (!vd->vdev_ops->vdev_op_leaf)
return (ENODEV);
if (vdev_is_dead(vd))
return (ENXIO);
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
abd_zero(pad2, VDEV_PAD_SIZE);
abd_copy_from_buf(pad2, buf, size);
retry:
zio = zio_root(spa, NULL, NULL, flags);
vdev_label_write(zio, vd, 0, pad2,
offsetof(vdev_label_t, vl_pad2),
VDEV_PAD_SIZE, NULL, NULL, flags);
error = zio_wait(zio);
if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
flags |= ZIO_FLAG_TRYHARD;
goto retry;
}
abd_free(pad2);
return (error);
}
/*
* ==========================================================================
* uberblock load/sync
* ==========================================================================
*/
/*
* Consider the following situation: txg is safely synced to disk. We've
* written the first uberblock for txg + 1, and then we lose power. When we
* come back up, we fail to see the uberblock for txg + 1 because, say,
* it was on a mirrored device and the replica to which we wrote txg + 1
* is now offline. If we then make some changes and sync txg + 1, and then
* the missing replica comes back, then for a few seconds we'll have two
* conflicting uberblocks on disk with the same txg. The solution is simple:
* among uberblocks with equal txg, choose the one with the latest timestamp.
*/
static int
vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
{
if (ub1->ub_txg < ub2->ub_txg)
return (-1);
if (ub1->ub_txg > ub2->ub_txg)
return (1);
if (ub1->ub_timestamp < ub2->ub_timestamp)
return (-1);
if (ub1->ub_timestamp > ub2->ub_timestamp)
return (1);
return (0);
}
struct ubl_cbdata {
uberblock_t *ubl_ubbest; /* Best uberblock */
vdev_t *ubl_vd; /* vdev associated with the above */
};
static void
vdev_uberblock_load_done(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
spa_t *spa = zio->io_spa;
zio_t *rio = zio->io_private;
uberblock_t *ub = abd_to_buf(zio->io_abd);
struct ubl_cbdata *cbp = rio->io_private;
ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd));
if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
mutex_enter(&rio->io_lock);
if (ub->ub_txg <= spa->spa_load_max_txg &&
vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) {
/*
* Keep track of the vdev in which this uberblock
* was found. We will use this information later
* to obtain the config nvlist associated with
* this uberblock.
*/
*cbp->ubl_ubbest = *ub;
cbp->ubl_vd = vd;
}
mutex_exit(&rio->io_lock);
}
abd_free(zio->io_abd);
}
static void
vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
struct ubl_cbdata *cbp)
{
for (int c = 0; c < vd->vdev_children; c++)
vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp);
if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
for (int l = 0; l < VDEV_LABELS; l++) {
for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
vdev_label_read(zio, vd, l,
abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd),
B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n),
VDEV_UBERBLOCK_SIZE(vd),
vdev_uberblock_load_done, zio, flags);
}
}
}
}
/*
* Reads the 'best' uberblock from disk along with its associated
* configuration. First, we read the uberblock array of each label of each
* vdev, keeping track of the uberblock with the highest txg in each array.
* Then, we read the configuration from the same vdev as the best uberblock.
*/
void
vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
{
zio_t *zio;
spa_t *spa = rvd->vdev_spa;
struct ubl_cbdata cb;
int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
ASSERT(ub);
ASSERT(config);
bzero(ub, sizeof (uberblock_t));
*config = NULL;
cb.ubl_ubbest = ub;
cb.ubl_vd = NULL;
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
zio = zio_root(spa, NULL, &cb, flags);
vdev_uberblock_load_impl(zio, rvd, flags, &cb);
(void) zio_wait(zio);
/*
* It's possible that the best uberblock was discovered on a label
* that has a configuration which was written in a future txg.
* Search all labels on this vdev to find the configuration that
* matches the txg for our uberblock.
*/
if (cb.ubl_vd != NULL)
*config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg);
spa_config_exit(spa, SCL_ALL, FTAG);
}
/*
* On success, increment root zio's count of good writes.
* We only get credit for writes to known-visible vdevs; see spa_vdev_add().
*/
static void
vdev_uberblock_sync_done(zio_t *zio)
{
uint64_t *good_writes = zio->io_private;
if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
atomic_inc_64(good_writes);
}
/*
* Write the uberblock to all labels of all leaves of the specified vdev.
*/
static void
vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
{
for (int c = 0; c < vd->vdev_children; c++)
vdev_uberblock_sync(zio, ub, vd->vdev_child[c], flags);
if (!vd->vdev_ops->vdev_op_leaf)
return;
if (!vdev_writeable(vd))
return;
int n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1);
/* Copy the uberblock_t into the ABD */
abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
for (int l = 0; l < VDEV_LABELS; l++)
vdev_label_write(zio, vd, l, ub_abd,
VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
vdev_uberblock_sync_done, zio->io_private,
flags | ZIO_FLAG_DONT_PROPAGATE);
abd_free(ub_abd);
}
/* Sync the uberblocks to all vdevs in svd[] */
int
vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
{
spa_t *spa = svd[0]->vdev_spa;
zio_t *zio;
uint64_t good_writes = 0;
zio = zio_root(spa, NULL, &good_writes, flags);
for (int v = 0; v < svdcount; v++)
vdev_uberblock_sync(zio, ub, svd[v], flags);
(void) zio_wait(zio);
/*
* Flush the uberblocks to disk. This ensures that the odd labels
* are no longer needed (because the new uberblocks and the even
* labels are safely on disk), so it is safe to overwrite them.
*/
zio = zio_root(spa, NULL, NULL, flags);
- for (int v = 0; v < svdcount; v++)
- zio_flush(zio, svd[v]);
+ for (int v = 0; v < svdcount; v++) {
+ if (vdev_writeable(svd[v])) {
+ zio_flush(zio, svd[v]);
+ }
+ }
(void) zio_wait(zio);
return (good_writes >= 1 ? 0 : EIO);
}
/*
* On success, increment the count of good writes for our top-level vdev.
*/
static void
vdev_label_sync_done(zio_t *zio)
{
uint64_t *good_writes = zio->io_private;
if (zio->io_error == 0)
atomic_inc_64(good_writes);
}
/*
* If there weren't enough good writes, indicate failure to the parent.
*/
static void
vdev_label_sync_top_done(zio_t *zio)
{
uint64_t *good_writes = zio->io_private;
if (*good_writes == 0)
zio->io_error = SET_ERROR(EIO);
kmem_free(good_writes, sizeof (uint64_t));
}
/*
* We ignore errors for log and cache devices, simply free the private data.
*/
static void
vdev_label_sync_ignore_done(zio_t *zio)
{
kmem_free(zio->io_private, sizeof (uint64_t));
}
/*
* Write all even or odd labels to all leaves of the specified vdev.
*/
static void
vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
{
nvlist_t *label;
vdev_phys_t *vp;
abd_t *vp_abd;
char *buf;
size_t buflen;
for (int c = 0; c < vd->vdev_children; c++)
vdev_label_sync(zio, vd->vdev_child[c], l, txg, flags);
if (!vd->vdev_ops->vdev_op_leaf)
return;
if (!vdev_writeable(vd))
return;
/*
* Generate a label describing the top-level config to which we belong.
*/
label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE);
vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
abd_zero(vp_abd, sizeof (vdev_phys_t));
vp = abd_to_buf(vp_abd);
buf = vp->vp_nvlist;
buflen = sizeof (vp->vp_nvlist);
if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0) {
for (; l < VDEV_LABELS; l += 2) {
vdev_label_write(zio, vd, l, vp_abd,
offsetof(vdev_label_t, vl_vdev_phys),
sizeof (vdev_phys_t),
vdev_label_sync_done, zio->io_private,
flags | ZIO_FLAG_DONT_PROPAGATE);
}
}
abd_free(vp_abd);
nvlist_free(label);
}
int
vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
{
list_t *dl = &spa->spa_config_dirty_list;
vdev_t *vd;
zio_t *zio;
int error;
/*
* Write the new labels to disk.
*/
zio = zio_root(spa, NULL, NULL, flags);
for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) {
uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t),
KM_SLEEP);
ASSERT(!vd->vdev_ishole);
zio_t *vio = zio_null(zio, spa, NULL,
(vd->vdev_islog || vd->vdev_aux != NULL) ?
vdev_label_sync_ignore_done : vdev_label_sync_top_done,
good_writes, flags);
vdev_label_sync(vio, vd, l, txg, flags);
zio_nowait(vio);
}
error = zio_wait(zio);
/*
* Flush the new labels to disk.
*/
zio = zio_root(spa, NULL, NULL, flags);
for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd))
zio_flush(zio, vd);
(void) zio_wait(zio);
return (error);
}
/*
* Sync the uberblock and any changes to the vdev configuration.
*
* The order of operations is carefully crafted to ensure that
* if the system panics or loses power at any time, the state on disk
* is still transactionally consistent. The in-line comments below
* describe the failure semantics at each stage.
*
* Moreover, vdev_config_sync() is designed to be idempotent: if it fails
* at any time, you can just call it again, and it will resume its work.
*/
int
vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
{
spa_t *spa = svd[0]->vdev_spa;
uberblock_t *ub = &spa->spa_uberblock;
vdev_t *vd;
zio_t *zio;
int error = 0;
int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
retry:
/*
* Normally, we don't want to try too hard to write every label and
* uberblock. If there is a flaky disk, we don't want the rest of the
* sync process to block while we retry. But if we can't write a
* single label out, we should retry with ZIO_FLAG_TRYHARD before
* bailing out and declaring the pool faulted.
*/
if (error != 0) {
if ((flags & ZIO_FLAG_TRYHARD) != 0)
return (error);
flags |= ZIO_FLAG_TRYHARD;
}
ASSERT(ub->ub_txg <= txg);
/*
* If this isn't a resync due to I/O errors,
* and nothing changed in this transaction group,
* and the vdev configuration hasn't changed,
* then there's nothing to do.
*/
if (ub->ub_txg < txg &&
uberblock_update(ub, spa->spa_root_vdev, txg) == B_FALSE &&
list_is_empty(&spa->spa_config_dirty_list))
return (0);
if (txg > spa_freeze_txg(spa))
return (0);
ASSERT(txg <= spa->spa_final_txg);
/*
* Flush the write cache of every disk that's been written to
* in this transaction group. This ensures that all blocks
* written in this txg will be committed to stable storage
* before any uberblock that references them.
*/
zio = zio_root(spa, NULL, NULL, flags);
for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd;
vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)))
zio_flush(zio, vd);
(void) zio_wait(zio);
/*
* Sync out the even labels (L0, L2) for every dirty vdev. If the
* system dies in the middle of this process, that's OK: all of the
* even labels that made it to disk will be newer than any uberblock,
* and will therefore be considered invalid. The odd labels (L1, L3),
* which have not yet been touched, will still be valid. We flush
* the new labels to disk to ensure that all even-label updates
* are committed to stable storage before the uberblock update.
*/
if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0)
goto retry;
/*
* Sync the uberblocks to all vdevs in svd[].
* If the system dies in the middle of this step, there are two cases
* to consider, and the on-disk state is consistent either way:
*
* (1) If none of the new uberblocks made it to disk, then the
* previous uberblock will be the newest, and the odd labels
* (which had not yet been touched) will be valid with respect
* to that uberblock.
*
* (2) If one or more new uberblocks made it to disk, then they
* will be the newest, and the even labels (which had all
* been successfully committed) will be valid with respect
* to the new uberblocks.
*/
if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0)
goto retry;
/*
* Sync out odd labels for every dirty vdev. If the system dies
* in the middle of this process, the even labels and the new
* uberblocks will suffice to open the pool. The next time
* the pool is opened, the first thing we'll do -- before any
* user data is modified -- is mark every vdev dirty so that
* all labels will be brought up to date. We flush the new labels
* to disk to ensure that all odd-label updates are committed to
* stable storage before the next transaction group begins.
*/
if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0)
goto retry;;
trim_thread_wakeup(spa);
return (0);
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c (revision 332525)
@@ -1,708 +1,711 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/spa_impl.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_scan.h>
#include <sys/vdev_impl.h>
#include <sys/zio.h>
#include <sys/abd.h>
#include <sys/fs/zfs.h>
/*
* Virtual device vector for mirroring.
*/
typedef struct mirror_child {
vdev_t *mc_vd;
uint64_t mc_offset;
int mc_error;
int mc_load;
uint8_t mc_tried;
uint8_t mc_skipped;
uint8_t mc_speculative;
} mirror_child_t;
typedef struct mirror_map {
int *mm_preferred;
int mm_preferred_cnt;
int mm_children;
boolean_t mm_resilvering;
boolean_t mm_root;
mirror_child_t mm_child[];
} mirror_map_t;
static int vdev_mirror_shift = 21;
#ifdef _KERNEL
SYSCTL_DECL(_vfs_zfs_vdev);
static SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0,
"ZFS VDEV Mirror");
#endif
/*
* The load configuration settings below are tuned by default for
* the case where all devices are of the same rotational type.
*
* If there is a mixture of rotating and non-rotating media, setting
* non_rotating_seek_inc to 0 may well provide better results as it
* will direct more reads to the non-rotating vdevs which are more
* likely to have a higher performance.
*/
/* Rotating media load calculation configuration. */
static int rotating_inc = 0;
#ifdef _KERNEL
SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_inc, CTLFLAG_RWTUN,
&rotating_inc, 0, "Rotating media load increment for non-seeking I/O's");
#endif
static int rotating_seek_inc = 5;
#ifdef _KERNEL
SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_inc, CTLFLAG_RWTUN,
&rotating_seek_inc, 0, "Rotating media load increment for seeking I/O's");
#endif
static int rotating_seek_offset = 1 * 1024 * 1024;
#ifdef _KERNEL
SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_offset, CTLFLAG_RWTUN,
&rotating_seek_offset, 0, "Offset in bytes from the last I/O which "
"triggers a reduced rotating media seek increment");
#endif
/* Non-rotating media load calculation configuration. */
static int non_rotating_inc = 0;
#ifdef _KERNEL
SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_inc, CTLFLAG_RWTUN,
&non_rotating_inc, 0,
"Non-rotating media load increment for non-seeking I/O's");
#endif
static int non_rotating_seek_inc = 1;
#ifdef _KERNEL
SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_seek_inc, CTLFLAG_RWTUN,
&non_rotating_seek_inc, 0,
"Non-rotating media load increment for seeking I/O's");
#endif
static inline size_t
vdev_mirror_map_size(int children)
{
return (offsetof(mirror_map_t, mm_child[children]) +
sizeof(int) * children);
}
static inline mirror_map_t *
vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root)
{
mirror_map_t *mm;
mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP);
mm->mm_children = children;
mm->mm_resilvering = resilvering;
mm->mm_root = root;
mm->mm_preferred = (int *)((uintptr_t)mm +
offsetof(mirror_map_t, mm_child[children]));
return mm;
}
static void
vdev_mirror_map_free(zio_t *zio)
{
mirror_map_t *mm = zio->io_vsd;
kmem_free(mm, vdev_mirror_map_size(mm->mm_children));
}
static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
vdev_mirror_map_free,
zio_vsd_default_cksum_report
};
static int
vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset)
{
uint64_t lastoffset;
int load;
/* All DVAs have equal weight at the root. */
if (mm->mm_root)
return (INT_MAX);
/*
* We don't return INT_MAX if the device is resilvering i.e.
* vdev_resilver_txg != 0 as when tested performance was slightly
* worse overall when resilvering with compared to without.
*/
/* Standard load based on pending queue length. */
load = vdev_queue_length(vd);
lastoffset = vdev_queue_lastoffset(vd);
if (vd->vdev_rotation_rate == VDEV_RATE_NON_ROTATING) {
/* Non-rotating media. */
if (lastoffset == zio_offset)
return (load + non_rotating_inc);
/*
* Apply a seek penalty even for non-rotating devices as
* sequential I/O'a can be aggregated into fewer operations
* on the device, thus avoiding unnecessary per-command
* overhead and boosting performance.
*/
return (load + non_rotating_seek_inc);
}
/* Rotating media I/O's which directly follow the last I/O. */
if (lastoffset == zio_offset)
return (load + rotating_inc);
/*
* Apply half the seek increment to I/O's within seek offset
* of the last I/O queued to this vdev as they should incure less
* of a seek increment.
*/
if (ABS(lastoffset - zio_offset) < rotating_seek_offset)
return (load + (rotating_seek_inc / 2));
/* Apply the full seek increment to all other I/O's. */
return (load + rotating_seek_inc);
}
static mirror_map_t *
vdev_mirror_map_init(zio_t *zio)
{
mirror_map_t *mm = NULL;
mirror_child_t *mc;
vdev_t *vd = zio->io_vd;
int c;
if (vd == NULL) {
dva_t *dva = zio->io_bp->blk_dva;
spa_t *spa = zio->io_spa;
mm = vdev_mirror_map_alloc(BP_GET_NDVAS(zio->io_bp), B_FALSE,
B_TRUE);
for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
}
} else {
/*
* If we are resilvering, then we should handle scrub reads
* differently; we shouldn't issue them to the resilvering
* device because it might not have those blocks.
*
* We are resilvering iff:
* 1) We are a replacing vdev (ie our name is "replacing-1" or
* "spare-1" or something like that), and
* 2) The pool is currently being resilvered.
*
* We cannot simply check vd->vdev_resilver_txg, because it's
* not set in this path.
*
* Nor can we just check our vdev_ops; there are cases (such as
* when a user types "zpool replace pool odev spare_dev" and
* spare_dev is in the spare list, or when a spare device is
* automatically used to replace a DEGRADED device) when
* resilvering is complete but both the original vdev and the
* spare vdev remain in the pool. That behavior is intentional.
* It helps implement the policy that a spare should be
* automatically removed from the pool after the user replaces
* the device that originally failed.
*
* If a spa load is in progress, then spa_dsl_pool may be
* uninitialized. But we shouldn't be resilvering during a spa
* load anyway.
*/
boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops ||
vd->vdev_ops == &vdev_spare_ops) &&
spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE &&
dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool);
mm = vdev_mirror_map_alloc(vd->vdev_children, replacing,
B_FALSE);
for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
mc->mc_vd = vd->vdev_child[c];
mc->mc_offset = zio->io_offset;
}
}
zio->io_vsd = mm;
zio->io_vsd_ops = &vdev_mirror_vsd_ops;
return (mm);
}
static int
vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
uint64_t *logical_ashift, uint64_t *physical_ashift)
{
int numerrors = 0;
int lasterror = 0;
if (vd->vdev_children == 0) {
vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
return (SET_ERROR(EINVAL));
}
vdev_open_children(vd);
for (int c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
if (cvd->vdev_open_error) {
lasterror = cvd->vdev_open_error;
numerrors++;
continue;
}
*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
*physical_ashift = MAX(*physical_ashift,
cvd->vdev_physical_ashift);
}
if (numerrors == vd->vdev_children) {
vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
return (lasterror);
}
return (0);
}
static void
vdev_mirror_close(vdev_t *vd)
{
for (int c = 0; c < vd->vdev_children; c++)
vdev_close(vd->vdev_child[c]);
}
static void
vdev_mirror_child_done(zio_t *zio)
{
mirror_child_t *mc = zio->io_private;
mc->mc_error = zio->io_error;
mc->mc_tried = 1;
mc->mc_skipped = 0;
}
static void
vdev_mirror_scrub_done(zio_t *zio)
{
mirror_child_t *mc = zio->io_private;
if (zio->io_error == 0) {
zio_t *pio;
zio_link_t *zl = NULL;
mutex_enter(&zio->io_lock);
while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
mutex_enter(&pio->io_lock);
ASSERT3U(zio->io_size, >=, pio->io_size);
abd_copy(pio->io_abd, zio->io_abd, pio->io_size);
mutex_exit(&pio->io_lock);
}
mutex_exit(&zio->io_lock);
}
abd_free(zio->io_abd);
mc->mc_error = zio->io_error;
mc->mc_tried = 1;
mc->mc_skipped = 0;
}
/*
* Check the other, lower-index DVAs to see if they're on the same
* vdev as the child we picked. If they are, use them since they
* are likely to have been allocated from the primary metaslab in
* use at the time, and hence are more likely to have locality with
* single-copy data.
*/
static int
vdev_mirror_dva_select(zio_t *zio, int p)
{
dva_t *dva = zio->io_bp->blk_dva;
mirror_map_t *mm = zio->io_vsd;
int preferred;
int c;
preferred = mm->mm_preferred[p];
for (p-- ; p >= 0; p--) {
c = mm->mm_preferred[p];
if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred]))
preferred = c;
}
return (preferred);
}
static int
vdev_mirror_preferred_child_randomize(zio_t *zio)
{
mirror_map_t *mm = zio->io_vsd;
int p;
if (mm->mm_root) {
p = spa_get_random(mm->mm_preferred_cnt);
return (vdev_mirror_dva_select(zio, p));
}
/*
* To ensure we don't always favour the first matching vdev,
* which could lead to wear leveling issues on SSD's, we
* use the I/O offset as a pseudo random seed into the vdevs
* which have the lowest load.
*/
p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt;
return (mm->mm_preferred[p]);
}
/*
* Try to find a vdev whose DTL doesn't contain the block we want to read
* prefering vdevs based on determined load.
*
* If we can't, try the read on any vdev we haven't already tried.
*/
static int
vdev_mirror_child_select(zio_t *zio)
{
mirror_map_t *mm = zio->io_vsd;
uint64_t txg = zio->io_txg;
int c, lowest_load;
ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
lowest_load = INT_MAX;
mm->mm_preferred_cnt = 0;
for (c = 0; c < mm->mm_children; c++) {
mirror_child_t *mc;
mc = &mm->mm_child[c];
if (mc->mc_tried || mc->mc_skipped)
continue;
if (!vdev_readable(mc->mc_vd)) {
mc->mc_error = SET_ERROR(ENXIO);
mc->mc_tried = 1; /* don't even try */
mc->mc_skipped = 1;
continue;
}
if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) {
mc->mc_error = SET_ERROR(ESTALE);
mc->mc_skipped = 1;
mc->mc_speculative = 1;
continue;
}
mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
if (mc->mc_load > lowest_load)
continue;
if (mc->mc_load < lowest_load) {
lowest_load = mc->mc_load;
mm->mm_preferred_cnt = 0;
}
mm->mm_preferred[mm->mm_preferred_cnt] = c;
mm->mm_preferred_cnt++;
}
if (mm->mm_preferred_cnt == 1) {
vdev_queue_register_lastoffset(
mm->mm_child[mm->mm_preferred[0]].mc_vd, zio);
return (mm->mm_preferred[0]);
}
if (mm->mm_preferred_cnt > 1) {
int c = vdev_mirror_preferred_child_randomize(zio);
vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd, zio);
return (c);
}
/*
* Every device is either missing or has this txg in its DTL.
* Look for any child we haven't already tried before giving up.
*/
for (c = 0; c < mm->mm_children; c++) {
if (!mm->mm_child[c].mc_tried) {
vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd,
zio);
return (c);
}
}
/*
* Every child failed. There's no place left to look.
*/
return (-1);
}
static void
vdev_mirror_io_start(zio_t *zio)
{
mirror_map_t *mm;
mirror_child_t *mc;
int c, children;
mm = vdev_mirror_map_init(zio);
if (zio->io_type == ZIO_TYPE_READ) {
if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering &&
mm->mm_children > 1) {
/*
* For scrubbing reads we need to allocate a read
* buffer for each child and issue reads to all
* children. If any child succeeds, it will copy its
* data into zio->io_data in vdev_mirror_scrub_done.
*/
for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset,
abd_alloc_sametype(zio->io_abd,
zio->io_size), zio->io_size,
zio->io_type, zio->io_priority, 0,
vdev_mirror_scrub_done, mc));
}
zio_execute(zio);
return;
}
/*
* For normal reads just pick one child.
*/
c = vdev_mirror_child_select(zio);
children = (c >= 0);
} else {
ASSERT(zio->io_type == ZIO_TYPE_WRITE ||
zio->io_type == ZIO_TYPE_FREE);
/*
* Writes and frees go to all children.
*/
c = 0;
children = mm->mm_children;
}
while (children--) {
mc = &mm->mm_child[c];
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
zio->io_type, zio->io_priority, 0,
vdev_mirror_child_done, mc));
c++;
}
zio_execute(zio);
}
static int
vdev_mirror_worst_error(mirror_map_t *mm)
{
int error[2] = { 0, 0 };
for (int c = 0; c < mm->mm_children; c++) {
mirror_child_t *mc = &mm->mm_child[c];
int s = mc->mc_speculative;
error[s] = zio_worst_error(error[s], mc->mc_error);
}
return (error[0] ? error[0] : error[1]);
}
static void
vdev_mirror_io_done(zio_t *zio)
{
mirror_map_t *mm = zio->io_vsd;
mirror_child_t *mc;
int c;
int good_copies = 0;
int unexpected_errors = 0;
for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
if (mc->mc_error) {
if (!mc->mc_skipped)
unexpected_errors++;
} else if (mc->mc_tried) {
good_copies++;
}
}
if (zio->io_type == ZIO_TYPE_WRITE) {
/*
* XXX -- for now, treat partial writes as success.
*
* Now that we support write reallocation, it would be better
* to treat partial failure as real failure unless there are
* no non-degraded top-level vdevs left, and not update DTLs
* if we intend to reallocate.
*/
/* XXPOLICY */
if (good_copies != mm->mm_children) {
/*
* Always require at least one good copy.
*
* For ditto blocks (io_vd == NULL), require
* all copies to be good.
*
* XXX -- for replacing vdevs, there's no great answer.
* If the old device is really dead, we may not even
* be able to access it -- so we only want to
* require good writes to the new device. But if
* the new device turns out to be flaky, we want
* to be able to detach it -- which requires all
* writes to the old device to have succeeded.
*/
if (good_copies == 0 || zio->io_vd == NULL)
zio->io_error = vdev_mirror_worst_error(mm);
}
return;
} else if (zio->io_type == ZIO_TYPE_FREE) {
return;
}
ASSERT(zio->io_type == ZIO_TYPE_READ);
/*
* If we don't have a good copy yet, keep trying other children.
*/
/* XXPOLICY */
if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
ASSERT(c >= 0 && c < mm->mm_children);
mc = &mm->mm_child[c];
zio_vdev_io_redone(zio);
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
ZIO_TYPE_READ, zio->io_priority, 0,
vdev_mirror_child_done, mc));
return;
}
/* XXPOLICY */
if (good_copies == 0) {
zio->io_error = vdev_mirror_worst_error(mm);
ASSERT(zio->io_error != 0);
}
if (good_copies && spa_writeable(zio->io_spa) &&
(unexpected_errors ||
(zio->io_flags & ZIO_FLAG_RESILVER) ||
((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_resilvering))) {
/*
* Use the good data we have in hand to repair damaged children.
*/
for (c = 0; c < mm->mm_children; c++) {
/*
* Don't rewrite known good children.
* Not only is it unnecessary, it could
* actually be harmful: if the system lost
* power while rewriting the only good copy,
* there would be no good copies left!
*/
mc = &mm->mm_child[c];
if (mc->mc_error == 0) {
if (mc->mc_tried)
continue;
if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
!vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
zio->io_txg, 1))
continue;
mc->mc_error = SET_ERROR(ESTALE);
}
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset,
zio->io_abd, zio->io_size,
ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
}
}
}
static void
vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
{
if (faulted == vd->vdev_children)
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_NO_REPLICAS);
else if (degraded + faulted != 0)
vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
else
vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
}
vdev_ops_t vdev_mirror_ops = {
vdev_mirror_open,
vdev_mirror_close,
vdev_default_asize,
vdev_mirror_io_start,
vdev_mirror_io_done,
vdev_mirror_state_change,
NULL,
NULL,
+ NULL,
VDEV_TYPE_MIRROR, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
vdev_ops_t vdev_replacing_ops = {
vdev_mirror_open,
vdev_mirror_close,
vdev_default_asize,
vdev_mirror_io_start,
vdev_mirror_io_done,
vdev_mirror_state_change,
NULL,
NULL,
+ NULL,
VDEV_TYPE_REPLACING, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
vdev_ops_t vdev_spare_ops = {
vdev_mirror_open,
vdev_mirror_close,
vdev_default_asize,
vdev_mirror_io_start,
vdev_mirror_io_done,
vdev_mirror_state_change,
+ NULL,
NULL,
NULL,
VDEV_TYPE_SPARE, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c (revision 332525)
@@ -1,107 +1,109 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
/*
* The 'missing' vdev is a special vdev type used only during import. It
* signifies a placeholder in the root vdev for some vdev that we know is
* missing. We pass it down to the kernel to allow the rest of the
* configuration to parsed and an attempt made to open all available devices.
* Because its GUID is always 0, we know that the guid sum will mismatch and we
* won't be able to open the pool anyway.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/vdev_impl.h>
#include <sys/fs/zfs.h>
#include <sys/zio.h>
/* ARGSUSED */
static int
vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
uint64_t *logical_ashift, uint64_t *physical_ashift)
{
/*
* Really this should just fail. But then the root vdev will be in the
* faulted state with VDEV_AUX_NO_REPLICAS, when what we really want is
* VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we
* will fail the GUID sum check before ever trying to open the pool.
*/
*psize = 0;
*max_psize = 0;
*logical_ashift = 0;
*physical_ashift = 0;
return (0);
}
/* ARGSUSED */
static void
vdev_missing_close(vdev_t *vd)
{
}
/* ARGSUSED */
static void
vdev_missing_io_start(zio_t *zio)
{
zio->io_error = SET_ERROR(ENOTSUP);
zio_execute(zio);
}
/* ARGSUSED */
static void
vdev_missing_io_done(zio_t *zio)
{
}
vdev_ops_t vdev_missing_ops = {
vdev_missing_open,
vdev_missing_close,
vdev_default_asize,
vdev_missing_io_start,
vdev_missing_io_done,
NULL,
NULL,
NULL,
+ NULL,
VDEV_TYPE_MISSING, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
vdev_ops_t vdev_hole_ops = {
vdev_missing_open,
vdev_missing_close,
vdev_default_asize,
vdev_missing_io_start,
vdev_missing_io_done,
+ NULL,
NULL,
NULL,
NULL,
VDEV_TYPE_HOLE, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c (revision 332525)
@@ -1,952 +1,962 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/zfs_context.h>
#include <sys/vdev_impl.h>
#include <sys/spa_impl.h>
#include <sys/zio.h>
#include <sys/avl.h>
#include <sys/dsl_pool.h>
#include <sys/metaslab_impl.h>
#include <sys/abd.h>
/*
* ZFS I/O Scheduler
* ---------------
*
* ZFS issues I/O operations to leaf vdevs to satisfy and complete zios. The
* I/O scheduler determines when and in what order those operations are
* issued. The I/O scheduler divides operations into six I/O classes
* prioritized in the following order: sync read, sync write, async read,
* async write, scrub/resilver and trim. Each queue defines the minimum and
* maximum number of concurrent operations that may be issued to the device.
* In addition, the device has an aggregate maximum. Note that the sum of the
* per-queue minimums must not exceed the aggregate maximum, and if the
* aggregate maximum is equal to or greater than the sum of the per-queue
* maximums, the per-queue minimum has no effect.
*
* For many physical devices, throughput increases with the number of
* concurrent operations, but latency typically suffers. Further, physical
* devices typically have a limit at which more concurrent operations have no
* effect on throughput or can actually cause it to decrease.
*
* The scheduler selects the next operation to issue by first looking for an
* I/O class whose minimum has not been satisfied. Once all are satisfied and
* the aggregate maximum has not been hit, the scheduler looks for classes
* whose maximum has not been satisfied. Iteration through the I/O classes is
* done in the order specified above. No further operations are issued if the
* aggregate maximum number of concurrent operations has been hit or if there
* are no operations queued for an I/O class that has not hit its maximum.
* Every time an I/O is queued or an operation completes, the I/O scheduler
* looks for new operations to issue.
*
* All I/O classes have a fixed maximum number of outstanding operations
* except for the async write class. Asynchronous writes represent the data
* that is committed to stable storage during the syncing stage for
* transaction groups (see txg.c). Transaction groups enter the syncing state
* periodically so the number of queued async writes will quickly burst up and
* then bleed down to zero. Rather than servicing them as quickly as possible,
* the I/O scheduler changes the maximum number of active async write I/Os
* according to the amount of dirty data in the pool (see dsl_pool.c). Since
* both throughput and latency typically increase with the number of
* concurrent operations issued to physical devices, reducing the burstiness
* in the number of concurrent operations also stabilizes the response time of
* operations from other -- and in particular synchronous -- queues. In broad
* strokes, the I/O scheduler will issue more concurrent operations from the
* async write queue as there's more dirty data in the pool.
*
* Async Writes
*
* The number of concurrent operations issued for the async write I/O class
* follows a piece-wise linear function defined by a few adjustable points.
*
* | o---------| <-- zfs_vdev_async_write_max_active
* ^ | /^ |
* | | / | |
* active | / | |
* I/O | / | |
* count | / | |
* | / | |
* |------------o | | <-- zfs_vdev_async_write_min_active
* 0|____________^______|_________|
* 0% | | 100% of zfs_dirty_data_max
* | |
* | `-- zfs_vdev_async_write_active_max_dirty_percent
* `--------- zfs_vdev_async_write_active_min_dirty_percent
*
* Until the amount of dirty data exceeds a minimum percentage of the dirty
* data allowed in the pool, the I/O scheduler will limit the number of
* concurrent operations to the minimum. As that threshold is crossed, the
* number of concurrent operations issued increases linearly to the maximum at
* the specified maximum percentage of the dirty data allowed in the pool.
*
* Ideally, the amount of dirty data on a busy pool will stay in the sloped
* part of the function between zfs_vdev_async_write_active_min_dirty_percent
* and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the
* maximum percentage, this indicates that the rate of incoming data is
* greater than the rate that the backend storage can handle. In this case, we
* must further throttle incoming writes (see dmu_tx_delay() for details).
*/
/*
* The maximum number of I/Os active to each device. Ideally, this will be >=
* the sum of each queue's max_active. It must be at least the sum of each
* queue's min_active.
*/
uint32_t zfs_vdev_max_active = 1000;
/*
* Per-queue limits on the number of I/Os active to each device. If the
* sum of the queue's max_active is < zfs_vdev_max_active, then the
* min_active comes into play. We will send min_active from each queue,
* and then select from queues in the order defined by zio_priority_t.
*
* In general, smaller max_active's will lead to lower latency of synchronous
* operations. Larger max_active's may lead to higher overall throughput,
* depending on underlying storage.
*
* The ratio of the queues' max_actives determines the balance of performance
* between reads, writes, and scrubs. E.g., increasing
* zfs_vdev_scrub_max_active will cause the scrub or resilver to complete
* more quickly, but reads and writes to have higher latency and lower
* throughput.
*/
uint32_t zfs_vdev_sync_read_min_active = 10;
uint32_t zfs_vdev_sync_read_max_active = 10;
uint32_t zfs_vdev_sync_write_min_active = 10;
uint32_t zfs_vdev_sync_write_max_active = 10;
uint32_t zfs_vdev_async_read_min_active = 1;
uint32_t zfs_vdev_async_read_max_active = 3;
uint32_t zfs_vdev_async_write_min_active = 1;
uint32_t zfs_vdev_async_write_max_active = 10;
uint32_t zfs_vdev_scrub_min_active = 1;
uint32_t zfs_vdev_scrub_max_active = 2;
uint32_t zfs_vdev_trim_min_active = 1;
/*
* TRIM max active is large in comparison to the other values due to the fact
* that TRIM IOs are coalesced at the device layer. This value is set such
* that a typical SSD can process the queued IOs in a single request.
*/
uint32_t zfs_vdev_trim_max_active = 64;
+uint32_t zfs_vdev_removal_min_active = 1;
+uint32_t zfs_vdev_removal_max_active = 2;
/*
* When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
* dirty data, use zfs_vdev_async_write_min_active. When it has more than
* zfs_vdev_async_write_active_max_dirty_percent, use
* zfs_vdev_async_write_max_active. The value is linearly interpolated
* between min and max.
*/
int zfs_vdev_async_write_active_min_dirty_percent = 30;
int zfs_vdev_async_write_active_max_dirty_percent = 60;
/*
* To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
* For read I/Os, we also aggregate across small adjacency gaps; for writes
* we include spans of optional I/Os to aid aggregation at the disk even when
* they aren't able to help us aggregate at this level.
*/
int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE;
int zfs_vdev_read_gap_limit = 32 << 10;
int zfs_vdev_write_gap_limit = 4 << 10;
/*
* Define the queue depth percentage for each top-level. This percentage is
* used in conjunction with zfs_vdev_async_max_active to determine how many
* allocations a specific top-level vdev should handle. Once the queue depth
* reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100
* then allocator will stop allocating blocks on that top-level device.
* The default kernel setting is 1000% which will yield 100 allocations per
* device. For userland testing, the default setting is 300% which equates
* to 30 allocations per device.
*/
#ifdef _KERNEL
int zfs_vdev_queue_depth_pct = 1000;
#else
int zfs_vdev_queue_depth_pct = 300;
#endif
#ifdef __FreeBSD__
#ifdef _KERNEL
SYSCTL_DECL(_vfs_zfs_vdev);
static int sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS);
SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_min_dirty_percent,
CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
sysctl_zfs_async_write_active_min_dirty_percent, "I",
"Percentage of async write dirty data below which "
"async_write_min_active is used.");
static int sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS);
SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_max_dirty_percent,
CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
sysctl_zfs_async_write_active_max_dirty_percent, "I",
"Percentage of async write dirty data above which "
"async_write_max_active is used.");
SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, max_active, CTLFLAG_RWTUN,
&zfs_vdev_max_active, 0,
"The maximum number of I/Os of all types active for each device.");
#define ZFS_VDEV_QUEUE_KNOB_MIN(name) \
SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _min_active, CTLFLAG_RWTUN,\
&zfs_vdev_ ## name ## _min_active, 0, \
"Initial number of I/O requests of type " #name \
" active for each device");
#define ZFS_VDEV_QUEUE_KNOB_MAX(name) \
SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _max_active, CTLFLAG_RWTUN,\
&zfs_vdev_ ## name ## _max_active, 0, \
"Maximum number of I/O requests of type " #name \
" active for each device");
ZFS_VDEV_QUEUE_KNOB_MIN(sync_read);
ZFS_VDEV_QUEUE_KNOB_MAX(sync_read);
ZFS_VDEV_QUEUE_KNOB_MIN(sync_write);
ZFS_VDEV_QUEUE_KNOB_MAX(sync_write);
ZFS_VDEV_QUEUE_KNOB_MIN(async_read);
ZFS_VDEV_QUEUE_KNOB_MAX(async_read);
ZFS_VDEV_QUEUE_KNOB_MIN(async_write);
ZFS_VDEV_QUEUE_KNOB_MAX(async_write);
ZFS_VDEV_QUEUE_KNOB_MIN(scrub);
ZFS_VDEV_QUEUE_KNOB_MAX(scrub);
ZFS_VDEV_QUEUE_KNOB_MIN(trim);
ZFS_VDEV_QUEUE_KNOB_MAX(trim);
#undef ZFS_VDEV_QUEUE_KNOB
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RWTUN,
&zfs_vdev_aggregation_limit, 0,
"I/O requests are aggregated up to this size");
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, read_gap_limit, CTLFLAG_RWTUN,
&zfs_vdev_read_gap_limit, 0,
"Acceptable gap between two reads being aggregated");
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RWTUN,
&zfs_vdev_write_gap_limit, 0,
"Acceptable gap between two writes being aggregated");
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, queue_depth_pct, CTLFLAG_RWTUN,
&zfs_vdev_queue_depth_pct, 0,
"Queue depth percentage for each top-level");
static int
sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS)
{
int val, err;
val = zfs_vdev_async_write_active_min_dirty_percent;
err = sysctl_handle_int(oidp, &val, 0, req);
if (err != 0 || req->newptr == NULL)
return (err);
if (val < 0 || val > 100 ||
val >= zfs_vdev_async_write_active_max_dirty_percent)
return (EINVAL);
zfs_vdev_async_write_active_min_dirty_percent = val;
return (0);
}
static int
sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS)
{
int val, err;
val = zfs_vdev_async_write_active_max_dirty_percent;
err = sysctl_handle_int(oidp, &val, 0, req);
if (err != 0 || req->newptr == NULL)
return (err);
if (val < 0 || val > 100 ||
val <= zfs_vdev_async_write_active_min_dirty_percent)
return (EINVAL);
zfs_vdev_async_write_active_max_dirty_percent = val;
return (0);
}
#endif
#endif
int
vdev_queue_offset_compare(const void *x1, const void *x2)
{
const zio_t *z1 = x1;
const zio_t *z2 = x2;
if (z1->io_offset < z2->io_offset)
return (-1);
if (z1->io_offset > z2->io_offset)
return (1);
if (z1 < z2)
return (-1);
if (z1 > z2)
return (1);
return (0);
}
static inline avl_tree_t *
vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
{
return (&vq->vq_class[p].vqc_queued_tree);
}
static inline avl_tree_t *
vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
{
if (t == ZIO_TYPE_READ)
return (&vq->vq_read_offset_tree);
else if (t == ZIO_TYPE_WRITE)
return (&vq->vq_write_offset_tree);
else
return (NULL);
}
int
vdev_queue_timestamp_compare(const void *x1, const void *x2)
{
const zio_t *z1 = x1;
const zio_t *z2 = x2;
if (z1->io_timestamp < z2->io_timestamp)
return (-1);
if (z1->io_timestamp > z2->io_timestamp)
return (1);
if (z1->io_offset < z2->io_offset)
return (-1);
if (z1->io_offset > z2->io_offset)
return (1);
if (z1 < z2)
return (-1);
if (z1 > z2)
return (1);
return (0);
}
void
vdev_queue_init(vdev_t *vd)
{
vdev_queue_t *vq = &vd->vdev_queue;
mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
vq->vq_vdev = vd;
avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
sizeof (zio_t), offsetof(struct zio, io_queue_node));
avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));
avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));
for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
int (*compfn) (const void *, const void *);
/*
* The synchronous i/o queues are dispatched in FIFO rather
* than LBA order. This provides more consistent latency for
* these i/os.
*/
if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE)
compfn = vdev_queue_timestamp_compare;
else
compfn = vdev_queue_offset_compare;
avl_create(vdev_queue_class_tree(vq, p), compfn,
sizeof (zio_t), offsetof(struct zio, io_queue_node));
}
vq->vq_lastoffset = 0;
}
void
vdev_queue_fini(vdev_t *vd)
{
vdev_queue_t *vq = &vd->vdev_queue;
for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
avl_destroy(vdev_queue_class_tree(vq, p));
avl_destroy(&vq->vq_active_tree);
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
mutex_destroy(&vq->vq_lock);
}
static void
vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
{
spa_t *spa = zio->io_spa;
avl_tree_t *qtt;
ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
qtt = vdev_queue_type_tree(vq, zio->io_type);
if (qtt)
avl_add(qtt, zio);
#ifdef illumos
mutex_enter(&spa->spa_iokstat_lock);
spa->spa_queue_stats[zio->io_priority].spa_queued++;
if (spa->spa_iokstat != NULL)
kstat_waitq_enter(spa->spa_iokstat->ks_data);
mutex_exit(&spa->spa_iokstat_lock);
#endif
}
static void
vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
{
spa_t *spa = zio->io_spa;
avl_tree_t *qtt;
ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
qtt = vdev_queue_type_tree(vq, zio->io_type);
if (qtt)
avl_remove(qtt, zio);
#ifdef illumos
mutex_enter(&spa->spa_iokstat_lock);
ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_queued, >, 0);
spa->spa_queue_stats[zio->io_priority].spa_queued--;
if (spa->spa_iokstat != NULL)
kstat_waitq_exit(spa->spa_iokstat->ks_data);
mutex_exit(&spa->spa_iokstat_lock);
#endif
}
static void
vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
{
spa_t *spa = zio->io_spa;
ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
vq->vq_class[zio->io_priority].vqc_active++;
avl_add(&vq->vq_active_tree, zio);
#ifdef illumos
mutex_enter(&spa->spa_iokstat_lock);
spa->spa_queue_stats[zio->io_priority].spa_active++;
if (spa->spa_iokstat != NULL)
kstat_runq_enter(spa->spa_iokstat->ks_data);
mutex_exit(&spa->spa_iokstat_lock);
#endif
}
static void
vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
{
spa_t *spa = zio->io_spa;
ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
vq->vq_class[zio->io_priority].vqc_active--;
avl_remove(&vq->vq_active_tree, zio);
#ifdef illumos
mutex_enter(&spa->spa_iokstat_lock);
ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_active, >, 0);
spa->spa_queue_stats[zio->io_priority].spa_active--;
if (spa->spa_iokstat != NULL) {
kstat_io_t *ksio = spa->spa_iokstat->ks_data;
kstat_runq_exit(spa->spa_iokstat->ks_data);
if (zio->io_type == ZIO_TYPE_READ) {
ksio->reads++;
ksio->nread += zio->io_size;
} else if (zio->io_type == ZIO_TYPE_WRITE) {
ksio->writes++;
ksio->nwritten += zio->io_size;
}
}
mutex_exit(&spa->spa_iokstat_lock);
#endif
}
static void
vdev_queue_agg_io_done(zio_t *aio)
{
if (aio->io_type == ZIO_TYPE_READ) {
zio_t *pio;
zio_link_t *zl = NULL;
while ((pio = zio_walk_parents(aio, &zl)) != NULL) {
abd_copy_off(pio->io_abd, aio->io_abd,
0, pio->io_offset - aio->io_offset, pio->io_size);
}
}
abd_free(aio->io_abd);
}
static int
vdev_queue_class_min_active(zio_priority_t p)
{
switch (p) {
case ZIO_PRIORITY_SYNC_READ:
return (zfs_vdev_sync_read_min_active);
case ZIO_PRIORITY_SYNC_WRITE:
return (zfs_vdev_sync_write_min_active);
case ZIO_PRIORITY_ASYNC_READ:
return (zfs_vdev_async_read_min_active);
case ZIO_PRIORITY_ASYNC_WRITE:
return (zfs_vdev_async_write_min_active);
case ZIO_PRIORITY_SCRUB:
return (zfs_vdev_scrub_min_active);
case ZIO_PRIORITY_TRIM:
return (zfs_vdev_trim_min_active);
+ case ZIO_PRIORITY_REMOVAL:
+ return (zfs_vdev_removal_min_active);
default:
panic("invalid priority %u", p);
return (0);
}
}
static __noinline int
vdev_queue_max_async_writes(spa_t *spa)
{
int writes;
uint64_t dirty = spa->spa_dsl_pool->dp_dirty_total;
uint64_t min_bytes = zfs_dirty_data_max *
zfs_vdev_async_write_active_min_dirty_percent / 100;
uint64_t max_bytes = zfs_dirty_data_max *
zfs_vdev_async_write_active_max_dirty_percent / 100;
/*
* Sync tasks correspond to interactive user actions. To reduce the
* execution time of those actions we push data out as fast as possible.
*/
if (spa_has_pending_synctask(spa)) {
return (zfs_vdev_async_write_max_active);
}
if (dirty < min_bytes)
return (zfs_vdev_async_write_min_active);
if (dirty > max_bytes)
return (zfs_vdev_async_write_max_active);
/*
* linear interpolation:
* slope = (max_writes - min_writes) / (max_bytes - min_bytes)
* move right by min_bytes
* move up by min_writes
*/
writes = (dirty - min_bytes) *
(zfs_vdev_async_write_max_active -
zfs_vdev_async_write_min_active) /
(max_bytes - min_bytes) +
zfs_vdev_async_write_min_active;
ASSERT3U(writes, >=, zfs_vdev_async_write_min_active);
ASSERT3U(writes, <=, zfs_vdev_async_write_max_active);
return (writes);
}
static int
vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
{
switch (p) {
case ZIO_PRIORITY_SYNC_READ:
return (zfs_vdev_sync_read_max_active);
case ZIO_PRIORITY_SYNC_WRITE:
return (zfs_vdev_sync_write_max_active);
case ZIO_PRIORITY_ASYNC_READ:
return (zfs_vdev_async_read_max_active);
case ZIO_PRIORITY_ASYNC_WRITE:
return (vdev_queue_max_async_writes(spa));
case ZIO_PRIORITY_SCRUB:
return (zfs_vdev_scrub_max_active);
case ZIO_PRIORITY_TRIM:
return (zfs_vdev_trim_max_active);
+ case ZIO_PRIORITY_REMOVAL:
+ return (zfs_vdev_removal_max_active);
default:
panic("invalid priority %u", p);
return (0);
}
}
/*
* Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if
* there is no eligible class.
*/
static zio_priority_t
vdev_queue_class_to_issue(vdev_queue_t *vq)
{
spa_t *spa = vq->vq_vdev->vdev_spa;
zio_priority_t p;
ASSERT(MUTEX_HELD(&vq->vq_lock));
if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
return (ZIO_PRIORITY_NUM_QUEUEABLE);
/* find a queue that has not reached its minimum # outstanding i/os */
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
vq->vq_class[p].vqc_active <
vdev_queue_class_min_active(p))
return (p);
}
/*
* If we haven't found a queue, look for one that hasn't reached its
* maximum # outstanding i/os.
*/
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
vq->vq_class[p].vqc_active <
vdev_queue_class_max_active(spa, p))
return (p);
}
/* No eligible queued i/os */
return (ZIO_PRIORITY_NUM_QUEUEABLE);
}
/*
* Compute the range spanned by two i/os, which is the endpoint of the last
* (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
* Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio);
* thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0.
*/
#define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset)
#define IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
static zio_t *
vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
{
zio_t *first, *last, *aio, *dio, *mandatory, *nio;
uint64_t maxgap = 0;
uint64_t size;
boolean_t stretch;
avl_tree_t *t;
enum zio_flag flags;
ASSERT(MUTEX_HELD(&vq->vq_lock));
if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
return (NULL);
first = last = zio;
if (zio->io_type == ZIO_TYPE_READ)
maxgap = zfs_vdev_read_gap_limit;
/*
* We can aggregate I/Os that are sufficiently adjacent and of
* the same flavor, as expressed by the AGG_INHERIT flags.
* The latter requirement is necessary so that certain
* attributes of the I/O, such as whether it's a normal I/O
* or a scrub/resilver, can be preserved in the aggregate.
* We can include optional I/Os, but don't allow them
* to begin a range as they add no benefit in that situation.
*/
/*
* We keep track of the last non-optional I/O.
*/
mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first;
/*
* Walk backwards through sufficiently contiguous I/Os
* recording the last non-optional I/O.
*/
flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
t = vdev_queue_type_tree(vq, zio->io_type);
while (t != NULL && (dio = AVL_PREV(t, first)) != NULL &&
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit &&
- IO_GAP(dio, first) <= maxgap) {
+ IO_GAP(dio, first) <= maxgap &&
+ dio->io_type == zio->io_type) {
first = dio;
if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL))
mandatory = first;
}
/*
* Skip any initial optional I/Os.
*/
while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) {
first = AVL_NEXT(t, first);
ASSERT(first != NULL);
}
/*
* Walk forward through sufficiently contiguous I/Os.
* The aggregation limit does not apply to optional i/os, so that
* we can issue contiguous writes even if they are larger than the
* aggregation limit.
*/
while ((dio = AVL_NEXT(t, last)) != NULL &&
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
(IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit ||
(dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
- IO_GAP(last, dio) <= maxgap) {
+ IO_GAP(last, dio) <= maxgap &&
+ dio->io_type == zio->io_type) {
last = dio;
if (!(last->io_flags & ZIO_FLAG_OPTIONAL))
mandatory = last;
}
/*
* Now that we've established the range of the I/O aggregation
* we must decide what to do with trailing optional I/Os.
* For reads, there's nothing to do. While we are unable to
* aggregate further, it's possible that a trailing optional
* I/O would allow the underlying device to aggregate with
* subsequent I/Os. We must therefore determine if the next
* non-optional I/O is close enough to make aggregation
* worthwhile.
*/
stretch = B_FALSE;
if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) {
zio_t *nio = last;
while ((dio = AVL_NEXT(t, nio)) != NULL &&
IO_GAP(nio, dio) == 0 &&
IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) {
nio = dio;
if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
stretch = B_TRUE;
break;
}
}
}
if (stretch) {
/*
* We are going to include an optional io in our aggregated
* span, thus closing the write gap. Only mandatory i/os can
* start aggregated spans, so make sure that the next i/o
* after our span is mandatory.
*/
dio = AVL_NEXT(t, last);
dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
} else {
/* do not include the optional i/o */
while (last != mandatory && last != first) {
ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL);
last = AVL_PREV(t, last);
ASSERT(last != NULL);
}
}
if (first == last)
return (NULL);
size = IO_SPAN(first, last);
ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
abd_alloc_for_io(size, B_TRUE), size, first->io_type,
zio->io_priority, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
vdev_queue_agg_io_done, NULL);
aio->io_timestamp = first->io_timestamp;
nio = first;
do {
dio = nio;
nio = AVL_NEXT(t, dio);
ASSERT3U(dio->io_type, ==, aio->io_type);
if (dio->io_flags & ZIO_FLAG_NODATA) {
ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
abd_zero_off(aio->io_abd,
dio->io_offset - aio->io_offset, dio->io_size);
} else if (dio->io_type == ZIO_TYPE_WRITE) {
abd_copy_off(aio->io_abd, dio->io_abd,
dio->io_offset - aio->io_offset, 0, dio->io_size);
}
zio_add_child(dio, aio);
vdev_queue_io_remove(vq, dio);
zio_vdev_io_bypass(dio);
zio_execute(dio);
} while (dio != last);
return (aio);
}
static zio_t *
vdev_queue_io_to_issue(vdev_queue_t *vq)
{
zio_t *zio, *aio;
zio_priority_t p;
avl_index_t idx;
avl_tree_t *tree;
zio_t search;
again:
ASSERT(MUTEX_HELD(&vq->vq_lock));
p = vdev_queue_class_to_issue(vq);
if (p == ZIO_PRIORITY_NUM_QUEUEABLE) {
/* No eligible queued i/os */
return (NULL);
}
/*
* For LBA-ordered queues (async / scrub), issue the i/o which follows
* the most recently issued i/o in LBA (offset) order.
*
* For FIFO queues (sync), issue the i/o with the lowest timestamp.
*/
tree = vdev_queue_class_tree(vq, p);
search.io_timestamp = 0;
search.io_offset = vq->vq_last_offset + 1;
VERIFY3P(avl_find(tree, &search, &idx), ==, NULL);
zio = avl_nearest(tree, idx, AVL_AFTER);
if (zio == NULL)
zio = avl_first(tree);
ASSERT3U(zio->io_priority, ==, p);
aio = vdev_queue_aggregate(vq, zio);
if (aio != NULL)
zio = aio;
else
vdev_queue_io_remove(vq, zio);
/*
* If the I/O is or was optional and therefore has no data, we need to
* simply discard it. We need to drop the vdev queue's lock to avoid a
* deadlock that we could encounter since this I/O will complete
* immediately.
*/
if (zio->io_flags & ZIO_FLAG_NODATA) {
mutex_exit(&vq->vq_lock);
zio_vdev_io_bypass(zio);
zio_execute(zio);
mutex_enter(&vq->vq_lock);
goto again;
}
vdev_queue_pending_add(vq, zio);
vq->vq_last_offset = zio->io_offset;
return (zio);
}
zio_t *
vdev_queue_io(zio_t *zio)
{
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
zio_t *nio;
if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
return (zio);
/*
* Children i/os inherent their parent's priority, which might
* not match the child's i/o type. Fix it up here.
*/
if (zio->io_type == ZIO_TYPE_READ) {
if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
- zio->io_priority != ZIO_PRIORITY_SCRUB)
+ zio->io_priority != ZIO_PRIORITY_SCRUB &&
+ zio->io_priority != ZIO_PRIORITY_REMOVAL)
zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
} else if (zio->io_type == ZIO_TYPE_WRITE) {
if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
- zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE)
+ zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
+ zio->io_priority != ZIO_PRIORITY_REMOVAL)
zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
} else {
ASSERT(zio->io_type == ZIO_TYPE_FREE);
zio->io_priority = ZIO_PRIORITY_TRIM;
}
zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
mutex_enter(&vq->vq_lock);
zio->io_timestamp = gethrtime();
vdev_queue_io_add(vq, zio);
nio = vdev_queue_io_to_issue(vq);
mutex_exit(&vq->vq_lock);
if (nio == NULL)
return (NULL);
if (nio->io_done == vdev_queue_agg_io_done) {
zio_nowait(nio);
return (NULL);
}
return (nio);
}
void
vdev_queue_io_done(zio_t *zio)
{
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
zio_t *nio;
mutex_enter(&vq->vq_lock);
vdev_queue_pending_remove(vq, zio);
vq->vq_io_complete_ts = gethrtime();
while ((nio = vdev_queue_io_to_issue(vq)) != NULL) {
mutex_exit(&vq->vq_lock);
if (nio->io_done == vdev_queue_agg_io_done) {
zio_nowait(nio);
} else {
zio_vdev_io_reissue(nio);
zio_execute(nio);
}
mutex_enter(&vq->vq_lock);
}
mutex_exit(&vq->vq_lock);
}
/*
* As these three methods are only used for load calculations we're not concerned
* if we get an incorrect value on 32bit platforms due to lack of vq_lock mutex
* use here, instead we prefer to keep it lock free for performance.
*/
int
vdev_queue_length(vdev_t *vd)
{
return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
}
uint64_t
vdev_queue_lastoffset(vdev_t *vd)
{
return (vd->vdev_queue.vq_lastoffset);
}
void
vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio)
{
vd->vdev_queue.vq_lastoffset = zio->io_offset + zio->io_size;
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c (revision 332525)
@@ -1,2598 +1,2599 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/vdev_impl.h>
#ifdef illumos
#include <sys/vdev_disk.h>
#endif
#include <sys/vdev_file.h>
#include <sys/vdev_raidz.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
#include <sys/abd.h>
#include <sys/fs/zfs.h>
#include <sys/fm/fs/zfs.h>
#include <sys/bio.h>
/*
* Virtual device vector for RAID-Z.
*
* This vdev supports single, double, and triple parity. For single parity,
* we use a simple XOR of all the data columns. For double or triple parity,
* we use a special case of Reed-Solomon coding. This extends the
* technique described in "The mathematics of RAID-6" by H. Peter Anvin by
* drawing on the system described in "A Tutorial on Reed-Solomon Coding for
* Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
* former is also based. The latter is designed to provide higher performance
* for writes.
*
* Note that the Plank paper claimed to support arbitrary N+M, but was then
* amended six years later identifying a critical flaw that invalidates its
* claims. Nevertheless, the technique can be adapted to work for up to
* triple parity. For additional parity, the amendment "Note: Correction to
* the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
* is viable, but the additional complexity means that write performance will
* suffer.
*
* All of the methods above operate on a Galois field, defined over the
* integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
* can be expressed with a single byte. Briefly, the operations on the
* field are defined as follows:
*
* o addition (+) is represented by a bitwise XOR
* o subtraction (-) is therefore identical to addition: A + B = A - B
* o multiplication of A by 2 is defined by the following bitwise expression:
*
* (A * 2)_7 = A_6
* (A * 2)_6 = A_5
* (A * 2)_5 = A_4
* (A * 2)_4 = A_3 + A_7
* (A * 2)_3 = A_2 + A_7
* (A * 2)_2 = A_1 + A_7
* (A * 2)_1 = A_0
* (A * 2)_0 = A_7
*
* In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
* As an aside, this multiplication is derived from the error correcting
* primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
*
* Observe that any number in the field (except for 0) can be expressed as a
* power of 2 -- a generator for the field. We store a table of the powers of
* 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
* be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
* than field addition). The inverse of a field element A (A^-1) is therefore
* A ^ (255 - 1) = A^254.
*
* The up-to-three parity columns, P, Q, R over several data columns,
* D_0, ... D_n-1, can be expressed by field operations:
*
* P = D_0 + D_1 + ... + D_n-2 + D_n-1
* Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
* = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
* R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
* = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
*
* We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
* XOR operation, and 2 and 4 can be computed quickly and generate linearly-
* independent coefficients. (There are no additional coefficients that have
* this property which is why the uncorrected Plank method breaks down.)
*
* See the reconstruction code below for how P, Q and R can used individually
* or in concert to recover missing data columns.
*/
typedef struct raidz_col {
uint64_t rc_devidx; /* child device index for I/O */
uint64_t rc_offset; /* device offset */
uint64_t rc_size; /* I/O size */
abd_t *rc_abd; /* I/O data */
void *rc_gdata; /* used to store the "good" version */
int rc_error; /* I/O error for this device */
uint8_t rc_tried; /* Did we attempt this I/O column? */
uint8_t rc_skipped; /* Did we skip this I/O column? */
} raidz_col_t;
typedef struct raidz_map {
uint64_t rm_cols; /* Regular column count */
uint64_t rm_scols; /* Count including skipped columns */
uint64_t rm_bigcols; /* Number of oversized columns */
uint64_t rm_asize; /* Actual total I/O size */
uint64_t rm_missingdata; /* Count of missing data devices */
uint64_t rm_missingparity; /* Count of missing parity devices */
uint64_t rm_firstdatacol; /* First data column/parity count */
uint64_t rm_nskip; /* Skipped sectors for padding */
uint64_t rm_skipstart; /* Column index of padding start */
abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */
uintptr_t rm_reports; /* # of referencing checksum reports */
uint8_t rm_freed; /* map no longer has referencing ZIO */
uint8_t rm_ecksuminjected; /* checksum error was injected */
raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
} raidz_map_t;
#define VDEV_RAIDZ_P 0
#define VDEV_RAIDZ_Q 1
#define VDEV_RAIDZ_R 2
#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
/*
* We provide a mechanism to perform the field multiplication operation on a
* 64-bit value all at once rather than a byte at a time. This works by
* creating a mask from the top bit in each byte and using that to
* conditionally apply the XOR of 0x1d.
*/
#define VDEV_RAIDZ_64MUL_2(x, mask) \
{ \
(mask) = (x) & 0x8080808080808080ULL; \
(mask) = ((mask) << 1) - ((mask) >> 7); \
(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
((mask) & 0x1d1d1d1d1d1d1d1d); \
}
#define VDEV_RAIDZ_64MUL_4(x, mask) \
{ \
VDEV_RAIDZ_64MUL_2((x), mask); \
VDEV_RAIDZ_64MUL_2((x), mask); \
}
#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
/*
* Force reconstruction to use the general purpose method.
*/
int vdev_raidz_default_to_general;
/* Powers of 2 in the Galois field defined above. */
static const uint8_t vdev_raidz_pow2[256] = {
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
};
/* Logs of 2 in the Galois field defined above. */
static const uint8_t vdev_raidz_log2[256] = {
0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
};
static void vdev_raidz_generate_parity(raidz_map_t *rm);
/*
* Multiply a given number by 2 raised to the given power.
*/
static uint8_t
vdev_raidz_exp2(uint_t a, int exp)
{
if (a == 0)
return (0);
ASSERT(exp >= 0);
ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
exp += vdev_raidz_log2[a];
if (exp > 255)
exp -= 255;
return (vdev_raidz_pow2[exp]);
}
static void
vdev_raidz_map_free(raidz_map_t *rm)
{
int c;
size_t size;
for (c = 0; c < rm->rm_firstdatacol; c++) {
if (rm->rm_col[c].rc_abd != NULL)
abd_free(rm->rm_col[c].rc_abd);
if (rm->rm_col[c].rc_gdata != NULL)
zio_buf_free(rm->rm_col[c].rc_gdata,
rm->rm_col[c].rc_size);
}
size = 0;
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
if (rm->rm_col[c].rc_abd != NULL)
abd_put(rm->rm_col[c].rc_abd);
size += rm->rm_col[c].rc_size;
}
if (rm->rm_abd_copy != NULL)
abd_free(rm->rm_abd_copy);
kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
}
static void
vdev_raidz_map_free_vsd(zio_t *zio)
{
raidz_map_t *rm = zio->io_vsd;
ASSERT0(rm->rm_freed);
rm->rm_freed = 1;
if (rm->rm_reports == 0)
vdev_raidz_map_free(rm);
}
/*ARGSUSED*/
static void
vdev_raidz_cksum_free(void *arg, size_t ignored)
{
raidz_map_t *rm = arg;
ASSERT3U(rm->rm_reports, >, 0);
if (--rm->rm_reports == 0 && rm->rm_freed != 0)
vdev_raidz_map_free(rm);
}
static void
vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
{
raidz_map_t *rm = zcr->zcr_cbdata;
size_t c = zcr->zcr_cbinfo;
size_t x;
const char *good = NULL;
char *bad;
if (good_data == NULL) {
zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
return;
}
if (c < rm->rm_firstdatacol) {
/*
* The first time through, calculate the parity blocks for
* the good data (this relies on the fact that the good
* data never changes for a given logical ZIO)
*/
if (rm->rm_col[0].rc_gdata == NULL) {
abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
char *buf;
int offset;
/*
* Set up the rm_col[]s to generate the parity for
* good_data, first saving the parity bufs and
* replacing them with buffers to hold the result.
*/
for (x = 0; x < rm->rm_firstdatacol; x++) {
bad_parity[x] = rm->rm_col[x].rc_abd;
rm->rm_col[x].rc_gdata =
zio_buf_alloc(rm->rm_col[x].rc_size);
rm->rm_col[x].rc_abd =
abd_get_from_buf(rm->rm_col[x].rc_gdata,
rm->rm_col[x].rc_size);
}
/* fill in the data columns from good_data */
buf = (char *)good_data;
for (; x < rm->rm_cols; x++) {
abd_put(rm->rm_col[x].rc_abd);
rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
rm->rm_col[x].rc_size);
buf += rm->rm_col[x].rc_size;
}
/*
* Construct the parity from the good data.
*/
vdev_raidz_generate_parity(rm);
/* restore everything back to its original state */
for (x = 0; x < rm->rm_firstdatacol; x++) {
abd_put(rm->rm_col[x].rc_abd);
rm->rm_col[x].rc_abd = bad_parity[x];
}
offset = 0;
for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
abd_put(rm->rm_col[x].rc_abd);
rm->rm_col[x].rc_abd = abd_get_offset(
rm->rm_abd_copy, offset);
offset += rm->rm_col[x].rc_size;
}
}
ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
good = rm->rm_col[c].rc_gdata;
} else {
/* adjust good_data to point at the start of our column */
good = good_data;
for (x = rm->rm_firstdatacol; x < c; x++)
good += rm->rm_col[x].rc_size;
}
bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
/* we drop the ereport if it ends up that the data was good */
zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
}
/*
* Invoked indirectly by zfs_ereport_start_checksum(), called
* below when our read operation fails completely. The main point
* is to keep a copy of everything we read from disk, so that at
* vdev_raidz_cksum_finish() time we can compare it with the good data.
*/
static void
vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
{
size_t c = (size_t)(uintptr_t)arg;
size_t offset;
raidz_map_t *rm = zio->io_vsd;
size_t size;
/* set up the report and bump the refcount */
zcr->zcr_cbdata = rm;
zcr->zcr_cbinfo = c;
zcr->zcr_finish = vdev_raidz_cksum_finish;
zcr->zcr_free = vdev_raidz_cksum_free;
rm->rm_reports++;
ASSERT3U(rm->rm_reports, >, 0);
if (rm->rm_abd_copy != NULL)
return;
/*
* It's the first time we're called for this raidz_map_t, so we need
* to copy the data aside; there's no guarantee that our zio's buffer
* won't be re-used for something else.
*
* Our parity data is already in separate buffers, so there's no need
* to copy them.
*/
size = 0;
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
size += rm->rm_col[c].rc_size;
rm->rm_abd_copy =
abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
raidz_col_t *col = &rm->rm_col[c];
abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
abd_copy(tmp, col->rc_abd, col->rc_size);
abd_put(col->rc_abd);
col->rc_abd = tmp;
offset += col->rc_size;
}
ASSERT3U(offset, ==, size);
}
static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
vdev_raidz_map_free_vsd,
vdev_raidz_cksum_report
};
/*
* Divides the IO evenly across all child vdevs; usually, dcols is
* the number of children in the target vdev.
*/
static raidz_map_t *
vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, boolean_t dofree,
uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
{
raidz_map_t *rm;
/* The starting RAIDZ (parent) vdev sector of the block. */
uint64_t b = offset >> unit_shift;
/* The zio's size in units of the vdev's minimum sector size. */
uint64_t s = size >> unit_shift;
/* The first column for this stripe. */
uint64_t f = b % dcols;
/* The starting byte offset on each child vdev. */
uint64_t o = (b / dcols) << unit_shift;
uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
uint64_t off = 0;
/*
* "Quotient": The number of data sectors for this stripe on all but
* the "big column" child vdevs that also contain "remainder" data.
*/
q = s / (dcols - nparity);
/*
* "Remainder": The number of partial stripe data sectors in this I/O.
* This will add a sector to some, but not all, child vdevs.
*/
r = s - q * (dcols - nparity);
/* The number of "big columns" - those which contain remainder data. */
bc = (r == 0 ? 0 : r + nparity);
/*
* The total number of data and parity sectors associated with
* this I/O.
*/
tot = s + nparity * (q + (r == 0 ? 0 : 1));
/* acols: The columns that will be accessed. */
/* scols: The columns that will be accessed or skipped. */
if (q == 0) {
/* Our I/O request doesn't span all child vdevs. */
acols = bc;
scols = MIN(dcols, roundup(bc, nparity + 1));
} else {
acols = dcols;
scols = dcols;
}
ASSERT3U(acols, <=, scols);
rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
rm->rm_cols = acols;
rm->rm_scols = scols;
rm->rm_bigcols = bc;
rm->rm_skipstart = bc;
rm->rm_missingdata = 0;
rm->rm_missingparity = 0;
rm->rm_firstdatacol = nparity;
rm->rm_abd_copy = NULL;
rm->rm_reports = 0;
rm->rm_freed = 0;
rm->rm_ecksuminjected = 0;
asize = 0;
for (c = 0; c < scols; c++) {
col = f + c;
coff = o;
if (col >= dcols) {
col -= dcols;
coff += 1ULL << unit_shift;
}
rm->rm_col[c].rc_devidx = col;
rm->rm_col[c].rc_offset = coff;
rm->rm_col[c].rc_abd = NULL;
rm->rm_col[c].rc_gdata = NULL;
rm->rm_col[c].rc_error = 0;
rm->rm_col[c].rc_tried = 0;
rm->rm_col[c].rc_skipped = 0;
if (c >= acols)
rm->rm_col[c].rc_size = 0;
else if (c < bc)
rm->rm_col[c].rc_size = (q + 1) << unit_shift;
else
rm->rm_col[c].rc_size = q << unit_shift;
asize += rm->rm_col[c].rc_size;
}
ASSERT3U(asize, ==, tot << unit_shift);
rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
rm->rm_nskip = roundup(tot, nparity + 1) - tot;
ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
ASSERT3U(rm->rm_nskip, <=, nparity);
if (!dofree) {
for (c = 0; c < rm->rm_firstdatacol; c++) {
rm->rm_col[c].rc_abd =
abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
}
rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
off = rm->rm_col[c].rc_size;
for (c = c + 1; c < acols; c++) {
rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
off += rm->rm_col[c].rc_size;
}
}
/*
* If all data stored spans all columns, there's a danger that parity
* will always be on the same device and, since parity isn't read
* during normal operation, that that device's I/O bandwidth won't be
* used effectively. We therefore switch the parity every 1MB.
*
* ... at least that was, ostensibly, the theory. As a practical
* matter unless we juggle the parity between all devices evenly, we
* won't see any benefit. Further, occasional writes that aren't a
* multiple of the LCM of the number of children and the minimum
* stripe width are sufficient to avoid pessimal behavior.
* Unfortunately, this decision created an implicit on-disk format
* requirement that we need to support for all eternity, but only
* for single-parity RAID-Z.
*
* If we intend to skip a sector in the zeroth column for padding
* we must make sure to note this swap. We will never intend to
* skip the first column since at least one data and one parity
* column must appear in each row.
*/
ASSERT(rm->rm_cols >= 2);
ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
devidx = rm->rm_col[0].rc_devidx;
o = rm->rm_col[0].rc_offset;
rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
rm->rm_col[1].rc_devidx = devidx;
rm->rm_col[1].rc_offset = o;
if (rm->rm_skipstart == 0)
rm->rm_skipstart = 1;
}
return (rm);
}
struct pqr_struct {
uint64_t *p;
uint64_t *q;
uint64_t *r;
};
static int
vdev_raidz_p_func(void *buf, size_t size, void *private)
{
struct pqr_struct *pqr = private;
const uint64_t *src = buf;
int i, cnt = size / sizeof (src[0]);
ASSERT(pqr->p && !pqr->q && !pqr->r);
for (i = 0; i < cnt; i++, src++, pqr->p++)
*pqr->p ^= *src;
return (0);
}
static int
vdev_raidz_pq_func(void *buf, size_t size, void *private)
{
struct pqr_struct *pqr = private;
const uint64_t *src = buf;
uint64_t mask;
int i, cnt = size / sizeof (src[0]);
ASSERT(pqr->p && pqr->q && !pqr->r);
for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
*pqr->p ^= *src;
VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
*pqr->q ^= *src;
}
return (0);
}
static int
vdev_raidz_pqr_func(void *buf, size_t size, void *private)
{
struct pqr_struct *pqr = private;
const uint64_t *src = buf;
uint64_t mask;
int i, cnt = size / sizeof (src[0]);
ASSERT(pqr->p && pqr->q && pqr->r);
for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
*pqr->p ^= *src;
VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
*pqr->q ^= *src;
VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
*pqr->r ^= *src;
}
return (0);
}
static void
vdev_raidz_generate_parity_p(raidz_map_t *rm)
{
uint64_t *p;
int c;
abd_t *src;
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
src = rm->rm_col[c].rc_abd;
p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
if (c == rm->rm_firstdatacol) {
abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
} else {
struct pqr_struct pqr = { p, NULL, NULL };
(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
vdev_raidz_p_func, &pqr);
}
}
}
static void
vdev_raidz_generate_parity_pq(raidz_map_t *rm)
{
uint64_t *p, *q, pcnt, ccnt, mask, i;
int c;
abd_t *src;
pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
rm->rm_col[VDEV_RAIDZ_Q].rc_size);
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
src = rm->rm_col[c].rc_abd;
p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
if (c == rm->rm_firstdatacol) {
abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
(void) memcpy(q, p, rm->rm_col[c].rc_size);
} else {
struct pqr_struct pqr = { p, q, NULL };
(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
vdev_raidz_pq_func, &pqr);
}
if (c == rm->rm_firstdatacol) {
for (i = ccnt; i < pcnt; i++) {
p[i] = 0;
q[i] = 0;
}
} else {
/*
* Treat short columns as though they are full of 0s.
* Note that there's therefore nothing needed for P.
*/
for (i = ccnt; i < pcnt; i++) {
VDEV_RAIDZ_64MUL_2(q[i], mask);
}
}
}
}
static void
vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
{
uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
int c;
abd_t *src;
pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
rm->rm_col[VDEV_RAIDZ_Q].rc_size);
ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
rm->rm_col[VDEV_RAIDZ_R].rc_size);
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
src = rm->rm_col[c].rc_abd;
p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
if (c == rm->rm_firstdatacol) {
abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
(void) memcpy(q, p, rm->rm_col[c].rc_size);
(void) memcpy(r, p, rm->rm_col[c].rc_size);
} else {
struct pqr_struct pqr = { p, q, r };
(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
vdev_raidz_pqr_func, &pqr);
}
if (c == rm->rm_firstdatacol) {
for (i = ccnt; i < pcnt; i++) {
p[i] = 0;
q[i] = 0;
r[i] = 0;
}
} else {
/*
* Treat short columns as though they are full of 0s.
* Note that there's therefore nothing needed for P.
*/
for (i = ccnt; i < pcnt; i++) {
VDEV_RAIDZ_64MUL_2(q[i], mask);
VDEV_RAIDZ_64MUL_4(r[i], mask);
}
}
}
}
/*
* Generate RAID parity in the first virtual columns according to the number of
* parity columns available.
*/
static void
vdev_raidz_generate_parity(raidz_map_t *rm)
{
switch (rm->rm_firstdatacol) {
case 1:
vdev_raidz_generate_parity_p(rm);
break;
case 2:
vdev_raidz_generate_parity_pq(rm);
break;
case 3:
vdev_raidz_generate_parity_pqr(rm);
break;
default:
cmn_err(CE_PANIC, "invalid RAID-Z configuration");
}
}
/* ARGSUSED */
static int
vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
{
uint64_t *dst = dbuf;
uint64_t *src = sbuf;
int cnt = size / sizeof (src[0]);
for (int i = 0; i < cnt; i++) {
dst[i] ^= src[i];
}
return (0);
}
/* ARGSUSED */
static int
vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
void *private)
{
uint64_t *dst = dbuf;
uint64_t *src = sbuf;
uint64_t mask;
int cnt = size / sizeof (dst[0]);
for (int i = 0; i < cnt; i++, dst++, src++) {
VDEV_RAIDZ_64MUL_2(*dst, mask);
*dst ^= *src;
}
return (0);
}
/* ARGSUSED */
static int
vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
{
uint64_t *dst = buf;
uint64_t mask;
int cnt = size / sizeof (dst[0]);
for (int i = 0; i < cnt; i++, dst++) {
/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
VDEV_RAIDZ_64MUL_2(*dst, mask);
}
return (0);
}
struct reconst_q_struct {
uint64_t *q;
int exp;
};
static int
vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
{
struct reconst_q_struct *rq = private;
uint64_t *dst = buf;
int cnt = size / sizeof (dst[0]);
for (int i = 0; i < cnt; i++, dst++, rq->q++) {
*dst ^= *rq->q;
int j;
uint8_t *b;
for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
*b = vdev_raidz_exp2(*b, rq->exp);
}
}
return (0);
}
struct reconst_pq_struct {
uint8_t *p;
uint8_t *q;
uint8_t *pxy;
uint8_t *qxy;
int aexp;
int bexp;
};
static int
vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
{
struct reconst_pq_struct *rpq = private;
uint8_t *xd = xbuf;
uint8_t *yd = ybuf;
for (int i = 0; i < size;
i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
*yd = *rpq->p ^ *rpq->pxy ^ *xd;
}
return (0);
}
static int
vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
{
struct reconst_pq_struct *rpq = private;
uint8_t *xd = xbuf;
for (int i = 0; i < size;
i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
/* same operation as vdev_raidz_reconst_pq_func() on xd */
*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
}
return (0);
}
static int
vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
{
int x = tgts[0];
int c;
abd_t *dst, *src;
ASSERT(ntgts == 1);
ASSERT(x >= rm->rm_firstdatacol);
ASSERT(x < rm->rm_cols);
ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
ASSERT(rm->rm_col[x].rc_size > 0);
src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
dst = rm->rm_col[x].rc_abd;
abd_copy(dst, src, rm->rm_col[x].rc_size);
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
uint64_t size = MIN(rm->rm_col[x].rc_size,
rm->rm_col[c].rc_size);
src = rm->rm_col[c].rc_abd;
dst = rm->rm_col[x].rc_abd;
if (c == x)
continue;
(void) abd_iterate_func2(dst, src, 0, 0, size,
vdev_raidz_reconst_p_func, NULL);
}
return (1 << VDEV_RAIDZ_P);
}
static int
vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
{
int x = tgts[0];
int c, exp;
abd_t *dst, *src;
ASSERT(ntgts == 1);
ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
rm->rm_col[c].rc_size);
src = rm->rm_col[c].rc_abd;
dst = rm->rm_col[x].rc_abd;
if (c == rm->rm_firstdatacol) {
abd_copy(dst, src, size);
if (rm->rm_col[x].rc_size > size)
abd_zero_off(dst, size,
rm->rm_col[x].rc_size - size);
} else {
ASSERT3U(size, <=, rm->rm_col[x].rc_size);
(void) abd_iterate_func2(dst, src, 0, 0, size,
vdev_raidz_reconst_q_pre_func, NULL);
(void) abd_iterate_func(dst,
size, rm->rm_col[x].rc_size - size,
vdev_raidz_reconst_q_pre_tail_func, NULL);
}
}
src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
dst = rm->rm_col[x].rc_abd;
exp = 255 - (rm->rm_cols - 1 - x);
struct reconst_q_struct rq = { abd_to_buf(src), exp };
(void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
vdev_raidz_reconst_q_post_func, &rq);
return (1 << VDEV_RAIDZ_Q);
}
static int
vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
{
uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
abd_t *pdata, *qdata;
uint64_t xsize, ysize;
int x = tgts[0];
int y = tgts[1];
abd_t *xd, *yd;
ASSERT(ntgts == 2);
ASSERT(x < y);
ASSERT(x >= rm->rm_firstdatacol);
ASSERT(y < rm->rm_cols);
ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
/*
* Move the parity data aside -- we're going to compute parity as
* though columns x and y were full of zeros -- Pxy and Qxy. We want to
* reuse the parity generation mechanism without trashing the actual
* parity so we make those columns appear to be full of zeros by
* setting their lengths to zero.
*/
pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
xsize = rm->rm_col[x].rc_size;
ysize = rm->rm_col[y].rc_size;
rm->rm_col[VDEV_RAIDZ_P].rc_abd =
abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
rm->rm_col[x].rc_size = 0;
rm->rm_col[y].rc_size = 0;
vdev_raidz_generate_parity_pq(rm);
rm->rm_col[x].rc_size = xsize;
rm->rm_col[y].rc_size = ysize;
p = abd_to_buf(pdata);
q = abd_to_buf(qdata);
pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
xd = rm->rm_col[x].rc_abd;
yd = rm->rm_col[y].rc_abd;
/*
* We now have:
* Pxy = P + D_x + D_y
* Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
*
* We can then solve for D_x:
* D_x = A * (P + Pxy) + B * (Q + Qxy)
* where
* A = 2^(x - y) * (2^(x - y) + 1)^-1
* B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
*
* With D_x in hand, we can easily solve for D_y:
* D_y = P + Pxy + D_x
*/
a = vdev_raidz_pow2[255 + x - y];
b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
tmp = 255 - vdev_raidz_log2[a ^ 1];
aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
ASSERT3U(xsize, >=, ysize);
struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
vdev_raidz_reconst_pq_func, &rpq);
(void) abd_iterate_func(xd, ysize, xsize - ysize,
vdev_raidz_reconst_pq_tail_func, &rpq);
abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
/*
* Restore the saved parity data.
*/
rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
}
/* BEGIN CSTYLED */
/*
* In the general case of reconstruction, we must solve the system of linear
* equations defined by the coeffecients used to generate parity as well as
* the contents of the data and parity disks. This can be expressed with
* vectors for the original data (D) and the actual data (d) and parity (p)
* and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
*
* __ __ __ __
* | | __ __ | p_0 |
* | V | | D_0 | | p_m-1 |
* | | x | : | = | d_0 |
* | I | | D_n-1 | | : |
* | | ~~ ~~ | d_n-1 |
* ~~ ~~ ~~ ~~
*
* I is simply a square identity matrix of size n, and V is a vandermonde
* matrix defined by the coeffecients we chose for the various parity columns
* (1, 2, 4). Note that these values were chosen both for simplicity, speedy
* computation as well as linear separability.
*
* __ __ __ __
* | 1 .. 1 1 1 | | p_0 |
* | 2^n-1 .. 4 2 1 | __ __ | : |
* | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
* | 1 .. 0 0 0 | | D_1 | | d_0 |
* | 0 .. 0 0 0 | x | D_2 | = | d_1 |
* | : : : : | | : | | d_2 |
* | 0 .. 1 0 0 | | D_n-1 | | : |
* | 0 .. 0 1 0 | ~~ ~~ | : |
* | 0 .. 0 0 1 | | d_n-1 |
* ~~ ~~ ~~ ~~
*
* Note that I, V, d, and p are known. To compute D, we must invert the
* matrix and use the known data and parity values to reconstruct the unknown
* data values. We begin by removing the rows in V|I and d|p that correspond
* to failed or missing columns; we then make V|I square (n x n) and d|p
* sized n by removing rows corresponding to unused parity from the bottom up
* to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
* using Gauss-Jordan elimination. In the example below we use m=3 parity
* columns, n=8 data columns, with errors in d_1, d_2, and p_1:
* __ __
* | 1 1 1 1 1 1 1 1 |
* | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
* | 19 205 116 29 64 16 4 1 | / /
* | 1 0 0 0 0 0 0 0 | / /
* | 0 1 0 0 0 0 0 0 | <--' /
* (V|I) = | 0 0 1 0 0 0 0 0 | <---'
* | 0 0 0 1 0 0 0 0 |
* | 0 0 0 0 1 0 0 0 |
* | 0 0 0 0 0 1 0 0 |
* | 0 0 0 0 0 0 1 0 |
* | 0 0 0 0 0 0 0 1 |
* ~~ ~~
* __ __
* | 1 1 1 1 1 1 1 1 |
* | 19 205 116 29 64 16 4 1 |
* | 1 0 0 0 0 0 0 0 |
* (V|I)' = | 0 0 0 1 0 0 0 0 |
* | 0 0 0 0 1 0 0 0 |
* | 0 0 0 0 0 1 0 0 |
* | 0 0 0 0 0 0 1 0 |
* | 0 0 0 0 0 0 0 1 |
* ~~ ~~
*
* Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
* have carefully chosen the seed values 1, 2, and 4 to ensure that this
* matrix is not singular.
* __ __
* | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
* | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
* | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
* | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
* | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
* | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
* | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
* | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
* ~~ ~~
* __ __
* | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
* | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
* | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
* | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
* | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
* | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
* | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
* | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
* ~~ ~~
* __ __
* | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
* | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
* | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
* | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
* | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
* | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
* | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
* | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
* ~~ ~~
* __ __
* | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
* | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
* | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
* | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
* | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
* | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
* | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
* | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
* ~~ ~~
* __ __
* | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
* | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
* | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
* | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
* | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
* | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
* | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
* | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
* ~~ ~~
* __ __
* | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
* | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
* | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
* | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
* | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
* | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
* | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
* | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
* ~~ ~~
* __ __
* | 0 0 1 0 0 0 0 0 |
* | 167 100 5 41 159 169 217 208 |
* | 166 100 4 40 158 168 216 209 |
* (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
* | 0 0 0 0 1 0 0 0 |
* | 0 0 0 0 0 1 0 0 |
* | 0 0 0 0 0 0 1 0 |
* | 0 0 0 0 0 0 0 1 |
* ~~ ~~
*
* We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
* of the missing data.
*
* As is apparent from the example above, the only non-trivial rows in the
* inverse matrix correspond to the data disks that we're trying to
* reconstruct. Indeed, those are the only rows we need as the others would
* only be useful for reconstructing data known or assumed to be valid. For
* that reason, we only build the coefficients in the rows that correspond to
* targeted columns.
*/
/* END CSTYLED */
static void
vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
uint8_t **rows)
{
int i, j;
int pow;
ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
/*
* Fill in the missing rows of interest.
*/
for (i = 0; i < nmap; i++) {
ASSERT3S(0, <=, map[i]);
ASSERT3S(map[i], <=, 2);
pow = map[i] * n;
if (pow > 255)
pow -= 255;
ASSERT(pow <= 255);
for (j = 0; j < n; j++) {
pow -= map[i];
if (pow < 0)
pow += 255;
rows[i][j] = vdev_raidz_pow2[pow];
}
}
}
static void
vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
uint8_t **rows, uint8_t **invrows, const uint8_t *used)
{
int i, j, ii, jj;
uint8_t log;
/*
* Assert that the first nmissing entries from the array of used
* columns correspond to parity columns and that subsequent entries
* correspond to data columns.
*/
for (i = 0; i < nmissing; i++) {
ASSERT3S(used[i], <, rm->rm_firstdatacol);
}
for (; i < n; i++) {
ASSERT3S(used[i], >=, rm->rm_firstdatacol);
}
/*
* First initialize the storage where we'll compute the inverse rows.
*/
for (i = 0; i < nmissing; i++) {
for (j = 0; j < n; j++) {
invrows[i][j] = (i == j) ? 1 : 0;
}
}
/*
* Subtract all trivial rows from the rows of consequence.
*/
for (i = 0; i < nmissing; i++) {
for (j = nmissing; j < n; j++) {
ASSERT3U(used[j], >=, rm->rm_firstdatacol);
jj = used[j] - rm->rm_firstdatacol;
ASSERT3S(jj, <, n);
invrows[i][j] = rows[i][jj];
rows[i][jj] = 0;
}
}
/*
* For each of the rows of interest, we must normalize it and subtract
* a multiple of it from the other rows.
*/
for (i = 0; i < nmissing; i++) {
for (j = 0; j < missing[i]; j++) {
ASSERT0(rows[i][j]);
}
ASSERT3U(rows[i][missing[i]], !=, 0);
/*
* Compute the inverse of the first element and multiply each
* element in the row by that value.
*/
log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
for (j = 0; j < n; j++) {
rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
}
for (ii = 0; ii < nmissing; ii++) {
if (i == ii)
continue;
ASSERT3U(rows[ii][missing[i]], !=, 0);
log = vdev_raidz_log2[rows[ii][missing[i]]];
for (j = 0; j < n; j++) {
rows[ii][j] ^=
vdev_raidz_exp2(rows[i][j], log);
invrows[ii][j] ^=
vdev_raidz_exp2(invrows[i][j], log);
}
}
}
/*
* Verify that the data that is left in the rows are properly part of
* an identity matrix.
*/
for (i = 0; i < nmissing; i++) {
for (j = 0; j < n; j++) {
if (j == missing[i]) {
ASSERT3U(rows[i][j], ==, 1);
} else {
ASSERT0(rows[i][j]);
}
}
}
}
static void
vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
int *missing, uint8_t **invrows, const uint8_t *used)
{
int i, j, x, cc, c;
uint8_t *src;
uint64_t ccount;
uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
uint8_t log = 0;
uint8_t val;
int ll;
uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
uint8_t *p, *pp;
size_t psize;
psize = sizeof (invlog[0][0]) * n * nmissing;
p = kmem_alloc(psize, KM_SLEEP);
for (pp = p, i = 0; i < nmissing; i++) {
invlog[i] = pp;
pp += n;
}
for (i = 0; i < nmissing; i++) {
for (j = 0; j < n; j++) {
ASSERT3U(invrows[i][j], !=, 0);
invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
}
}
for (i = 0; i < n; i++) {
c = used[i];
ASSERT3U(c, <, rm->rm_cols);
src = abd_to_buf(rm->rm_col[c].rc_abd);
ccount = rm->rm_col[c].rc_size;
for (j = 0; j < nmissing; j++) {
cc = missing[j] + rm->rm_firstdatacol;
ASSERT3U(cc, >=, rm->rm_firstdatacol);
ASSERT3U(cc, <, rm->rm_cols);
ASSERT3U(cc, !=, c);
dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
dcount[j] = rm->rm_col[cc].rc_size;
}
ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
for (x = 0; x < ccount; x++, src++) {
if (*src != 0)
log = vdev_raidz_log2[*src];
for (cc = 0; cc < nmissing; cc++) {
if (x >= dcount[cc])
continue;
if (*src == 0) {
val = 0;
} else {
if ((ll = log + invlog[cc][i]) >= 255)
ll -= 255;
val = vdev_raidz_pow2[ll];
}
if (i == 0)
dst[cc][x] = val;
else
dst[cc][x] ^= val;
}
}
}
kmem_free(p, psize);
}
static int
vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
{
int n, i, c, t, tt;
int nmissing_rows;
int missing_rows[VDEV_RAIDZ_MAXPARITY];
int parity_map[VDEV_RAIDZ_MAXPARITY];
uint8_t *p, *pp;
size_t psize;
uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
uint8_t *used;
abd_t **bufs = NULL;
int code = 0;
/*
* Matrix reconstruction can't use scatter ABDs yet, so we allocate
* temporary linear ABDs.
*/
if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
raidz_col_t *col = &rm->rm_col[c];
bufs[c] = col->rc_abd;
col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
abd_copy(col->rc_abd, bufs[c], col->rc_size);
}
}
n = rm->rm_cols - rm->rm_firstdatacol;
/*
* Figure out which data columns are missing.
*/
nmissing_rows = 0;
for (t = 0; t < ntgts; t++) {
if (tgts[t] >= rm->rm_firstdatacol) {
missing_rows[nmissing_rows++] =
tgts[t] - rm->rm_firstdatacol;
}
}
/*
* Figure out which parity columns to use to help generate the missing
* data columns.
*/
for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
ASSERT(tt < ntgts);
ASSERT(c < rm->rm_firstdatacol);
/*
* Skip any targeted parity columns.
*/
if (c == tgts[tt]) {
tt++;
continue;
}
code |= 1 << c;
parity_map[i] = c;
i++;
}
ASSERT(code != 0);
ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
nmissing_rows * n + sizeof (used[0]) * n;
p = kmem_alloc(psize, KM_SLEEP);
for (pp = p, i = 0; i < nmissing_rows; i++) {
rows[i] = pp;
pp += n;
invrows[i] = pp;
pp += n;
}
used = pp;
for (i = 0; i < nmissing_rows; i++) {
used[i] = parity_map[i];
}
for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
if (tt < nmissing_rows &&
c == missing_rows[tt] + rm->rm_firstdatacol) {
tt++;
continue;
}
ASSERT3S(i, <, n);
used[i] = c;
i++;
}
/*
* Initialize the interesting rows of the matrix.
*/
vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
/*
* Invert the matrix.
*/
vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
invrows, used);
/*
* Reconstruct the missing data using the generated matrix.
*/
vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
invrows, used);
kmem_free(p, psize);
/*
* copy back from temporary linear abds and free them
*/
if (bufs) {
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
raidz_col_t *col = &rm->rm_col[c];
abd_copy(bufs[c], col->rc_abd, col->rc_size);
abd_free(col->rc_abd);
col->rc_abd = bufs[c];
}
kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
}
return (code);
}
static int
vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
{
int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
int ntgts;
int i, c;
int code;
int nbadparity, nbaddata;
int parity_valid[VDEV_RAIDZ_MAXPARITY];
/*
* The tgts list must already be sorted.
*/
for (i = 1; i < nt; i++) {
ASSERT(t[i] > t[i - 1]);
}
nbadparity = rm->rm_firstdatacol;
nbaddata = rm->rm_cols - nbadparity;
ntgts = 0;
for (i = 0, c = 0; c < rm->rm_cols; c++) {
if (c < rm->rm_firstdatacol)
parity_valid[c] = B_FALSE;
if (i < nt && c == t[i]) {
tgts[ntgts++] = c;
i++;
} else if (rm->rm_col[c].rc_error != 0) {
tgts[ntgts++] = c;
} else if (c >= rm->rm_firstdatacol) {
nbaddata--;
} else {
parity_valid[c] = B_TRUE;
nbadparity--;
}
}
ASSERT(ntgts >= nt);
ASSERT(nbaddata >= 0);
ASSERT(nbaddata + nbadparity == ntgts);
dt = &tgts[nbadparity];
/*
* See if we can use any of our optimized reconstruction routines.
*/
if (!vdev_raidz_default_to_general) {
switch (nbaddata) {
case 1:
if (parity_valid[VDEV_RAIDZ_P])
return (vdev_raidz_reconstruct_p(rm, dt, 1));
ASSERT(rm->rm_firstdatacol > 1);
if (parity_valid[VDEV_RAIDZ_Q])
return (vdev_raidz_reconstruct_q(rm, dt, 1));
ASSERT(rm->rm_firstdatacol > 2);
break;
case 2:
ASSERT(rm->rm_firstdatacol > 1);
if (parity_valid[VDEV_RAIDZ_P] &&
parity_valid[VDEV_RAIDZ_Q])
return (vdev_raidz_reconstruct_pq(rm, dt, 2));
ASSERT(rm->rm_firstdatacol > 2);
break;
}
}
code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
ASSERT(code > 0);
return (code);
}
static int
vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
uint64_t *logical_ashift, uint64_t *physical_ashift)
{
vdev_t *cvd;
uint64_t nparity = vd->vdev_nparity;
int c;
int lasterror = 0;
int numerrors = 0;
ASSERT(nparity > 0);
if (nparity > VDEV_RAIDZ_MAXPARITY ||
vd->vdev_children < nparity + 1) {
vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
return (SET_ERROR(EINVAL));
}
vdev_open_children(vd);
for (c = 0; c < vd->vdev_children; c++) {
cvd = vd->vdev_child[c];
if (cvd->vdev_open_error != 0) {
lasterror = cvd->vdev_open_error;
numerrors++;
continue;
}
*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
*physical_ashift = MAX(*physical_ashift,
cvd->vdev_physical_ashift);
}
*asize *= vd->vdev_children;
*max_asize *= vd->vdev_children;
if (numerrors > nparity) {
vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
return (lasterror);
}
return (0);
}
static void
vdev_raidz_close(vdev_t *vd)
{
int c;
for (c = 0; c < vd->vdev_children; c++)
vdev_close(vd->vdev_child[c]);
}
#ifdef illumos
/*
* Handle a read or write I/O to a RAID-Z dump device.
*
* The dump device is in a unique situation compared to other ZFS datasets:
* writing to this device should be as simple and fast as possible. In
* addition, durability matters much less since the dump will be extracted
* once the machine reboots. For that reason, this function eschews parity for
* performance and simplicity. The dump device uses the checksum setting
* ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
* dataset.
*
* Blocks of size 128 KB have been preallocated for this volume. I/Os less than
* 128 KB will not fill an entire block; in addition, they may not be properly
* aligned. In that case, this function uses the preallocated 128 KB block and
* omits reading or writing any "empty" portions of that block, as opposed to
* allocating a fresh appropriately-sized block.
*
* Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
*
* vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
*
* If this were a standard RAID-Z dataset, a block of at least 40 KB would be
* allocated which spans all five child vdevs. 8 KB of data would be written to
* each of four vdevs, with the fifth containing the parity bits.
*
* parity data data data data
* | PP | XX | XX | XX | XX |
* ^ ^ ^ ^ ^
* | | | | |
* 8 KB parity ------8 KB data blocks------
*
* However, when writing to the dump device, the behavior is different:
*
* vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
*
* Unlike the normal RAID-Z case in which the block is allocated based on the
* I/O size, reads and writes here always use a 128 KB logical I/O size. If the
* I/O size is less than 128 KB, only the actual portions of data are written.
* In this example the data is written to the third data vdev since that vdev
* contains the offset [64 KB, 96 KB).
*
* parity data data data data
* | | | | XX | |
* ^
* |
* 32 KB data block
*
* As a result, an individual I/O may not span all child vdevs; moreover, a
* small I/O may only operate on a single child vdev.
*
* Note that since there are no parity bits calculated or written, this format
* remains the same no matter how many parity bits are used in a normal RAID-Z
* stripe. On a RAID-Z3 configuration with seven child vdevs, the example above
* would look like:
*
* parity parity parity data data data data
* | | | | | | XX | |
* ^
* |
* 32 KB data block
*/
int
vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
{
vdev_t *tvd = vd->vdev_top;
vdev_t *cvd;
raidz_map_t *rm;
raidz_col_t *rc;
int c, err = 0;
uint64_t start, end, colstart, colend;
uint64_t coloffset, colsize, colskip;
int flags = doread ? BIO_READ : BIO_WRITE;
#ifdef _KERNEL
/*
* Don't write past the end of the block
*/
VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
start = offset;
end = start + size;
/*
* Allocate a RAID-Z map for this block. Note that this block starts
* from the "original" offset, this is, the offset of the extent which
* contains the requisite offset of the data being read or written.
*
* Even if this I/O operation doesn't span the full block size, let's
* treat the on-disk format as if the only blocks are the complete 128
* KB size.
*/
abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
SPA_OLD_MAXBLOCKSIZE);
rm = vdev_raidz_map_alloc(abd,
SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
vd->vdev_children, vd->vdev_nparity);
coloffset = origoffset;
for (c = rm->rm_firstdatacol; c < rm->rm_cols;
c++, coloffset += rc->rc_size) {
rc = &rm->rm_col[c];
cvd = vd->vdev_child[rc->rc_devidx];
/*
* Find the start and end of this column in the RAID-Z map,
* keeping in mind that the stated size and offset of the
* operation may not fill the entire column for this vdev.
*
* If any portion of the data spans this column, issue the
* appropriate operation to the vdev.
*/
if (coloffset + rc->rc_size <= start)
continue;
if (coloffset >= end)
continue;
colstart = MAX(coloffset, start);
colend = MIN(end, coloffset + rc->rc_size);
colsize = colend - colstart;
colskip = colstart - coloffset;
VERIFY3U(colsize, <=, rc->rc_size);
VERIFY3U(colskip, <=, rc->rc_size);
/*
* Note that the child vdev will have a vdev label at the start
* of its range of offsets, hence the need for
* VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
* example of why this calculation is needed.
*/
if ((err = vdev_disk_physio(cvd,
((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
flags, isdump)) != 0)
break;
}
vdev_raidz_map_free(rm);
abd_put(abd);
#endif /* KERNEL */
return (err);
}
#endif
static uint64_t
vdev_raidz_asize(vdev_t *vd, uint64_t psize)
{
uint64_t asize;
uint64_t ashift = vd->vdev_top->vdev_ashift;
uint64_t cols = vd->vdev_children;
uint64_t nparity = vd->vdev_nparity;
asize = ((psize - 1) >> ashift) + 1;
asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
asize = roundup(asize, nparity + 1) << ashift;
return (asize);
}
static void
vdev_raidz_child_done(zio_t *zio)
{
raidz_col_t *rc = zio->io_private;
rc->rc_error = zio->io_error;
rc->rc_tried = 1;
rc->rc_skipped = 0;
}
/*
* Start an IO operation on a RAIDZ VDev
*
* Outline:
* - For write operations:
* 1. Generate the parity data
* 2. Create child zio write operations to each column's vdev, for both
* data and parity.
* 3. If the column skips any sectors for padding, create optional dummy
* write zio children for those areas to improve aggregation continuity.
* - For read operations:
* 1. Create child zio read operations to each data column's vdev to read
* the range of data required for zio.
* 2. If this is a scrub or resilver operation, or if any of the data
* vdevs have had errors, then create zio read operations to the parity
* columns' VDevs as well.
*/
static void
vdev_raidz_io_start(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
vdev_t *tvd = vd->vdev_top;
vdev_t *cvd;
raidz_map_t *rm;
raidz_col_t *rc;
int c, i;
rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
zio->io_type == ZIO_TYPE_FREE,
tvd->vdev_ashift, vd->vdev_children,
vd->vdev_nparity);
zio->io_vsd = rm;
zio->io_vsd_ops = &vdev_raidz_vsd_ops;
ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
if (zio->io_type == ZIO_TYPE_FREE) {
for (c = 0; c < rm->rm_cols; c++) {
rc = &rm->rm_col[c];
cvd = vd->vdev_child[rc->rc_devidx];
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, rc->rc_abd, rc->rc_size,
zio->io_type, zio->io_priority, 0,
vdev_raidz_child_done, rc));
}
zio_execute(zio);
return;
}
if (zio->io_type == ZIO_TYPE_WRITE) {
vdev_raidz_generate_parity(rm);
for (c = 0; c < rm->rm_cols; c++) {
rc = &rm->rm_col[c];
cvd = vd->vdev_child[rc->rc_devidx];
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, rc->rc_abd, rc->rc_size,
zio->io_type, zio->io_priority, 0,
vdev_raidz_child_done, rc));
}
/*
* Generate optional I/Os for any skipped sectors to improve
* aggregation contiguity.
*/
for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
ASSERT(c <= rm->rm_scols);
if (c == rm->rm_scols)
c = 0;
rc = &rm->rm_col[c];
cvd = vd->vdev_child[rc->rc_devidx];
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset + rc->rc_size, NULL,
1 << tvd->vdev_ashift,
zio->io_type, zio->io_priority,
ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
}
zio_execute(zio);
return;
}
ASSERT(zio->io_type == ZIO_TYPE_READ);
/*
* Iterate over the columns in reverse order so that we hit the parity
* last -- any errors along the way will force us to read the parity.
*/
for (c = rm->rm_cols - 1; c >= 0; c--) {
rc = &rm->rm_col[c];
cvd = vd->vdev_child[rc->rc_devidx];
if (!vdev_readable(cvd)) {
if (c >= rm->rm_firstdatacol)
rm->rm_missingdata++;
else
rm->rm_missingparity++;
rc->rc_error = SET_ERROR(ENXIO);
rc->rc_tried = 1; /* don't even try */
rc->rc_skipped = 1;
continue;
}
if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
if (c >= rm->rm_firstdatacol)
rm->rm_missingdata++;
else
rm->rm_missingparity++;
rc->rc_error = SET_ERROR(ESTALE);
rc->rc_skipped = 1;
continue;
}
if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, rc->rc_abd, rc->rc_size,
zio->io_type, zio->io_priority, 0,
vdev_raidz_child_done, rc));
}
}
zio_execute(zio);
}
/*
* Report a checksum error for a child of a RAID-Z device.
*/
static void
raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
{
void *buf;
vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
zio_bad_cksum_t zbc;
raidz_map_t *rm = zio->io_vsd;
mutex_enter(&vd->vdev_stat_lock);
vd->vdev_stat.vs_checksum_errors++;
mutex_exit(&vd->vdev_stat_lock);
zbc.zbc_has_cksum = 0;
zbc.zbc_injected = rm->rm_ecksuminjected;
buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
zfs_ereport_post_checksum(zio->io_spa, vd, zio,
rc->rc_offset, rc->rc_size, buf, bad_data,
&zbc);
abd_return_buf(rc->rc_abd, buf, rc->rc_size);
}
}
/*
* We keep track of whether or not there were any injected errors, so that
* any ereports we generate can note it.
*/
static int
raidz_checksum_verify(zio_t *zio)
{
zio_bad_cksum_t zbc;
raidz_map_t *rm = zio->io_vsd;
int ret = zio_checksum_error(zio, &zbc);
if (ret != 0 && zbc.zbc_injected != 0)
rm->rm_ecksuminjected = 1;
return (ret);
}
/*
* Generate the parity from the data columns. If we tried and were able to
* read the parity without error, verify that the generated parity matches the
* data we read. If it doesn't, we fire off a checksum error. Return the
* number such failures.
*/
static int
raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
{
void *orig[VDEV_RAIDZ_MAXPARITY];
int c, ret = 0;
raidz_col_t *rc;
blkptr_t *bp = zio->io_bp;
enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
(BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
if (checksum == ZIO_CHECKSUM_NOPARITY)
return (ret);
for (c = 0; c < rm->rm_firstdatacol; c++) {
rc = &rm->rm_col[c];
if (!rc->rc_tried || rc->rc_error != 0)
continue;
orig[c] = zio_buf_alloc(rc->rc_size);
abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
}
vdev_raidz_generate_parity(rm);
for (c = 0; c < rm->rm_firstdatacol; c++) {
rc = &rm->rm_col[c];
if (!rc->rc_tried || rc->rc_error != 0)
continue;
if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
raidz_checksum_error(zio, rc, orig[c]);
rc->rc_error = SET_ERROR(ECKSUM);
ret++;
}
zio_buf_free(orig[c], rc->rc_size);
}
return (ret);
}
/*
* Keep statistics on all the ways that we used parity to correct data.
*/
static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
static int
vdev_raidz_worst_error(raidz_map_t *rm)
{
int error = 0;
for (int c = 0; c < rm->rm_cols; c++)
error = zio_worst_error(error, rm->rm_col[c].rc_error);
return (error);
}
/*
* Iterate over all combinations of bad data and attempt a reconstruction.
* Note that the algorithm below is non-optimal because it doesn't take into
* account how reconstruction is actually performed. For example, with
* triple-parity RAID-Z the reconstruction procedure is the same if column 4
* is targeted as invalid as if columns 1 and 4 are targeted since in both
* cases we'd only use parity information in column 0.
*/
static int
vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
{
raidz_map_t *rm = zio->io_vsd;
raidz_col_t *rc;
void *orig[VDEV_RAIDZ_MAXPARITY];
int tstore[VDEV_RAIDZ_MAXPARITY + 2];
int *tgts = &tstore[1];
int current, next, i, c, n;
int code, ret = 0;
ASSERT(total_errors < rm->rm_firstdatacol);
/*
* This simplifies one edge condition.
*/
tgts[-1] = -1;
for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
/*
* Initialize the targets array by finding the first n columns
* that contain no error.
*
* If there were no data errors, we need to ensure that we're
* always explicitly attempting to reconstruct at least one
* data column. To do this, we simply push the highest target
* up into the data columns.
*/
for (c = 0, i = 0; i < n; i++) {
if (i == n - 1 && data_errors == 0 &&
c < rm->rm_firstdatacol) {
c = rm->rm_firstdatacol;
}
while (rm->rm_col[c].rc_error != 0) {
c++;
ASSERT3S(c, <, rm->rm_cols);
}
tgts[i] = c++;
}
/*
* Setting tgts[n] simplifies the other edge condition.
*/
tgts[n] = rm->rm_cols;
/*
* These buffers were allocated in previous iterations.
*/
for (i = 0; i < n - 1; i++) {
ASSERT(orig[i] != NULL);
}
orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
current = 0;
next = tgts[current];
while (current != n) {
tgts[current] = next;
current = 0;
/*
* Save off the original data that we're going to
* attempt to reconstruct.
*/
for (i = 0; i < n; i++) {
ASSERT(orig[i] != NULL);
c = tgts[i];
ASSERT3S(c, >=, 0);
ASSERT3S(c, <, rm->rm_cols);
rc = &rm->rm_col[c];
abd_copy_to_buf(orig[i], rc->rc_abd,
rc->rc_size);
}
/*
* Attempt a reconstruction and exit the outer loop on
* success.
*/
code = vdev_raidz_reconstruct(rm, tgts, n);
if (raidz_checksum_verify(zio) == 0) {
atomic_inc_64(&raidz_corrected[code]);
for (i = 0; i < n; i++) {
c = tgts[i];
rc = &rm->rm_col[c];
ASSERT(rc->rc_error == 0);
if (rc->rc_tried)
raidz_checksum_error(zio, rc,
orig[i]);
rc->rc_error = SET_ERROR(ECKSUM);
}
ret = code;
goto done;
}
/*
* Restore the original data.
*/
for (i = 0; i < n; i++) {
c = tgts[i];
rc = &rm->rm_col[c];
abd_copy_from_buf(rc->rc_abd, orig[i],
rc->rc_size);
}
do {
/*
* Find the next valid column after the current
* position..
*/
for (next = tgts[current] + 1;
next < rm->rm_cols &&
rm->rm_col[next].rc_error != 0; next++)
continue;
ASSERT(next <= tgts[current + 1]);
/*
* If that spot is available, we're done here.
*/
if (next != tgts[current + 1])
break;
/*
* Otherwise, find the next valid column after
* the previous position.
*/
for (c = tgts[current - 1] + 1;
rm->rm_col[c].rc_error != 0; c++)
continue;
tgts[current] = c;
current++;
} while (current != n);
}
}
n--;
done:
for (i = 0; i < n; i++) {
zio_buf_free(orig[i], rm->rm_col[0].rc_size);
}
return (ret);
}
/*
* Complete an IO operation on a RAIDZ VDev
*
* Outline:
* - For write operations:
* 1. Check for errors on the child IOs.
* 2. Return, setting an error code if too few child VDevs were written
* to reconstruct the data later. Note that partial writes are
* considered successful if they can be reconstructed at all.
* - For read operations:
* 1. Check for errors on the child IOs.
* 2. If data errors occurred:
* a. Try to reassemble the data from the parity available.
* b. If we haven't yet read the parity drives, read them now.
* c. If all parity drives have been read but the data still doesn't
* reassemble with a correct checksum, then try combinatorial
* reconstruction.
* d. If that doesn't work, return an error.
* 3. If there were unexpected errors or this is a resilver operation,
* rewrite the vdevs that had errors.
*/
static void
vdev_raidz_io_done(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
vdev_t *cvd;
raidz_map_t *rm = zio->io_vsd;
raidz_col_t *rc;
int unexpected_errors = 0;
int parity_errors = 0;
int parity_untried = 0;
int data_errors = 0;
int total_errors = 0;
int n, c;
int tgts[VDEV_RAIDZ_MAXPARITY];
int code;
ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
for (c = 0; c < rm->rm_cols; c++) {
rc = &rm->rm_col[c];
if (rc->rc_error) {
ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
if (c < rm->rm_firstdatacol)
parity_errors++;
else
data_errors++;
if (!rc->rc_skipped)
unexpected_errors++;
total_errors++;
} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
parity_untried++;
}
}
if (zio->io_type == ZIO_TYPE_WRITE) {
/*
* XXX -- for now, treat partial writes as a success.
* (If we couldn't write enough columns to reconstruct
* the data, the I/O failed. Otherwise, good enough.)
*
* Now that we support write reallocation, it would be better
* to treat partial failure as real failure unless there are
* no non-degraded top-level vdevs left, and not update DTLs
* if we intend to reallocate.
*/
/* XXPOLICY */
if (total_errors > rm->rm_firstdatacol)
zio->io_error = vdev_raidz_worst_error(rm);
return;
} else if (zio->io_type == ZIO_TYPE_FREE) {
return;
}
ASSERT(zio->io_type == ZIO_TYPE_READ);
/*
* There are three potential phases for a read:
* 1. produce valid data from the columns read
* 2. read all disks and try again
* 3. perform combinatorial reconstruction
*
* Each phase is progressively both more expensive and less likely to
* occur. If we encounter more errors than we can repair or all phases
* fail, we have no choice but to return an error.
*/
/*
* If the number of errors we saw was correctable -- less than or equal
* to the number of parity disks read -- attempt to produce data that
* has a valid checksum. Naturally, this case applies in the absence of
* any errors.
*/
if (total_errors <= rm->rm_firstdatacol - parity_untried) {
if (data_errors == 0) {
if (raidz_checksum_verify(zio) == 0) {
/*
* If we read parity information (unnecessarily
* as it happens since no reconstruction was
* needed) regenerate and verify the parity.
* We also regenerate parity when resilvering
* so we can write it out to the failed device
* later.
*/
if (parity_errors + parity_untried <
rm->rm_firstdatacol ||
(zio->io_flags & ZIO_FLAG_RESILVER)) {
n = raidz_parity_verify(zio, rm);
unexpected_errors += n;
ASSERT(parity_errors + n <=
rm->rm_firstdatacol);
}
goto done;
}
} else {
/*
* We either attempt to read all the parity columns or
* none of them. If we didn't try to read parity, we
* wouldn't be here in the correctable case. There must
* also have been fewer parity errors than parity
* columns or, again, we wouldn't be in this code path.
*/
ASSERT(parity_untried == 0);
ASSERT(parity_errors < rm->rm_firstdatacol);
/*
* Identify the data columns that reported an error.
*/
n = 0;
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
rc = &rm->rm_col[c];
if (rc->rc_error != 0) {
ASSERT(n < VDEV_RAIDZ_MAXPARITY);
tgts[n++] = c;
}
}
ASSERT(rm->rm_firstdatacol >= n);
code = vdev_raidz_reconstruct(rm, tgts, n);
if (raidz_checksum_verify(zio) == 0) {
atomic_inc_64(&raidz_corrected[code]);
/*
* If we read more parity disks than were used
* for reconstruction, confirm that the other
* parity disks produced correct data. This
* routine is suboptimal in that it regenerates
* the parity that we already used in addition
* to the parity that we're attempting to
* verify, but this should be a relatively
* uncommon case, and can be optimized if it
* becomes a problem. Note that we regenerate
* parity when resilvering so we can write it
* out to failed devices later.
*/
if (parity_errors < rm->rm_firstdatacol - n ||
(zio->io_flags & ZIO_FLAG_RESILVER)) {
n = raidz_parity_verify(zio, rm);
unexpected_errors += n;
ASSERT(parity_errors + n <=
rm->rm_firstdatacol);
}
goto done;
}
}
}
/*
* This isn't a typical situation -- either we got a read error or
* a child silently returned bad data. Read every block so we can
* try again with as much data and parity as we can track down. If
* we've already been through once before, all children will be marked
* as tried so we'll proceed to combinatorial reconstruction.
*/
unexpected_errors = 1;
rm->rm_missingdata = 0;
rm->rm_missingparity = 0;
for (c = 0; c < rm->rm_cols; c++) {
if (rm->rm_col[c].rc_tried)
continue;
zio_vdev_io_redone(zio);
do {
rc = &rm->rm_col[c];
if (rc->rc_tried)
continue;
zio_nowait(zio_vdev_child_io(zio, NULL,
vd->vdev_child[rc->rc_devidx],
rc->rc_offset, rc->rc_abd, rc->rc_size,
zio->io_type, zio->io_priority, 0,
vdev_raidz_child_done, rc));
} while (++c < rm->rm_cols);
return;
}
/*
* At this point we've attempted to reconstruct the data given the
* errors we detected, and we've attempted to read all columns. There
* must, therefore, be one or more additional problems -- silent errors
* resulting in invalid data rather than explicit I/O errors resulting
* in absent data. We check if there is enough additional data to
* possibly reconstruct the data and then perform combinatorial
* reconstruction over all possible combinations. If that fails,
* we're cooked.
*/
if (total_errors > rm->rm_firstdatacol) {
zio->io_error = vdev_raidz_worst_error(rm);
} else if (total_errors < rm->rm_firstdatacol &&
(code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
/*
* If we didn't use all the available parity for the
* combinatorial reconstruction, verify that the remaining
* parity is correct.
*/
if (code != (1 << rm->rm_firstdatacol) - 1)
(void) raidz_parity_verify(zio, rm);
} else {
/*
* We're here because either:
*
* total_errors == rm_first_datacol, or
* vdev_raidz_combrec() failed
*
* In either case, there is enough bad data to prevent
* reconstruction.
*
* Start checksum ereports for all children which haven't
* failed, and the IO wasn't speculative.
*/
zio->io_error = SET_ERROR(ECKSUM);
if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
for (c = 0; c < rm->rm_cols; c++) {
rc = &rm->rm_col[c];
if (rc->rc_error == 0) {
zio_bad_cksum_t zbc;
zbc.zbc_has_cksum = 0;
zbc.zbc_injected =
rm->rm_ecksuminjected;
zfs_ereport_start_checksum(
zio->io_spa,
vd->vdev_child[rc->rc_devidx],
zio, rc->rc_offset, rc->rc_size,
(void *)(uintptr_t)c, &zbc);
}
}
}
}
done:
zio_checksum_verified(zio);
if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
(unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
/*
* Use the good data we have in hand to repair damaged children.
*/
for (c = 0; c < rm->rm_cols; c++) {
rc = &rm->rm_col[c];
cvd = vd->vdev_child[rc->rc_devidx];
if (rc->rc_error == 0)
continue;
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, rc->rc_abd, rc->rc_size,
ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
}
}
}
static void
vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
{
if (faulted > vd->vdev_nparity)
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_NO_REPLICAS);
else if (degraded + faulted != 0)
vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
else
vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
}
vdev_ops_t vdev_raidz_ops = {
vdev_raidz_open,
vdev_raidz_close,
vdev_raidz_asize,
vdev_raidz_io_start,
vdev_raidz_io_done,
vdev_raidz_state_change,
NULL,
NULL,
+ NULL,
VDEV_TYPE_RAIDZ, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c (nonexistent)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c (revision 332525)
@@ -0,0 +1,1919 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/uberblock_impl.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/bpobj.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_dir.h>
+#include <sys/arc.h>
+#include <sys/zfeature.h>
+#include <sys/vdev_indirect_births.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/abd.h>
+
+/*
+ * This file contains the necessary logic to remove vdevs from a
+ * storage pool. Currently, the only devices that can be removed
+ * are log, cache, and spare devices; and top level vdevs from a pool
+ * w/o raidz. (Note that members of a mirror can also be removed
+ * by the detach operation.)
+ *
+ * Log vdevs are removed by evacuating them and then turning the vdev
+ * into a hole vdev while holding spa config locks.
+ *
+ * Top level vdevs are removed and converted into an indirect vdev via
+ * a multi-step process:
+ *
+ * - Disable allocations from this device (spa_vdev_remove_top).
+ *
+ * - From a new thread (spa_vdev_remove_thread), copy data from
+ * the removing vdev to a different vdev. The copy happens in open
+ * context (spa_vdev_copy_impl) and issues a sync task
+ * (vdev_mapping_sync) so the sync thread can update the partial
+ * indirect mappings in core and on disk.
+ *
+ * - If a free happens during a removal, it is freed from the
+ * removing vdev, and if it has already been copied, from the new
+ * location as well (free_from_removing_vdev).
+ *
+ * - After the removal is completed, the copy thread converts the vdev
+ * into an indirect vdev (vdev_remove_complete) before instructing
+ * the sync thread to destroy the space maps and finish the removal
+ * (spa_finish_removal).
+ */
+
+typedef struct vdev_copy_arg {
+ metaslab_t *vca_msp;
+ uint64_t vca_outstanding_bytes;
+ kcondvar_t vca_cv;
+ kmutex_t vca_lock;
+} vdev_copy_arg_t;
+
+typedef struct vdev_copy_seg_arg {
+ vdev_copy_arg_t *vcsa_copy_arg;
+ uint64_t vcsa_txg;
+ dva_t *vcsa_dest_dva;
+ blkptr_t *vcsa_dest_bp;
+} vdev_copy_seg_arg_t;
+
+/*
+ * The maximum amount of allowed data we're allowed to copy from a device
+ * at a time when removing it.
+ */
+int zfs_remove_max_copy_bytes = 8 * 1024 * 1024;
+
+/*
+ * The largest contiguous segment that we will attempt to allocate when
+ * removing a device. This can be no larger than SPA_MAXBLOCKSIZE. If
+ * there is a performance problem with attempting to allocate large blocks,
+ * consider decreasing this.
+ *
+ * Note: we will issue I/Os of up to this size. The mpt driver does not
+ * respond well to I/Os larger than 1MB, so we set this to 1MB. (When
+ * mpt processes an I/O larger than 1MB, it needs to do an allocation of
+ * 2 physically contiguous pages; if this allocation fails, mpt will drop
+ * the I/O and hang the device.)
+ */
+int zfs_remove_max_segment = 1024 * 1024;
+
+#define VDEV_REMOVAL_ZAP_OBJS "lzap"
+
+static void spa_vdev_remove_thread(void *arg);
+
+static void
+spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx)
+{
+ VERIFY0(zap_update(spa->spa_dsl_pool->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_REMOVING, sizeof (uint64_t),
+ sizeof (spa->spa_removing_phys) / sizeof (uint64_t),
+ &spa->spa_removing_phys, tx));
+}
+
+static nvlist_t *
+spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
+{
+ for (int i = 0; i < count; i++) {
+ uint64_t guid =
+ fnvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID);
+
+ if (guid == target_guid)
+ return (nvpp[i]);
+ }
+
+ return (NULL);
+}
+
+static void
+spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
+ nvlist_t *dev_to_remove)
+{
+ nvlist_t **newdev = NULL;
+
+ if (count > 1)
+ newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
+
+ for (int i = 0, j = 0; i < count; i++) {
+ if (dev[i] == dev_to_remove)
+ continue;
+ VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
+ }
+
+ VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
+ VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
+
+ for (int i = 0; i < count - 1; i++)
+ nvlist_free(newdev[i]);
+
+ if (count > 1)
+ kmem_free(newdev, (count - 1) * sizeof (void *));
+}
+
+static spa_vdev_removal_t *
+spa_vdev_removal_create(vdev_t *vd)
+{
+ spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP);
+ mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
+ svr->svr_allocd_segs = range_tree_create(NULL, NULL);
+ svr->svr_vdev = vd;
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ svr->svr_frees[i] = range_tree_create(NULL, NULL);
+ list_create(&svr->svr_new_segments[i],
+ sizeof (vdev_indirect_mapping_entry_t),
+ offsetof(vdev_indirect_mapping_entry_t, vime_node));
+ }
+
+ return (svr);
+}
+
+void
+spa_vdev_removal_destroy(spa_vdev_removal_t *svr)
+{
+ for (int i = 0; i < TXG_SIZE; i++) {
+ ASSERT0(svr->svr_bytes_done[i]);
+ ASSERT0(svr->svr_max_offset_to_sync[i]);
+ range_tree_destroy(svr->svr_frees[i]);
+ list_destroy(&svr->svr_new_segments[i]);
+ }
+
+ range_tree_destroy(svr->svr_allocd_segs);
+ mutex_destroy(&svr->svr_lock);
+ cv_destroy(&svr->svr_cv);
+ kmem_free(svr, sizeof (*svr));
+}
+
+/*
+ * This is called as a synctask in the txg in which we will mark this vdev
+ * as removing (in the config stored in the MOS).
+ *
+ * It begins the evacuation of a toplevel vdev by:
+ * - initializing the spa_removing_phys which tracks this removal
+ * - computing the amount of space to remove for accounting purposes
+ * - dirtying all dbufs in the spa_config_object
+ * - creating the spa_vdev_removal
+ * - starting the spa_vdev_remove_thread
+ */
+static void
+vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
+{
+ vdev_t *vd = arg;
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_dsl_pool->dp_meta_objset;
+ spa_vdev_removal_t *svr = NULL;
+ uint64_t txg = dmu_tx_get_txg(tx);
+
+ ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops);
+ svr = spa_vdev_removal_create(vd);
+
+ ASSERT(vd->vdev_removing);
+ ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
+
+ spa_feature_incr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+ /*
+ * By activating the OBSOLETE_COUNTS feature, we prevent
+ * the pool from being downgraded and ensure that the
+ * refcounts are precise.
+ */
+ spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+ uint64_t one = 1;
+ VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1,
+ &one, tx));
+ ASSERT3U(vdev_obsolete_counts_are_precise(vd), !=, 0);
+ }
+
+ vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx);
+ vd->vdev_indirect_mapping =
+ vdev_indirect_mapping_open(mos, vic->vic_mapping_object);
+ vic->vic_births_object = vdev_indirect_births_alloc(mos, tx);
+ vd->vdev_indirect_births =
+ vdev_indirect_births_open(mos, vic->vic_births_object);
+ spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id;
+ spa->spa_removing_phys.sr_start_time = gethrestime_sec();
+ spa->spa_removing_phys.sr_end_time = 0;
+ spa->spa_removing_phys.sr_state = DSS_SCANNING;
+ spa->spa_removing_phys.sr_to_copy = 0;
+ spa->spa_removing_phys.sr_copied = 0;
+
+ /*
+ * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because
+ * there may be space in the defer tree, which is free, but still
+ * counted in vs_alloc.
+ */
+ for (uint64_t i = 0; i < vd->vdev_ms_count; i++) {
+ metaslab_t *ms = vd->vdev_ms[i];
+ if (ms->ms_sm == NULL)
+ continue;
+
+ /*
+ * Sync tasks happen before metaslab_sync(), therefore
+ * smp_alloc and sm_alloc must be the same.
+ */
+ ASSERT3U(space_map_allocated(ms->ms_sm), ==,
+ ms->ms_sm->sm_phys->smp_alloc);
+
+ spa->spa_removing_phys.sr_to_copy +=
+ space_map_allocated(ms->ms_sm);
+
+ /*
+ * Space which we are freeing this txg does not need to
+ * be copied.
+ */
+ spa->spa_removing_phys.sr_to_copy -=
+ range_tree_space(ms->ms_freeingtree);
+
+ ASSERT0(range_tree_space(ms->ms_freedtree));
+ for (int t = 0; t < TXG_SIZE; t++)
+ ASSERT0(range_tree_space(ms->ms_alloctree[t]));
+ }
+
+ /*
+ * Sync tasks are called before metaslab_sync(), so there should
+ * be no already-synced metaslabs in the TXG_CLEAN list.
+ */
+ ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL);
+
+ spa_sync_removing_state(spa, tx);
+
+ /*
+ * All blocks that we need to read the most recent mapping must be
+ * stored on concrete vdevs. Therefore, we must dirty anything that
+ * is read before spa_remove_init(). Specifically, the
+ * spa_config_object. (Note that although we already modified the
+ * spa_config_object in spa_sync_removing_state, that may not have
+ * modified all blocks of the object.)
+ */
+ dmu_object_info_t doi;
+ VERIFY0(dmu_object_info(mos, DMU_POOL_DIRECTORY_OBJECT, &doi));
+ for (uint64_t offset = 0; offset < doi.doi_max_offset; ) {
+ dmu_buf_t *dbuf;
+ VERIFY0(dmu_buf_hold(mos, DMU_POOL_DIRECTORY_OBJECT,
+ offset, FTAG, &dbuf, 0));
+ dmu_buf_will_dirty(dbuf, tx);
+ offset += dbuf->db_size;
+ dmu_buf_rele(dbuf, FTAG);
+ }
+
+ /*
+ * Now that we've allocated the im_object, dirty the vdev to ensure
+ * that the object gets written to the config on disk.
+ */
+ vdev_config_dirty(vd);
+
+ zfs_dbgmsg("starting removal thread for vdev %llu (%p) in txg %llu "
+ "im_obj=%llu", vd->vdev_id, vd, dmu_tx_get_txg(tx),
+ vic->vic_mapping_object);
+
+ spa_history_log_internal(spa, "vdev remove started", tx,
+ "%s vdev %llu %s", spa_name(spa), vd->vdev_id,
+ (vd->vdev_path != NULL) ? vd->vdev_path : "-");
+ /*
+ * Setting spa_vdev_removal causes subsequent frees to call
+ * free_from_removing_vdev(). Note that we don't need any locking
+ * because we are the sync thread, and metaslab_free_impl() is only
+ * called from syncing context (potentially from a zio taskq thread,
+ * but in any case only when there are outstanding free i/os, which
+ * there are not).
+ */
+ ASSERT3P(spa->spa_vdev_removal, ==, NULL);
+ spa->spa_vdev_removal = svr;
+ svr->svr_thread = thread_create(NULL, 0,
+ spa_vdev_remove_thread, vd, 0, &p0, TS_RUN, minclsyspri);
+}
+
+/*
+ * When we are opening a pool, we must read the mapping for each
+ * indirect vdev in order from most recently removed to least
+ * recently removed. We do this because the blocks for the mapping
+ * of older indirect vdevs may be stored on more recently removed vdevs.
+ * In order to read each indirect mapping object, we must have
+ * initialized all more recently removed vdevs.
+ */
+int
+spa_remove_init(spa_t *spa)
+{
+ int error;
+
+ error = zap_lookup(spa->spa_dsl_pool->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_REMOVING, sizeof (uint64_t),
+ sizeof (spa->spa_removing_phys) / sizeof (uint64_t),
+ &spa->spa_removing_phys);
+
+ if (error == ENOENT) {
+ spa->spa_removing_phys.sr_state = DSS_NONE;
+ spa->spa_removing_phys.sr_removing_vdev = -1;
+ spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
+ return (0);
+ } else if (error != 0) {
+ return (error);
+ }
+
+ if (spa->spa_removing_phys.sr_state == DSS_SCANNING) {
+ /*
+ * We are currently removing a vdev. Create and
+ * initialize a spa_vdev_removal_t from the bonus
+ * buffer of the removing vdevs vdev_im_object, and
+ * initialize its partial mapping.
+ */
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ vdev_t *vd = vdev_lookup_top(spa,
+ spa->spa_removing_phys.sr_removing_vdev);
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ if (vd == NULL)
+ return (EINVAL);
+
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+
+ ASSERT(vdev_is_concrete(vd));
+ spa_vdev_removal_t *svr = spa_vdev_removal_create(vd);
+ ASSERT(svr->svr_vdev->vdev_removing);
+
+ vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
+ spa->spa_meta_objset, vic->vic_mapping_object);
+ vd->vdev_indirect_births = vdev_indirect_births_open(
+ spa->spa_meta_objset, vic->vic_births_object);
+
+ spa->spa_vdev_removal = svr;
+ }
+
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ uint64_t indirect_vdev_id =
+ spa->spa_removing_phys.sr_prev_indirect_vdev;
+ while (indirect_vdev_id != UINT64_MAX) {
+ vdev_t *vd = vdev_lookup_top(spa, indirect_vdev_id);
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+ vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
+ spa->spa_meta_objset, vic->vic_mapping_object);
+ vd->vdev_indirect_births = vdev_indirect_births_open(
+ spa->spa_meta_objset, vic->vic_births_object);
+
+ indirect_vdev_id = vic->vic_prev_indirect_vdev;
+ }
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ /*
+ * Now that we've loaded all the indirect mappings, we can allow
+ * reads from other blocks (e.g. via predictive prefetch).
+ */
+ spa->spa_indirect_vdevs_loaded = B_TRUE;
+ return (0);
+}
+
+void
+spa_restart_removal(spa_t *spa)
+{
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+
+ if (svr == NULL)
+ return;
+
+ /*
+ * In general when this function is called there is no
+ * removal thread running. The only scenario where this
+ * is not true is during spa_import() where this function
+ * is called twice [once from spa_import_impl() and
+ * spa_async_resume()]. Thus, in the scenario where we
+ * import a pool that has an ongoing removal we don't
+ * want to spawn a second thread.
+ */
+ if (svr->svr_thread != NULL)
+ return;
+
+ if (!spa_writeable(spa))
+ return;
+
+ vdev_t *vd = svr->svr_vdev;
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+
+ ASSERT3P(vd, !=, NULL);
+ ASSERT(vd->vdev_removing);
+
+ zfs_dbgmsg("restarting removal of %llu at count=%llu",
+ vd->vdev_id, vdev_indirect_mapping_num_entries(vim));
+ svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, vd,
+ 0, &p0, TS_RUN, minclsyspri);
+}
+
+/*
+ * Process freeing from a device which is in the middle of being removed.
+ * We must handle this carefully so that we attempt to copy freed data,
+ * and we correctly free already-copied data.
+ */
+void
+free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
+ uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ uint64_t max_offset_yet = 0;
+
+ ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
+ ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
+ vdev_indirect_mapping_object(vim));
+ ASSERT3P(vd, ==, svr->svr_vdev);
+ ASSERT3U(spa_syncing_txg(spa), ==, txg);
+
+ mutex_enter(&svr->svr_lock);
+
+ /*
+ * Remove the segment from the removing vdev's spacemap. This
+ * ensures that we will not attempt to copy this space (if the
+ * removal thread has not yet visited it), and also ensures
+ * that we know what is actually allocated on the new vdevs
+ * (needed if we cancel the removal).
+ *
+ * Note: we must do the metaslab_free_concrete() with the svr_lock
+ * held, so that the remove_thread can not load this metaslab and then
+ * visit this offset between the time that we metaslab_free_concrete()
+ * and when we check to see if it has been visited.
+ */
+ metaslab_free_concrete(vd, offset, size, txg);
+
+ uint64_t synced_size = 0;
+ uint64_t synced_offset = 0;
+ uint64_t max_offset_synced = vdev_indirect_mapping_max_offset(vim);
+ if (offset < max_offset_synced) {
+ /*
+ * The mapping for this offset is already on disk.
+ * Free from the new location.
+ *
+ * Note that we use svr_max_synced_offset because it is
+ * updated atomically with respect to the in-core mapping.
+ * By contrast, vim_max_offset is not.
+ *
+ * This block may be split between a synced entry and an
+ * in-flight or unvisited entry. Only process the synced
+ * portion of it here.
+ */
+ synced_size = MIN(size, max_offset_synced - offset);
+ synced_offset = offset;
+
+ ASSERT3U(max_offset_yet, <=, max_offset_synced);
+ max_offset_yet = max_offset_synced;
+
+ DTRACE_PROBE3(remove__free__synced,
+ spa_t *, spa,
+ uint64_t, offset,
+ uint64_t, synced_size);
+
+ size -= synced_size;
+ offset += synced_size;
+ }
+
+ /*
+ * Look at all in-flight txgs starting from the currently syncing one
+ * and see if a section of this free is being copied. By starting from
+ * this txg and iterating forward, we might find that this region
+ * was copied in two different txgs and handle it appropriately.
+ */
+ for (int i = 0; i < TXG_CONCURRENT_STATES; i++) {
+ int txgoff = (txg + i) & TXG_MASK;
+ if (size > 0 && offset < svr->svr_max_offset_to_sync[txgoff]) {
+ /*
+ * The mapping for this offset is in flight, and
+ * will be synced in txg+i.
+ */
+ uint64_t inflight_size = MIN(size,
+ svr->svr_max_offset_to_sync[txgoff] - offset);
+
+ DTRACE_PROBE4(remove__free__inflight,
+ spa_t *, spa,
+ uint64_t, offset,
+ uint64_t, inflight_size,
+ uint64_t, txg + i);
+
+ /*
+ * We copy data in order of increasing offset.
+ * Therefore the max_offset_to_sync[] must increase
+ * (or be zero, indicating that nothing is being
+ * copied in that txg).
+ */
+ if (svr->svr_max_offset_to_sync[txgoff] != 0) {
+ ASSERT3U(svr->svr_max_offset_to_sync[txgoff],
+ >=, max_offset_yet);
+ max_offset_yet =
+ svr->svr_max_offset_to_sync[txgoff];
+ }
+
+ /*
+ * We've already committed to copying this segment:
+ * we have allocated space elsewhere in the pool for
+ * it and have an IO outstanding to copy the data. We
+ * cannot free the space before the copy has
+ * completed, or else the copy IO might overwrite any
+ * new data. To free that space, we record the
+ * segment in the appropriate svr_frees tree and free
+ * the mapped space later, in the txg where we have
+ * completed the copy and synced the mapping (see
+ * vdev_mapping_sync).
+ */
+ range_tree_add(svr->svr_frees[txgoff],
+ offset, inflight_size);
+ size -= inflight_size;
+ offset += inflight_size;
+
+ /*
+ * This space is already accounted for as being
+ * done, because it is being copied in txg+i.
+ * However, if i!=0, then it is being copied in
+ * a future txg. If we crash after this txg
+ * syncs but before txg+i syncs, then the space
+ * will be free. Therefore we must account
+ * for the space being done in *this* txg
+ * (when it is freed) rather than the future txg
+ * (when it will be copied).
+ */
+ ASSERT3U(svr->svr_bytes_done[txgoff], >=,
+ inflight_size);
+ svr->svr_bytes_done[txgoff] -= inflight_size;
+ svr->svr_bytes_done[txg & TXG_MASK] += inflight_size;
+ }
+ }
+ ASSERT0(svr->svr_max_offset_to_sync[TXG_CLEAN(txg) & TXG_MASK]);
+
+ if (size > 0) {
+ /*
+ * The copy thread has not yet visited this offset. Ensure
+ * that it doesn't.
+ */
+
+ DTRACE_PROBE3(remove__free__unvisited,
+ spa_t *, spa,
+ uint64_t, offset,
+ uint64_t, size);
+
+ if (svr->svr_allocd_segs != NULL)
+ range_tree_clear(svr->svr_allocd_segs, offset, size);
+
+ /*
+ * Since we now do not need to copy this data, for
+ * accounting purposes we have done our job and can count
+ * it as completed.
+ */
+ svr->svr_bytes_done[txg & TXG_MASK] += size;
+ }
+ mutex_exit(&svr->svr_lock);
+
+ /*
+ * Now that we have dropped svr_lock, process the synced portion
+ * of this free.
+ */
+ if (synced_size > 0) {
+ vdev_indirect_mark_obsolete(vd, synced_offset, synced_size,
+ txg);
+ /*
+ * Note: this can only be called from syncing context,
+ * and the vdev_indirect_mapping is only changed from the
+ * sync thread, so we don't need svr_lock while doing
+ * metaslab_free_impl_cb.
+ */
+ vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size,
+ metaslab_free_impl_cb, &txg);
+ }
+}
+
+/*
+ * Stop an active removal and update the spa_removing phys.
+ */
+static void
+spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx)
+{
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+ ASSERT3U(dmu_tx_get_txg(tx), ==, spa_syncing_txg(spa));
+
+ /* Ensure the removal thread has completed before we free the svr. */
+ spa_vdev_remove_suspend(spa);
+
+ ASSERT(state == DSS_FINISHED || state == DSS_CANCELED);
+
+ if (state == DSS_FINISHED) {
+ spa_removing_phys_t *srp = &spa->spa_removing_phys;
+ vdev_t *vd = svr->svr_vdev;
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+
+ if (srp->sr_prev_indirect_vdev != UINT64_MAX) {
+ vdev_t *pvd = vdev_lookup_top(spa,
+ srp->sr_prev_indirect_vdev);
+ ASSERT3P(pvd->vdev_ops, ==, &vdev_indirect_ops);
+ }
+
+ vic->vic_prev_indirect_vdev = srp->sr_prev_indirect_vdev;
+ srp->sr_prev_indirect_vdev = vd->vdev_id;
+ }
+ spa->spa_removing_phys.sr_state = state;
+ spa->spa_removing_phys.sr_end_time = gethrestime_sec();
+
+ spa->spa_vdev_removal = NULL;
+ spa_vdev_removal_destroy(svr);
+
+ spa_sync_removing_state(spa, tx);
+
+ vdev_config_dirty(spa->spa_root_vdev);
+}
+
+static void
+free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size)
+{
+ vdev_t *vd = arg;
+ vdev_indirect_mark_obsolete(vd, offset, size,
+ vd->vdev_spa->spa_syncing_txg);
+ vdev_indirect_ops.vdev_op_remap(vd, offset, size,
+ metaslab_free_impl_cb, &vd->vdev_spa->spa_syncing_txg);
+}
+
+/*
+ * On behalf of the removal thread, syncs an incremental bit more of
+ * the indirect mapping to disk and updates the in-memory mapping.
+ * Called as a sync task in every txg that the removal thread makes progress.
+ */
+static void
+vdev_mapping_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_vdev_removal_t *svr = arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = svr->svr_vdev;
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+ uint64_t txg = dmu_tx_get_txg(tx);
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+
+ ASSERT(vic->vic_mapping_object != 0);
+ ASSERT3U(txg, ==, spa_syncing_txg(spa));
+
+ vdev_indirect_mapping_add_entries(vim,
+ &svr->svr_new_segments[txg & TXG_MASK], tx);
+ vdev_indirect_births_add_entry(vd->vdev_indirect_births,
+ vdev_indirect_mapping_max_offset(vim), dmu_tx_get_txg(tx), tx);
+
+ /*
+ * Free the copied data for anything that was freed while the
+ * mapping entries were in flight.
+ */
+ mutex_enter(&svr->svr_lock);
+ range_tree_vacate(svr->svr_frees[txg & TXG_MASK],
+ free_mapped_segment_cb, vd);
+ ASSERT3U(svr->svr_max_offset_to_sync[txg & TXG_MASK], >=,
+ vdev_indirect_mapping_max_offset(vim));
+ svr->svr_max_offset_to_sync[txg & TXG_MASK] = 0;
+ mutex_exit(&svr->svr_lock);
+
+ spa_sync_removing_state(spa, tx);
+}
+
+static void
+spa_vdev_copy_segment_write_done(zio_t *zio)
+{
+ vdev_copy_seg_arg_t *vcsa = zio->io_private;
+ vdev_copy_arg_t *vca = vcsa->vcsa_copy_arg;
+ spa_config_exit(zio->io_spa, SCL_STATE, FTAG);
+ abd_free(zio->io_abd);
+
+ mutex_enter(&vca->vca_lock);
+ vca->vca_outstanding_bytes -= zio->io_size;
+ cv_signal(&vca->vca_cv);
+ mutex_exit(&vca->vca_lock);
+
+ ASSERT0(zio->io_error);
+ kmem_free(vcsa->vcsa_dest_bp, sizeof (blkptr_t));
+ kmem_free(vcsa, sizeof (vdev_copy_seg_arg_t));
+}
+
+static void
+spa_vdev_copy_segment_read_done(zio_t *zio)
+{
+ vdev_copy_seg_arg_t *vcsa = zio->io_private;
+ dva_t *dest_dva = vcsa->vcsa_dest_dva;
+ uint64_t txg = vcsa->vcsa_txg;
+ spa_t *spa = zio->io_spa;
+ vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(dest_dva));
+ blkptr_t *bp = NULL;
+ dva_t *dva = NULL;
+ uint64_t size = zio->io_size;
+
+ ASSERT3P(dest_vd, !=, NULL);
+ ASSERT0(zio->io_error);
+
+ vcsa->vcsa_dest_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
+ bp = vcsa->vcsa_dest_bp;
+ dva = bp->blk_dva;
+
+ BP_ZERO(bp);
+
+ /* initialize with dest_dva */
+ bcopy(dest_dva, dva, sizeof (dva_t));
+ BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
+
+ BP_SET_LSIZE(bp, size);
+ BP_SET_PSIZE(bp, size);
+ BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+ BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
+ BP_SET_TYPE(bp, DMU_OT_NONE);
+ BP_SET_LEVEL(bp, 0);
+ BP_SET_DEDUP(bp, 0);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+
+ zio_nowait(zio_rewrite(spa->spa_txg_zio[txg & TXG_MASK], spa,
+ txg, bp, zio->io_abd, size,
+ spa_vdev_copy_segment_write_done, vcsa,
+ ZIO_PRIORITY_REMOVAL, 0, NULL));
+}
+
+static int
+spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
+ vdev_copy_arg_t *vca, zio_alloc_list_t *zal)
+{
+ metaslab_group_t *mg = vd->vdev_mg;
+ spa_t *spa = vd->vdev_spa;
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+ vdev_indirect_mapping_entry_t *entry;
+ vdev_copy_seg_arg_t *private;
+ dva_t dst = { 0 };
+ blkptr_t blk, *bp = &blk;
+ dva_t *dva = bp->blk_dva;
+
+ ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+
+ int error = metaslab_alloc_dva(spa, mg->mg_class, size,
+ &dst, 0, NULL, txg, 0, zal);
+ if (error != 0)
+ return (error);
+
+ /*
+ * We can't have any padding of the allocated size, otherwise we will
+ * misunderstand what's allocated, and the size of the mapping.
+ * The caller ensures this will be true by passing in a size that is
+ * aligned to the worst (highest) ashift in the pool.
+ */
+ ASSERT3U(DVA_GET_ASIZE(&dst), ==, size);
+
+ mutex_enter(&vca->vca_lock);
+ vca->vca_outstanding_bytes += size;
+ mutex_exit(&vca->vca_lock);
+
+ entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
+ DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
+ entry->vime_mapping.vimep_dst = dst;
+
+ private = kmem_alloc(sizeof (vdev_copy_seg_arg_t), KM_SLEEP);
+ private->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
+ private->vcsa_txg = txg;
+ private->vcsa_copy_arg = vca;
+
+ /*
+ * This lock is eventually released by the donefunc for the
+ * zio_write_phys that finishes copying the data.
+ */
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+
+ /*
+ * Do logical I/O, letting the redundancy vdevs (like mirror)
+ * handle their own I/O instead of duplicating that code here.
+ */
+ BP_ZERO(bp);
+
+ DVA_SET_VDEV(&dva[0], vd->vdev_id);
+ DVA_SET_OFFSET(&dva[0], start);
+ DVA_SET_GANG(&dva[0], 0);
+ DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, size));
+
+ BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
+
+ BP_SET_LSIZE(bp, size);
+ BP_SET_PSIZE(bp, size);
+ BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+ BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
+ BP_SET_TYPE(bp, DMU_OT_NONE);
+ BP_SET_LEVEL(bp, 0);
+ BP_SET_DEDUP(bp, 0);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+
+ zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa,
+ bp, abd_alloc_for_io(size, B_FALSE), size,
+ spa_vdev_copy_segment_read_done, private,
+ ZIO_PRIORITY_REMOVAL, 0, NULL));
+
+ list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry);
+ ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift);
+ vdev_dirty(vd, 0, NULL, txg);
+
+ return (0);
+}
+
+/*
+ * Complete the removal of a toplevel vdev. This is called as a
+ * synctask in the same txg that we will sync out the new config (to the
+ * MOS object) which indicates that this vdev is indirect.
+ */
+static void
+vdev_remove_complete_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_vdev_removal_t *svr = arg;
+ vdev_t *vd = svr->svr_vdev;
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ ASSERT0(svr->svr_bytes_done[i]);
+ }
+
+ ASSERT3U(spa->spa_removing_phys.sr_copied, ==,
+ spa->spa_removing_phys.sr_to_copy);
+
+ vdev_destroy_spacemaps(vd, tx);
+
+ /* destroy leaf zaps, if any */
+ ASSERT3P(svr->svr_zaplist, !=, NULL);
+ for (nvpair_t *pair = nvlist_next_nvpair(svr->svr_zaplist, NULL);
+ pair != NULL;
+ pair = nvlist_next_nvpair(svr->svr_zaplist, pair)) {
+ vdev_destroy_unlink_zap(vd, fnvpair_value_uint64(pair), tx);
+ }
+ fnvlist_free(svr->svr_zaplist);
+
+ spa_finish_removal(dmu_tx_pool(tx)->dp_spa, DSS_FINISHED, tx);
+ /* vd->vdev_path is not available here */
+ spa_history_log_internal(spa, "vdev remove completed", tx,
+ "%s vdev %llu", spa_name(spa), vd->vdev_id);
+}
+
+static void
+vdev_indirect_state_transfer(vdev_t *ivd, vdev_t *vd)
+{
+ ivd->vdev_indirect_config = vd->vdev_indirect_config;
+
+ ASSERT3P(ivd->vdev_indirect_mapping, ==, NULL);
+ ASSERT(vd->vdev_indirect_mapping != NULL);
+ ivd->vdev_indirect_mapping = vd->vdev_indirect_mapping;
+ vd->vdev_indirect_mapping = NULL;
+
+ ASSERT3P(ivd->vdev_indirect_births, ==, NULL);
+ ASSERT(vd->vdev_indirect_births != NULL);
+ ivd->vdev_indirect_births = vd->vdev_indirect_births;
+ vd->vdev_indirect_births = NULL;
+
+ ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
+ ASSERT0(range_tree_space(ivd->vdev_obsolete_segments));
+
+ if (vd->vdev_obsolete_sm != NULL) {
+ ASSERT3U(ivd->vdev_asize, ==, vd->vdev_asize);
+
+ /*
+ * We cannot use space_map_{open,close} because we hold all
+ * the config locks as writer.
+ */
+ ASSERT3P(ivd->vdev_obsolete_sm, ==, NULL);
+ ivd->vdev_obsolete_sm = vd->vdev_obsolete_sm;
+ vd->vdev_obsolete_sm = NULL;
+ }
+}
+
+static void
+vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist)
+{
+ ASSERT3P(zlist, !=, NULL);
+ ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops);
+
+ if (vd->vdev_leaf_zap != 0) {
+ char zkey[32];
+ (void) snprintf(zkey, sizeof (zkey), "%s-%ju",
+ VDEV_REMOVAL_ZAP_OBJS, (uintmax_t)vd->vdev_leaf_zap);
+ fnvlist_add_uint64(zlist, zkey, vd->vdev_leaf_zap);
+ }
+
+ for (uint64_t id = 0; id < vd->vdev_children; id++) {
+ vdev_remove_enlist_zaps(vd->vdev_child[id], zlist);
+ }
+}
+
+static void
+vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
+{
+ vdev_t *ivd;
+ dmu_tx_t *tx;
+ spa_t *spa = vd->vdev_spa;
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+
+ /*
+ * First, build a list of leaf zaps to be destroyed.
+ * This is passed to the sync context thread,
+ * which does the actual unlinking.
+ */
+ svr->svr_zaplist = fnvlist_alloc();
+ vdev_remove_enlist_zaps(vd, svr->svr_zaplist);
+
+ ivd = vdev_add_parent(vd, &vdev_indirect_ops);
+
+ vd->vdev_leaf_zap = 0;
+
+ vdev_remove_child(ivd, vd);
+ vdev_compact_children(ivd);
+
+ vdev_indirect_state_transfer(ivd, vd);
+
+ svr->svr_vdev = ivd;
+
+ ASSERT(!ivd->vdev_removing);
+ ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+ dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr,
+ 0, ZFS_SPACE_CHECK_NONE, tx);
+ dmu_tx_commit(tx);
+
+ /*
+ * Indicate that this thread has exited.
+ * After this, we can not use svr.
+ */
+ mutex_enter(&svr->svr_lock);
+ svr->svr_thread = NULL;
+ cv_broadcast(&svr->svr_cv);
+ mutex_exit(&svr->svr_lock);
+}
+
+/*
+ * Complete the removal of a toplevel vdev. This is called in open
+ * context by the removal thread after we have copied all vdev's data.
+ */
+static void
+vdev_remove_complete(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ uint64_t txg;
+
+ /*
+ * Wait for any deferred frees to be synced before we call
+ * vdev_metaslab_fini()
+ */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+
+ txg = spa_vdev_enter(spa);
+ zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu",
+ vd->vdev_id, txg);
+
+ /*
+ * Discard allocation state.
+ */
+ if (vd->vdev_mg != NULL) {
+ vdev_metaslab_fini(vd);
+ metaslab_group_destroy(vd->vdev_mg);
+ vd->vdev_mg = NULL;
+ }
+ ASSERT0(vd->vdev_stat.vs_space);
+ ASSERT0(vd->vdev_stat.vs_dspace);
+
+ vdev_remove_replace_with_indirect(vd, txg);
+
+ /*
+ * We now release the locks, allowing spa_sync to run and finish the
+ * removal via vdev_remove_complete_sync in syncing context.
+ */
+ (void) spa_vdev_exit(spa, NULL, txg, 0);
+
+ /*
+ * Top ZAP should have been transferred to the indirect vdev in
+ * vdev_remove_replace_with_indirect.
+ */
+ ASSERT0(vd->vdev_top_zap);
+
+ /*
+ * Leaf ZAP should have been moved in vdev_remove_replace_with_indirect.
+ */
+ ASSERT0(vd->vdev_leaf_zap);
+
+ txg = spa_vdev_enter(spa);
+ (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
+ /*
+ * Request to update the config and the config cachefile.
+ */
+ vdev_config_dirty(spa->spa_root_vdev);
+ (void) spa_vdev_exit(spa, vd, txg, 0);
+}
+
+/*
+ * Evacuates a segment of size at most max_alloc from the vdev
+ * via repeated calls to spa_vdev_copy_segment. If an allocation
+ * fails, the pool is probably too fragmented to handle such a
+ * large size, so decrease max_alloc so that the caller will not try
+ * this size again this txg.
+ */
+static void
+spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
+ uint64_t *max_alloc, dmu_tx_t *tx)
+{
+ uint64_t txg = dmu_tx_get_txg(tx);
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ mutex_enter(&svr->svr_lock);
+
+ range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root);
+ if (rs == NULL) {
+ mutex_exit(&svr->svr_lock);
+ return;
+ }
+ uint64_t offset = rs->rs_start;
+ uint64_t length = MIN(rs->rs_end - rs->rs_start, *max_alloc);
+
+ range_tree_remove(svr->svr_allocd_segs, offset, length);
+
+ if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) {
+ dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync,
+ svr, 0, ZFS_SPACE_CHECK_NONE, tx);
+ }
+
+ svr->svr_max_offset_to_sync[txg & TXG_MASK] = offset + length;
+
+ /*
+ * Note: this is the amount of *allocated* space
+ * that we are taking care of each txg.
+ */
+ svr->svr_bytes_done[txg & TXG_MASK] += length;
+
+ mutex_exit(&svr->svr_lock);
+
+ zio_alloc_list_t zal;
+ metaslab_trace_init(&zal);
+ uint64_t thismax = *max_alloc;
+ while (length > 0) {
+ uint64_t mylen = MIN(length, thismax);
+
+ int error = spa_vdev_copy_segment(svr->svr_vdev,
+ offset, mylen, txg, vca, &zal);
+
+ if (error == ENOSPC) {
+ /*
+ * Cut our segment in half, and don't try this
+ * segment size again this txg. Note that the
+ * allocation size must be aligned to the highest
+ * ashift in the pool, so that the allocation will
+ * not be padded out to a multiple of the ashift,
+ * which could cause us to think that this mapping
+ * is larger than we intended.
+ */
+ ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT);
+ ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift);
+ thismax = P2ROUNDUP(mylen / 2,
+ 1 << spa->spa_max_ashift);
+ ASSERT3U(thismax, <, mylen);
+ /*
+ * The minimum-size allocation can not fail.
+ */
+ ASSERT3U(mylen, >, 1 << spa->spa_max_ashift);
+ *max_alloc = mylen - (1 << spa->spa_max_ashift);
+ } else {
+ ASSERT0(error);
+ length -= mylen;
+ offset += mylen;
+
+ /*
+ * We've performed an allocation, so reset the
+ * alloc trace list.
+ */
+ metaslab_trace_fini(&zal);
+ metaslab_trace_init(&zal);
+ }
+ }
+ metaslab_trace_fini(&zal);
+}
+
+/*
+ * The removal thread operates in open context. It iterates over all
+ * allocated space in the vdev, by loading each metaslab's spacemap.
+ * For each contiguous segment of allocated space (capping the segment
+ * size at SPA_MAXBLOCKSIZE), we:
+ * - Allocate space for it on another vdev.
+ * - Create a new mapping from the old location to the new location
+ * (as a record in svr_new_segments).
+ * - Initiate a logical read zio to get the data off the removing disk.
+ * - In the read zio's done callback, initiate a logical write zio to
+ * write it to the new vdev.
+ * Note that all of this will take effect when a particular TXG syncs.
+ * The sync thread ensures that all the phys reads and writes for the syncing
+ * TXG have completed (see spa_txg_zio) and writes the new mappings to disk
+ * (see vdev_mapping_sync()).
+ */
+static void
+spa_vdev_remove_thread(void *arg)
+{
+ vdev_t *vd = arg;
+ spa_t *spa = vd->vdev_spa;
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+ vdev_copy_arg_t vca;
+ uint64_t max_alloc = zfs_remove_max_segment;
+ uint64_t last_txg = 0;
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ uint64_t start_offset = vdev_indirect_mapping_max_offset(vim);
+
+ ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops);
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT(vd->vdev_removing);
+ ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
+ ASSERT3P(svr->svr_vdev, ==, vd);
+ ASSERT(vim != NULL);
+
+ mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL);
+ vca.vca_outstanding_bytes = 0;
+
+ mutex_enter(&svr->svr_lock);
+
+ /*
+ * Start from vim_max_offset so we pick up where we left off
+ * if we are restarting the removal after opening the pool.
+ */
+ uint64_t msi;
+ for (msi = start_offset >> vd->vdev_ms_shift;
+ msi < vd->vdev_ms_count && !svr->svr_thread_exit; msi++) {
+ metaslab_t *msp = vd->vdev_ms[msi];
+ ASSERT3U(msi, <=, vd->vdev_ms_count);
+
+ ASSERT0(range_tree_space(svr->svr_allocd_segs));
+
+ mutex_enter(&msp->ms_sync_lock);
+ mutex_enter(&msp->ms_lock);
+
+ /*
+ * Assert nothing in flight -- ms_*tree is empty.
+ */
+ for (int i = 0; i < TXG_SIZE; i++) {
+ ASSERT0(range_tree_space(msp->ms_alloctree[i]));
+ }
+
+ /*
+ * If the metaslab has ever been allocated from (ms_sm!=NULL),
+ * read the allocated segments from the space map object
+ * into svr_allocd_segs. Since we do this while holding
+ * svr_lock and ms_sync_lock, concurrent frees (which
+ * would have modified the space map) will wait for us
+ * to finish loading the spacemap, and then take the
+ * appropriate action (see free_from_removing_vdev()).
+ */
+ if (msp->ms_sm != NULL) {
+ space_map_t *sm = NULL;
+
+ /*
+ * We have to open a new space map here, because
+ * ms_sm's sm_length and sm_alloc may not reflect
+ * what's in the object contents, if we are in between
+ * metaslab_sync() and metaslab_sync_done().
+ */
+ VERIFY0(space_map_open(&sm,
+ spa->spa_dsl_pool->dp_meta_objset,
+ msp->ms_sm->sm_object, msp->ms_sm->sm_start,
+ msp->ms_sm->sm_size, msp->ms_sm->sm_shift));
+ space_map_update(sm);
+ VERIFY0(space_map_load(sm, svr->svr_allocd_segs,
+ SM_ALLOC));
+ space_map_close(sm);
+
+ range_tree_walk(msp->ms_freeingtree,
+ range_tree_remove, svr->svr_allocd_segs);
+
+ /*
+ * When we are resuming from a paused removal (i.e.
+ * when importing a pool with a removal in progress),
+ * discard any state that we have already processed.
+ */
+ range_tree_clear(svr->svr_allocd_segs, 0, start_offset);
+ }
+ mutex_exit(&msp->ms_lock);
+ mutex_exit(&msp->ms_sync_lock);
+
+ vca.vca_msp = msp;
+ zfs_dbgmsg("copying %llu segments for metaslab %llu",
+ avl_numnodes(&svr->svr_allocd_segs->rt_root),
+ msp->ms_id);
+
+ while (!svr->svr_thread_exit &&
+ range_tree_space(svr->svr_allocd_segs) != 0) {
+
+ mutex_exit(&svr->svr_lock);
+
+ mutex_enter(&vca.vca_lock);
+ while (vca.vca_outstanding_bytes >
+ zfs_remove_max_copy_bytes) {
+ cv_wait(&vca.vca_cv, &vca.vca_lock);
+ }
+ mutex_exit(&vca.vca_lock);
+
+ dmu_tx_t *tx =
+ dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ uint64_t txg = dmu_tx_get_txg(tx);
+
+ if (txg != last_txg)
+ max_alloc = zfs_remove_max_segment;
+ last_txg = txg;
+
+ spa_vdev_copy_impl(svr, &vca, &max_alloc, tx);
+
+ dmu_tx_commit(tx);
+ mutex_enter(&svr->svr_lock);
+ }
+ }
+
+ mutex_exit(&svr->svr_lock);
+ /*
+ * Wait for all copies to finish before cleaning up the vca.
+ */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+ ASSERT0(vca.vca_outstanding_bytes);
+
+ mutex_destroy(&vca.vca_lock);
+ cv_destroy(&vca.vca_cv);
+
+ if (svr->svr_thread_exit) {
+ mutex_enter(&svr->svr_lock);
+ range_tree_vacate(svr->svr_allocd_segs, NULL, NULL);
+ svr->svr_thread = NULL;
+ cv_broadcast(&svr->svr_cv);
+ mutex_exit(&svr->svr_lock);
+ } else {
+ ASSERT0(range_tree_space(svr->svr_allocd_segs));
+ vdev_remove_complete(vd);
+ }
+ thread_exit();
+}
+
+void
+spa_vdev_remove_suspend(spa_t *spa)
+{
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+
+ if (svr == NULL)
+ return;
+
+ mutex_enter(&svr->svr_lock);
+ svr->svr_thread_exit = B_TRUE;
+ while (svr->svr_thread != NULL)
+ cv_wait(&svr->svr_cv, &svr->svr_lock);
+ svr->svr_thread_exit = B_FALSE;
+ mutex_exit(&svr->svr_lock);
+}
+
+/* ARGSUSED */
+static int
+spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ if (spa->spa_vdev_removal == NULL)
+ return (ESRCH);
+ return (0);
+}
+
+/*
+ * Cancel a removal by freeing all entries from the partial mapping
+ * and marking the vdev as no longer being removing.
+ */
+/* ARGSUSED */
+static void
+spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+ vdev_t *vd = svr->svr_vdev;
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ objset_t *mos = spa->spa_meta_objset;
+
+ ASSERT3P(svr->svr_thread, ==, NULL);
+
+ spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
+ if (vdev_obsolete_counts_are_precise(vd)) {
+ spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+ VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx));
+ }
+
+ if (vdev_obsolete_sm_object(vd) != 0) {
+ ASSERT(vd->vdev_obsolete_sm != NULL);
+ ASSERT3U(vdev_obsolete_sm_object(vd), ==,
+ space_map_object(vd->vdev_obsolete_sm));
+
+ space_map_free(vd->vdev_obsolete_sm, tx);
+ VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx));
+ space_map_close(vd->vdev_obsolete_sm);
+ vd->vdev_obsolete_sm = NULL;
+ spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+ }
+ for (int i = 0; i < TXG_SIZE; i++) {
+ ASSERT(list_is_empty(&svr->svr_new_segments[i]));
+ ASSERT3U(svr->svr_max_offset_to_sync[i], <=,
+ vdev_indirect_mapping_max_offset(vim));
+ }
+
+ for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
+ metaslab_t *msp = vd->vdev_ms[msi];
+
+ if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim))
+ break;
+
+ ASSERT0(range_tree_space(svr->svr_allocd_segs));
+
+ mutex_enter(&msp->ms_lock);
+
+ /*
+ * Assert nothing in flight -- ms_*tree is empty.
+ */
+ for (int i = 0; i < TXG_SIZE; i++)
+ ASSERT0(range_tree_space(msp->ms_alloctree[i]));
+ for (int i = 0; i < TXG_DEFER_SIZE; i++)
+ ASSERT0(range_tree_space(msp->ms_defertree[i]));
+ ASSERT0(range_tree_space(msp->ms_freedtree));
+
+ if (msp->ms_sm != NULL) {
+ /*
+ * Assert that the in-core spacemap has the same
+ * length as the on-disk one, so we can use the
+ * existing in-core spacemap to load it from disk.
+ */
+ ASSERT3U(msp->ms_sm->sm_alloc, ==,
+ msp->ms_sm->sm_phys->smp_alloc);
+ ASSERT3U(msp->ms_sm->sm_length, ==,
+ msp->ms_sm->sm_phys->smp_objsize);
+
+ mutex_enter(&svr->svr_lock);
+ VERIFY0(space_map_load(msp->ms_sm,
+ svr->svr_allocd_segs, SM_ALLOC));
+ range_tree_walk(msp->ms_freeingtree,
+ range_tree_remove, svr->svr_allocd_segs);
+
+ /*
+ * Clear everything past what has been synced,
+ * because we have not allocated mappings for it yet.
+ */
+ uint64_t syncd = vdev_indirect_mapping_max_offset(vim);
+ range_tree_clear(svr->svr_allocd_segs, syncd,
+ msp->ms_sm->sm_start + msp->ms_sm->sm_size - syncd);
+
+ mutex_exit(&svr->svr_lock);
+ }
+ mutex_exit(&msp->ms_lock);
+
+ mutex_enter(&svr->svr_lock);
+ range_tree_vacate(svr->svr_allocd_segs,
+ free_mapped_segment_cb, vd);
+ mutex_exit(&svr->svr_lock);
+ }
+
+ /*
+ * Note: this must happen after we invoke free_mapped_segment_cb,
+ * because it adds to the obsolete_segments.
+ */
+ range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
+
+ ASSERT3U(vic->vic_mapping_object, ==,
+ vdev_indirect_mapping_object(vd->vdev_indirect_mapping));
+ vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
+ vd->vdev_indirect_mapping = NULL;
+ vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
+ vic->vic_mapping_object = 0;
+
+ ASSERT3U(vic->vic_births_object, ==,
+ vdev_indirect_births_object(vd->vdev_indirect_births));
+ vdev_indirect_births_close(vd->vdev_indirect_births);
+ vd->vdev_indirect_births = NULL;
+ vdev_indirect_births_free(mos, vic->vic_births_object, tx);
+ vic->vic_births_object = 0;
+
+ /*
+ * We may have processed some frees from the removing vdev in this
+ * txg, thus increasing svr_bytes_done; discard that here to
+ * satisfy the assertions in spa_vdev_removal_destroy().
+ * Note that future txg's can not have any bytes_done, because
+ * future TXG's are only modified from open context, and we have
+ * already shut down the copying thread.
+ */
+ svr->svr_bytes_done[dmu_tx_get_txg(tx) & TXG_MASK] = 0;
+ spa_finish_removal(spa, DSS_CANCELED, tx);
+
+ vd->vdev_removing = B_FALSE;
+ vdev_config_dirty(vd);
+
+ zfs_dbgmsg("canceled device removal for vdev %llu in %llu",
+ vd->vdev_id, dmu_tx_get_txg(tx));
+ spa_history_log_internal(spa, "vdev remove canceled", tx,
+ "%s vdev %llu %s", spa_name(spa),
+ vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-");
+}
+
+int
+spa_vdev_remove_cancel(spa_t *spa)
+{
+ spa_vdev_remove_suspend(spa);
+
+ if (spa->spa_vdev_removal == NULL)
+ return (ESRCH);
+
+ uint64_t vdid = spa->spa_vdev_removal->svr_vdev->vdev_id;
+
+ int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
+ spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_NONE);
+
+ if (error == 0) {
+ spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
+ vdev_t *vd = vdev_lookup_top(spa, vdid);
+ metaslab_group_activate(vd->vdev_mg);
+ spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG);
+ }
+
+ return (error);
+}
+
+/*
+ * Called every sync pass of every txg if there's a svr.
+ */
+void
+svr_sync(spa_t *spa, dmu_tx_t *tx)
+{
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+
+ /*
+ * This check is necessary so that we do not dirty the
+ * DIRECTORY_OBJECT via spa_sync_removing_state() when there
+ * is nothing to do. Dirtying it every time would prevent us
+ * from syncing-to-convergence.
+ */
+ if (svr->svr_bytes_done[txgoff] == 0)
+ return;
+
+ /*
+ * Update progress accounting.
+ */
+ spa->spa_removing_phys.sr_copied += svr->svr_bytes_done[txgoff];
+ svr->svr_bytes_done[txgoff] = 0;
+
+ spa_sync_removing_state(spa, tx);
+}
+
+static void
+vdev_remove_make_hole_and_free(vdev_t *vd)
+{
+ uint64_t id = vd->vdev_id;
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+ boolean_t last_vdev = (id == (rvd->vdev_children - 1));
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ vdev_free(vd);
+
+ if (last_vdev) {
+ vdev_compact_children(rvd);
+ } else {
+ vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
+ vdev_add_child(rvd, vd);
+ }
+ vdev_config_dirty(rvd);
+
+ /*
+ * Reassess the health of our root vdev.
+ */
+ vdev_reopen(rvd);
+}
+
+/*
+ * Remove a log device. The config lock is held for the specified TXG.
+ */
+static int
+spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
+{
+ metaslab_group_t *mg = vd->vdev_mg;
+ spa_t *spa = vd->vdev_spa;
+ int error = 0;
+
+ ASSERT(vd->vdev_islog);
+ ASSERT(vd == vd->vdev_top);
+
+ /*
+ * Stop allocating from this vdev.
+ */
+ metaslab_group_passivate(mg);
+
+ /*
+ * Wait for the youngest allocations and frees to sync,
+ * and then wait for the deferral of those frees to finish.
+ */
+ spa_vdev_config_exit(spa, NULL,
+ *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
+
+ /*
+ * Evacuate the device. We don't hold the config lock as writer
+ * since we need to do I/O but we do keep the
+ * spa_namespace_lock held. Once this completes the device
+ * should no longer have any blocks allocated on it.
+ */
+ if (vd->vdev_islog) {
+ if (vd->vdev_stat.vs_alloc != 0)
+ error = spa_reset_logs(spa);
+ }
+
+ *txg = spa_vdev_config_enter(spa);
+
+ if (error != 0) {
+ metaslab_group_activate(mg);
+ return (error);
+ }
+ ASSERT0(vd->vdev_stat.vs_alloc);
+
+ /*
+ * The evacuation succeeded. Remove any remaining MOS metadata
+ * associated with this vdev, and wait for these changes to sync.
+ */
+ vd->vdev_removing = B_TRUE;
+
+ vdev_dirty_leaves(vd, VDD_DTL, *txg);
+ vdev_config_dirty(vd);
+
+ spa_history_log_internal(spa, "vdev remove", NULL,
+ "%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id,
+ (vd->vdev_path != NULL) ? vd->vdev_path : "-");
+
+ /* Make sure these changes are sync'ed */
+ spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
+
+ *txg = spa_vdev_config_enter(spa);
+
+ sysevent_t *ev = spa_event_create(spa, vd, NULL,
+ ESC_ZFS_VDEV_REMOVE_DEV);
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ /* The top ZAP should have been destroyed by vdev_remove_empty. */
+ ASSERT0(vd->vdev_top_zap);
+ /* The leaf ZAP should have been destroyed by vdev_dtl_sync. */
+ ASSERT0(vd->vdev_leaf_zap);
+
+ (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
+
+ if (list_link_active(&vd->vdev_state_dirty_node))
+ vdev_state_clean(vd);
+ if (list_link_active(&vd->vdev_config_dirty_node))
+ vdev_config_clean(vd);
+
+ /*
+ * Clean up the vdev namespace.
+ */
+ vdev_remove_make_hole_and_free(vd);
+
+ if (ev != NULL)
+ spa_event_post(ev);
+
+ return (0);
+}
+
+static int
+spa_vdev_remove_top_check(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ if (vd != vd->vdev_top)
+ return (SET_ERROR(ENOTSUP));
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
+ return (SET_ERROR(ENOTSUP));
+
+ /*
+ * There has to be enough free space to remove the
+ * device and leave double the "slop" space (i.e. we
+ * must leave at least 3% of the pool free, in addition to
+ * the normal slop space).
+ */
+ if (dsl_dir_space_available(spa->spa_dsl_pool->dp_root_dir,
+ NULL, 0, B_TRUE) <
+ vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
+ return (SET_ERROR(ENOSPC));
+ }
+
+ /*
+ * There can not be a removal in progress.
+ */
+ if (spa->spa_removing_phys.sr_state == DSS_SCANNING)
+ return (SET_ERROR(EBUSY));
+
+ /*
+ * The device must have all its data.
+ */
+ if (!vdev_dtl_empty(vd, DTL_MISSING) ||
+ !vdev_dtl_empty(vd, DTL_OUTAGE))
+ return (SET_ERROR(EBUSY));
+
+ /*
+ * The device must be healthy.
+ */
+ if (!vdev_readable(vd))
+ return (SET_ERROR(EIO));
+
+ /*
+ * All vdevs in normal class must have the same ashift.
+ */
+ if (spa->spa_max_ashift != spa->spa_min_ashift) {
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * All vdevs in normal class must have the same ashift
+ * and not be raidz.
+ */
+ vdev_t *rvd = spa->spa_root_vdev;
+ int num_indirect = 0;
+ for (uint64_t id = 0; id < rvd->vdev_children; id++) {
+ vdev_t *cvd = rvd->vdev_child[id];
+ if (cvd->vdev_ashift != 0 && !cvd->vdev_islog)
+ ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift);
+ if (cvd->vdev_ops == &vdev_indirect_ops)
+ num_indirect++;
+ if (!vdev_is_concrete(cvd))
+ continue;
+ if (cvd->vdev_ops == &vdev_raidz_ops)
+ return (SET_ERROR(EINVAL));
+ /*
+ * Need the mirror to be mirror of leaf vdevs only
+ */
+ if (cvd->vdev_ops == &vdev_mirror_ops) {
+ for (uint64_t cid = 0;
+ cid < cvd->vdev_children; cid++) {
+ vdev_t *tmp = cvd->vdev_child[cid];
+ if (!tmp->vdev_ops->vdev_op_leaf)
+ return (SET_ERROR(EINVAL));
+ }
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Initiate removal of a top-level vdev, reducing the total space in the pool.
+ * The config lock is held for the specified TXG. Once initiated,
+ * evacuation of all allocated space (copying it to other vdevs) happens
+ * in the background (see spa_vdev_remove_thread()), and can be canceled
+ * (see spa_vdev_remove_cancel()). If successful, the vdev will
+ * be transformed to an indirect vdev (see spa_vdev_remove_complete()).
+ */
+static int
+spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ int error;
+
+ /*
+ * Check for errors up-front, so that we don't waste time
+ * passivating the metaslab group and clearing the ZIL if there
+ * are errors.
+ */
+ error = spa_vdev_remove_top_check(vd);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Stop allocating from this vdev. Note that we must check
+ * that this is not the only device in the pool before
+ * passivating, otherwise we will not be able to make
+ * progress because we can't allocate from any vdevs.
+ * The above check for sufficient free space serves this
+ * purpose.
+ */
+ metaslab_group_t *mg = vd->vdev_mg;
+ metaslab_group_passivate(mg);
+
+ /*
+ * Wait for the youngest allocations and frees to sync,
+ * and then wait for the deferral of those frees to finish.
+ */
+ spa_vdev_config_exit(spa, NULL,
+ *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
+
+ /*
+ * We must ensure that no "stubby" log blocks are allocated
+ * on the device to be removed. These blocks could be
+ * written at any time, including while we are in the middle
+ * of copying them.
+ */
+ error = spa_reset_logs(spa);
+
+ *txg = spa_vdev_config_enter(spa);
+
+ /*
+ * Things might have changed while the config lock was dropped
+ * (e.g. space usage). Check for errors again.
+ */
+ if (error == 0)
+ error = spa_vdev_remove_top_check(vd);
+
+ if (error != 0) {
+ metaslab_group_activate(mg);
+ return (error);
+ }
+
+ vd->vdev_removing = B_TRUE;
+
+ vdev_dirty_leaves(vd, VDD_DTL, *txg);
+ vdev_config_dirty(vd);
+ dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg);
+ dsl_sync_task_nowait(spa->spa_dsl_pool,
+ vdev_remove_initiate_sync,
+ vd, 0, ZFS_SPACE_CHECK_NONE, tx);
+ dmu_tx_commit(tx);
+
+ return (0);
+}
+
+/*
+ * Remove a device from the pool.
+ *
+ * Removing a device from the vdev namespace requires several steps
+ * and can take a significant amount of time. As a result we use
+ * the spa_vdev_config_[enter/exit] functions which allow us to
+ * grab and release the spa_config_lock while still holding the namespace
+ * lock. During each step the configuration is synced out.
+ */
+int
+spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
+{
+ vdev_t *vd;
+ nvlist_t **spares, **l2cache, *nv;
+ uint64_t txg = 0;
+ uint_t nspares, nl2cache;
+ int error = 0;
+ boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
+ sysevent_t *ev = NULL;
+
+ ASSERT(spa_writeable(spa));
+
+ if (!locked)
+ txg = spa_vdev_enter(spa);
+
+ vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+
+ if (spa->spa_spares.sav_vdevs != NULL &&
+ nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
+ (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
+ /*
+ * Only remove the hot spare if it's not currently in use
+ * in this pool.
+ */
+ if (vd == NULL || unspare) {
+ char *nvstr = fnvlist_lookup_string(nv,
+ ZPOOL_CONFIG_PATH);
+ spa_history_log_internal(spa, "vdev remove", NULL,
+ "%s vdev (%s) %s", spa_name(spa),
+ VDEV_TYPE_SPARE, nvstr);
+ if (vd == NULL)
+ vd = spa_lookup_by_guid(spa, guid, B_TRUE);
+ ev = spa_event_create(spa, vd, NULL,
+ ESC_ZFS_VDEV_REMOVE_AUX);
+ spa_vdev_remove_aux(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, spares, nspares, nv);
+ spa_load_spares(spa);
+ spa->spa_spares.sav_sync = B_TRUE;
+ } else {
+ error = SET_ERROR(EBUSY);
+ }
+ } else if (spa->spa_l2cache.sav_vdevs != NULL &&
+ nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
+ ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
+ (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
+ char *nvstr = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH);
+ spa_history_log_internal(spa, "vdev remove", NULL,
+ "%s vdev (%s) %s", spa_name(spa), VDEV_TYPE_L2CACHE, nvstr);
+ /*
+ * Cache devices can always be removed.
+ */
+ vd = spa_lookup_by_guid(spa, guid, B_TRUE);
+ ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX);
+ spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
+ ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
+ spa_load_l2cache(spa);
+ spa->spa_l2cache.sav_sync = B_TRUE;
+ } else if (vd != NULL && vd->vdev_islog) {
+ ASSERT(!locked);
+ error = spa_vdev_remove_log(vd, &txg);
+ } else if (vd != NULL) {
+ ASSERT(!locked);
+ error = spa_vdev_remove_top(vd, &txg);
+ } else {
+ /*
+ * There is no vdev of any kind with the specified guid.
+ */
+ error = SET_ERROR(ENOENT);
+ }
+
+ if (!locked)
+ error = spa_vdev_exit(spa, NULL, txg, error);
+
+ if (ev != NULL) {
+ if (error != 0) {
+ spa_event_discard(ev);
+ } else {
+ spa_event_post(ev);
+ }
+ }
+
+ return (error);
+}
+
+int
+spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs)
+{
+ prs->prs_state = spa->spa_removing_phys.sr_state;
+
+ if (prs->prs_state == DSS_NONE)
+ return (SET_ERROR(ENOENT));
+
+ prs->prs_removing_vdev = spa->spa_removing_phys.sr_removing_vdev;
+ prs->prs_start_time = spa->spa_removing_phys.sr_start_time;
+ prs->prs_end_time = spa->spa_removing_phys.sr_end_time;
+ prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy;
+ prs->prs_copied = spa->spa_removing_phys.sr_copied;
+
+ if (spa->spa_vdev_removal != NULL) {
+ for (int i = 0; i < TXG_SIZE; i++) {
+ prs->prs_copied +=
+ spa->spa_vdev_removal->svr_bytes_done[i];
+ }
+ }
+
+ prs->prs_mapping_memory = 0;
+ uint64_t indirect_vdev_id =
+ spa->spa_removing_phys.sr_prev_indirect_vdev;
+ while (indirect_vdev_id != -1) {
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[indirect_vdev_id];
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+ prs->prs_mapping_memory += vdev_indirect_mapping_size(vim);
+ indirect_vdev_id = vic->vic_prev_indirect_vdev;
+ }
+
+ return (0);
+}
Property changes on: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c (revision 332525)
@@ -1,123 +1,124 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/vdev_impl.h>
#include <sys/zio.h>
#include <sys/fs/zfs.h>
/*
* Virtual device vector for the pool's root vdev.
*/
/*
* We should be able to tolerate one failure with absolutely no damage
* to our metadata. Two failures will take out space maps, a bunch of
* indirect block trees, meta dnodes, dnodes, etc. Probably not a happy
* place to live. When we get smarter, we can liberalize this policy.
* e.g. If we haven't lost two consecutive top-level vdevs, then we are
* probably fine. Adding bean counters during alloc/free can make this
* future guesswork more accurate.
*/
static int
too_many_errors(vdev_t *vd, int numerrors)
{
ASSERT3U(numerrors, <=, vd->vdev_children);
return (numerrors > 0);
}
static int
vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
uint64_t *logical_ashift, uint64_t *physical_ashift)
{
int lasterror = 0;
int numerrors = 0;
if (vd->vdev_children == 0) {
vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
return (SET_ERROR(EINVAL));
}
vdev_open_children(vd);
for (int c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
if (cvd->vdev_open_error && !cvd->vdev_islog) {
lasterror = cvd->vdev_open_error;
numerrors++;
}
}
if (too_many_errors(vd, numerrors)) {
vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
return (lasterror);
}
*asize = 0;
*max_asize = 0;
*logical_ashift = 0;
*physical_ashift = 0;
return (0);
}
static void
vdev_root_close(vdev_t *vd)
{
for (int c = 0; c < vd->vdev_children; c++)
vdev_close(vd->vdev_child[c]);
}
static void
vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
{
if (too_many_errors(vd, faulted)) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_NO_REPLICAS);
} else if (degraded) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
} else {
vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
}
}
vdev_ops_t vdev_root_ops = {
vdev_root_open,
vdev_root_close,
vdev_default_asize,
NULL, /* io_start - not applicable to the root */
NULL, /* io_done - not applicable to the root */
vdev_root_state_change,
+ NULL,
NULL,
NULL,
VDEV_TYPE_ROOT, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c (revision 332525)
@@ -1,857 +1,860 @@
/*
* CDDL HEADER START
*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
#include "lua.h"
#include "lualib.h"
#include "lauxlib.h"
#include <zfs_prop.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_synctask.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_dir.h>
#include <sys/dmu_objset.h>
#include <sys/mntent.h>
#include <sys/sunddi.h>
#include <sys/zap.h>
#include <sys/zcp.h>
#include <sys/zcp_iter.h>
#include <sys/zcp_global.h>
#include <sys/zfs_ioctl.h>
#include <sys/zfs_znode.h>
#include <sys/zvol.h>
#ifdef _KERNEL
#include <sys/zfs_vfsops.h>
#endif
static int
get_objset_type(dsl_dataset_t *ds, zfs_type_t *type)
{
int error;
objset_t *os;
error = dmu_objset_from_ds(ds, &os);
if (error != 0)
return (error);
if (ds->ds_is_snapshot) {
*type = ZFS_TYPE_SNAPSHOT;
} else {
switch (os->os_phys->os_type) {
case DMU_OST_ZFS:
*type = ZFS_TYPE_FILESYSTEM;
break;
case DMU_OST_ZVOL:
*type = ZFS_TYPE_VOLUME;
break;
default:
return (EINVAL);
}
}
return (0);
}
/*
* Returns the string name of ds's type in str (a buffer which should be
* at least 12 bytes long).
*/
static int
get_objset_type_name(dsl_dataset_t *ds, char *str)
{
int error;
zfs_type_t type;
error = get_objset_type(ds, &type);
if (error != 0)
return (error);
switch (type) {
case ZFS_TYPE_SNAPSHOT:
(void) strcpy(str, "snapshot");
break;
case ZFS_TYPE_FILESYSTEM:
(void) strcpy(str, "filesystem");
break;
case ZFS_TYPE_VOLUME:
(void) strcpy(str, "volume");
break;
default:
return (EINVAL);
}
return (0);
}
/*
* Determines the source of a property given its setpoint and
* property type. It pushes the source to the lua stack.
*/
static void
get_prop_src(lua_State *state, const char *setpoint, zfs_prop_t prop)
{
if (zfs_prop_readonly(prop) || (prop == ZFS_PROP_VERSION)) {
lua_pushnil(state);
} else {
const char *src;
if (strcmp("", setpoint) == 0) {
src = "default";
} else {
src = setpoint;
}
(void) lua_pushstring(state, src);
}
}
/*
* Given an error encountered while getting properties, either longjmp's for
* a fatal error or pushes nothing to the stack for a non fatal one.
*/
static int
zcp_handle_error(lua_State *state, const char *dataset_name,
const char *property_name, int error)
{
ASSERT3S(error, !=, 0);
if (error == ENOENT) {
return (0);
} else if (error == EINVAL) {
return (luaL_error(state,
"property '%s' is not a valid property on dataset '%s'",
property_name, dataset_name));
} else if (error == EIO) {
return (luaL_error(state,
"I/O error while retrieving property '%s' on dataset '%s'",
property_name, dataset_name));
} else {
return (luaL_error(state, "unexpected error %d while "
"retrieving property '%s' on dataset '%s'",
error, property_name, dataset_name));
}
}
/*
* Look up a user defined property in the zap object. If it exists, push it
* and the setpoint onto the stack, otherwise don't push anything.
*/
static int
zcp_get_user_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name,
const char *property_name)
{
int error;
char *buf;
char setpoint[ZFS_MAX_DATASET_NAME_LEN];
/*
* zcp_dataset_hold will either successfully return the requested
* dataset or throw a lua error and longjmp out of the zfs.get_prop call
* without returning.
*/
dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG);
if (ds == NULL)
return (1); /* not reached; zcp_dataset_hold() longjmp'd */
buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
error = dsl_prop_get_ds(ds, property_name, 1, ZAP_MAXVALUELEN,
buf, setpoint);
dsl_dataset_rele(ds, FTAG);
if (error != 0) {
kmem_free(buf, ZAP_MAXVALUELEN);
return (zcp_handle_error(state, dataset_name, property_name,
error));
}
(void) lua_pushstring(state, buf);
(void) lua_pushstring(state, setpoint);
kmem_free(buf, ZAP_MAXVALUELEN);
return (2);
}
/*
* Check if the property we're looking for is stored in the ds_dir. If so,
* return it in the 'val' argument. Return 0 on success and ENOENT and if
* the property is not present.
*/
static int
get_dsl_dir_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop,
uint64_t *val)
{
dsl_dir_t *dd = ds->ds_dir;
mutex_enter(&dd->dd_lock);
switch (zfs_prop) {
case ZFS_PROP_USEDSNAP:
*val = dsl_dir_get_usedsnap(dd);
break;
case ZFS_PROP_USEDCHILD:
*val = dsl_dir_get_usedchild(dd);
break;
case ZFS_PROP_USEDDS:
*val = dsl_dir_get_usedds(dd);
break;
case ZFS_PROP_USEDREFRESERV:
*val = dsl_dir_get_usedrefreserv(dd);
break;
case ZFS_PROP_LOGICALUSED:
*val = dsl_dir_get_logicalused(dd);
break;
default:
mutex_exit(&dd->dd_lock);
return (ENOENT);
}
mutex_exit(&dd->dd_lock);
return (0);
}
/*
* Takes a dataset, a property, a value and that value's setpoint as
* found in the ZAP. Checks if the property has been changed in the vfs.
* If so, val and setpoint will be overwritten with updated content.
* Otherwise, they are left unchanged.
*/
static int
get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
char *setpoint)
{
#ifndef _KERNEL
return (0);
#else
int error;
#ifdef illumos
zfsvfs_t *zfvp;
#endif
vfs_t *vfsp;
objset_t *os;
uint64_t tmp = *val;
error = dmu_objset_from_ds(ds, &os);
if (error != 0)
return (error);
error = getzfsvfs_impl(os, &vfsp);
if (error != 0)
return (error);
#ifdef illumos
vfsp = zfvp->z_vfs;
#endif
switch (zfs_prop) {
case ZFS_PROP_ATIME:
if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
tmp = 0;
if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
tmp = 1;
break;
case ZFS_PROP_DEVICES:
if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
tmp = 0;
if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
tmp = 1;
break;
case ZFS_PROP_EXEC:
if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
tmp = 0;
if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
tmp = 1;
break;
case ZFS_PROP_SETUID:
if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
tmp = 0;
if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
tmp = 1;
break;
case ZFS_PROP_READONLY:
if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
tmp = 0;
if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
tmp = 1;
break;
case ZFS_PROP_XATTR:
if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL))
tmp = 0;
if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
tmp = 1;
break;
case ZFS_PROP_NBMAND:
if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
tmp = 0;
if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
tmp = 1;
break;
default:
#ifdef illumos
VFS_RELE(vfsp);
#else
vfs_rel(vfsp);
#endif
return (ENOENT);
}
#ifdef illumos
VFS_RELE(vfsp);
#else
vfs_rel(vfsp);
#endif
if (tmp != *val) {
(void) strcpy(setpoint, "temporary");
*val = tmp;
}
return (0);
#endif
}
/*
* Check if the property we're looking for is stored at the dsl_dataset or
* dsl_dir level. If so, push the property value and source onto the lua stack
* and return 0. If it is not present or a failure occurs in lookup, return a
* non-zero error value.
*/
static int
get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname,
zfs_prop_t zfs_prop)
{
int error = 0;
objset_t *os;
uint64_t numval;
char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
char setpoint[ZFS_MAX_DATASET_NAME_LEN] =
"Internal error - setpoint not determined";
zfs_type_t ds_type;
zprop_type_t prop_type = zfs_prop_get_type(zfs_prop);
(void) get_objset_type(ds, &ds_type);
switch (zfs_prop) {
case ZFS_PROP_REFRATIO:
numval = dsl_get_refratio(ds);
break;
case ZFS_PROP_USED:
numval = dsl_get_used(ds);
break;
case ZFS_PROP_CLONES: {
nvlist_t *clones = fnvlist_alloc();
error = get_clones_stat_impl(ds, clones);
if (error == 0) {
/* push list to lua stack */
VERIFY0(zcp_nvlist_to_lua(state, clones, NULL, 0));
/* source */
(void) lua_pushnil(state);
}
nvlist_free(clones);
kmem_free(strval, ZAP_MAXVALUELEN);
return (error);
}
case ZFS_PROP_COMPRESSRATIO:
numval = dsl_get_compressratio(ds);
break;
case ZFS_PROP_CREATION:
numval = dsl_get_creation(ds);
break;
case ZFS_PROP_REFERENCED:
numval = dsl_get_referenced(ds);
break;
case ZFS_PROP_AVAILABLE:
numval = dsl_get_available(ds);
break;
case ZFS_PROP_LOGICALREFERENCED:
numval = dsl_get_logicalreferenced(ds);
break;
case ZFS_PROP_CREATETXG:
numval = dsl_get_creationtxg(ds);
break;
case ZFS_PROP_GUID:
numval = dsl_get_guid(ds);
break;
case ZFS_PROP_UNIQUE:
numval = dsl_get_unique(ds);
break;
case ZFS_PROP_OBJSETID:
numval = dsl_get_objsetid(ds);
break;
case ZFS_PROP_ORIGIN:
dsl_dir_get_origin(ds->ds_dir, strval);
break;
case ZFS_PROP_USERACCOUNTING:
error = dmu_objset_from_ds(ds, &os);
if (error == 0)
numval = dmu_objset_userspace_present(os);
break;
case ZFS_PROP_WRITTEN:
error = dsl_get_written(ds, &numval);
break;
case ZFS_PROP_TYPE:
error = get_objset_type_name(ds, strval);
break;
case ZFS_PROP_PREV_SNAP:
error = dsl_get_prev_snap(ds, strval);
break;
case ZFS_PROP_NAME:
dsl_dataset_name(ds, strval);
break;
case ZFS_PROP_MOUNTPOINT:
error = dsl_get_mountpoint(ds, dsname, strval, setpoint);
break;
case ZFS_PROP_VERSION:
/* should be a snapshot or filesystem */
ASSERT(ds_type != ZFS_TYPE_VOLUME);
error = dmu_objset_from_ds(ds, &os);
/* look in the master node for the version */
if (error == 0) {
error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
sizeof (numval), 1, &numval);
}
break;
case ZFS_PROP_DEFER_DESTROY:
numval = dsl_get_defer_destroy(ds);
break;
case ZFS_PROP_USERREFS:
numval = dsl_get_userrefs(ds);
break;
case ZFS_PROP_FILESYSTEM_COUNT:
error = dsl_dir_get_filesystem_count(ds->ds_dir, &numval);
(void) strcpy(setpoint, "");
break;
case ZFS_PROP_SNAPSHOT_COUNT:
error = dsl_dir_get_snapshot_count(ds->ds_dir, &numval);
(void) strcpy(setpoint, "");
break;
+ case ZFS_PROP_REMAPTXG:
+ error = dsl_dir_get_remaptxg(ds->ds_dir, &numval);
+ break;
case ZFS_PROP_NUMCLONES:
numval = dsl_get_numclones(ds);
break;
case ZFS_PROP_INCONSISTENT:
numval = dsl_get_inconsistent(ds);
break;
case ZFS_PROP_RECEIVE_RESUME_TOKEN:
VERIFY3U(strlcpy(strval, get_receive_resume_stats_impl(ds),
ZAP_MAXVALUELEN), <, ZAP_MAXVALUELEN);
if (strcmp(strval, "") == 0) {
VERIFY3U(strlcpy(strval, get_child_receive_stats(ds),
ZAP_MAXVALUELEN), <, ZAP_MAXVALUELEN);
if (strcmp(strval, "") == 0)
error = ENOENT;
}
break;
case ZFS_PROP_VOLSIZE:
ASSERT(ds_type == ZFS_TYPE_VOLUME);
error = dmu_objset_from_ds(ds, &os);
if (error == 0) {
error = zap_lookup(os, ZVOL_ZAP_OBJ, "size",
sizeof (numval), 1, &numval);
}
if (error == 0)
(void) strcpy(setpoint, dsname);
break;
case ZFS_PROP_VOLBLOCKSIZE: {
ASSERT(ds_type == ZFS_TYPE_VOLUME);
dmu_object_info_t doi;
error = dmu_objset_from_ds(ds, &os);
if (error == 0) {
error = dmu_object_info(os, ZVOL_OBJ, &doi);
if (error == 0)
numval = doi.doi_data_block_size;
}
break;
}
default:
/* Did not match these props, check in the dsl_dir */
error = get_dsl_dir_prop(ds, zfs_prop, &numval);
}
if (error != 0) {
kmem_free(strval, ZAP_MAXVALUELEN);
return (error);
}
switch (prop_type) {
case PROP_TYPE_NUMBER: {
(void) lua_pushnumber(state, numval);
break;
}
case PROP_TYPE_STRING: {
(void) lua_pushstring(state, strval);
break;
}
case PROP_TYPE_INDEX: {
const char *propval;
error = zfs_prop_index_to_string(zfs_prop, numval, &propval);
if (error != 0) {
kmem_free(strval, ZAP_MAXVALUELEN);
return (error);
}
(void) lua_pushstring(state, propval);
break;
}
}
kmem_free(strval, ZAP_MAXVALUELEN);
/* Push the source to the stack */
get_prop_src(state, setpoint, zfs_prop);
return (0);
}
/*
* Look up a property and its source in the zap object. If the value is
* present and successfully retrieved, push the value and source on the
* lua stack and return 0. On failure, return a non-zero error value.
*/
static int
get_zap_prop(lua_State *state, dsl_dataset_t *ds, zfs_prop_t zfs_prop)
{
int error = 0;
char setpoint[ZFS_MAX_DATASET_NAME_LEN];
char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
uint64_t numval;
const char *prop_name = zfs_prop_to_name(zfs_prop);
zprop_type_t prop_type = zfs_prop_get_type(zfs_prop);
if (prop_type == PROP_TYPE_STRING) {
/* Push value to lua stack */
error = dsl_prop_get_ds(ds, prop_name, 1,
ZAP_MAXVALUELEN, strval, setpoint);
if (error == 0)
(void) lua_pushstring(state, strval);
} else {
error = dsl_prop_get_ds(ds, prop_name, sizeof (numval),
1, &numval, setpoint);
/* Fill in temorary value for prop, if applicable */
(void) get_temporary_prop(ds, zfs_prop, &numval, setpoint);
/* Push value to lua stack */
if (prop_type == PROP_TYPE_INDEX) {
const char *propval;
error = zfs_prop_index_to_string(zfs_prop, numval,
&propval);
if (error == 0)
(void) lua_pushstring(state, propval);
} else {
if (error == 0)
(void) lua_pushnumber(state, numval);
}
}
kmem_free(strval, ZAP_MAXVALUELEN);
if (error == 0)
get_prop_src(state, setpoint, zfs_prop);
return (error);
}
/*
* Determine whether property is valid for a given dataset
*/
boolean_t
prop_valid_for_ds(dsl_dataset_t *ds, zfs_prop_t zfs_prop)
{
int error;
zfs_type_t zfs_type;
/* properties not supported */
if ((zfs_prop == ZFS_PROP_ISCSIOPTIONS) ||
(zfs_prop == ZFS_PROP_MOUNTED))
return (B_FALSE);
/* if we want the origin prop, ds must be a clone */
if ((zfs_prop == ZFS_PROP_ORIGIN) && (!dsl_dir_is_clone(ds->ds_dir)))
return (B_FALSE);
error = get_objset_type(ds, &zfs_type);
if (error != 0)
return (B_FALSE);
return (zfs_prop_valid_for_type(zfs_prop, zfs_type));
}
/*
* Look up a given dataset property. On success return 2, the number of
* values pushed to the lua stack (property value and source). On a fatal
* error, longjmp. On a non fatal error push nothing.
*/
static int
zcp_get_system_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name,
zfs_prop_t zfs_prop)
{
int error;
/*
* zcp_dataset_hold will either successfully return the requested
* dataset or throw a lua error and longjmp out of the zfs.get_prop call
* without returning.
*/
dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG);
if (ds == NULL)
return (1); /* not reached; zcp_dataset_hold() longjmp'd */
/* Check that the property is valid for the given dataset */
const char *prop_name = zfs_prop_to_name(zfs_prop);
if (!prop_valid_for_ds(ds, zfs_prop)) {
dsl_dataset_rele(ds, FTAG);
return (0);
}
/* Check if the property can be accessed directly */
error = get_special_prop(state, ds, dataset_name, zfs_prop);
if (error == 0) {
dsl_dataset_rele(ds, FTAG);
/* The value and source have been pushed by get_special_prop */
return (2);
}
if (error != ENOENT) {
dsl_dataset_rele(ds, FTAG);
return (zcp_handle_error(state, dataset_name,
prop_name, error));
}
/* If we were unable to find it, look in the zap object */
error = get_zap_prop(state, ds, zfs_prop);
dsl_dataset_rele(ds, FTAG);
if (error != 0) {
return (zcp_handle_error(state, dataset_name,
prop_name, error));
}
/* The value and source have been pushed by get_zap_prop */
return (2);
}
static zfs_userquota_prop_t
get_userquota_prop(const char *prop_name)
{
zfs_userquota_prop_t type;
/* Figure out the property type ({user|group}{quota|used}) */
for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) {
if (strncmp(prop_name, zfs_userquota_prop_prefixes[type],
strlen(zfs_userquota_prop_prefixes[type])) == 0)
break;
}
return (type);
}
#ifdef _KERNEL
/*
* Given the name of a zfs_userquota_prop, this function determines the
* prop type as well as the numeric group/user ids based on the string
* following the '@' in the property name. On success, returns 0. On failure,
* returns a non-zero error.
* 'domain' must be free'd by caller using strfree()
*/
static int
parse_userquota_prop(const char *prop_name, zfs_userquota_prop_t *type,
char **domain, uint64_t *rid)
{
char *cp, *end, *domain_val;
*type = get_userquota_prop(prop_name);
if (*type >= ZFS_NUM_USERQUOTA_PROPS)
return (EINVAL);
*rid = 0;
cp = strchr(prop_name, '@') + 1;
if (strncmp(cp, "S-1-", 4) == 0) {
/*
* It's a numeric SID (eg "S-1-234-567-89") and we want to
* seperate the domain id and the rid
*/
int domain_len = strrchr(cp, '-') - cp;
domain_val = kmem_alloc(domain_len + 1, KM_SLEEP);
(void) strncpy(domain_val, cp, domain_len);
domain_val[domain_len] = '\0';
cp += domain_len + 1;
(void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid);
if (*end != '\0') {
strfree(domain_val);
return (EINVAL);
}
} else {
/* It's only a user/group ID (eg "12345"), just get the rid */
domain_val = NULL;
(void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid);
if (*end != '\0')
return (EINVAL);
}
*domain = domain_val;
return (0);
}
/*
* Look up {user|group}{quota|used} property for given dataset. On success
* push the value (quota or used amount) and the setpoint. On failure, push
* a lua error.
*/
static int
zcp_get_userquota_prop(lua_State *state, dsl_pool_t *dp,
const char *dataset_name, const char *prop_name)
{
zfsvfs_t *zfvp;
zfsvfs_t *zfsvfs;
int error;
zfs_userquota_prop_t type;
char *domain;
uint64_t rid, value;
objset_t *os;
dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG);
if (ds == NULL)
return (1); /* not reached; zcp_dataset_hold() longjmp'd */
error = parse_userquota_prop(prop_name, &type, &domain, &rid);
if (error == 0) {
error = dmu_objset_from_ds(ds, &os);
if (error == 0) {
zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
error = zfsvfs_create_impl(&zfvp, zfsvfs, os);
if (error == 0) {
error = zfs_userspace_one(zfvp, type, domain,
rid, &value);
zfsvfs_free(zfvp);
}
}
if (domain != NULL)
strfree(domain);
}
dsl_dataset_rele(ds, FTAG);
if ((value == 0) && ((type == ZFS_PROP_USERQUOTA) ||
(type == ZFS_PROP_GROUPQUOTA)))
error = ENOENT;
if (error != 0) {
return (zcp_handle_error(state, dataset_name,
prop_name, error));
}
(void) lua_pushnumber(state, value);
(void) lua_pushstring(state, dataset_name);
return (2);
}
#endif
/*
* Determines the name of the snapshot referenced in the written property
* name. Returns snapshot name in snap_name, a buffer that must be at least
* as large as ZFS_MAX_DATASET_NAME_LEN
*/
static void
parse_written_prop(const char *dataset_name, const char *prop_name,
char *snap_name)
{
ASSERT(zfs_prop_written(prop_name));
const char *name = prop_name + ZFS_WRITTEN_PROP_PREFIX_LEN;
if (strchr(name, '@') == NULL) {
(void) sprintf(snap_name, "%s@%s", dataset_name, name);
} else {
(void) strcpy(snap_name, name);
}
}
/*
* Look up written@ property for given dataset. On success
* push the value and the setpoint. If error is fatal, we will
* longjmp, otherwise push nothing.
*/
static int
zcp_get_written_prop(lua_State *state, dsl_pool_t *dp,
const char *dataset_name, const char *prop_name)
{
char snap_name[ZFS_MAX_DATASET_NAME_LEN];
uint64_t used, comp, uncomp;
dsl_dataset_t *old;
int error = 0;
parse_written_prop(dataset_name, prop_name, snap_name);
dsl_dataset_t *new = zcp_dataset_hold(state, dp, dataset_name, FTAG);
if (new == NULL)
return (1); /* not reached; zcp_dataset_hold() longjmp'd */
error = dsl_dataset_hold(dp, snap_name, FTAG, &old);
if (error != 0) {
dsl_dataset_rele(new, FTAG);
return (zcp_dataset_hold_error(state, dp, snap_name,
error));
}
error = dsl_dataset_space_written(old, new,
&used, &comp, &uncomp);
dsl_dataset_rele(old, FTAG);
dsl_dataset_rele(new, FTAG);
if (error != 0) {
return (zcp_handle_error(state, dataset_name,
snap_name, error));
}
(void) lua_pushnumber(state, used);
(void) lua_pushstring(state, dataset_name);
return (2);
}
static int zcp_get_prop(lua_State *state);
static zcp_lib_info_t zcp_get_prop_info = {
.name = "get_prop",
.func = zcp_get_prop,
.pargs = {
{ .za_name = "dataset", .za_lua_type = LUA_TSTRING},
{ .za_name = "property", .za_lua_type = LUA_TSTRING},
{NULL, 0}
},
.kwargs = {
{NULL, 0}
}
};
static int
zcp_get_prop(lua_State *state)
{
const char *dataset_name;
const char *property_name;
dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
zcp_lib_info_t *libinfo = &zcp_get_prop_info;
zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
dataset_name = lua_tostring(state, 1);
property_name = lua_tostring(state, 2);
/* User defined property */
if (zfs_prop_user(property_name)) {
return (zcp_get_user_prop(state, dp,
dataset_name, property_name));
}
/* userspace property */
if (zfs_prop_userquota(property_name)) {
#ifdef _KERNEL
return (zcp_get_userquota_prop(state, dp,
dataset_name, property_name));
#else
return (luaL_error(state,
"user quota properties only supported in kernel mode",
property_name));
#endif
}
/* written@ property */
if (zfs_prop_written(property_name)) {
return (zcp_get_written_prop(state, dp,
dataset_name, property_name));
}
zfs_prop_t zfs_prop = zfs_name_to_prop(property_name);
/* Valid system property */
if (zfs_prop != ZPROP_INVAL) {
return (zcp_get_system_prop(state, dp, dataset_name,
zfs_prop));
}
/* Invalid property name */
return (luaL_error(state,
"'%s' is not a valid property", property_name));
}
int
zcp_load_get_lib(lua_State *state)
{
lua_pushcclosure(state, zcp_get_prop_info.func, 0);
lua_setfield(state, -2, zcp_get_prop_info.name);
return (1);
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c (revision 332525)
@@ -1,6989 +1,7017 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
* Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
* Copyright 2014 Xin Li <delphij@FreeBSD.org>. All rights reserved.
* Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved.
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2016 Toomas Soome <tsoome@me.com>
* Copyright 2017 RackTop Systems.
* Copyright (c) 2017 Datto Inc.
* Copyright 2016 Toomas Soome <tsoome@me.com>
*/
/*
* ZFS ioctls.
*
* This file handles the ioctls to /dev/zfs, used for configuring ZFS storage
* pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool.
*
* There are two ways that we handle ioctls: the legacy way where almost
* all of the logic is in the ioctl callback, and the new way where most
* of the marshalling is handled in the common entry point, zfsdev_ioctl().
*
* Non-legacy ioctls should be registered by calling
* zfs_ioctl_register() from zfs_ioctl_init(). The ioctl is invoked
* from userland by lzc_ioctl().
*
* The registration arguments are as follows:
*
* const char *name
* The name of the ioctl. This is used for history logging. If the
* ioctl returns successfully (the callback returns 0), and allow_log
* is true, then a history log entry will be recorded with the input &
* output nvlists. The log entry can be printed with "zpool history -i".
*
* zfs_ioc_t ioc
* The ioctl request number, which userland will pass to ioctl(2).
* The ioctl numbers can change from release to release, because
* the caller (libzfs) must be matched to the kernel.
*
* zfs_secpolicy_func_t *secpolicy
* This function will be called before the zfs_ioc_func_t, to
* determine if this operation is permitted. It should return EPERM
* on failure, and 0 on success. Checks include determining if the
* dataset is visible in this zone, and if the user has either all
* zfs privileges in the zone (SYS_MOUNT), or has been granted permission
* to do this operation on this dataset with "zfs allow".
*
* zfs_ioc_namecheck_t namecheck
* This specifies what to expect in the zfs_cmd_t:zc_name -- a pool
* name, a dataset name, or nothing. If the name is not well-formed,
* the ioctl will fail and the callback will not be called.
* Therefore, the callback can assume that the name is well-formed
* (e.g. is null-terminated, doesn't have more than one '@' character,
* doesn't have invalid characters).
*
* zfs_ioc_poolcheck_t pool_check
* This specifies requirements on the pool state. If the pool does
* not meet them (is suspended or is readonly), the ioctl will fail
* and the callback will not be called. If any checks are specified
* (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME.
* Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED |
* POOL_CHECK_READONLY).
*
* boolean_t smush_outnvlist
* If smush_outnvlist is true, then the output is presumed to be a
* list of errors, and it will be "smushed" down to fit into the
* caller's buffer, by removing some entries and replacing them with a
* single "N_MORE_ERRORS" entry indicating how many were removed. See
* nvlist_smush() for details. If smush_outnvlist is false, and the
* outnvlist does not fit into the userland-provided buffer, then the
* ioctl will fail with ENOMEM.
*
* zfs_ioc_func_t *func
* The callback function that will perform the operation.
*
* The callback should return 0 on success, or an error number on
* failure. If the function fails, the userland ioctl will return -1,
* and errno will be set to the callback's return value. The callback
* will be called with the following arguments:
*
* const char *name
* The name of the pool or dataset to operate on, from
* zfs_cmd_t:zc_name. The 'namecheck' argument specifies the
* expected type (pool, dataset, or none).
*
* nvlist_t *innvl
* The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src. Or
* NULL if no input nvlist was provided. Changes to this nvlist are
* ignored. If the input nvlist could not be deserialized, the
* ioctl will fail and the callback will not be called.
*
* nvlist_t *outnvl
* The output nvlist, initially empty. The callback can fill it in,
* and it will be returned to userland by serializing it into
* zfs_cmd_t:zc_nvlist_dst. If it is non-empty, and serialization
* fails (e.g. because the caller didn't supply a large enough
* buffer), then the overall ioctl will fail. See the
* 'smush_nvlist' argument above for additional behaviors.
*
* There are two typical uses of the output nvlist:
* - To return state, e.g. property values. In this case,
* smush_outnvlist should be false. If the buffer was not large
* enough, the caller will reallocate a larger buffer and try
* the ioctl again.
*
* - To return multiple errors from an ioctl which makes on-disk
* changes. In this case, smush_outnvlist should be true.
* Ioctls which make on-disk modifications should generally not
* use the outnvl if they succeed, because the caller can not
* distinguish between the operation failing, and
* deserialization failing.
*/
#ifdef __FreeBSD__
#include "opt_kstack_pages.h"
#endif
#include <sys/types.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/errno.h>
#include <sys/uio.h>
#include <sys/buf.h>
#include <sys/file.h>
#include <sys/kmem.h>
#include <sys/conf.h>
#include <sys/cmn_err.h>
#include <sys/stat.h>
#include <sys/zfs_ioctl.h>
#include <sys/zfs_vfsops.h>
#include <sys/zfs_znode.h>
#include <sys/zap.h>
#include <sys/spa.h>
#include <sys/spa_impl.h>
#include <sys/vdev.h>
#include <sys/dmu.h>
#include <sys/dsl_dir.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_deleg.h>
#include <sys/dmu_objset.h>
#include <sys/dmu_impl.h>
#include <sys/dmu_tx.h>
#include <sys/sunddi.h>
#include <sys/policy.h>
#include <sys/zone.h>
#include <sys/nvpair.h>
#include <sys/mount.h>
#include <sys/taskqueue.h>
#include <sys/sdt.h>
#include <sys/varargs.h>
#include <sys/fs/zfs.h>
#include <sys/zfs_ctldir.h>
#include <sys/zfs_dir.h>
#include <sys/zfs_onexit.h>
#include <sys/zvol.h>
#include <sys/dsl_scan.h>
#include <sys/dmu_objset.h>
#include <sys/dmu_send.h>
#include <sys/dsl_destroy.h>
#include <sys/dsl_bookmark.h>
#include <sys/dsl_userhold.h>
#include <sys/zfeature.h>
#include <sys/zcp.h>
#include <sys/zio_checksum.h>
+#include <sys/vdev_removal.h>
#include "zfs_namecheck.h"
#include "zfs_prop.h"
#include "zfs_deleg.h"
#include "zfs_comutil.h"
#include "zfs_ioctl_compat.h"
#include "lua.h"
#include "lauxlib.h"
static struct cdev *zfsdev;
extern void zfs_init(void);
extern void zfs_fini(void);
uint_t zfs_fsyncer_key;
extern uint_t rrw_tsd_key;
static uint_t zfs_allow_log_key;
extern uint_t zfs_geom_probe_vdev_key;
typedef int zfs_ioc_legacy_func_t(zfs_cmd_t *);
typedef int zfs_ioc_func_t(const char *, nvlist_t *, nvlist_t *);
typedef int zfs_secpolicy_func_t(zfs_cmd_t *, nvlist_t *, cred_t *);
typedef enum {
NO_NAME,
POOL_NAME,
DATASET_NAME
} zfs_ioc_namecheck_t;
typedef enum {
POOL_CHECK_NONE = 1 << 0,
POOL_CHECK_SUSPENDED = 1 << 1,
POOL_CHECK_READONLY = 1 << 2,
} zfs_ioc_poolcheck_t;
typedef struct zfs_ioc_vec {
zfs_ioc_legacy_func_t *zvec_legacy_func;
zfs_ioc_func_t *zvec_func;
zfs_secpolicy_func_t *zvec_secpolicy;
zfs_ioc_namecheck_t zvec_namecheck;
boolean_t zvec_allow_log;
zfs_ioc_poolcheck_t zvec_pool_check;
boolean_t zvec_smush_outnvlist;
const char *zvec_name;
} zfs_ioc_vec_t;
/* This array is indexed by zfs_userquota_prop_t */
static const char *userquota_perms[] = {
ZFS_DELEG_PERM_USERUSED,
ZFS_DELEG_PERM_USERQUOTA,
ZFS_DELEG_PERM_GROUPUSED,
ZFS_DELEG_PERM_GROUPQUOTA,
};
static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc);
static int zfs_check_settable(const char *name, nvpair_t *property,
cred_t *cr);
static int zfs_check_clearable(char *dataset, nvlist_t *props,
nvlist_t **errors);
static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *,
boolean_t *);
int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *);
static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp);
static void zfsdev_close(void *data);
static int zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature);
/* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */
void
__dprintf(const char *file, const char *func, int line, const char *fmt, ...)
{
const char *newfile;
char buf[512];
va_list adx;
/*
* Get rid of annoying "../common/" prefix to filename.
*/
newfile = strrchr(file, '/');
if (newfile != NULL) {
newfile = newfile + 1; /* Get rid of leading / */
} else {
newfile = file;
}
va_start(adx, fmt);
(void) vsnprintf(buf, sizeof (buf), fmt, adx);
va_end(adx);
/*
* To get this data, use the zfs-dprintf probe as so:
* dtrace -q -n 'zfs-dprintf \
* /stringof(arg0) == "dbuf.c"/ \
* {printf("%s: %s", stringof(arg1), stringof(arg3))}'
* arg0 = file name
* arg1 = function name
* arg2 = line number
* arg3 = message
*/
DTRACE_PROBE4(zfs__dprintf,
char *, newfile, char *, func, int, line, char *, buf);
}
static void
history_str_free(char *buf)
{
kmem_free(buf, HIS_MAX_RECORD_LEN);
}
static char *
history_str_get(zfs_cmd_t *zc)
{
char *buf;
if (zc->zc_history == 0)
return (NULL);
buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP);
if (copyinstr((void *)(uintptr_t)zc->zc_history,
buf, HIS_MAX_RECORD_LEN, NULL) != 0) {
history_str_free(buf);
return (NULL);
}
buf[HIS_MAX_RECORD_LEN -1] = '\0';
return (buf);
}
/*
* Check to see if the named dataset is currently defined as bootable
*/
static boolean_t
zfs_is_bootfs(const char *name)
{
objset_t *os;
if (dmu_objset_hold(name, FTAG, &os) == 0) {
boolean_t ret;
ret = (dmu_objset_id(os) == spa_bootfs(dmu_objset_spa(os)));
dmu_objset_rele(os, FTAG);
return (ret);
}
return (B_FALSE);
}
/*
* Return non-zero if the spa version is less than requested version.
*/
static int
zfs_earlier_version(const char *name, int version)
{
spa_t *spa;
if (spa_open(name, &spa, FTAG) == 0) {
if (spa_version(spa) < version) {
spa_close(spa, FTAG);
return (1);
}
spa_close(spa, FTAG);
}
return (0);
}
/*
* Return TRUE if the ZPL version is less than requested version.
*/
static boolean_t
zpl_earlier_version(const char *name, int version)
{
objset_t *os;
boolean_t rc = B_TRUE;
if (dmu_objset_hold(name, FTAG, &os) == 0) {
uint64_t zplversion;
if (dmu_objset_type(os) != DMU_OST_ZFS) {
dmu_objset_rele(os, FTAG);
return (B_TRUE);
}
/* XXX reading from non-owned objset */
if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0)
rc = zplversion < version;
dmu_objset_rele(os, FTAG);
}
return (rc);
}
static void
zfs_log_history(zfs_cmd_t *zc)
{
spa_t *spa;
char *buf;
if ((buf = history_str_get(zc)) == NULL)
return;
if (spa_open(zc->zc_name, &spa, FTAG) == 0) {
if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY)
(void) spa_history_log(spa, buf);
spa_close(spa, FTAG);
}
history_str_free(buf);
}
/*
* Policy for top-level read operations (list pools). Requires no privileges,
* and can be used in the local zone, as there is no associated dataset.
*/
/* ARGSUSED */
static int
zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
return (0);
}
/*
* Policy for dataset read operations (list children, get statistics). Requires
* no privileges, but must be visible in the local zone.
*/
/* ARGSUSED */
static int
zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
if (INGLOBALZONE(curthread) ||
zone_dataset_visible(zc->zc_name, NULL))
return (0);
return (SET_ERROR(ENOENT));
}
static int
zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr)
{
int writable = 1;
/*
* The dataset must be visible by this zone -- check this first
* so they don't see EPERM on something they shouldn't know about.
*/
if (!INGLOBALZONE(curthread) &&
!zone_dataset_visible(dataset, &writable))
return (SET_ERROR(ENOENT));
if (INGLOBALZONE(curthread)) {
/*
* If the fs is zoned, only root can access it from the
* global zone.
*/
if (secpolicy_zfs(cr) && zoned)
return (SET_ERROR(EPERM));
} else {
/*
* If we are in a local zone, the 'zoned' property must be set.
*/
if (!zoned)
return (SET_ERROR(EPERM));
/* must be writable by this zone */
if (!writable)
return (SET_ERROR(EPERM));
}
return (0);
}
static int
zfs_dozonecheck(const char *dataset, cred_t *cr)
{
uint64_t zoned;
if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL))
return (SET_ERROR(ENOENT));
return (zfs_dozonecheck_impl(dataset, zoned, cr));
}
static int
zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr)
{
uint64_t zoned;
if (dsl_prop_get_int_ds(ds, "jailed", &zoned))
return (SET_ERROR(ENOENT));
return (zfs_dozonecheck_impl(dataset, zoned, cr));
}
static int
zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds,
const char *perm, cred_t *cr)
{
int error;
error = zfs_dozonecheck_ds(name, ds, cr);
if (error == 0) {
error = secpolicy_zfs(cr);
if (error != 0)
error = dsl_deleg_access_impl(ds, perm, cr);
}
return (error);
}
static int
zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr)
{
int error;
dsl_dataset_t *ds;
dsl_pool_t *dp;
/*
* First do a quick check for root in the global zone, which
* is allowed to do all write_perms. This ensures that zfs_ioc_*
* will get to handle nonexistent datasets.
*/
if (INGLOBALZONE(curthread) && secpolicy_zfs(cr) == 0)
return (0);
error = dsl_pool_hold(name, FTAG, &dp);
if (error != 0)
return (error);
error = dsl_dataset_hold(dp, name, FTAG, &ds);
if (error != 0) {
dsl_pool_rele(dp, FTAG);
return (error);
}
error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr);
dsl_dataset_rele(ds, FTAG);
dsl_pool_rele(dp, FTAG);
return (error);
}
#ifdef SECLABEL
/*
* Policy for setting the security label property.
*
* Returns 0 for success, non-zero for access and other errors.
*/
static int
zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr)
{
char ds_hexsl[MAXNAMELEN];
bslabel_t ds_sl, new_sl;
boolean_t new_default = FALSE;
uint64_t zoned;
int needed_priv = -1;
int error;
/* First get the existing dataset label. */
error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
1, sizeof (ds_hexsl), &ds_hexsl, NULL);
if (error != 0)
return (SET_ERROR(EPERM));
if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0)
new_default = TRUE;
/* The label must be translatable */
if (!new_default && (hexstr_to_label(strval, &new_sl) != 0))
return (SET_ERROR(EINVAL));
/*
* In a non-global zone, disallow attempts to set a label that
* doesn't match that of the zone; otherwise no other checks
* are needed.
*/
if (!INGLOBALZONE(curproc)) {
if (new_default || !blequal(&new_sl, CR_SL(CRED())))
return (SET_ERROR(EPERM));
return (0);
}
/*
* For global-zone datasets (i.e., those whose zoned property is
* "off", verify that the specified new label is valid for the
* global zone.
*/
if (dsl_prop_get_integer(name,
zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
return (SET_ERROR(EPERM));
if (!zoned) {
if (zfs_check_global_label(name, strval) != 0)
return (SET_ERROR(EPERM));
}
/*
* If the existing dataset label is nondefault, check if the
* dataset is mounted (label cannot be changed while mounted).
* Get the zfsvfs; if there isn't one, then the dataset isn't
* mounted (or isn't a dataset, doesn't exist, ...).
*/
if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) {
objset_t *os;
static char *setsl_tag = "setsl_tag";
/*
* Try to own the dataset; abort if there is any error,
* (e.g., already mounted, in use, or other error).
*/
error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE,
setsl_tag, &os);
if (error != 0)
return (SET_ERROR(EPERM));
dmu_objset_disown(os, setsl_tag);
if (new_default) {
needed_priv = PRIV_FILE_DOWNGRADE_SL;
goto out_check;
}
if (hexstr_to_label(strval, &new_sl) != 0)
return (SET_ERROR(EPERM));
if (blstrictdom(&ds_sl, &new_sl))
needed_priv = PRIV_FILE_DOWNGRADE_SL;
else if (blstrictdom(&new_sl, &ds_sl))
needed_priv = PRIV_FILE_UPGRADE_SL;
} else {
/* dataset currently has a default label */
if (!new_default)
needed_priv = PRIV_FILE_UPGRADE_SL;
}
out_check:
if (needed_priv != -1)
return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL));
return (0);
}
#endif /* SECLABEL */
static int
zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval,
cred_t *cr)
{
char *strval;
/*
* Check permissions for special properties.
*/
switch (prop) {
case ZFS_PROP_ZONED:
/*
* Disallow setting of 'zoned' from within a local zone.
*/
if (!INGLOBALZONE(curthread))
return (SET_ERROR(EPERM));
break;
case ZFS_PROP_QUOTA:
case ZFS_PROP_FILESYSTEM_LIMIT:
case ZFS_PROP_SNAPSHOT_LIMIT:
if (!INGLOBALZONE(curthread)) {
uint64_t zoned;
char setpoint[ZFS_MAX_DATASET_NAME_LEN];
/*
* Unprivileged users are allowed to modify the
* limit on things *under* (ie. contained by)
* the thing they own.
*/
if (dsl_prop_get_integer(dsname, "jailed", &zoned,
setpoint))
return (SET_ERROR(EPERM));
if (!zoned || strlen(dsname) <= strlen(setpoint))
return (SET_ERROR(EPERM));
}
break;
case ZFS_PROP_MLSLABEL:
#ifdef SECLABEL
if (!is_system_labeled())
return (SET_ERROR(EPERM));
if (nvpair_value_string(propval, &strval) == 0) {
int err;
err = zfs_set_slabel_policy(dsname, strval, CRED());
if (err != 0)
return (err);
}
#else
return (EOPNOTSUPP);
#endif
break;
}
return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr));
}
/* ARGSUSED */
static int
zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
int error;
error = zfs_dozonecheck(zc->zc_name, cr);
if (error != 0)
return (error);
/*
* permission to set permissions will be evaluated later in
* dsl_deleg_can_allow()
*/
return (0);
}
/* ARGSUSED */
static int
zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
return (zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_ROLLBACK, cr));
}
/* ARGSUSED */
static int
zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
dsl_pool_t *dp;
dsl_dataset_t *ds;
char *cp;
int error;
/*
* Generate the current snapshot name from the given objsetid, then
* use that name for the secpolicy/zone checks.
*/
cp = strchr(zc->zc_name, '@');
if (cp == NULL)
return (SET_ERROR(EINVAL));
error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
if (error != 0)
return (error);
error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
if (error != 0) {
dsl_pool_rele(dp, FTAG);
return (error);
}
dsl_dataset_name(ds, zc->zc_name);
error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds,
ZFS_DELEG_PERM_SEND, cr);
dsl_dataset_rele(ds, FTAG);
dsl_pool_rele(dp, FTAG);
return (error);
}
/* ARGSUSED */
static int
zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
return (zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_SEND, cr));
}
/* ARGSUSED */
static int
zfs_secpolicy_deleg_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
vnode_t *vp;
int error;
if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
NO_FOLLOW, NULL, &vp)) != 0)
return (error);
/* Now make sure mntpnt and dataset are ZFS */
if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 ||
(strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
zc->zc_name) != 0)) {
VN_RELE(vp);
return (SET_ERROR(EPERM));
}
VN_RELE(vp);
return (dsl_deleg_access(zc->zc_name,
ZFS_DELEG_PERM_SHARE, cr));
}
int
zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
if (!INGLOBALZONE(curthread))
return (SET_ERROR(EPERM));
if (secpolicy_nfs(cr) == 0) {
return (0);
} else {
return (zfs_secpolicy_deleg_share(zc, innvl, cr));
}
}
int
zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
if (!INGLOBALZONE(curthread))
return (SET_ERROR(EPERM));
if (secpolicy_smb(cr) == 0) {
return (0);
} else {
return (zfs_secpolicy_deleg_share(zc, innvl, cr));
}
}
static int
zfs_get_parent(const char *datasetname, char *parent, int parentsize)
{
char *cp;
/*
* Remove the @bla or /bla from the end of the name to get the parent.
*/
(void) strncpy(parent, datasetname, parentsize);
cp = strrchr(parent, '@');
if (cp != NULL) {
cp[0] = '\0';
} else {
cp = strrchr(parent, '/');
if (cp == NULL)
return (SET_ERROR(ENOENT));
cp[0] = '\0';
}
return (0);
}
int
zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
{
int error;
if ((error = zfs_secpolicy_write_perms(name,
ZFS_DELEG_PERM_MOUNT, cr)) != 0)
return (error);
return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr));
}
/* ARGSUSED */
static int
zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
return (zfs_secpolicy_destroy_perms(zc->zc_name, cr));
}
/*
* Destroying snapshots with delegated permissions requires
* descendant mount and destroy permissions.
*/
/* ARGSUSED */
static int
zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
nvlist_t *snaps;
nvpair_t *pair, *nextpair;
int error = 0;
if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
return (SET_ERROR(EINVAL));
for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
pair = nextpair) {
nextpair = nvlist_next_nvpair(snaps, pair);
error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr);
if (error == ENOENT) {
/*
* Ignore any snapshots that don't exist (we consider
* them "already destroyed"). Remove the name from the
* nvl here in case the snapshot is created between
* now and when we try to destroy it (in which case
* we don't want to destroy it since we haven't
* checked for permission).
*/
fnvlist_remove_nvpair(snaps, pair);
error = 0;
}
if (error != 0)
break;
}
return (error);
}
int
zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
{
char parentname[ZFS_MAX_DATASET_NAME_LEN];
int error;
if ((error = zfs_secpolicy_write_perms(from,
ZFS_DELEG_PERM_RENAME, cr)) != 0)
return (error);
if ((error = zfs_secpolicy_write_perms(from,
ZFS_DELEG_PERM_MOUNT, cr)) != 0)
return (error);
if ((error = zfs_get_parent(to, parentname,
sizeof (parentname))) != 0)
return (error);
if ((error = zfs_secpolicy_write_perms(parentname,
ZFS_DELEG_PERM_CREATE, cr)) != 0)
return (error);
if ((error = zfs_secpolicy_write_perms(parentname,
ZFS_DELEG_PERM_MOUNT, cr)) != 0)
return (error);
return (error);
}
/* ARGSUSED */
static int
zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
char *at = NULL;
int error;
if ((zc->zc_cookie & 1) != 0) {
/*
* This is recursive rename, so the starting snapshot might
* not exist. Check file system or volume permission instead.
*/
at = strchr(zc->zc_name, '@');
if (at == NULL)
return (EINVAL);
*at = '\0';
}
error = zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr);
if (at != NULL)
*at = '@';
return (error);
}
/* ARGSUSED */
static int
zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
dsl_pool_t *dp;
dsl_dataset_t *clone;
int error;
error = zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_PROMOTE, cr);
if (error != 0)
return (error);
error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
if (error != 0)
return (error);
error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone);
if (error == 0) {
char parentname[ZFS_MAX_DATASET_NAME_LEN];
dsl_dataset_t *origin = NULL;
dsl_dir_t *dd;
dd = clone->ds_dir;
error = dsl_dataset_hold_obj(dd->dd_pool,
dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin);
if (error != 0) {
dsl_dataset_rele(clone, FTAG);
dsl_pool_rele(dp, FTAG);
return (error);
}
error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone,
ZFS_DELEG_PERM_MOUNT, cr);
dsl_dataset_name(origin, parentname);
if (error == 0) {
error = zfs_secpolicy_write_perms_ds(parentname, origin,
ZFS_DELEG_PERM_PROMOTE, cr);
}
dsl_dataset_rele(clone, FTAG);
dsl_dataset_rele(origin, FTAG);
}
dsl_pool_rele(dp, FTAG);
return (error);
}
/* ARGSUSED */
static int
zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
int error;
if ((error = zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_RECEIVE, cr)) != 0)
return (error);
if ((error = zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_MOUNT, cr)) != 0)
return (error);
return (zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_CREATE, cr));
}
int
zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
{
return (zfs_secpolicy_write_perms(name,
ZFS_DELEG_PERM_SNAPSHOT, cr));
}
/*
* Check for permission to create each snapshot in the nvlist.
*/
/* ARGSUSED */
static int
zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
nvlist_t *snaps;
int error;
nvpair_t *pair;
if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
return (SET_ERROR(EINVAL));
for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
pair = nvlist_next_nvpair(snaps, pair)) {
char *name = nvpair_name(pair);
char *atp = strchr(name, '@');
if (atp == NULL) {
error = SET_ERROR(EINVAL);
break;
}
*atp = '\0';
error = zfs_secpolicy_snapshot_perms(name, cr);
*atp = '@';
if (error != 0)
break;
}
return (error);
}
/*
* Check for permission to create each snapshot in the nvlist.
*/
/* ARGSUSED */
static int
zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
int error = 0;
for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
char *name = nvpair_name(pair);
char *hashp = strchr(name, '#');
if (hashp == NULL) {
error = SET_ERROR(EINVAL);
break;
}
*hashp = '\0';
error = zfs_secpolicy_write_perms(name,
ZFS_DELEG_PERM_BOOKMARK, cr);
*hashp = '#';
if (error != 0)
break;
}
return (error);
}
/* ARGSUSED */
static int
+zfs_secpolicy_remap(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_REMAP, cr));
+}
+
+/* ARGSUSED */
+static int
zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
nvpair_t *pair, *nextpair;
int error = 0;
for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
pair = nextpair) {
char *name = nvpair_name(pair);
char *hashp = strchr(name, '#');
nextpair = nvlist_next_nvpair(innvl, pair);
if (hashp == NULL) {
error = SET_ERROR(EINVAL);
break;
}
*hashp = '\0';
error = zfs_secpolicy_write_perms(name,
ZFS_DELEG_PERM_DESTROY, cr);
*hashp = '#';
if (error == ENOENT) {
/*
* Ignore any filesystems that don't exist (we consider
* their bookmarks "already destroyed"). Remove
* the name from the nvl here in case the filesystem
* is created between now and when we try to destroy
* the bookmark (in which case we don't want to
* destroy it since we haven't checked for permission).
*/
fnvlist_remove_nvpair(innvl, pair);
error = 0;
}
if (error != 0)
break;
}
return (error);
}
/* ARGSUSED */
static int
zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
/*
* Even root must have a proper TSD so that we know what pool
* to log to.
*/
if (tsd_get(zfs_allow_log_key) == NULL)
return (SET_ERROR(EPERM));
return (0);
}
static int
zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
char parentname[ZFS_MAX_DATASET_NAME_LEN];
int error;
char *origin;
if ((error = zfs_get_parent(zc->zc_name, parentname,
sizeof (parentname))) != 0)
return (error);
if (nvlist_lookup_string(innvl, "origin", &origin) == 0 &&
(error = zfs_secpolicy_write_perms(origin,
ZFS_DELEG_PERM_CLONE, cr)) != 0)
return (error);
if ((error = zfs_secpolicy_write_perms(parentname,
ZFS_DELEG_PERM_CREATE, cr)) != 0)
return (error);
return (zfs_secpolicy_write_perms(parentname,
ZFS_DELEG_PERM_MOUNT, cr));
}
/*
* Policy for pool operations - create/destroy pools, add vdevs, etc. Requires
* SYS_CONFIG privilege, which is not available in a local zone.
*/
/* ARGSUSED */
static int
zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
if (secpolicy_sys_config(cr, B_FALSE) != 0)
return (SET_ERROR(EPERM));
return (0);
}
/*
* Policy for object to name lookups.
*/
/* ARGSUSED */
static int
zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
int error;
if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0)
return (0);
error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr);
return (error);
}
/*
* Policy for fault injection. Requires all privileges.
*/
/* ARGSUSED */
static int
zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
return (secpolicy_zinject(cr));
}
/* ARGSUSED */
static int
zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
zfs_prop_t prop = zfs_name_to_prop(zc->zc_value);
if (prop == ZPROP_INVAL) {
if (!zfs_prop_user(zc->zc_value))
return (SET_ERROR(EINVAL));
return (zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_USERPROP, cr));
} else {
return (zfs_secpolicy_setprop(zc->zc_name, prop,
NULL, cr));
}
}
static int
zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
int err = zfs_secpolicy_read(zc, innvl, cr);
if (err)
return (err);
if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
return (SET_ERROR(EINVAL));
if (zc->zc_value[0] == 0) {
/*
* They are asking about a posix uid/gid. If it's
* themself, allow it.
*/
if (zc->zc_objset_type == ZFS_PROP_USERUSED ||
zc->zc_objset_type == ZFS_PROP_USERQUOTA) {
if (zc->zc_guid == crgetuid(cr))
return (0);
} else {
if (groupmember(zc->zc_guid, cr))
return (0);
}
}
return (zfs_secpolicy_write_perms(zc->zc_name,
userquota_perms[zc->zc_objset_type], cr));
}
static int
zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
int err = zfs_secpolicy_read(zc, innvl, cr);
if (err)
return (err);
if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
return (SET_ERROR(EINVAL));
return (zfs_secpolicy_write_perms(zc->zc_name,
userquota_perms[zc->zc_objset_type], cr));
}
/* ARGSUSED */
static int
zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION,
NULL, cr));
}
/* ARGSUSED */
static int
zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
nvpair_t *pair;
nvlist_t *holds;
int error;
error = nvlist_lookup_nvlist(innvl, "holds", &holds);
if (error != 0)
return (SET_ERROR(EINVAL));
for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
pair = nvlist_next_nvpair(holds, pair)) {
char fsname[ZFS_MAX_DATASET_NAME_LEN];
error = dmu_fsname(nvpair_name(pair), fsname);
if (error != 0)
return (error);
error = zfs_secpolicy_write_perms(fsname,
ZFS_DELEG_PERM_HOLD, cr);
if (error != 0)
return (error);
}
return (0);
}
/* ARGSUSED */
static int
zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
nvpair_t *pair;
int error;
for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
pair = nvlist_next_nvpair(innvl, pair)) {
char fsname[ZFS_MAX_DATASET_NAME_LEN];
error = dmu_fsname(nvpair_name(pair), fsname);
if (error != 0)
return (error);
error = zfs_secpolicy_write_perms(fsname,
ZFS_DELEG_PERM_RELEASE, cr);
if (error != 0)
return (error);
}
return (0);
}
/*
* Policy for allowing temporary snapshots to be taken or released
*/
static int
zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
/*
* A temporary snapshot is the same as a snapshot,
* hold, destroy and release all rolled into one.
* Delegated diff alone is sufficient that we allow this.
*/
int error;
if ((error = zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_DIFF, cr)) == 0)
return (0);
error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr);
if (error == 0)
error = zfs_secpolicy_hold(zc, innvl, cr);
if (error == 0)
error = zfs_secpolicy_release(zc, innvl, cr);
if (error == 0)
error = zfs_secpolicy_destroy(zc, innvl, cr);
return (error);
}
/*
* Returns the nvlist as specified by the user in the zfs_cmd_t.
*/
static int
get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp)
{
char *packed;
int error;
nvlist_t *list = NULL;
/*
* Read in and unpack the user-supplied nvlist.
*/
if (size == 0)
return (SET_ERROR(EINVAL));
packed = kmem_alloc(size, KM_SLEEP);
if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size,
iflag)) != 0) {
kmem_free(packed, size);
return (SET_ERROR(EFAULT));
}
if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) {
kmem_free(packed, size);
return (error);
}
kmem_free(packed, size);
*nvp = list;
return (0);
}
/*
* Reduce the size of this nvlist until it can be serialized in 'max' bytes.
* Entries will be removed from the end of the nvlist, and one int32 entry
* named "N_MORE_ERRORS" will be added indicating how many entries were
* removed.
*/
static int
nvlist_smush(nvlist_t *errors, size_t max)
{
size_t size;
size = fnvlist_size(errors);
if (size > max) {
nvpair_t *more_errors;
int n = 0;
if (max < 1024)
return (SET_ERROR(ENOMEM));
fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0);
more_errors = nvlist_prev_nvpair(errors, NULL);
do {
nvpair_t *pair = nvlist_prev_nvpair(errors,
more_errors);
fnvlist_remove_nvpair(errors, pair);
n++;
size = fnvlist_size(errors);
} while (size > max);
fnvlist_remove_nvpair(errors, more_errors);
fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n);
ASSERT3U(fnvlist_size(errors), <=, max);
}
return (0);
}
static int
put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
{
char *packed = NULL;
int error = 0;
size_t size;
size = fnvlist_size(nvl);
if (size > zc->zc_nvlist_dst_size) {
/*
* Solaris returns ENOMEM here, because even if an error is
* returned from an ioctl(2), new zc_nvlist_dst_size will be
* passed to the userland. This is not the case for FreeBSD.
* We need to return 0, so the kernel will copy the
* zc_nvlist_dst_size back and the userland can discover that a
* bigger buffer is needed.
*/
error = 0;
} else {
packed = fnvlist_pack(nvl, &size);
if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
size, zc->zc_iflags) != 0)
error = SET_ERROR(EFAULT);
fnvlist_pack_free(packed, size);
}
zc->zc_nvlist_dst_size = size;
zc->zc_nvlist_dst_filled = B_TRUE;
return (error);
}
int
getzfsvfs_impl(objset_t *os, vfs_t **vfsp)
{
zfsvfs_t *zfvp;
int error = 0;
if (dmu_objset_type(os) != DMU_OST_ZFS) {
return (SET_ERROR(EINVAL));
}
mutex_enter(&os->os_user_ptr_lock);
zfvp = dmu_objset_get_user(os);
if (zfvp) {
*vfsp = zfvp->z_vfs;
vfs_ref(zfvp->z_vfs);
} else {
error = SET_ERROR(ESRCH);
}
mutex_exit(&os->os_user_ptr_lock);
return (error);
}
int
getzfsvfs(const char *dsname, zfsvfs_t **zfvp)
{
objset_t *os;
vfs_t *vfsp;
int error;
error = dmu_objset_hold(dsname, FTAG, &os);
if (error != 0)
return (error);
error = getzfsvfs_impl(os, &vfsp);
dmu_objset_rele(os, FTAG);
if (error != 0)
return (error);
error = vfs_busy(vfsp, 0);
vfs_rel(vfsp);
if (error != 0) {
*zfvp = NULL;
error = SET_ERROR(ESRCH);
} else {
*zfvp = vfsp->vfs_data;
}
return (error);
}
/*
* Find a zfsvfs_t for a mounted filesystem, or create our own, in which
* case its z_vfs will be NULL, and it will be opened as the owner.
* If 'writer' is set, the z_teardown_lock will be held for RW_WRITER,
* which prevents all vnode ops from running.
*/
static int
zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer)
{
int error = 0;
if (getzfsvfs(name, zfvp) != 0)
error = zfsvfs_create(name, zfvp);
if (error == 0) {
rrm_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER :
RW_READER, tag);
#ifdef illumos
if ((*zfvp)->z_unmounted) {
/*
* XXX we could probably try again, since the unmounting
* thread should be just about to disassociate the
* objset from the zfsvfs.
*/
rrm_exit(&(*zfvp)->z_teardown_lock, tag);
return (SET_ERROR(EBUSY));
}
#else
/*
* vfs_busy() ensures that the filesystem is not and
* can not be unmounted.
*/
ASSERT(!(*zfvp)->z_unmounted);
#endif
}
return (error);
}
static void
zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag)
{
rrm_exit(&zfsvfs->z_teardown_lock, tag);
if (zfsvfs->z_vfs) {
#ifdef illumos
VFS_RELE(zfsvfs->z_vfs);
#else
vfs_unbusy(zfsvfs->z_vfs);
#endif
} else {
dmu_objset_disown(zfsvfs->z_os, zfsvfs);
zfsvfs_free(zfsvfs);
}
}
static int
zfs_ioc_pool_create(zfs_cmd_t *zc)
{
int error;
nvlist_t *config, *props = NULL;
nvlist_t *rootprops = NULL;
nvlist_t *zplprops = NULL;
if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
zc->zc_iflags, &config))
return (error);
if (zc->zc_nvlist_src_size != 0 && (error =
get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
zc->zc_iflags, &props))) {
nvlist_free(config);
return (error);
}
if (props) {
nvlist_t *nvl = NULL;
uint64_t version = SPA_VERSION;
(void) nvlist_lookup_uint64(props,
zpool_prop_to_name(ZPOOL_PROP_VERSION), &version);
if (!SPA_VERSION_IS_SUPPORTED(version)) {
error = SET_ERROR(EINVAL);
goto pool_props_bad;
}
(void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl);
if (nvl) {
error = nvlist_dup(nvl, &rootprops, KM_SLEEP);
if (error != 0) {
nvlist_free(config);
nvlist_free(props);
return (error);
}
(void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS);
}
VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
error = zfs_fill_zplprops_root(version, rootprops,
zplprops, NULL);
if (error != 0)
goto pool_props_bad;
}
error = spa_create(zc->zc_name, config, props, zplprops);
/*
* Set the remaining root properties
*/
if (!error && (error = zfs_set_prop_nvlist(zc->zc_name,
ZPROP_SRC_LOCAL, rootprops, NULL)) != 0)
(void) spa_destroy(zc->zc_name);
pool_props_bad:
nvlist_free(rootprops);
nvlist_free(zplprops);
nvlist_free(config);
nvlist_free(props);
return (error);
}
static int
zfs_ioc_pool_destroy(zfs_cmd_t *zc)
{
int error;
zfs_log_history(zc);
error = spa_destroy(zc->zc_name);
if (error == 0)
zvol_remove_minors(zc->zc_name);
return (error);
}
static int
zfs_ioc_pool_import(zfs_cmd_t *zc)
{
nvlist_t *config, *props = NULL;
uint64_t guid;
int error;
if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
zc->zc_iflags, &config)) != 0)
return (error);
if (zc->zc_nvlist_src_size != 0 && (error =
get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
zc->zc_iflags, &props))) {
nvlist_free(config);
return (error);
}
if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
guid != zc->zc_guid)
error = SET_ERROR(EINVAL);
else
error = spa_import(zc->zc_name, config, props, zc->zc_cookie);
if (zc->zc_nvlist_dst != 0) {
int err;
if ((err = put_nvlist(zc, config)) != 0)
error = err;
}
nvlist_free(config);
nvlist_free(props);
return (error);
}
static int
zfs_ioc_pool_export(zfs_cmd_t *zc)
{
int error;
boolean_t force = (boolean_t)zc->zc_cookie;
boolean_t hardforce = (boolean_t)zc->zc_guid;
zfs_log_history(zc);
error = spa_export(zc->zc_name, NULL, force, hardforce);
if (error == 0)
zvol_remove_minors(zc->zc_name);
return (error);
}
static int
zfs_ioc_pool_configs(zfs_cmd_t *zc)
{
nvlist_t *configs;
int error;
if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
return (SET_ERROR(EEXIST));
error = put_nvlist(zc, configs);
nvlist_free(configs);
return (error);
}
/*
* inputs:
* zc_name name of the pool
*
* outputs:
* zc_cookie real errno
* zc_nvlist_dst config nvlist
* zc_nvlist_dst_size size of config nvlist
*/
static int
zfs_ioc_pool_stats(zfs_cmd_t *zc)
{
nvlist_t *config;
int error;
int ret = 0;
error = spa_get_stats(zc->zc_name, &config, zc->zc_value,
sizeof (zc->zc_value));
if (config != NULL) {
ret = put_nvlist(zc, config);
nvlist_free(config);
/*
* The config may be present even if 'error' is non-zero.
* In this case we return success, and preserve the real errno
* in 'zc_cookie'.
*/
zc->zc_cookie = error;
} else {
ret = error;
}
return (ret);
}
/*
* Try to import the given pool, returning pool stats as appropriate so that
* user land knows which devices are available and overall pool health.
*/
static int
zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
{
nvlist_t *tryconfig, *config;
int error;
if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
zc->zc_iflags, &tryconfig)) != 0)
return (error);
config = spa_tryimport(tryconfig);
nvlist_free(tryconfig);
if (config == NULL)
return (SET_ERROR(EINVAL));
error = put_nvlist(zc, config);
nvlist_free(config);
return (error);
}
/*
* inputs:
* zc_name name of the pool
* zc_cookie scan func (pool_scan_func_t)
* zc_flags scrub pause/resume flag (pool_scrub_cmd_t)
*/
static int
zfs_ioc_pool_scan(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
return (error);
if (zc->zc_flags >= POOL_SCRUB_FLAGS_END)
return (SET_ERROR(EINVAL));
if (zc->zc_flags == POOL_SCRUB_PAUSE)
error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE);
else if (zc->zc_cookie == POOL_SCAN_NONE)
error = spa_scan_stop(spa);
else
error = spa_scan(spa, zc->zc_cookie);
spa_close(spa, FTAG);
return (error);
}
static int
zfs_ioc_pool_freeze(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
error = spa_open(zc->zc_name, &spa, FTAG);
if (error == 0) {
spa_freeze(spa);
spa_close(spa, FTAG);
}
return (error);
}
static int
zfs_ioc_pool_upgrade(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
return (error);
if (zc->zc_cookie < spa_version(spa) ||
!SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) {
spa_close(spa, FTAG);
return (SET_ERROR(EINVAL));
}
spa_upgrade(spa, zc->zc_cookie);
spa_close(spa, FTAG);
return (error);
}
static int
zfs_ioc_pool_get_history(zfs_cmd_t *zc)
{
spa_t *spa;
char *hist_buf;
uint64_t size;
int error;
if ((size = zc->zc_history_len) == 0)
return (SET_ERROR(EINVAL));
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
return (error);
if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
spa_close(spa, FTAG);
return (SET_ERROR(ENOTSUP));
}
hist_buf = kmem_alloc(size, KM_SLEEP);
if ((error = spa_history_get(spa, &zc->zc_history_offset,
&zc->zc_history_len, hist_buf)) == 0) {
error = ddi_copyout(hist_buf,
(void *)(uintptr_t)zc->zc_history,
zc->zc_history_len, zc->zc_iflags);
}
spa_close(spa, FTAG);
kmem_free(hist_buf, size);
return (error);
}
static int
zfs_ioc_pool_reguid(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
error = spa_open(zc->zc_name, &spa, FTAG);
if (error == 0) {
error = spa_change_guid(spa);
spa_close(spa, FTAG);
}
return (error);
}
static int
zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc)
{
return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value));
}
/*
* inputs:
* zc_name name of filesystem
* zc_obj object to find
*
* outputs:
* zc_value name of object
*/
static int
zfs_ioc_obj_to_path(zfs_cmd_t *zc)
{
objset_t *os;
int error;
/* XXX reading from objset not owned */
if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
return (error);
if (dmu_objset_type(os) != DMU_OST_ZFS) {
dmu_objset_rele(os, FTAG);
return (SET_ERROR(EINVAL));
}
error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value,
sizeof (zc->zc_value));
dmu_objset_rele(os, FTAG);
return (error);
}
/*
* inputs:
* zc_name name of filesystem
* zc_obj object to find
*
* outputs:
* zc_stat stats on object
* zc_value path to object
*/
static int
zfs_ioc_obj_to_stats(zfs_cmd_t *zc)
{
objset_t *os;
int error;
/* XXX reading from objset not owned */
if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
return (error);
if (dmu_objset_type(os) != DMU_OST_ZFS) {
dmu_objset_rele(os, FTAG);
return (SET_ERROR(EINVAL));
}
error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value,
sizeof (zc->zc_value));
dmu_objset_rele(os, FTAG);
return (error);
}
static int
zfs_ioc_vdev_add(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
nvlist_t *config, **l2cache, **spares;
uint_t nl2cache = 0, nspares = 0;
error = spa_open(zc->zc_name, &spa, FTAG);
if (error != 0)
return (error);
error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
zc->zc_iflags, &config);
(void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE,
&l2cache, &nl2cache);
(void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES,
&spares, &nspares);
#ifdef illumos
/*
* A root pool with concatenated devices is not supported.
* Thus, can not add a device to a root pool.
*
* Intent log device can not be added to a rootpool because
* during mountroot, zil is replayed, a seperated log device
* can not be accessed during the mountroot time.
*
* l2cache and spare devices are ok to be added to a rootpool.
*/
if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) {
nvlist_free(config);
spa_close(spa, FTAG);
return (SET_ERROR(EDOM));
}
#endif /* illumos */
if (error == 0) {
error = spa_vdev_add(spa, config);
nvlist_free(config);
}
spa_close(spa, FTAG);
return (error);
}
/*
* inputs:
* zc_name name of the pool
- * zc_nvlist_conf nvlist of devices to remove
- * zc_cookie to stop the remove?
+ * zc_guid guid of vdev to remove
+ * zc_cookie cancel removal
*/
static int
zfs_ioc_vdev_remove(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
error = spa_open(zc->zc_name, &spa, FTAG);
if (error != 0)
return (error);
- error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE);
+ if (zc->zc_cookie != 0) {
+ error = spa_vdev_remove_cancel(spa);
+ } else {
+ error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE);
+ }
spa_close(spa, FTAG);
return (error);
}
static int
zfs_ioc_vdev_set_state(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
vdev_state_t newstate = VDEV_STATE_UNKNOWN;
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
return (error);
switch (zc->zc_cookie) {
case VDEV_STATE_ONLINE:
error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate);
break;
case VDEV_STATE_OFFLINE:
error = vdev_offline(spa, zc->zc_guid, zc->zc_obj);
break;
case VDEV_STATE_FAULTED:
if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
zc->zc_obj != VDEV_AUX_EXTERNAL)
zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
error = vdev_fault(spa, zc->zc_guid, zc->zc_obj);
break;
case VDEV_STATE_DEGRADED:
if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
zc->zc_obj != VDEV_AUX_EXTERNAL)
zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj);
break;
default:
error = SET_ERROR(EINVAL);
}
zc->zc_cookie = newstate;
spa_close(spa, FTAG);
return (error);
}
static int
zfs_ioc_vdev_attach(zfs_cmd_t *zc)
{
spa_t *spa;
int replacing = zc->zc_cookie;
nvlist_t *config;
int error;
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
return (error);
if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
zc->zc_iflags, &config)) == 0) {
error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
nvlist_free(config);
}
spa_close(spa, FTAG);
return (error);
}
static int
zfs_ioc_vdev_detach(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
return (error);
error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE);
spa_close(spa, FTAG);
return (error);
}
static int
zfs_ioc_vdev_split(zfs_cmd_t *zc)
{
spa_t *spa;
nvlist_t *config, *props = NULL;
int error;
boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT);
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
return (error);
if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
zc->zc_iflags, &config)) {
spa_close(spa, FTAG);
return (error);
}
if (zc->zc_nvlist_src_size != 0 && (error =
get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
zc->zc_iflags, &props))) {
spa_close(spa, FTAG);
nvlist_free(config);
return (error);
}
error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp);
spa_close(spa, FTAG);
nvlist_free(config);
nvlist_free(props);
return (error);
}
static int
zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
{
spa_t *spa;
char *path = zc->zc_value;
uint64_t guid = zc->zc_guid;
int error;
error = spa_open(zc->zc_name, &spa, FTAG);
if (error != 0)
return (error);
error = spa_vdev_setpath(spa, guid, path);
spa_close(spa, FTAG);
return (error);
}
static int
zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
{
spa_t *spa;
char *fru = zc->zc_value;
uint64_t guid = zc->zc_guid;
int error;
error = spa_open(zc->zc_name, &spa, FTAG);
if (error != 0)
return (error);
error = spa_vdev_setfru(spa, guid, fru);
spa_close(spa, FTAG);
return (error);
}
static int
zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
{
int error = 0;
nvlist_t *nv;
dmu_objset_fast_stat(os, &zc->zc_objset_stats);
if (zc->zc_nvlist_dst != 0 &&
(error = dsl_prop_get_all(os, &nv)) == 0) {
dmu_objset_stats(os, nv);
/*
* NB: zvol_get_stats() will read the objset contents,
* which we aren't supposed to do with a
* DS_MODE_USER hold, because it could be
* inconsistent. So this is a bit of a workaround...
* XXX reading with out owning
*/
if (!zc->zc_objset_stats.dds_inconsistent &&
dmu_objset_type(os) == DMU_OST_ZVOL) {
error = zvol_get_stats(os, nv);
if (error == EIO)
return (error);
VERIFY0(error);
}
error = put_nvlist(zc, nv);
nvlist_free(nv);
}
return (error);
}
/*
* inputs:
* zc_name name of filesystem
* zc_nvlist_dst_size size of buffer for property nvlist
*
* outputs:
* zc_objset_stats stats
* zc_nvlist_dst property nvlist
* zc_nvlist_dst_size size of property nvlist
*/
static int
zfs_ioc_objset_stats(zfs_cmd_t *zc)
{
objset_t *os;
int error;
error = dmu_objset_hold(zc->zc_name, FTAG, &os);
if (error == 0) {
error = zfs_ioc_objset_stats_impl(zc, os);
dmu_objset_rele(os, FTAG);
}
if (error == ENOMEM)
error = 0;
return (error);
}
/*
* inputs:
* zc_name name of filesystem
* zc_nvlist_dst_size size of buffer for property nvlist
*
* outputs:
* zc_nvlist_dst received property nvlist
* zc_nvlist_dst_size size of received property nvlist
*
* Gets received properties (distinct from local properties on or after
* SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from
* local property values.
*/
static int
zfs_ioc_objset_recvd_props(zfs_cmd_t *zc)
{
int error = 0;
nvlist_t *nv;
/*
* Without this check, we would return local property values if the
* caller has not already received properties on or after
* SPA_VERSION_RECVD_PROPS.
*/
if (!dsl_prop_get_hasrecvd(zc->zc_name))
return (SET_ERROR(ENOTSUP));
if (zc->zc_nvlist_dst != 0 &&
(error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) {
error = put_nvlist(zc, nv);
nvlist_free(nv);
}
return (error);
}
static int
nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop)
{
uint64_t value;
int error;
/*
* zfs_get_zplprop() will either find a value or give us
* the default value (if there is one).
*/
if ((error = zfs_get_zplprop(os, prop, &value)) != 0)
return (error);
VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0);
return (0);
}
/*
* inputs:
* zc_name name of filesystem
* zc_nvlist_dst_size size of buffer for zpl property nvlist
*
* outputs:
* zc_nvlist_dst zpl property nvlist
* zc_nvlist_dst_size size of zpl property nvlist
*/
static int
zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
{
objset_t *os;
int err;
/* XXX reading without owning */
if (err = dmu_objset_hold(zc->zc_name, FTAG, &os))
return (err);
dmu_objset_fast_stat(os, &zc->zc_objset_stats);
/*
* NB: nvl_add_zplprop() will read the objset contents,
* which we aren't supposed to do with a DS_MODE_USER
* hold, because it could be inconsistent.
*/
if (zc->zc_nvlist_dst != 0 &&
!zc->zc_objset_stats.dds_inconsistent &&
dmu_objset_type(os) == DMU_OST_ZFS) {
nvlist_t *nv;
VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 &&
(err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 &&
(err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 &&
(err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0)
err = put_nvlist(zc, nv);
nvlist_free(nv);
} else {
err = SET_ERROR(ENOENT);
}
dmu_objset_rele(os, FTAG);
return (err);
}
boolean_t
dataset_name_hidden(const char *name)
{
/*
* Skip over datasets that are not visible in this zone,
* internal datasets (which have a $ in their name), and
* temporary datasets (which have a % in their name).
*/
if (strchr(name, '$') != NULL)
return (B_TRUE);
if (strchr(name, '%') != NULL)
return (B_TRUE);
if (!INGLOBALZONE(curthread) && !zone_dataset_visible(name, NULL))
return (B_TRUE);
return (B_FALSE);
}
/*
* inputs:
* zc_name name of filesystem
* zc_cookie zap cursor
* zc_nvlist_dst_size size of buffer for property nvlist
*
* outputs:
* zc_name name of next filesystem
* zc_cookie zap cursor
* zc_objset_stats stats
* zc_nvlist_dst property nvlist
* zc_nvlist_dst_size size of property nvlist
*/
static int
zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
{
objset_t *os;
int error;
char *p;
size_t orig_len = strlen(zc->zc_name);
top:
if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) {
if (error == ENOENT)
error = SET_ERROR(ESRCH);
return (error);
}
p = strrchr(zc->zc_name, '/');
if (p == NULL || p[1] != '\0')
(void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
p = zc->zc_name + strlen(zc->zc_name);
do {
error = dmu_dir_list_next(os,
sizeof (zc->zc_name) - (p - zc->zc_name), p,
NULL, &zc->zc_cookie);
if (error == ENOENT)
error = SET_ERROR(ESRCH);
} while (error == 0 && dataset_name_hidden(zc->zc_name));
dmu_objset_rele(os, FTAG);
/*
* If it's an internal dataset (ie. with a '$' in its name),
* don't try to get stats for it, otherwise we'll return ENOENT.
*/
if (error == 0 && strchr(zc->zc_name, '$') == NULL) {
error = zfs_ioc_objset_stats(zc); /* fill in the stats */
if (error == ENOENT) {
/* We lost a race with destroy, get the next one. */
zc->zc_name[orig_len] = '\0';
goto top;
}
}
return (error);
}
/*
* inputs:
* zc_name name of filesystem
* zc_cookie zap cursor
* zc_nvlist_dst_size size of buffer for property nvlist
* zc_simple when set, only name is requested
*
* outputs:
* zc_name name of next snapshot
* zc_objset_stats stats
* zc_nvlist_dst property nvlist
* zc_nvlist_dst_size size of property nvlist
*/
static int
zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
{
objset_t *os;
int error;
error = dmu_objset_hold(zc->zc_name, FTAG, &os);
if (error != 0) {
return (error == ENOENT ? ESRCH : error);
}
/*
* A dataset name of maximum length cannot have any snapshots,
* so exit immediately.
*/
if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >=
ZFS_MAX_DATASET_NAME_LEN) {
dmu_objset_rele(os, FTAG);
return (SET_ERROR(ESRCH));
}
error = dmu_snapshot_list_next(os,
sizeof (zc->zc_name) - strlen(zc->zc_name),
zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie,
NULL);
if (error == 0 && !zc->zc_simple) {
dsl_dataset_t *ds;
dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
error = dsl_dataset_hold_obj(dp, zc->zc_obj, FTAG, &ds);
if (error == 0) {
objset_t *ossnap;
error = dmu_objset_from_ds(ds, &ossnap);
if (error == 0)
error = zfs_ioc_objset_stats_impl(zc, ossnap);
dsl_dataset_rele(ds, FTAG);
}
} else if (error == ENOENT) {
error = SET_ERROR(ESRCH);
}
dmu_objset_rele(os, FTAG);
/* if we failed, undo the @ that we tacked on to zc_name */
if (error != 0)
*strchr(zc->zc_name, '@') = '\0';
return (error);
}
static int
zfs_prop_set_userquota(const char *dsname, nvpair_t *pair)
{
const char *propname = nvpair_name(pair);
uint64_t *valary;
unsigned int vallen;
const char *domain;
char *dash;
zfs_userquota_prop_t type;
uint64_t rid;
uint64_t quota;
zfsvfs_t *zfsvfs;
int err;
if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
nvlist_t *attrs;
VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
&pair) != 0)
return (SET_ERROR(EINVAL));
}
/*
* A correctly constructed propname is encoded as
* userquota@<rid>-<domain>.
*/
if ((dash = strchr(propname, '-')) == NULL ||
nvpair_value_uint64_array(pair, &valary, &vallen) != 0 ||
vallen != 3)
return (SET_ERROR(EINVAL));
domain = dash + 1;
type = valary[0];
rid = valary[1];
quota = valary[2];
err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE);
if (err == 0) {
err = zfs_set_userquota(zfsvfs, type, domain, rid, quota);
zfsvfs_rele(zfsvfs, FTAG);
}
return (err);
}
/*
* If the named property is one that has a special function to set its value,
* return 0 on success and a positive error code on failure; otherwise if it is
* not one of the special properties handled by this function, return -1.
*
* XXX: It would be better for callers of the property interface if we handled
* these special cases in dsl_prop.c (in the dsl layer).
*/
static int
zfs_prop_set_special(const char *dsname, zprop_source_t source,
nvpair_t *pair)
{
const char *propname = nvpair_name(pair);
zfs_prop_t prop = zfs_name_to_prop(propname);
uint64_t intval;
int err = -1;
if (prop == ZPROP_INVAL) {
if (zfs_prop_userquota(propname))
return (zfs_prop_set_userquota(dsname, pair));
return (-1);
}
if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
nvlist_t *attrs;
VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
&pair) == 0);
}
if (zfs_prop_get_type(prop) == PROP_TYPE_STRING)
return (-1);
VERIFY(0 == nvpair_value_uint64(pair, &intval));
switch (prop) {
case ZFS_PROP_QUOTA:
err = dsl_dir_set_quota(dsname, source, intval);
break;
case ZFS_PROP_REFQUOTA:
err = dsl_dataset_set_refquota(dsname, source, intval);
break;
case ZFS_PROP_FILESYSTEM_LIMIT:
case ZFS_PROP_SNAPSHOT_LIMIT:
if (intval == UINT64_MAX) {
/* clearing the limit, just do it */
err = 0;
} else {
err = dsl_dir_activate_fs_ss_limit(dsname);
}
/*
* Set err to -1 to force the zfs_set_prop_nvlist code down the
* default path to set the value in the nvlist.
*/
if (err == 0)
err = -1;
break;
case ZFS_PROP_RESERVATION:
err = dsl_dir_set_reservation(dsname, source, intval);
break;
case ZFS_PROP_REFRESERVATION:
err = dsl_dataset_set_refreservation(dsname, source, intval);
break;
case ZFS_PROP_VOLSIZE:
err = zvol_set_volsize(dsname, intval);
break;
case ZFS_PROP_VERSION:
{
zfsvfs_t *zfsvfs;
if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0)
break;
err = zfs_set_version(zfsvfs, intval);
zfsvfs_rele(zfsvfs, FTAG);
if (err == 0 && intval >= ZPL_VERSION_USERSPACE) {
zfs_cmd_t *zc;
zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
(void) strcpy(zc->zc_name, dsname);
(void) zfs_ioc_userspace_upgrade(zc);
kmem_free(zc, sizeof (zfs_cmd_t));
}
break;
}
default:
err = -1;
}
return (err);
}
/*
* This function is best effort. If it fails to set any of the given properties,
* it continues to set as many as it can and returns the last error
* encountered. If the caller provides a non-NULL errlist, it will be filled in
* with the list of names of all the properties that failed along with the
* corresponding error numbers.
*
* If every property is set successfully, zero is returned and errlist is not
* modified.
*/
int
zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl,
nvlist_t *errlist)
{
nvpair_t *pair;
nvpair_t *propval;
int rv = 0;
uint64_t intval;
char *strval;
nvlist_t *genericnvl = fnvlist_alloc();
nvlist_t *retrynvl = fnvlist_alloc();
retry:
pair = NULL;
while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
const char *propname = nvpair_name(pair);
zfs_prop_t prop = zfs_name_to_prop(propname);
int err = 0;
/* decode the property value */
propval = pair;
if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
nvlist_t *attrs;
attrs = fnvpair_value_nvlist(pair);
if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
&propval) != 0)
err = SET_ERROR(EINVAL);
}
/* Validate value type */
if (err == 0 && prop == ZPROP_INVAL) {
if (zfs_prop_user(propname)) {
if (nvpair_type(propval) != DATA_TYPE_STRING)
err = SET_ERROR(EINVAL);
} else if (zfs_prop_userquota(propname)) {
if (nvpair_type(propval) !=
DATA_TYPE_UINT64_ARRAY)
err = SET_ERROR(EINVAL);
} else {
err = SET_ERROR(EINVAL);
}
} else if (err == 0) {
if (nvpair_type(propval) == DATA_TYPE_STRING) {
if (zfs_prop_get_type(prop) != PROP_TYPE_STRING)
err = SET_ERROR(EINVAL);
} else if (nvpair_type(propval) == DATA_TYPE_UINT64) {
const char *unused;
intval = fnvpair_value_uint64(propval);
switch (zfs_prop_get_type(prop)) {
case PROP_TYPE_NUMBER:
break;
case PROP_TYPE_STRING:
err = SET_ERROR(EINVAL);
break;
case PROP_TYPE_INDEX:
if (zfs_prop_index_to_string(prop,
intval, &unused) != 0)
err = SET_ERROR(EINVAL);
break;
default:
cmn_err(CE_PANIC,
"unknown property type");
}
} else {
err = SET_ERROR(EINVAL);
}
}
/* Validate permissions */
if (err == 0)
err = zfs_check_settable(dsname, pair, CRED());
if (err == 0) {
err = zfs_prop_set_special(dsname, source, pair);
if (err == -1) {
/*
* For better performance we build up a list of
* properties to set in a single transaction.
*/
err = nvlist_add_nvpair(genericnvl, pair);
} else if (err != 0 && nvl != retrynvl) {
/*
* This may be a spurious error caused by
* receiving quota and reservation out of order.
* Try again in a second pass.
*/
err = nvlist_add_nvpair(retrynvl, pair);
}
}
if (err != 0) {
if (errlist != NULL)
fnvlist_add_int32(errlist, propname, err);
rv = err;
}
}
if (nvl != retrynvl && !nvlist_empty(retrynvl)) {
nvl = retrynvl;
goto retry;
}
if (!nvlist_empty(genericnvl) &&
dsl_props_set(dsname, source, genericnvl) != 0) {
/*
* If this fails, we still want to set as many properties as we
* can, so try setting them individually.
*/
pair = NULL;
while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) {
const char *propname = nvpair_name(pair);
int err = 0;
propval = pair;
if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
nvlist_t *attrs;
attrs = fnvpair_value_nvlist(pair);
propval = fnvlist_lookup_nvpair(attrs,
ZPROP_VALUE);
}
if (nvpair_type(propval) == DATA_TYPE_STRING) {
strval = fnvpair_value_string(propval);
err = dsl_prop_set_string(dsname, propname,
source, strval);
} else {
intval = fnvpair_value_uint64(propval);
err = dsl_prop_set_int(dsname, propname, source,
intval);
}
if (err != 0) {
if (errlist != NULL) {
fnvlist_add_int32(errlist, propname,
err);
}
rv = err;
}
}
}
nvlist_free(genericnvl);
nvlist_free(retrynvl);
return (rv);
}
/*
* Check that all the properties are valid user properties.
*/
static int
zfs_check_userprops(const char *fsname, nvlist_t *nvl)
{
nvpair_t *pair = NULL;
int error = 0;
while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
const char *propname = nvpair_name(pair);
if (!zfs_prop_user(propname) ||
nvpair_type(pair) != DATA_TYPE_STRING)
return (SET_ERROR(EINVAL));
if (error = zfs_secpolicy_write_perms(fsname,
ZFS_DELEG_PERM_USERPROP, CRED()))
return (error);
if (strlen(propname) >= ZAP_MAXNAMELEN)
return (SET_ERROR(ENAMETOOLONG));
if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN)
return (E2BIG);
}
return (0);
}
static void
props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops)
{
nvpair_t *pair;
VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
pair = NULL;
while ((pair = nvlist_next_nvpair(props, pair)) != NULL) {
if (nvlist_exists(skipped, nvpair_name(pair)))
continue;
VERIFY(nvlist_add_nvpair(*newprops, pair) == 0);
}
}
static int
clear_received_props(const char *dsname, nvlist_t *props,
nvlist_t *skipped)
{
int err = 0;
nvlist_t *cleared_props = NULL;
props_skip(props, skipped, &cleared_props);
if (!nvlist_empty(cleared_props)) {
/*
* Acts on local properties until the dataset has received
* properties at least once on or after SPA_VERSION_RECVD_PROPS.
*/
zprop_source_t flags = (ZPROP_SRC_NONE |
(dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0));
err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL);
}
nvlist_free(cleared_props);
return (err);
}
/*
* inputs:
* zc_name name of filesystem
* zc_value name of property to set
* zc_nvlist_src{_size} nvlist of properties to apply
* zc_cookie received properties flag
*
* outputs:
* zc_nvlist_dst{_size} error for each unapplied received property
*/
static int
zfs_ioc_set_prop(zfs_cmd_t *zc)
{
nvlist_t *nvl;
boolean_t received = zc->zc_cookie;
zprop_source_t source = (received ? ZPROP_SRC_RECEIVED :
ZPROP_SRC_LOCAL);
nvlist_t *errors;
int error;
if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
zc->zc_iflags, &nvl)) != 0)
return (error);
if (received) {
nvlist_t *origprops;
if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) {
(void) clear_received_props(zc->zc_name,
origprops, nvl);
nvlist_free(origprops);
}
error = dsl_prop_set_hasrecvd(zc->zc_name);
}
errors = fnvlist_alloc();
if (error == 0)
error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors);
if (zc->zc_nvlist_dst != 0 && errors != NULL) {
(void) put_nvlist(zc, errors);
}
nvlist_free(errors);
nvlist_free(nvl);
return (error);
}
/*
* inputs:
* zc_name name of filesystem
* zc_value name of property to inherit
* zc_cookie revert to received value if TRUE
*
* outputs: none
*/
static int
zfs_ioc_inherit_prop(zfs_cmd_t *zc)
{
const char *propname = zc->zc_value;
zfs_prop_t prop = zfs_name_to_prop(propname);
boolean_t received = zc->zc_cookie;
zprop_source_t source = (received
? ZPROP_SRC_NONE /* revert to received value, if any */
: ZPROP_SRC_INHERITED); /* explicitly inherit */
if (received) {
nvlist_t *dummy;
nvpair_t *pair;
zprop_type_t type;
int err;
/*
* zfs_prop_set_special() expects properties in the form of an
* nvpair with type info.
*/
if (prop == ZPROP_INVAL) {
if (!zfs_prop_user(propname))
return (SET_ERROR(EINVAL));
type = PROP_TYPE_STRING;
} else if (prop == ZFS_PROP_VOLSIZE ||
prop == ZFS_PROP_VERSION) {
return (SET_ERROR(EINVAL));
} else {
type = zfs_prop_get_type(prop);
}
VERIFY(nvlist_alloc(&dummy, NV_UNIQUE_NAME, KM_SLEEP) == 0);
switch (type) {
case PROP_TYPE_STRING:
VERIFY(0 == nvlist_add_string(dummy, propname, ""));
break;
case PROP_TYPE_NUMBER:
case PROP_TYPE_INDEX:
VERIFY(0 == nvlist_add_uint64(dummy, propname, 0));
break;
default:
nvlist_free(dummy);
return (SET_ERROR(EINVAL));
}
pair = nvlist_next_nvpair(dummy, NULL);
err = zfs_prop_set_special(zc->zc_name, source, pair);
nvlist_free(dummy);
if (err != -1)
return (err); /* special property already handled */
} else {
/*
* Only check this in the non-received case. We want to allow
* 'inherit -S' to revert non-inheritable properties like quota
* and reservation to the received or default values even though
* they are not considered inheritable.
*/
if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop))
return (SET_ERROR(EINVAL));
}
/* property name has been validated by zfs_secpolicy_inherit_prop() */
return (dsl_prop_inherit(zc->zc_name, zc->zc_value, source));
}
static int
zfs_ioc_pool_set_props(zfs_cmd_t *zc)
{
nvlist_t *props;
spa_t *spa;
int error;
nvpair_t *pair;
if (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
zc->zc_iflags, &props))
return (error);
/*
* If the only property is the configfile, then just do a spa_lookup()
* to handle the faulted case.
*/
pair = nvlist_next_nvpair(props, NULL);
if (pair != NULL && strcmp(nvpair_name(pair),
zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 &&
nvlist_next_nvpair(props, pair) == NULL) {
mutex_enter(&spa_namespace_lock);
if ((spa = spa_lookup(zc->zc_name)) != NULL) {
spa_configfile_set(spa, props, B_FALSE);
- spa_config_sync(spa, B_FALSE, B_TRUE);
+ spa_write_cachefile(spa, B_FALSE, B_TRUE);
}
mutex_exit(&spa_namespace_lock);
if (spa != NULL) {
nvlist_free(props);
return (0);
}
}
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
nvlist_free(props);
return (error);
}
error = spa_prop_set(spa, props);
nvlist_free(props);
spa_close(spa, FTAG);
return (error);
}
static int
zfs_ioc_pool_get_props(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
nvlist_t *nvp = NULL;
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
/*
* If the pool is faulted, there may be properties we can still
* get (such as altroot and cachefile), so attempt to get them
* anyway.
*/
mutex_enter(&spa_namespace_lock);
if ((spa = spa_lookup(zc->zc_name)) != NULL)
error = spa_prop_get(spa, &nvp);
mutex_exit(&spa_namespace_lock);
} else {
error = spa_prop_get(spa, &nvp);
spa_close(spa, FTAG);
}
if (error == 0 && zc->zc_nvlist_dst != 0)
error = put_nvlist(zc, nvp);
else
error = SET_ERROR(EFAULT);
nvlist_free(nvp);
return (error);
}
/*
* inputs:
* zc_name name of filesystem
* zc_nvlist_src{_size} nvlist of delegated permissions
* zc_perm_action allow/unallow flag
*
* outputs: none
*/
static int
zfs_ioc_set_fsacl(zfs_cmd_t *zc)
{
int error;
nvlist_t *fsaclnv = NULL;
if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
zc->zc_iflags, &fsaclnv)) != 0)
return (error);
/*
* Verify nvlist is constructed correctly
*/
if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) {
nvlist_free(fsaclnv);
return (SET_ERROR(EINVAL));
}
/*
* If we don't have PRIV_SYS_MOUNT, then validate
* that user is allowed to hand out each permission in
* the nvlist(s)
*/
error = secpolicy_zfs(CRED());
if (error != 0) {
if (zc->zc_perm_action == B_FALSE) {
error = dsl_deleg_can_allow(zc->zc_name,
fsaclnv, CRED());
} else {
error = dsl_deleg_can_unallow(zc->zc_name,
fsaclnv, CRED());
}
}
if (error == 0)
error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action);
nvlist_free(fsaclnv);
return (error);
}
/*
* inputs:
* zc_name name of filesystem
*
* outputs:
* zc_nvlist_src{_size} nvlist of delegated permissions
*/
static int
zfs_ioc_get_fsacl(zfs_cmd_t *zc)
{
nvlist_t *nvp;
int error;
if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) {
error = put_nvlist(zc, nvp);
nvlist_free(nvp);
}
return (error);
}
/* ARGSUSED */
static void
zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
{
zfs_creat_t *zct = arg;
zfs_create_fs(os, cr, zct->zct_zplprops, tx);
}
#define ZFS_PROP_UNDEFINED ((uint64_t)-1)
/*
* inputs:
* os parent objset pointer (NULL if root fs)
* fuids_ok fuids allowed in this version of the spa?
* sa_ok SAs allowed in this version of the spa?
* createprops list of properties requested by creator
*
* outputs:
* zplprops values for the zplprops we attach to the master node object
* is_ci true if requested file system will be purely case-insensitive
*
* Determine the settings for utf8only, normalization and
* casesensitivity. Specific values may have been requested by the
* creator and/or we can inherit values from the parent dataset. If
* the file system is of too early a vintage, a creator can not
* request settings for these properties, even if the requested
* setting is the default value. We don't actually want to create dsl
* properties for these, so remove them from the source nvlist after
* processing.
*/
static int
zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops,
nvlist_t *zplprops, boolean_t *is_ci)
{
uint64_t sense = ZFS_PROP_UNDEFINED;
uint64_t norm = ZFS_PROP_UNDEFINED;
uint64_t u8 = ZFS_PROP_UNDEFINED;
ASSERT(zplprops != NULL);
if (os != NULL && os->os_phys->os_type != DMU_OST_ZFS)
return (SET_ERROR(EINVAL));
/*
* Pull out creator prop choices, if any.
*/
if (createprops) {
(void) nvlist_lookup_uint64(createprops,
zfs_prop_to_name(ZFS_PROP_VERSION), &zplver);
(void) nvlist_lookup_uint64(createprops,
zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm);
(void) nvlist_remove_all(createprops,
zfs_prop_to_name(ZFS_PROP_NORMALIZE));
(void) nvlist_lookup_uint64(createprops,
zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8);
(void) nvlist_remove_all(createprops,
zfs_prop_to_name(ZFS_PROP_UTF8ONLY));
(void) nvlist_lookup_uint64(createprops,
zfs_prop_to_name(ZFS_PROP_CASE), &sense);
(void) nvlist_remove_all(createprops,
zfs_prop_to_name(ZFS_PROP_CASE));
}
/*
* If the zpl version requested is whacky or the file system
* or pool is version is too "young" to support normalization
* and the creator tried to set a value for one of the props,
* error out.
*/
if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) ||
(zplver >= ZPL_VERSION_FUID && !fuids_ok) ||
(zplver >= ZPL_VERSION_SA && !sa_ok) ||
(zplver < ZPL_VERSION_NORMALIZATION &&
(norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED ||
sense != ZFS_PROP_UNDEFINED)))
return (SET_ERROR(ENOTSUP));
/*
* Put the version in the zplprops
*/
VERIFY(nvlist_add_uint64(zplprops,
zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0);
if (norm == ZFS_PROP_UNDEFINED)
VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0);
VERIFY(nvlist_add_uint64(zplprops,
zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0);
/*
* If we're normalizing, names must always be valid UTF-8 strings.
*/
if (norm)
u8 = 1;
if (u8 == ZFS_PROP_UNDEFINED)
VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0);
VERIFY(nvlist_add_uint64(zplprops,
zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0);
if (sense == ZFS_PROP_UNDEFINED)
VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0);
VERIFY(nvlist_add_uint64(zplprops,
zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0);
if (is_ci)
*is_ci = (sense == ZFS_CASE_INSENSITIVE);
return (0);
}
static int
zfs_fill_zplprops(const char *dataset, nvlist_t *createprops,
nvlist_t *zplprops, boolean_t *is_ci)
{
boolean_t fuids_ok, sa_ok;
uint64_t zplver = ZPL_VERSION;
objset_t *os = NULL;
char parentname[ZFS_MAX_DATASET_NAME_LEN];
char *cp;
spa_t *spa;
uint64_t spa_vers;
int error;
(void) strlcpy(parentname, dataset, sizeof (parentname));
cp = strrchr(parentname, '/');
ASSERT(cp != NULL);
cp[0] = '\0';
if ((error = spa_open(dataset, &spa, FTAG)) != 0)
return (error);
spa_vers = spa_version(spa);
spa_close(spa, FTAG);
zplver = zfs_zpl_version_map(spa_vers);
fuids_ok = (zplver >= ZPL_VERSION_FUID);
sa_ok = (zplver >= ZPL_VERSION_SA);
/*
* Open parent object set so we can inherit zplprop values.
*/
if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0)
return (error);
error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops,
zplprops, is_ci);
dmu_objset_rele(os, FTAG);
return (error);
}
static int
zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops,
nvlist_t *zplprops, boolean_t *is_ci)
{
boolean_t fuids_ok;
boolean_t sa_ok;
uint64_t zplver = ZPL_VERSION;
int error;
zplver = zfs_zpl_version_map(spa_vers);
fuids_ok = (zplver >= ZPL_VERSION_FUID);
sa_ok = (zplver >= ZPL_VERSION_SA);
error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok,
createprops, zplprops, is_ci);
return (error);
}
/*
* innvl: {
* "type" -> dmu_objset_type_t (int32)
* (optional) "props" -> { prop -> value }
* }
*
* outnvl: propname -> error code (int32)
*/
static int
zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
{
int error = 0;
zfs_creat_t zct = { 0 };
nvlist_t *nvprops = NULL;
void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
int32_t type32;
dmu_objset_type_t type;
boolean_t is_insensitive = B_FALSE;
if (nvlist_lookup_int32(innvl, "type", &type32) != 0)
return (SET_ERROR(EINVAL));
type = type32;
(void) nvlist_lookup_nvlist(innvl, "props", &nvprops);
switch (type) {
case DMU_OST_ZFS:
cbfunc = zfs_create_cb;
break;
case DMU_OST_ZVOL:
cbfunc = zvol_create_cb;
break;
default:
cbfunc = NULL;
break;
}
if (strchr(fsname, '@') ||
strchr(fsname, '%'))
return (SET_ERROR(EINVAL));
zct.zct_props = nvprops;
if (cbfunc == NULL)
return (SET_ERROR(EINVAL));
if (type == DMU_OST_ZVOL) {
uint64_t volsize, volblocksize;
if (nvprops == NULL)
return (SET_ERROR(EINVAL));
if (nvlist_lookup_uint64(nvprops,
zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0)
return (SET_ERROR(EINVAL));
if ((error = nvlist_lookup_uint64(nvprops,
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
&volblocksize)) != 0 && error != ENOENT)
return (SET_ERROR(EINVAL));
if (error != 0)
volblocksize = zfs_prop_default_numeric(
ZFS_PROP_VOLBLOCKSIZE);
if ((error = zvol_check_volblocksize(
volblocksize)) != 0 ||
(error = zvol_check_volsize(volsize,
volblocksize)) != 0)
return (error);
} else if (type == DMU_OST_ZFS) {
int error;
/*
* We have to have normalization and
* case-folding flags correct when we do the
* file system creation, so go figure them out
* now.
*/
VERIFY(nvlist_alloc(&zct.zct_zplprops,
NV_UNIQUE_NAME, KM_SLEEP) == 0);
error = zfs_fill_zplprops(fsname, nvprops,
zct.zct_zplprops, &is_insensitive);
if (error != 0) {
nvlist_free(zct.zct_zplprops);
return (error);
}
}
error = dmu_objset_create(fsname, type,
is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct);
nvlist_free(zct.zct_zplprops);
/*
* It would be nice to do this atomically.
*/
if (error == 0) {
error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
nvprops, outnvl);
if (error != 0)
(void) dsl_destroy_head(fsname);
}
#ifdef __FreeBSD__
if (error == 0 && type == DMU_OST_ZVOL)
zvol_create_minors(fsname);
#endif
return (error);
}
/*
* innvl: {
* "origin" -> name of origin snapshot
* (optional) "props" -> { prop -> value }
* }
*
* outnvl: propname -> error code (int32)
*/
static int
zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
{
int error = 0;
nvlist_t *nvprops = NULL;
char *origin_name;
if (nvlist_lookup_string(innvl, "origin", &origin_name) != 0)
return (SET_ERROR(EINVAL));
(void) nvlist_lookup_nvlist(innvl, "props", &nvprops);
if (strchr(fsname, '@') ||
strchr(fsname, '%'))
return (SET_ERROR(EINVAL));
if (dataset_namecheck(origin_name, NULL, NULL) != 0)
return (SET_ERROR(EINVAL));
error = dmu_objset_clone(fsname, origin_name);
if (error != 0)
return (error);
/*
* It would be nice to do this atomically.
*/
if (error == 0) {
error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
nvprops, outnvl);
if (error != 0)
(void) dsl_destroy_head(fsname);
}
#ifdef __FreeBSD__
if (error == 0)
zvol_create_minors(fsname);
#endif
return (error);
}
+/* ARGSUSED */
+static int
+zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ if (strchr(fsname, '@') ||
+ strchr(fsname, '%'))
+ return (SET_ERROR(EINVAL));
+
+ return (dmu_objset_remap_indirects(fsname));
+}
+
/*
* innvl: {
* "snaps" -> { snapshot1, snapshot2 }
* (optional) "props" -> { prop -> value (string) }
* }
*
* outnvl: snapshot -> error code (int32)
*/
static int
zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
{
nvlist_t *snaps;
nvlist_t *props = NULL;
int error, poollen;
nvpair_t *pair;
(void) nvlist_lookup_nvlist(innvl, "props", &props);
if ((error = zfs_check_userprops(poolname, props)) != 0)
return (error);
if (!nvlist_empty(props) &&
zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS))
return (SET_ERROR(ENOTSUP));
if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
return (SET_ERROR(EINVAL));
poollen = strlen(poolname);
for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
pair = nvlist_next_nvpair(snaps, pair)) {
const char *name = nvpair_name(pair);
const char *cp = strchr(name, '@');
/*
* The snap name must contain an @, and the part after it must
* contain only valid characters.
*/
if (cp == NULL ||
zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
return (SET_ERROR(EINVAL));
/*
* The snap must be in the specified pool.
*/
if (strncmp(name, poolname, poollen) != 0 ||
(name[poollen] != '/' && name[poollen] != '@'))
return (SET_ERROR(EXDEV));
/* This must be the only snap of this fs. */
for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair);
pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) {
if (strncmp(name, nvpair_name(pair2), cp - name + 1)
== 0) {
return (SET_ERROR(EXDEV));
}
}
}
error = dsl_dataset_snapshot(snaps, props, outnvl);
return (error);
}
/*
* innvl: "message" -> string
*/
/* ARGSUSED */
static int
zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
{
char *message;
spa_t *spa;
int error;
char *poolname;
/*
* The poolname in the ioctl is not set, we get it from the TSD,
* which was set at the end of the last successful ioctl that allows
* logging. The secpolicy func already checked that it is set.
* Only one log ioctl is allowed after each successful ioctl, so
* we clear the TSD here.
*/
poolname = tsd_get(zfs_allow_log_key);
(void) tsd_set(zfs_allow_log_key, NULL);
error = spa_open(poolname, &spa, FTAG);
strfree(poolname);
if (error != 0)
return (error);
if (nvlist_lookup_string(innvl, "message", &message) != 0) {
spa_close(spa, FTAG);
return (SET_ERROR(EINVAL));
}
if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
spa_close(spa, FTAG);
return (SET_ERROR(ENOTSUP));
}
error = spa_history_log(spa, message);
spa_close(spa, FTAG);
return (error);
}
#ifdef __FreeBSD__
static int
zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
{
char name[MAXNAMELEN];
spa_t *spa;
vdev_t *vd;
char *command;
uint64_t pool_guid;
uint64_t vdev_guid;
int error;
if (nvlist_lookup_uint64(innvl,
ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
return (EINVAL);
if (nvlist_lookup_uint64(innvl,
ZPOOL_CONFIG_GUID, &vdev_guid) != 0)
return (EINVAL);
if (nvlist_lookup_string(innvl,
"command", &command) != 0)
return (EINVAL);
mutex_enter(&spa_namespace_lock);
spa = spa_by_guid(pool_guid, vdev_guid);
if (spa != NULL)
strcpy(name, spa_name(spa));
mutex_exit(&spa_namespace_lock);
if (spa == NULL)
return (ENOENT);
if ((error = spa_open(name, &spa, FTAG)) != 0)
return (error);
spa_vdev_state_enter(spa, SCL_ALL);
vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE);
if (vd == NULL) {
(void) spa_vdev_state_exit(spa, NULL, ENXIO);
spa_close(spa, FTAG);
return (ENODEV);
}
error = vdev_label_write_pad2(vd, command, strlen(command));
(void) spa_vdev_state_exit(spa, NULL, 0);
txg_wait_synced(spa->spa_dsl_pool, 0);
spa_close(spa, FTAG);
return (error);
}
#endif
/*
* The dp_config_rwlock must not be held when calling this, because the
* unmount may need to write out data.
*
* This function is best-effort. Callers must deal gracefully if it
* remains mounted (or is remounted after this call).
*
* Returns 0 if the argument is not a snapshot, or it is not currently a
* filesystem, or we were able to unmount it. Returns error code otherwise.
*/
void
zfs_unmount_snap(const char *snapname)
{
vfs_t *vfsp = NULL;
zfsvfs_t *zfsvfs = NULL;
if (strchr(snapname, '@') == NULL)
return;
int err = getzfsvfs(snapname, &zfsvfs);
if (err != 0) {
ASSERT3P(zfsvfs, ==, NULL);
return;
}
vfsp = zfsvfs->z_vfs;
ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os)));
#ifdef illumos
err = vn_vfswlock(vfsp->vfs_vnodecovered);
VFS_RELE(vfsp);
if (err != 0)
return;
#endif
/*
* Always force the unmount for snapshots.
*/
#ifdef illumos
(void) dounmount(vfsp, MS_FORCE, kcred);
#else
vfs_ref(vfsp);
vfs_unbusy(vfsp);
(void) dounmount(vfsp, MS_FORCE, curthread);
#endif
}
/* ARGSUSED */
static int
zfs_unmount_snap_cb(const char *snapname, void *arg)
{
zfs_unmount_snap(snapname);
return (0);
}
/*
* When a clone is destroyed, its origin may also need to be destroyed,
* in which case it must be unmounted. This routine will do that unmount
* if necessary.
*/
void
zfs_destroy_unmount_origin(const char *fsname)
{
int error;
objset_t *os;
dsl_dataset_t *ds;
error = dmu_objset_hold(fsname, FTAG, &os);
if (error != 0)
return;
ds = dmu_objset_ds(os);
if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) {
char originname[ZFS_MAX_DATASET_NAME_LEN];
dsl_dataset_name(ds->ds_prev, originname);
dmu_objset_rele(os, FTAG);
zfs_unmount_snap(originname);
} else {
dmu_objset_rele(os, FTAG);
}
}
/*
* innvl: {
* "snaps" -> { snapshot1, snapshot2 }
* (optional boolean) "defer"
* }
*
* outnvl: snapshot -> error code (int32)
*
*/
/* ARGSUSED */
static int
zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
{
int error, poollen;
nvlist_t *snaps;
nvpair_t *pair;
boolean_t defer;
if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
return (SET_ERROR(EINVAL));
defer = nvlist_exists(innvl, "defer");
poollen = strlen(poolname);
for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
pair = nvlist_next_nvpair(snaps, pair)) {
const char *name = nvpair_name(pair);
/*
* The snap must be in the specified pool to prevent the
* invalid removal of zvol minors below.
*/
if (strncmp(name, poolname, poollen) != 0 ||
(name[poollen] != '/' && name[poollen] != '@'))
return (SET_ERROR(EXDEV));
zfs_unmount_snap(nvpair_name(pair));
#if defined(__FreeBSD__)
zvol_remove_minors(name);
#endif
}
return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl));
}
/*
* Create bookmarks. Bookmark names are of the form <fs>#<bmark>.
* All bookmarks must be in the same pool.
*
* innvl: {
* bookmark1 -> snapshot1, bookmark2 -> snapshot2
* }
*
* outnvl: bookmark -> error code (int32)
*
*/
/* ARGSUSED */
static int
zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
{
for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
char *snap_name;
/*
* Verify the snapshot argument.
*/
if (nvpair_value_string(pair, &snap_name) != 0)
return (SET_ERROR(EINVAL));
/* Verify that the keys (bookmarks) are unique */
for (nvpair_t *pair2 = nvlist_next_nvpair(innvl, pair);
pair2 != NULL; pair2 = nvlist_next_nvpair(innvl, pair2)) {
if (strcmp(nvpair_name(pair), nvpair_name(pair2)) == 0)
return (SET_ERROR(EINVAL));
}
}
return (dsl_bookmark_create(innvl, outnvl));
}
/*
* innvl: {
* property 1, property 2, ...
* }
*
* outnvl: {
* bookmark name 1 -> { property 1, property 2, ... },
* bookmark name 2 -> { property 1, property 2, ... }
* }
*
*/
static int
zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
{
return (dsl_get_bookmarks(fsname, innvl, outnvl));
}
/*
* innvl: {
* bookmark name 1, bookmark name 2
* }
*
* outnvl: bookmark -> error code (int32)
*
*/
static int
zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl,
nvlist_t *outnvl)
{
int error, poollen;
poollen = strlen(poolname);
for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
const char *name = nvpair_name(pair);
const char *cp = strchr(name, '#');
/*
* The bookmark name must contain an #, and the part after it
* must contain only valid characters.
*/
if (cp == NULL ||
zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
return (SET_ERROR(EINVAL));
/*
* The bookmark must be in the specified pool.
*/
if (strncmp(name, poolname, poollen) != 0 ||
(name[poollen] != '/' && name[poollen] != '#'))
return (SET_ERROR(EXDEV));
}
error = dsl_bookmark_destroy(innvl, outnvl);
return (error);
}
static int
zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl,
nvlist_t *outnvl)
{
char *program;
uint64_t instrlimit, memlimit;
boolean_t sync_flag;
nvpair_t *nvarg = NULL;
if (0 != nvlist_lookup_string(innvl, ZCP_ARG_PROGRAM, &program)) {
return (EINVAL);
}
if (0 != nvlist_lookup_boolean_value(innvl, ZCP_ARG_SYNC, &sync_flag)) {
sync_flag = B_TRUE;
}
if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_INSTRLIMIT, &instrlimit)) {
instrlimit = ZCP_DEFAULT_INSTRLIMIT;
}
if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_MEMLIMIT, &memlimit)) {
memlimit = ZCP_DEFAULT_MEMLIMIT;
}
if (0 != nvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST, &nvarg)) {
return (EINVAL);
}
if (instrlimit == 0 || instrlimit > zfs_lua_max_instrlimit)
return (EINVAL);
if (memlimit == 0 || memlimit > zfs_lua_max_memlimit)
return (EINVAL);
return (zcp_eval(poolname, program, sync_flag, instrlimit, memlimit,
nvarg, outnvl));
}
/*
* inputs:
* zc_name name of dataset to destroy
* zc_objset_type type of objset
* zc_defer_destroy mark for deferred destroy
*
* outputs: none
*/
static int
zfs_ioc_destroy(zfs_cmd_t *zc)
{
int err;
if (zc->zc_objset_type == DMU_OST_ZFS)
zfs_unmount_snap(zc->zc_name);
if (strchr(zc->zc_name, '@'))
err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy);
else
err = dsl_destroy_head(zc->zc_name);
if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0)
#ifdef __FreeBSD__
zvol_remove_minors(zc->zc_name);
#else
(void) zvol_remove_minor(zc->zc_name);
#endif
return (err);
}
/*
* fsname is name of dataset to rollback (to most recent snapshot)
*
* innvl may contain name of expected target snapshot
*
* outnvl: "target" -> name of most recent snapshot
* }
*/
/* ARGSUSED */
static int
zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
{
zfsvfs_t *zfsvfs;
char *target = NULL;
int error;
(void) nvlist_lookup_string(innvl, "target", &target);
if (target != NULL) {
const char *cp = strchr(target, '@');
/*
* The snap name must contain an @, and the part after it must
* contain only valid characters.
*/
if (cp == NULL ||
zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
return (SET_ERROR(EINVAL));
}
if (getzfsvfs(fsname, &zfsvfs) == 0) {
dsl_dataset_t *ds;
ds = dmu_objset_ds(zfsvfs->z_os);
error = zfs_suspend_fs(zfsvfs);
if (error == 0) {
int resume_err;
error = dsl_dataset_rollback(fsname, target, zfsvfs,
outnvl);
resume_err = zfs_resume_fs(zfsvfs, ds);
error = error ? error : resume_err;
}
#ifdef illumos
VFS_RELE(zfsvfs->z_vfs);
#else
vfs_unbusy(zfsvfs->z_vfs);
#endif
} else {
error = dsl_dataset_rollback(fsname, target, NULL, outnvl);
}
return (error);
}
static int
recursive_unmount(const char *fsname, void *arg)
{
const char *snapname = arg;
char fullname[ZFS_MAX_DATASET_NAME_LEN];
(void) snprintf(fullname, sizeof (fullname), "%s@%s", fsname, snapname);
zfs_unmount_snap(fullname);
return (0);
}
/*
* inputs:
* zc_name old name of dataset
* zc_value new name of dataset
* zc_cookie recursive flag (only valid for snapshots)
*
* outputs: none
*/
static int
zfs_ioc_rename(zfs_cmd_t *zc)
{
boolean_t recursive = zc->zc_cookie & 1;
char *at;
boolean_t allow_mounted = B_TRUE;
#ifdef __FreeBSD__
allow_mounted = (zc->zc_cookie & 2) != 0;
#endif
zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
strchr(zc->zc_value, '%'))
return (SET_ERROR(EINVAL));
at = strchr(zc->zc_name, '@');
if (at != NULL) {
/* snaps must be in same fs */
int error;
if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1))
return (SET_ERROR(EXDEV));
*at = '\0';
if (zc->zc_objset_type == DMU_OST_ZFS && !allow_mounted) {
error = dmu_objset_find(zc->zc_name,
recursive_unmount, at + 1,
recursive ? DS_FIND_CHILDREN : 0);
if (error != 0) {
*at = '@';
return (error);
}
}
error = dsl_dataset_rename_snapshot(zc->zc_name,
at + 1, strchr(zc->zc_value, '@') + 1, recursive);
*at = '@';
return (error);
} else {
#ifdef illumos
if (zc->zc_objset_type == DMU_OST_ZVOL)
(void) zvol_remove_minor(zc->zc_name);
#endif
return (dsl_dir_rename(zc->zc_name, zc->zc_value));
}
}
static int
zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
{
const char *propname = nvpair_name(pair);
boolean_t issnap = (strchr(dsname, '@') != NULL);
zfs_prop_t prop = zfs_name_to_prop(propname);
uint64_t intval;
int err;
if (prop == ZPROP_INVAL) {
if (zfs_prop_user(propname)) {
if (err = zfs_secpolicy_write_perms(dsname,
ZFS_DELEG_PERM_USERPROP, cr))
return (err);
return (0);
}
if (!issnap && zfs_prop_userquota(propname)) {
const char *perm = NULL;
const char *uq_prefix =
zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA];
const char *gq_prefix =
zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA];
if (strncmp(propname, uq_prefix,
strlen(uq_prefix)) == 0) {
perm = ZFS_DELEG_PERM_USERQUOTA;
} else if (strncmp(propname, gq_prefix,
strlen(gq_prefix)) == 0) {
perm = ZFS_DELEG_PERM_GROUPQUOTA;
} else {
/* USERUSED and GROUPUSED are read-only */
return (SET_ERROR(EINVAL));
}
if (err = zfs_secpolicy_write_perms(dsname, perm, cr))
return (err);
return (0);
}
return (SET_ERROR(EINVAL));
}
if (issnap)
return (SET_ERROR(EINVAL));
if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
/*
* dsl_prop_get_all_impl() returns properties in this
* format.
*/
nvlist_t *attrs;
VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
&pair) == 0);
}
/*
* Check that this value is valid for this pool version
*/
switch (prop) {
case ZFS_PROP_COMPRESSION:
/*
* If the user specified gzip compression, make sure
* the SPA supports it. We ignore any errors here since
* we'll catch them later.
*/
if (nvpair_value_uint64(pair, &intval) == 0) {
if (intval >= ZIO_COMPRESS_GZIP_1 &&
intval <= ZIO_COMPRESS_GZIP_9 &&
zfs_earlier_version(dsname,
SPA_VERSION_GZIP_COMPRESSION)) {
return (SET_ERROR(ENOTSUP));
}
if (intval == ZIO_COMPRESS_ZLE &&
zfs_earlier_version(dsname,
SPA_VERSION_ZLE_COMPRESSION))
return (SET_ERROR(ENOTSUP));
if (intval == ZIO_COMPRESS_LZ4) {
spa_t *spa;
if ((err = spa_open(dsname, &spa, FTAG)) != 0)
return (err);
if (!spa_feature_is_enabled(spa,
SPA_FEATURE_LZ4_COMPRESS)) {
spa_close(spa, FTAG);
return (SET_ERROR(ENOTSUP));
}
spa_close(spa, FTAG);
}
/*
* If this is a bootable dataset then
* verify that the compression algorithm
* is supported for booting. We must return
* something other than ENOTSUP since it
* implies a downrev pool version.
*/
if (zfs_is_bootfs(dsname) &&
!BOOTFS_COMPRESS_VALID(intval)) {
return (SET_ERROR(ERANGE));
}
}
break;
case ZFS_PROP_COPIES:
if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS))
return (SET_ERROR(ENOTSUP));
break;
case ZFS_PROP_RECORDSIZE:
/* Record sizes above 128k need the feature to be enabled */
if (nvpair_value_uint64(pair, &intval) == 0 &&
intval > SPA_OLD_MAXBLOCKSIZE) {
spa_t *spa;
/*
* We don't allow setting the property above 1MB,
* unless the tunable has been changed.
*/
if (intval > zfs_max_recordsize ||
intval > SPA_MAXBLOCKSIZE)
return (SET_ERROR(ERANGE));
if ((err = spa_open(dsname, &spa, FTAG)) != 0)
return (err);
if (!spa_feature_is_enabled(spa,
SPA_FEATURE_LARGE_BLOCKS)) {
spa_close(spa, FTAG);
return (SET_ERROR(ENOTSUP));
}
spa_close(spa, FTAG);
}
break;
case ZFS_PROP_SHARESMB:
if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
return (SET_ERROR(ENOTSUP));
break;
case ZFS_PROP_ACLINHERIT:
if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
nvpair_value_uint64(pair, &intval) == 0) {
if (intval == ZFS_ACL_PASSTHROUGH_X &&
zfs_earlier_version(dsname,
SPA_VERSION_PASSTHROUGH_X))
return (SET_ERROR(ENOTSUP));
}
break;
case ZFS_PROP_CHECKSUM:
case ZFS_PROP_DEDUP:
{
spa_feature_t feature;
spa_t *spa;
/* dedup feature version checks */
if (prop == ZFS_PROP_DEDUP &&
zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
return (SET_ERROR(ENOTSUP));
if (nvpair_value_uint64(pair, &intval) != 0)
return (SET_ERROR(EINVAL));
/* check prop value is enabled in features */
feature = zio_checksum_to_feature(intval & ZIO_CHECKSUM_MASK);
if (feature == SPA_FEATURE_NONE)
break;
if ((err = spa_open(dsname, &spa, FTAG)) != 0)
return (err);
/*
* Salted checksums are not supported on root pools.
*/
if (spa_bootfs(spa) != 0 &&
intval < ZIO_CHECKSUM_FUNCTIONS &&
(zio_checksum_table[intval].ci_flags &
ZCHECKSUM_FLAG_SALTED)) {
spa_close(spa, FTAG);
return (SET_ERROR(ERANGE));
}
if (!spa_feature_is_enabled(spa, feature)) {
spa_close(spa, FTAG);
return (SET_ERROR(ENOTSUP));
}
spa_close(spa, FTAG);
break;
}
}
return (zfs_secpolicy_setprop(dsname, prop, pair, CRED()));
}
/*
* Checks for a race condition to make sure we don't increment a feature flag
* multiple times.
*/
static int
zfs_prop_activate_feature_check(void *arg, dmu_tx_t *tx)
{
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
spa_feature_t *featurep = arg;
if (!spa_feature_is_active(spa, *featurep))
return (0);
else
return (SET_ERROR(EBUSY));
}
/*
* The callback invoked on feature activation in the sync task caused by
* zfs_prop_activate_feature.
*/
static void
zfs_prop_activate_feature_sync(void *arg, dmu_tx_t *tx)
{
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
spa_feature_t *featurep = arg;
spa_feature_incr(spa, *featurep, tx);
}
/*
* Activates a feature on a pool in response to a property setting. This
* creates a new sync task which modifies the pool to reflect the feature
* as being active.
*/
static int
zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature)
{
int err;
/* EBUSY here indicates that the feature is already active */
err = dsl_sync_task(spa_name(spa),
zfs_prop_activate_feature_check, zfs_prop_activate_feature_sync,
&feature, 2, ZFS_SPACE_CHECK_RESERVED);
if (err != 0 && err != EBUSY)
return (err);
else
return (0);
}
/*
* Removes properties from the given props list that fail permission checks
* needed to clear them and to restore them in case of a receive error. For each
* property, make sure we have both set and inherit permissions.
*
* Returns the first error encountered if any permission checks fail. If the
* caller provides a non-NULL errlist, it also gives the complete list of names
* of all the properties that failed a permission check along with the
* corresponding error numbers. The caller is responsible for freeing the
* returned errlist.
*
* If every property checks out successfully, zero is returned and the list
* pointed at by errlist is NULL.
*/
static int
zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errlist)
{
zfs_cmd_t *zc;
nvpair_t *pair, *next_pair;
nvlist_t *errors;
int err, rv = 0;
if (props == NULL)
return (0);
VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP);
(void) strcpy(zc->zc_name, dataset);
pair = nvlist_next_nvpair(props, NULL);
while (pair != NULL) {
next_pair = nvlist_next_nvpair(props, pair);
(void) strcpy(zc->zc_value, nvpair_name(pair));
if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 ||
(err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) {
VERIFY(nvlist_remove_nvpair(props, pair) == 0);
VERIFY(nvlist_add_int32(errors,
zc->zc_value, err) == 0);
}
pair = next_pair;
}
kmem_free(zc, sizeof (zfs_cmd_t));
if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) {
nvlist_free(errors);
errors = NULL;
} else {
VERIFY(nvpair_value_int32(pair, &rv) == 0);
}
if (errlist == NULL)
nvlist_free(errors);
else
*errlist = errors;
return (rv);
}
static boolean_t
propval_equals(nvpair_t *p1, nvpair_t *p2)
{
if (nvpair_type(p1) == DATA_TYPE_NVLIST) {
/* dsl_prop_get_all_impl() format */
nvlist_t *attrs;
VERIFY(nvpair_value_nvlist(p1, &attrs) == 0);
VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
&p1) == 0);
}
if (nvpair_type(p2) == DATA_TYPE_NVLIST) {
nvlist_t *attrs;
VERIFY(nvpair_value_nvlist(p2, &attrs) == 0);
VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
&p2) == 0);
}
if (nvpair_type(p1) != nvpair_type(p2))
return (B_FALSE);
if (nvpair_type(p1) == DATA_TYPE_STRING) {
char *valstr1, *valstr2;
VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0);
VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0);
return (strcmp(valstr1, valstr2) == 0);
} else {
uint64_t intval1, intval2;
VERIFY(nvpair_value_uint64(p1, &intval1) == 0);
VERIFY(nvpair_value_uint64(p2, &intval2) == 0);
return (intval1 == intval2);
}
}
/*
* Remove properties from props if they are not going to change (as determined
* by comparison with origprops). Remove them from origprops as well, since we
* do not need to clear or restore properties that won't change.
*/
static void
props_reduce(nvlist_t *props, nvlist_t *origprops)
{
nvpair_t *pair, *next_pair;
if (origprops == NULL)
return; /* all props need to be received */
pair = nvlist_next_nvpair(props, NULL);
while (pair != NULL) {
const char *propname = nvpair_name(pair);
nvpair_t *match;
next_pair = nvlist_next_nvpair(props, pair);
if ((nvlist_lookup_nvpair(origprops, propname,
&match) != 0) || !propval_equals(pair, match))
goto next; /* need to set received value */
/* don't clear the existing received value */
(void) nvlist_remove_nvpair(origprops, match);
/* don't bother receiving the property */
(void) nvlist_remove_nvpair(props, pair);
next:
pair = next_pair;
}
}
/*
* Extract properties that cannot be set PRIOR to the receipt of a dataset.
* For example, refquota cannot be set until after the receipt of a dataset,
* because in replication streams, an older/earlier snapshot may exceed the
* refquota. We want to receive the older/earlier snapshot, but setting
* refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent
* the older/earlier snapshot from being received (with EDQUOT).
*
* The ZFS test "zfs_receive_011_pos" demonstrates such a scenario.
*
* libzfs will need to be judicious handling errors encountered by props
* extracted by this function.
*/
static nvlist_t *
extract_delay_props(nvlist_t *props)
{
nvlist_t *delayprops;
nvpair_t *nvp, *tmp;
static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, 0 };
int i;
VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL;
nvp = nvlist_next_nvpair(props, nvp)) {
/*
* strcmp() is safe because zfs_prop_to_name() always returns
* a bounded string.
*/
for (i = 0; delayable[i] != 0; i++) {
if (strcmp(zfs_prop_to_name(delayable[i]),
nvpair_name(nvp)) == 0) {
break;
}
}
if (delayable[i] != 0) {
tmp = nvlist_prev_nvpair(props, nvp);
VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0);
VERIFY(nvlist_remove_nvpair(props, nvp) == 0);
nvp = tmp;
}
}
if (nvlist_empty(delayprops)) {
nvlist_free(delayprops);
delayprops = NULL;
}
return (delayprops);
}
#ifdef DEBUG
static boolean_t zfs_ioc_recv_inject_err;
#endif
/*
* inputs:
* zc_name name of containing filesystem
* zc_nvlist_src{_size} nvlist of properties to apply
* zc_value name of snapshot to create
* zc_string name of clone origin (if DRR_FLAG_CLONE)
* zc_cookie file descriptor to recv from
* zc_begin_record the BEGIN record of the stream (not byteswapped)
* zc_guid force flag
* zc_cleanup_fd cleanup-on-exit file descriptor
* zc_action_handle handle for this guid/ds mapping (or zero on first call)
* zc_resumable if data is incomplete assume sender will resume
*
* outputs:
* zc_cookie number of bytes read
* zc_nvlist_dst{_size} error for each unapplied received property
* zc_obj zprop_errflags_t
* zc_action_handle handle for this guid/ds mapping
*/
static int
zfs_ioc_recv(zfs_cmd_t *zc)
{
file_t *fp;
dmu_recv_cookie_t drc;
boolean_t force = (boolean_t)zc->zc_guid;
int fd;
int error = 0;
int props_error = 0;
nvlist_t *errors;
offset_t off;
nvlist_t *props = NULL; /* sent properties */
nvlist_t *origprops = NULL; /* existing properties */
nvlist_t *delayprops = NULL; /* sent properties applied post-receive */
char *origin = NULL;
char *tosnap;
char tofs[ZFS_MAX_DATASET_NAME_LEN];
cap_rights_t rights;
boolean_t first_recvd_props = B_FALSE;
if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
strchr(zc->zc_value, '@') == NULL ||
strchr(zc->zc_value, '%'))
return (SET_ERROR(EINVAL));
(void) strcpy(tofs, zc->zc_value);
tosnap = strchr(tofs, '@');
*tosnap++ = '\0';
if (zc->zc_nvlist_src != 0 &&
(error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
zc->zc_iflags, &props)) != 0)
return (error);
fd = zc->zc_cookie;
#ifdef illumos
fp = getf(fd);
#else
fget_read(curthread, fd, cap_rights_init(&rights, CAP_PREAD), &fp);
#endif
if (fp == NULL) {
nvlist_free(props);
return (SET_ERROR(EBADF));
}
errors = fnvlist_alloc();
if (zc->zc_string[0])
origin = zc->zc_string;
error = dmu_recv_begin(tofs, tosnap,
&zc->zc_begin_record, force, zc->zc_resumable, origin, &drc);
if (error != 0)
goto out;
/*
* Set properties before we receive the stream so that they are applied
* to the new data. Note that we must call dmu_recv_stream() if
* dmu_recv_begin() succeeds.
*/
if (props != NULL && !drc.drc_newfs) {
if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >=
SPA_VERSION_RECVD_PROPS &&
!dsl_prop_get_hasrecvd(tofs))
first_recvd_props = B_TRUE;
/*
* If new received properties are supplied, they are to
* completely replace the existing received properties, so stash
* away the existing ones.
*/
if (dsl_prop_get_received(tofs, &origprops) == 0) {
nvlist_t *errlist = NULL;
/*
* Don't bother writing a property if its value won't
* change (and avoid the unnecessary security checks).
*
* The first receive after SPA_VERSION_RECVD_PROPS is a
* special case where we blow away all local properties
* regardless.
*/
if (!first_recvd_props)
props_reduce(props, origprops);
if (zfs_check_clearable(tofs, origprops, &errlist) != 0)
(void) nvlist_merge(errors, errlist, 0);
nvlist_free(errlist);
if (clear_received_props(tofs, origprops,
first_recvd_props ? NULL : props) != 0)
zc->zc_obj |= ZPROP_ERR_NOCLEAR;
} else {
zc->zc_obj |= ZPROP_ERR_NOCLEAR;
}
}
if (props != NULL) {
props_error = dsl_prop_set_hasrecvd(tofs);
if (props_error == 0) {
delayprops = extract_delay_props(props);
(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
props, errors);
}
}
off = fp->f_offset;
error = dmu_recv_stream(&drc, fp, &off, zc->zc_cleanup_fd,
&zc->zc_action_handle);
if (error == 0) {
zfsvfs_t *zfsvfs = NULL;
if (getzfsvfs(tofs, &zfsvfs) == 0) {
/* online recv */
dsl_dataset_t *ds;
int end_err;
ds = dmu_objset_ds(zfsvfs->z_os);
error = zfs_suspend_fs(zfsvfs);
/*
* If the suspend fails, then the recv_end will
* likely also fail, and clean up after itself.
*/
end_err = dmu_recv_end(&drc, zfsvfs);
if (error == 0)
error = zfs_resume_fs(zfsvfs, ds);
error = error ? error : end_err;
#ifdef illumos
VFS_RELE(zfsvfs->z_vfs);
#else
vfs_unbusy(zfsvfs->z_vfs);
#endif
} else {
error = dmu_recv_end(&drc, NULL);
}
/* Set delayed properties now, after we're done receiving. */
if (delayprops != NULL && error == 0) {
(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
delayprops, errors);
}
}
if (delayprops != NULL) {
/*
* Merge delayed props back in with initial props, in case
* we're DEBUG and zfs_ioc_recv_inject_err is set (which means
* we have to make sure clear_received_props() includes
* the delayed properties).
*
* Since zfs_ioc_recv_inject_err is only in DEBUG kernels,
* using ASSERT() will be just like a VERIFY.
*/
ASSERT(nvlist_merge(props, delayprops, 0) == 0);
nvlist_free(delayprops);
}
/*
* Now that all props, initial and delayed, are set, report the prop
* errors to the caller.
*/
if (zc->zc_nvlist_dst_size != 0 &&
(nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 ||
put_nvlist(zc, errors) != 0)) {
/*
* Caller made zc->zc_nvlist_dst less than the minimum expected
* size or supplied an invalid address.
*/
props_error = SET_ERROR(EINVAL);
}
zc->zc_cookie = off - fp->f_offset;
if (off >= 0 && off <= MAXOFFSET_T)
fp->f_offset = off;
#ifdef DEBUG
if (zfs_ioc_recv_inject_err) {
zfs_ioc_recv_inject_err = B_FALSE;
error = 1;
}
#endif
#ifdef __FreeBSD__
if (error == 0)
zvol_create_minors(tofs);
#endif
/*
* On error, restore the original props.
*/
if (error != 0 && props != NULL && !drc.drc_newfs) {
if (clear_received_props(tofs, props, NULL) != 0) {
/*
* We failed to clear the received properties.
* Since we may have left a $recvd value on the
* system, we can't clear the $hasrecvd flag.
*/
zc->zc_obj |= ZPROP_ERR_NORESTORE;
} else if (first_recvd_props) {
dsl_prop_unset_hasrecvd(tofs);
}
if (origprops == NULL && !drc.drc_newfs) {
/* We failed to stash the original properties. */
zc->zc_obj |= ZPROP_ERR_NORESTORE;
}
/*
* dsl_props_set() will not convert RECEIVED to LOCAL on or
* after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL
* explictly if we're restoring local properties cleared in the
* first new-style receive.
*/
if (origprops != NULL &&
zfs_set_prop_nvlist(tofs, (first_recvd_props ?
ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED),
origprops, NULL) != 0) {
/*
* We stashed the original properties but failed to
* restore them.
*/
zc->zc_obj |= ZPROP_ERR_NORESTORE;
}
}
out:
nvlist_free(props);
nvlist_free(origprops);
nvlist_free(errors);
releasef(fd);
if (error == 0)
error = props_error;
return (error);
}
/*
* inputs:
* zc_name name of snapshot to send
* zc_cookie file descriptor to send stream to
* zc_obj fromorigin flag (mutually exclusive with zc_fromobj)
* zc_sendobj objsetid of snapshot to send
* zc_fromobj objsetid of incremental fromsnap (may be zero)
* zc_guid if set, estimate size of stream only. zc_cookie is ignored.
* output size in zc_objset_type.
* zc_flags lzc_send_flags
*
* outputs:
* zc_objset_type estimated size, if zc_guid is set
*/
static int
zfs_ioc_send(zfs_cmd_t *zc)
{
int error;
offset_t off;
boolean_t estimate = (zc->zc_guid != 0);
boolean_t embedok = (zc->zc_flags & 0x1);
boolean_t large_block_ok = (zc->zc_flags & 0x2);
boolean_t compressok = (zc->zc_flags & 0x4);
if (zc->zc_obj != 0) {
dsl_pool_t *dp;
dsl_dataset_t *tosnap;
error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
if (error != 0)
return (error);
error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap);
if (error != 0) {
dsl_pool_rele(dp, FTAG);
return (error);
}
if (dsl_dir_is_clone(tosnap->ds_dir))
zc->zc_fromobj =
dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj;
dsl_dataset_rele(tosnap, FTAG);
dsl_pool_rele(dp, FTAG);
}
if (estimate) {
dsl_pool_t *dp;
dsl_dataset_t *tosnap;
dsl_dataset_t *fromsnap = NULL;
error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
if (error != 0)
return (error);
error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap);
if (error != 0) {
dsl_pool_rele(dp, FTAG);
return (error);
}
if (zc->zc_fromobj != 0) {
error = dsl_dataset_hold_obj(dp, zc->zc_fromobj,
FTAG, &fromsnap);
if (error != 0) {
dsl_dataset_rele(tosnap, FTAG);
dsl_pool_rele(dp, FTAG);
return (error);
}
}
error = dmu_send_estimate(tosnap, fromsnap, compressok,
&zc->zc_objset_type);
if (fromsnap != NULL)
dsl_dataset_rele(fromsnap, FTAG);
dsl_dataset_rele(tosnap, FTAG);
dsl_pool_rele(dp, FTAG);
} else {
file_t *fp;
cap_rights_t rights;
#ifdef illumos
fp = getf(zc->zc_cookie);
#else
fget_write(curthread, zc->zc_cookie,
cap_rights_init(&rights, CAP_WRITE), &fp);
#endif
if (fp == NULL)
return (SET_ERROR(EBADF));
off = fp->f_offset;
error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
zc->zc_fromobj, embedok, large_block_ok, compressok,
#ifdef illumos
zc->zc_cookie, fp->f_vnode, &off);
#else
zc->zc_cookie, fp, &off);
#endif
if (off >= 0 && off <= MAXOFFSET_T)
fp->f_offset = off;
releasef(zc->zc_cookie);
}
return (error);
}
/*
* inputs:
* zc_name name of snapshot on which to report progress
* zc_cookie file descriptor of send stream
*
* outputs:
* zc_cookie number of bytes written in send stream thus far
*/
static int
zfs_ioc_send_progress(zfs_cmd_t *zc)
{
dsl_pool_t *dp;
dsl_dataset_t *ds;
dmu_sendarg_t *dsp = NULL;
int error;
error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
if (error != 0)
return (error);
error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds);
if (error != 0) {
dsl_pool_rele(dp, FTAG);
return (error);
}
mutex_enter(&ds->ds_sendstream_lock);
/*
* Iterate over all the send streams currently active on this dataset.
* If there's one which matches the specified file descriptor _and_ the
* stream was started by the current process, return the progress of
* that stream.
*/
for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL;
dsp = list_next(&ds->ds_sendstreams, dsp)) {
if (dsp->dsa_outfd == zc->zc_cookie &&
dsp->dsa_proc == curproc)
break;
}
if (dsp != NULL)
zc->zc_cookie = *(dsp->dsa_off);
else
error = SET_ERROR(ENOENT);
mutex_exit(&ds->ds_sendstream_lock);
dsl_dataset_rele(ds, FTAG);
dsl_pool_rele(dp, FTAG);
return (error);
}
static int
zfs_ioc_inject_fault(zfs_cmd_t *zc)
{
int id, error;
error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id,
&zc->zc_inject_record);
if (error == 0)
zc->zc_guid = (uint64_t)id;
return (error);
}
static int
zfs_ioc_clear_fault(zfs_cmd_t *zc)
{
return (zio_clear_fault((int)zc->zc_guid));
}
static int
zfs_ioc_inject_list_next(zfs_cmd_t *zc)
{
int id = (int)zc->zc_guid;
int error;
error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name),
&zc->zc_inject_record);
zc->zc_guid = id;
return (error);
}
static int
zfs_ioc_error_log(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
size_t count = (size_t)zc->zc_nvlist_dst_size;
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
return (error);
error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst,
&count);
if (error == 0)
zc->zc_nvlist_dst_size = count;
else
zc->zc_nvlist_dst_size = spa_get_errlog_size(spa);
spa_close(spa, FTAG);
return (error);
}
static int
zfs_ioc_clear(zfs_cmd_t *zc)
{
spa_t *spa;
vdev_t *vd;
int error;
/*
* On zpool clear we also fix up missing slogs
*/
mutex_enter(&spa_namespace_lock);
spa = spa_lookup(zc->zc_name);
if (spa == NULL) {
mutex_exit(&spa_namespace_lock);
return (SET_ERROR(EIO));
}
if (spa_get_log_state(spa) == SPA_LOG_MISSING) {
/* we need to let spa_open/spa_load clear the chains */
spa_set_log_state(spa, SPA_LOG_CLEAR);
}
spa->spa_last_open_failed = 0;
mutex_exit(&spa_namespace_lock);
if (zc->zc_cookie & ZPOOL_NO_REWIND) {
error = spa_open(zc->zc_name, &spa, FTAG);
} else {
nvlist_t *policy;
nvlist_t *config = NULL;
if (zc->zc_nvlist_src == 0)
return (SET_ERROR(EINVAL));
if ((error = get_nvlist(zc->zc_nvlist_src,
zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) {
error = spa_open_rewind(zc->zc_name, &spa, FTAG,
policy, &config);
if (config != NULL) {
int err;
if ((err = put_nvlist(zc, config)) != 0)
error = err;
nvlist_free(config);
}
nvlist_free(policy);
}
}
if (error != 0)
return (error);
spa_vdev_state_enter(spa, SCL_NONE);
if (zc->zc_guid == 0) {
vd = NULL;
} else {
vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE);
if (vd == NULL) {
(void) spa_vdev_state_exit(spa, NULL, ENODEV);
spa_close(spa, FTAG);
return (SET_ERROR(ENODEV));
}
}
vdev_clear(spa, vd);
(void) spa_vdev_state_exit(spa, NULL, 0);
/*
* Resume any suspended I/Os.
*/
if (zio_resume(spa) != 0)
error = SET_ERROR(EIO);
spa_close(spa, FTAG);
return (error);
}
static int
zfs_ioc_pool_reopen(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
error = spa_open(zc->zc_name, &spa, FTAG);
if (error != 0)
return (error);
spa_vdev_state_enter(spa, SCL_NONE);
/*
* If a resilver is already in progress then set the
* spa_scrub_reopen flag to B_TRUE so that we don't restart
* the scan as a side effect of the reopen. Otherwise, let
* vdev_open() decided if a resilver is required.
*/
spa->spa_scrub_reopen = dsl_scan_resilvering(spa->spa_dsl_pool);
vdev_reopen(spa->spa_root_vdev);
spa->spa_scrub_reopen = B_FALSE;
(void) spa_vdev_state_exit(spa, NULL, 0);
spa_close(spa, FTAG);
return (0);
}
/*
* inputs:
* zc_name name of filesystem
*
* outputs:
* zc_string name of conflicting snapshot, if there is one
*/
static int
zfs_ioc_promote(zfs_cmd_t *zc)
{
dsl_pool_t *dp;
dsl_dataset_t *ds, *ods;
char origin[ZFS_MAX_DATASET_NAME_LEN];
char *cp;
int error;
error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
if (error != 0)
return (error);
error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds);
if (error != 0) {
dsl_pool_rele(dp, FTAG);
return (error);
}
if (!dsl_dir_is_clone(ds->ds_dir)) {
dsl_dataset_rele(ds, FTAG);
dsl_pool_rele(dp, FTAG);
return (SET_ERROR(EINVAL));
}
error = dsl_dataset_hold_obj(dp,
dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &ods);
if (error != 0) {
dsl_dataset_rele(ds, FTAG);
dsl_pool_rele(dp, FTAG);
return (error);
}
dsl_dataset_name(ods, origin);
dsl_dataset_rele(ods, FTAG);
dsl_dataset_rele(ds, FTAG);
dsl_pool_rele(dp, FTAG);
/*
* We don't need to unmount *all* the origin fs's snapshots, but
* it's easier.
*/
cp = strchr(origin, '@');
if (cp)
*cp = '\0';
(void) dmu_objset_find(origin,
zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS);
return (dsl_dataset_promote(zc->zc_name, zc->zc_string));
}
/*
* Retrieve a single {user|group}{used|quota}@... property.
*
* inputs:
* zc_name name of filesystem
* zc_objset_type zfs_userquota_prop_t
* zc_value domain name (eg. "S-1-234-567-89")
* zc_guid RID/UID/GID
*
* outputs:
* zc_cookie property value
*/
static int
zfs_ioc_userspace_one(zfs_cmd_t *zc)
{
zfsvfs_t *zfsvfs;
int error;
if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
return (SET_ERROR(EINVAL));
error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
if (error != 0)
return (error);
error = zfs_userspace_one(zfsvfs,
zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie);
zfsvfs_rele(zfsvfs, FTAG);
return (error);
}
/*
* inputs:
* zc_name name of filesystem
* zc_cookie zap cursor
* zc_objset_type zfs_userquota_prop_t
* zc_nvlist_dst[_size] buffer to fill (not really an nvlist)
*
* outputs:
* zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t)
* zc_cookie zap cursor
*/
static int
zfs_ioc_userspace_many(zfs_cmd_t *zc)
{
zfsvfs_t *zfsvfs;
int bufsize = zc->zc_nvlist_dst_size;
if (bufsize <= 0)
return (SET_ERROR(ENOMEM));
int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
if (error != 0)
return (error);
void *buf = kmem_alloc(bufsize, KM_SLEEP);
error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie,
buf, &zc->zc_nvlist_dst_size);
if (error == 0) {
error = ddi_copyout(buf,
(void *)(uintptr_t)zc->zc_nvlist_dst,
zc->zc_nvlist_dst_size, zc->zc_iflags);
}
kmem_free(buf, bufsize);
zfsvfs_rele(zfsvfs, FTAG);
return (error);
}
/*
* inputs:
* zc_name name of filesystem
*
* outputs:
* none
*/
static int
zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
{
objset_t *os;
int error = 0;
zfsvfs_t *zfsvfs;
if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
if (!dmu_objset_userused_enabled(zfsvfs->z_os)) {
/*
* If userused is not enabled, it may be because the
* objset needs to be closed & reopened (to grow the
* objset_phys_t). Suspend/resume the fs will do that.
*/
dsl_dataset_t *ds, *newds;
ds = dmu_objset_ds(zfsvfs->z_os);
error = zfs_suspend_fs(zfsvfs);
if (error == 0) {
dmu_objset_refresh_ownership(ds, &newds,
zfsvfs);
error = zfs_resume_fs(zfsvfs, newds);
}
}
if (error == 0)
error = dmu_objset_userspace_upgrade(zfsvfs->z_os);
#ifdef illumos
VFS_RELE(zfsvfs->z_vfs);
#else
vfs_unbusy(zfsvfs->z_vfs);
#endif
} else {
/* XXX kind of reading contents without owning */
error = dmu_objset_hold(zc->zc_name, FTAG, &os);
if (error != 0)
return (error);
error = dmu_objset_userspace_upgrade(os);
dmu_objset_rele(os, FTAG);
}
return (error);
}
#ifdef illumos
/*
* We don't want to have a hard dependency
* against some special symbols in sharefs
* nfs, and smbsrv. Determine them if needed when
* the first file system is shared.
* Neither sharefs, nfs or smbsrv are unloadable modules.
*/
int (*znfsexport_fs)(void *arg);
int (*zshare_fs)(enum sharefs_sys_op, share_t *, uint32_t);
int (*zsmbexport_fs)(void *arg, boolean_t add_share);
int zfs_nfsshare_inited;
int zfs_smbshare_inited;
ddi_modhandle_t nfs_mod;
ddi_modhandle_t sharefs_mod;
ddi_modhandle_t smbsrv_mod;
#endif /* illumos */
kmutex_t zfs_share_lock;
#ifdef illumos
static int
zfs_init_sharefs()
{
int error;
ASSERT(MUTEX_HELD(&zfs_share_lock));
/* Both NFS and SMB shares also require sharetab support. */
if (sharefs_mod == NULL && ((sharefs_mod =
ddi_modopen("fs/sharefs",
KRTLD_MODE_FIRST, &error)) == NULL)) {
return (SET_ERROR(ENOSYS));
}
if (zshare_fs == NULL && ((zshare_fs =
(int (*)(enum sharefs_sys_op, share_t *, uint32_t))
ddi_modsym(sharefs_mod, "sharefs_impl", &error)) == NULL)) {
return (SET_ERROR(ENOSYS));
}
return (0);
}
#endif /* illumos */
static int
zfs_ioc_share(zfs_cmd_t *zc)
{
#ifdef illumos
int error;
int opcode;
switch (zc->zc_share.z_sharetype) {
case ZFS_SHARE_NFS:
case ZFS_UNSHARE_NFS:
if (zfs_nfsshare_inited == 0) {
mutex_enter(&zfs_share_lock);
if (nfs_mod == NULL && ((nfs_mod = ddi_modopen("fs/nfs",
KRTLD_MODE_FIRST, &error)) == NULL)) {
mutex_exit(&zfs_share_lock);
return (SET_ERROR(ENOSYS));
}
if (znfsexport_fs == NULL &&
((znfsexport_fs = (int (*)(void *))
ddi_modsym(nfs_mod,
"nfs_export", &error)) == NULL)) {
mutex_exit(&zfs_share_lock);
return (SET_ERROR(ENOSYS));
}
error = zfs_init_sharefs();
if (error != 0) {
mutex_exit(&zfs_share_lock);
return (SET_ERROR(ENOSYS));
}
zfs_nfsshare_inited = 1;
mutex_exit(&zfs_share_lock);
}
break;
case ZFS_SHARE_SMB:
case ZFS_UNSHARE_SMB:
if (zfs_smbshare_inited == 0) {
mutex_enter(&zfs_share_lock);
if (smbsrv_mod == NULL && ((smbsrv_mod =
ddi_modopen("drv/smbsrv",
KRTLD_MODE_FIRST, &error)) == NULL)) {
mutex_exit(&zfs_share_lock);
return (SET_ERROR(ENOSYS));
}
if (zsmbexport_fs == NULL && ((zsmbexport_fs =
(int (*)(void *, boolean_t))ddi_modsym(smbsrv_mod,
"smb_server_share", &error)) == NULL)) {
mutex_exit(&zfs_share_lock);
return (SET_ERROR(ENOSYS));
}
error = zfs_init_sharefs();
if (error != 0) {
mutex_exit(&zfs_share_lock);
return (SET_ERROR(ENOSYS));
}
zfs_smbshare_inited = 1;
mutex_exit(&zfs_share_lock);
}
break;
default:
return (SET_ERROR(EINVAL));
}
switch (zc->zc_share.z_sharetype) {
case ZFS_SHARE_NFS:
case ZFS_UNSHARE_NFS:
if (error =
znfsexport_fs((void *)
(uintptr_t)zc->zc_share.z_exportdata))
return (error);
break;
case ZFS_SHARE_SMB:
case ZFS_UNSHARE_SMB:
if (error = zsmbexport_fs((void *)
(uintptr_t)zc->zc_share.z_exportdata,
zc->zc_share.z_sharetype == ZFS_SHARE_SMB ?
B_TRUE: B_FALSE)) {
return (error);
}
break;
}
opcode = (zc->zc_share.z_sharetype == ZFS_SHARE_NFS ||
zc->zc_share.z_sharetype == ZFS_SHARE_SMB) ?
SHAREFS_ADD : SHAREFS_REMOVE;
/*
* Add or remove share from sharetab
*/
error = zshare_fs(opcode,
(void *)(uintptr_t)zc->zc_share.z_sharedata,
zc->zc_share.z_sharemax);
return (error);
#else /* !illumos */
return (ENOSYS);
#endif /* illumos */
}
ace_t full_access[] = {
{(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0}
};
/*
* inputs:
* zc_name name of containing filesystem
* zc_obj object # beyond which we want next in-use object #
*
* outputs:
* zc_obj next in-use object #
*/
static int
zfs_ioc_next_obj(zfs_cmd_t *zc)
{
objset_t *os = NULL;
int error;
error = dmu_objset_hold(zc->zc_name, FTAG, &os);
if (error != 0)
return (error);
error = dmu_object_next(os, &zc->zc_obj, B_FALSE,
dsl_dataset_phys(os->os_dsl_dataset)->ds_prev_snap_txg);
dmu_objset_rele(os, FTAG);
return (error);
}
/*
* inputs:
* zc_name name of filesystem
* zc_value prefix name for snapshot
* zc_cleanup_fd cleanup-on-exit file descriptor for calling process
*
* outputs:
* zc_value short name of new snapshot
*/
static int
zfs_ioc_tmp_snapshot(zfs_cmd_t *zc)
{
char *snap_name;
char *hold_name;
int error;
minor_t minor;
error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor);
if (error != 0)
return (error);
snap_name = kmem_asprintf("%s-%016llx", zc->zc_value,
(u_longlong_t)ddi_get_lbolt64());
hold_name = kmem_asprintf("%%%s", zc->zc_value);
error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor,
hold_name);
if (error == 0)
(void) strcpy(zc->zc_value, snap_name);
strfree(snap_name);
strfree(hold_name);
zfs_onexit_fd_rele(zc->zc_cleanup_fd);
return (error);
}
/*
* inputs:
* zc_name name of "to" snapshot
* zc_value name of "from" snapshot
* zc_cookie file descriptor to write diff data on
*
* outputs:
* dmu_diff_record_t's to the file descriptor
*/
static int
zfs_ioc_diff(zfs_cmd_t *zc)
{
file_t *fp;
cap_rights_t rights;
offset_t off;
int error;
#ifdef illumos
fp = getf(zc->zc_cookie);
#else
fget_write(curthread, zc->zc_cookie,
cap_rights_init(&rights, CAP_WRITE), &fp);
#endif
if (fp == NULL)
return (SET_ERROR(EBADF));
off = fp->f_offset;
#ifdef illumos
error = dmu_diff(zc->zc_name, zc->zc_value, fp->f_vnode, &off);
#else
error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off);
#endif
if (off >= 0 && off <= MAXOFFSET_T)
fp->f_offset = off;
releasef(zc->zc_cookie);
return (error);
}
#ifdef illumos
/*
* Remove all ACL files in shares dir
*/
static int
zfs_smb_acl_purge(znode_t *dzp)
{
zap_cursor_t zc;
zap_attribute_t zap;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
int error;
for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
(error = zap_cursor_retrieve(&zc, &zap)) == 0;
zap_cursor_advance(&zc)) {
if ((error = VOP_REMOVE(ZTOV(dzp), zap.za_name, kcred,
NULL, 0)) != 0)
break;
}
zap_cursor_fini(&zc);
return (error);
}
#endif /* illumos */
static int
zfs_ioc_smb_acl(zfs_cmd_t *zc)
{
#ifdef illumos
vnode_t *vp;
znode_t *dzp;
vnode_t *resourcevp = NULL;
znode_t *sharedir;
zfsvfs_t *zfsvfs;
nvlist_t *nvlist;
char *src, *target;
vattr_t vattr;
vsecattr_t vsec;
int error = 0;
if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
NO_FOLLOW, NULL, &vp)) != 0)
return (error);
/* Now make sure mntpnt and dataset are ZFS */
if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 ||
(strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
zc->zc_name) != 0)) {
VN_RELE(vp);
return (SET_ERROR(EINVAL));
}
dzp = VTOZ(vp);
zfsvfs = dzp->z_zfsvfs;
ZFS_ENTER(zfsvfs);
/*
* Create share dir if its missing.
*/
mutex_enter(&zfsvfs->z_lock);
if (zfsvfs->z_shares_dir == 0) {
dmu_tx_t *tx;
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, TRUE,
ZFS_SHARES_DIR);
dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error != 0) {
dmu_tx_abort(tx);
} else {
error = zfs_create_share_dir(zfsvfs, tx);
dmu_tx_commit(tx);
}
if (error != 0) {
mutex_exit(&zfsvfs->z_lock);
VN_RELE(vp);
ZFS_EXIT(zfsvfs);
return (error);
}
}
mutex_exit(&zfsvfs->z_lock);
ASSERT(zfsvfs->z_shares_dir);
if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &sharedir)) != 0) {
VN_RELE(vp);
ZFS_EXIT(zfsvfs);
return (error);
}
switch (zc->zc_cookie) {
case ZFS_SMB_ACL_ADD:
vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
vattr.va_type = VREG;
vattr.va_mode = S_IFREG|0777;
vattr.va_uid = 0;
vattr.va_gid = 0;
vsec.vsa_mask = VSA_ACE;
vsec.vsa_aclentp = &full_access;
vsec.vsa_aclentsz = sizeof (full_access);
vsec.vsa_aclcnt = 1;
error = VOP_CREATE(ZTOV(sharedir), zc->zc_string,
&vattr, EXCL, 0, &resourcevp, kcred, 0, NULL, &vsec);
if (resourcevp)
VN_RELE(resourcevp);
break;
case ZFS_SMB_ACL_REMOVE:
error = VOP_REMOVE(ZTOV(sharedir), zc->zc_string, kcred,
NULL, 0);
break;
case ZFS_SMB_ACL_RENAME:
if ((error = get_nvlist(zc->zc_nvlist_src,
zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) {
VN_RELE(vp);
VN_RELE(ZTOV(sharedir));
ZFS_EXIT(zfsvfs);
return (error);
}
if (nvlist_lookup_string(nvlist, ZFS_SMB_ACL_SRC, &src) ||
nvlist_lookup_string(nvlist, ZFS_SMB_ACL_TARGET,
&target)) {
VN_RELE(vp);
VN_RELE(ZTOV(sharedir));
ZFS_EXIT(zfsvfs);
nvlist_free(nvlist);
return (error);
}
error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target,
kcred, NULL, 0);
nvlist_free(nvlist);
break;
case ZFS_SMB_ACL_PURGE:
error = zfs_smb_acl_purge(sharedir);
break;
default:
error = SET_ERROR(EINVAL);
break;
}
VN_RELE(vp);
VN_RELE(ZTOV(sharedir));
ZFS_EXIT(zfsvfs);
return (error);
#else /* !illumos */
return (EOPNOTSUPP);
#endif /* illumos */
}
/*
* innvl: {
* "holds" -> { snapname -> holdname (string), ... }
* (optional) "cleanup_fd" -> fd (int32)
* }
*
* outnvl: {
* snapname -> error value (int32)
* ...
* }
*/
/* ARGSUSED */
static int
zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist)
{
nvpair_t *pair;
nvlist_t *holds;
int cleanup_fd = -1;
int error;
minor_t minor = 0;
error = nvlist_lookup_nvlist(args, "holds", &holds);
if (error != 0)
return (SET_ERROR(EINVAL));
/* make sure the user didn't pass us any invalid (empty) tags */
for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
pair = nvlist_next_nvpair(holds, pair)) {
char *htag;
error = nvpair_value_string(pair, &htag);
if (error != 0)
return (SET_ERROR(error));
if (strlen(htag) == 0)
return (SET_ERROR(EINVAL));
}
if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) {
error = zfs_onexit_fd_hold(cleanup_fd, &minor);
if (error != 0)
return (error);
}
error = dsl_dataset_user_hold(holds, minor, errlist);
if (minor != 0)
zfs_onexit_fd_rele(cleanup_fd);
return (error);
}
/*
* innvl is not used.
*
* outnvl: {
* holdname -> time added (uint64 seconds since epoch)
* ...
* }
*/
/* ARGSUSED */
static int
zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl)
{
return (dsl_dataset_get_holds(snapname, outnvl));
}
/*
* innvl: {
* snapname -> { holdname, ... }
* ...
* }
*
* outnvl: {
* snapname -> error value (int32)
* ...
* }
*/
/* ARGSUSED */
static int
zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist)
{
return (dsl_dataset_user_release(holds, errlist));
}
/*
* inputs:
* zc_name name of new filesystem or snapshot
* zc_value full name of old snapshot
*
* outputs:
* zc_cookie space in bytes
* zc_objset_type compressed space in bytes
* zc_perm_action uncompressed space in bytes
*/
static int
zfs_ioc_space_written(zfs_cmd_t *zc)
{
int error;
dsl_pool_t *dp;
dsl_dataset_t *new, *old;
error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
if (error != 0)
return (error);
error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new);
if (error != 0) {
dsl_pool_rele(dp, FTAG);
return (error);
}
error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old);
if (error != 0) {
dsl_dataset_rele(new, FTAG);
dsl_pool_rele(dp, FTAG);
return (error);
}
error = dsl_dataset_space_written(old, new, &zc->zc_cookie,
&zc->zc_objset_type, &zc->zc_perm_action);
dsl_dataset_rele(old, FTAG);
dsl_dataset_rele(new, FTAG);
dsl_pool_rele(dp, FTAG);
return (error);
}
/*
* innvl: {
* "firstsnap" -> snapshot name
* }
*
* outnvl: {
* "used" -> space in bytes
* "compressed" -> compressed space in bytes
* "uncompressed" -> uncompressed space in bytes
* }
*/
static int
zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
{
int error;
dsl_pool_t *dp;
dsl_dataset_t *new, *old;
char *firstsnap;
uint64_t used, comp, uncomp;
if (nvlist_lookup_string(innvl, "firstsnap", &firstsnap) != 0)
return (SET_ERROR(EINVAL));
error = dsl_pool_hold(lastsnap, FTAG, &dp);
if (error != 0)
return (error);
error = dsl_dataset_hold(dp, lastsnap, FTAG, &new);
if (error == 0 && !new->ds_is_snapshot) {
dsl_dataset_rele(new, FTAG);
error = SET_ERROR(EINVAL);
}
if (error != 0) {
dsl_pool_rele(dp, FTAG);
return (error);
}
error = dsl_dataset_hold(dp, firstsnap, FTAG, &old);
if (error == 0 && !old->ds_is_snapshot) {
dsl_dataset_rele(old, FTAG);
error = SET_ERROR(EINVAL);
}
if (error != 0) {
dsl_dataset_rele(new, FTAG);
dsl_pool_rele(dp, FTAG);
return (error);
}
error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp);
dsl_dataset_rele(old, FTAG);
dsl_dataset_rele(new, FTAG);
dsl_pool_rele(dp, FTAG);
fnvlist_add_uint64(outnvl, "used", used);
fnvlist_add_uint64(outnvl, "compressed", comp);
fnvlist_add_uint64(outnvl, "uncompressed", uncomp);
return (error);
}
static int
zfs_ioc_jail(zfs_cmd_t *zc)
{
return (zone_dataset_attach(curthread->td_ucred, zc->zc_name,
(int)zc->zc_jailid));
}
static int
zfs_ioc_unjail(zfs_cmd_t *zc)
{
return (zone_dataset_detach(curthread->td_ucred, zc->zc_name,
(int)zc->zc_jailid));
}
/*
* innvl: {
* "fd" -> file descriptor to write stream to (int32)
* (optional) "fromsnap" -> full snap name to send an incremental from
* (optional) "largeblockok" -> (value ignored)
* indicates that blocks > 128KB are permitted
* (optional) "embedok" -> (value ignored)
* presence indicates DRR_WRITE_EMBEDDED records are permitted
* (optional) "compressok" -> (value ignored)
* presence indicates compressed DRR_WRITE records are permitted
* (optional) "resume_object" and "resume_offset" -> (uint64)
* if present, resume send stream from specified object and offset.
* }
*
* outnvl is unused
*/
/* ARGSUSED */
static int
zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
{
cap_rights_t rights;
file_t *fp;
int error;
offset_t off;
char *fromname = NULL;
int fd;
boolean_t largeblockok;
boolean_t embedok;
boolean_t compressok;
uint64_t resumeobj = 0;
uint64_t resumeoff = 0;
error = nvlist_lookup_int32(innvl, "fd", &fd);
if (error != 0)
return (SET_ERROR(EINVAL));
(void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
largeblockok = nvlist_exists(innvl, "largeblockok");
embedok = nvlist_exists(innvl, "embedok");
compressok = nvlist_exists(innvl, "compressok");
(void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj);
(void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff);
#ifdef illumos
file_t *fp = getf(fd);
#else
fget_write(curthread, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
#endif
if (fp == NULL)
return (SET_ERROR(EBADF));
off = fp->f_offset;
error = dmu_send(snapname, fromname, embedok, largeblockok, compressok,
#ifdef illumos
fd, resumeobj, resumeoff, fp->f_vnode, &off);
#else
fd, resumeobj, resumeoff, fp, &off);
#endif
#ifdef illumos
if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
fp->f_offset = off;
#else
fp->f_offset = off;
#endif
releasef(fd);
return (error);
}
/*
* Determine approximately how large a zfs send stream will be -- the number
* of bytes that will be written to the fd supplied to zfs_ioc_send_new().
*
* innvl: {
* (optional) "from" -> full snap or bookmark name to send an incremental
* from
* (optional) "largeblockok" -> (value ignored)
* indicates that blocks > 128KB are permitted
* (optional) "embedok" -> (value ignored)
* presence indicates DRR_WRITE_EMBEDDED records are permitted
* (optional) "compressok" -> (value ignored)
* presence indicates compressed DRR_WRITE records are permitted
* }
*
* outnvl: {
* "space" -> bytes of space (uint64)
* }
*/
static int
zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
{
dsl_pool_t *dp;
dsl_dataset_t *tosnap;
int error;
char *fromname;
boolean_t compressok;
uint64_t space;
error = dsl_pool_hold(snapname, FTAG, &dp);
if (error != 0)
return (error);
error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap);
if (error != 0) {
dsl_pool_rele(dp, FTAG);
return (error);
}
compressok = nvlist_exists(innvl, "compressok");
error = nvlist_lookup_string(innvl, "from", &fromname);
if (error == 0) {
if (strchr(fromname, '@') != NULL) {
/*
* If from is a snapshot, hold it and use the more
* efficient dmu_send_estimate to estimate send space
* size using deadlists.
*/
dsl_dataset_t *fromsnap;
error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap);
if (error != 0)
goto out;
error = dmu_send_estimate(tosnap, fromsnap, compressok,
&space);
dsl_dataset_rele(fromsnap, FTAG);
} else if (strchr(fromname, '#') != NULL) {
/*
* If from is a bookmark, fetch the creation TXG of the
* snapshot it was created from and use that to find
* blocks that were born after it.
*/
zfs_bookmark_phys_t frombm;
error = dsl_bookmark_lookup(dp, fromname, tosnap,
&frombm);
if (error != 0)
goto out;
error = dmu_send_estimate_from_txg(tosnap,
frombm.zbm_creation_txg, compressok, &space);
} else {
/*
* from is not properly formatted as a snapshot or
* bookmark
*/
error = SET_ERROR(EINVAL);
goto out;
}
} else {
/*
* If estimating the size of a full send, use dmu_send_estimate.
*/
error = dmu_send_estimate(tosnap, NULL, compressok, &space);
}
fnvlist_add_uint64(outnvl, "space", space);
out:
dsl_dataset_rele(tosnap, FTAG);
dsl_pool_rele(dp, FTAG);
return (error);
}
static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST];
static void
zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
boolean_t log_history, zfs_ioc_poolcheck_t pool_check)
{
zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST];
ASSERT3U(ioc, >=, ZFS_IOC_FIRST);
ASSERT3U(ioc, <, ZFS_IOC_LAST);
ASSERT3P(vec->zvec_legacy_func, ==, NULL);
ASSERT3P(vec->zvec_func, ==, NULL);
vec->zvec_legacy_func = func;
vec->zvec_secpolicy = secpolicy;
vec->zvec_namecheck = namecheck;
vec->zvec_allow_log = log_history;
vec->zvec_pool_check = pool_check;
}
/*
* See the block comment at the beginning of this file for details on
* each argument to this function.
*/
static void
zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func,
zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist,
boolean_t allow_log)
{
zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST];
ASSERT3U(ioc, >=, ZFS_IOC_FIRST);
ASSERT3U(ioc, <, ZFS_IOC_LAST);
ASSERT3P(vec->zvec_legacy_func, ==, NULL);
ASSERT3P(vec->zvec_func, ==, NULL);
/* if we are logging, the name must be valid */
ASSERT(!allow_log || namecheck != NO_NAME);
vec->zvec_name = name;
vec->zvec_func = func;
vec->zvec_secpolicy = secpolicy;
vec->zvec_namecheck = namecheck;
vec->zvec_pool_check = pool_check;
vec->zvec_smush_outnvlist = smush_outnvlist;
vec->zvec_allow_log = allow_log;
}
static void
zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
zfs_secpolicy_func_t *secpolicy, boolean_t log_history,
zfs_ioc_poolcheck_t pool_check)
{
zfs_ioctl_register_legacy(ioc, func, secpolicy,
POOL_NAME, log_history, pool_check);
}
static void
zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check)
{
zfs_ioctl_register_legacy(ioc, func, secpolicy,
DATASET_NAME, B_FALSE, pool_check);
}
static void
zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
{
zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config,
POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
}
static void
zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
zfs_secpolicy_func_t *secpolicy)
{
zfs_ioctl_register_legacy(ioc, func, secpolicy,
NO_NAME, B_FALSE, POOL_CHECK_NONE);
}
static void
zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc,
zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy)
{
zfs_ioctl_register_legacy(ioc, func, secpolicy,
DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED);
}
static void
zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
{
zfs_ioctl_register_dataset_read_secpolicy(ioc, func,
zfs_secpolicy_read);
}
static void
zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
zfs_secpolicy_func_t *secpolicy)
{
zfs_ioctl_register_legacy(ioc, func, secpolicy,
DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
}
static void
zfs_ioctl_init(void)
{
zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT,
zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY,
zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE);
zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS,
zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME,
POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
zfs_ioctl_register("send", ZFS_IOC_SEND_NEW,
zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME,
POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE,
zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME,
POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
zfs_ioctl_register("create", ZFS_IOC_CREATE,
zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
zfs_ioctl_register("clone", ZFS_IOC_CLONE,
zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+
+ zfs_ioctl_register("remap", ZFS_IOC_REMAP,
+ zfs_ioc_remap, zfs_secpolicy_remap, DATASET_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE);
zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS,
zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
zfs_ioctl_register("hold", ZFS_IOC_HOLD,
zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
zfs_ioctl_register("release", ZFS_IOC_RELEASE,
zfs_ioc_release, zfs_secpolicy_release, POOL_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS,
zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME,
POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK,
zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE);
zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK,
zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS,
zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME,
POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS,
zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks,
POOL_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
zfs_ioctl_register("channel_program", ZFS_IOC_CHANNEL_PROGRAM,
zfs_ioc_channel_program, zfs_secpolicy_config,
POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE,
B_TRUE);
/* IOCTLS that use the legacy function signature */
zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY);
zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create,
zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN,
zfs_ioc_pool_scan);
zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE,
zfs_ioc_pool_upgrade);
zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD,
zfs_ioc_vdev_add);
zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE,
zfs_ioc_vdev_remove);
zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE,
zfs_ioc_vdev_set_state);
zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH,
zfs_ioc_vdev_attach);
zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH,
zfs_ioc_vdev_detach);
zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH,
zfs_ioc_vdev_setpath);
zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU,
zfs_ioc_vdev_setfru);
zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS,
zfs_ioc_pool_set_props);
zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT,
zfs_ioc_vdev_split);
zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID,
zfs_ioc_pool_reguid);
zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS,
zfs_ioc_pool_configs, zfs_secpolicy_none);
zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT,
zfs_ioc_pool_tryimport, zfs_secpolicy_config);
zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT,
zfs_ioc_inject_fault, zfs_secpolicy_inject);
zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT,
zfs_ioc_clear_fault, zfs_secpolicy_inject);
zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT,
zfs_ioc_inject_list_next, zfs_secpolicy_inject);
/*
* pool destroy, and export don't log the history as part of
* zfsdev_ioctl, but rather zfs_ioc_pool_export
* does the logging of those commands.
*/
zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy,
zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE);
zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export,
zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE);
zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats,
zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE);
zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props,
zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE);
zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log,
zfs_secpolicy_inject, B_FALSE, POOL_CHECK_NONE);
zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME,
zfs_ioc_dsobj_to_dsname,
zfs_secpolicy_diff, B_FALSE, POOL_CHECK_NONE);
zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY,
zfs_ioc_pool_get_history,
zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED);
zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import,
zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear,
zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
zfs_ioctl_register_pool(ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen,
zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED);
zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN,
zfs_ioc_space_written);
zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS,
zfs_ioc_objset_recvd_props);
zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ,
zfs_ioc_next_obj);
zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL,
zfs_ioc_get_fsacl);
zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS,
zfs_ioc_objset_stats);
zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS,
zfs_ioc_objset_zplprops);
zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT,
zfs_ioc_dataset_list_next);
zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT,
zfs_ioc_snapshot_list_next);
zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS,
zfs_ioc_send_progress);
zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF,
zfs_ioc_diff, zfs_secpolicy_diff);
zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS,
zfs_ioc_obj_to_stats, zfs_secpolicy_diff);
zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH,
zfs_ioc_obj_to_path, zfs_secpolicy_diff);
zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE,
zfs_ioc_userspace_one, zfs_secpolicy_userspace_one);
zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY,
zfs_ioc_userspace_many, zfs_secpolicy_userspace_many);
zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND,
zfs_ioc_send, zfs_secpolicy_send);
zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop,
zfs_secpolicy_none);
zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy,
zfs_secpolicy_destroy);
zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename,
zfs_secpolicy_rename);
zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv,
zfs_secpolicy_recv);
zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote,
zfs_secpolicy_promote);
zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP,
zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop);
zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl,
zfs_secpolicy_set_fsacl);
zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share,
zfs_secpolicy_share, POOL_CHECK_NONE);
zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl,
zfs_secpolicy_smb_acl, POOL_CHECK_NONE);
zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE,
zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT,
zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
#ifdef __FreeBSD__
zfs_ioctl_register_dataset_nolog(ZFS_IOC_JAIL, zfs_ioc_jail,
zfs_secpolicy_config, POOL_CHECK_NONE);
zfs_ioctl_register_dataset_nolog(ZFS_IOC_UNJAIL, zfs_ioc_unjail,
zfs_secpolicy_config, POOL_CHECK_NONE);
zfs_ioctl_register("fbsd_nextboot", ZFS_IOC_NEXTBOOT,
zfs_ioc_nextboot, zfs_secpolicy_config, NO_NAME,
POOL_CHECK_NONE, B_FALSE, B_FALSE);
#endif
}
int
pool_status_check(const char *name, zfs_ioc_namecheck_t type,
zfs_ioc_poolcheck_t check)
{
spa_t *spa;
int error;
ASSERT(type == POOL_NAME || type == DATASET_NAME);
if (check & POOL_CHECK_NONE)
return (0);
error = spa_open(name, &spa, FTAG);
if (error == 0) {
if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa))
error = SET_ERROR(EAGAIN);
else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa))
error = SET_ERROR(EROFS);
spa_close(spa, FTAG);
}
return (error);
}
/*
* Find a free minor number.
*/
minor_t
zfsdev_minor_alloc(void)
{
static minor_t last_minor;
minor_t m;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
for (m = last_minor + 1; m != last_minor; m++) {
if (m > ZFSDEV_MAX_MINOR)
m = 1;
if (ddi_get_soft_state(zfsdev_state, m) == NULL) {
last_minor = m;
return (m);
}
}
return (0);
}
static int
zfs_ctldev_init(struct cdev *devp)
{
minor_t minor;
zfs_soft_state_t *zs;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
minor = zfsdev_minor_alloc();
if (minor == 0)
return (SET_ERROR(ENXIO));
if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS)
return (SET_ERROR(EAGAIN));
devfs_set_cdevpriv((void *)(uintptr_t)minor, zfsdev_close);
zs = ddi_get_soft_state(zfsdev_state, minor);
zs->zss_type = ZSST_CTLDEV;
zfs_onexit_init((zfs_onexit_t **)&zs->zss_data);
return (0);
}
static void
zfs_ctldev_destroy(zfs_onexit_t *zo, minor_t minor)
{
ASSERT(MUTEX_HELD(&spa_namespace_lock));
zfs_onexit_destroy(zo);
ddi_soft_state_free(zfsdev_state, minor);
}
void *
zfsdev_get_soft_state(minor_t minor, enum zfs_soft_state_type which)
{
zfs_soft_state_t *zp;
zp = ddi_get_soft_state(zfsdev_state, minor);
if (zp == NULL || zp->zss_type != which)
return (NULL);
return (zp->zss_data);
}
static int
zfsdev_open(struct cdev *devp, int flag, int mode, struct thread *td)
{
int error = 0;
#ifdef illumos
if (getminor(*devp) != 0)
return (zvol_open(devp, flag, otyp, cr));
#endif
/* This is the control device. Allocate a new minor if requested. */
if (flag & FEXCL) {
mutex_enter(&spa_namespace_lock);
error = zfs_ctldev_init(devp);
mutex_exit(&spa_namespace_lock);
}
return (error);
}
static void
zfsdev_close(void *data)
{
zfs_onexit_t *zo;
minor_t minor = (minor_t)(uintptr_t)data;
if (minor == 0)
return;
mutex_enter(&spa_namespace_lock);
zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV);
if (zo == NULL) {
mutex_exit(&spa_namespace_lock);
return;
}
zfs_ctldev_destroy(zo, minor);
mutex_exit(&spa_namespace_lock);
}
static int
zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag,
struct thread *td)
{
zfs_cmd_t *zc;
uint_t vecnum;
int error, rc, len;
#ifdef illumos
minor_t minor = getminor(dev);
#else
zfs_iocparm_t *zc_iocparm;
int cflag, cmd, oldvecnum;
boolean_t newioc, compat;
void *compat_zc = NULL;
cred_t *cr = td->td_ucred;
#endif
const zfs_ioc_vec_t *vec;
char *saved_poolname = NULL;
nvlist_t *innvl = NULL;
cflag = ZFS_CMD_COMPAT_NONE;
compat = B_FALSE;
newioc = B_TRUE; /* "new" style (zfs_iocparm_t) ioctl */
len = IOCPARM_LEN(zcmd);
vecnum = cmd = zcmd & 0xff;
/*
* Check if we are talking to supported older binaries
* and translate zfs_cmd if necessary
*/
if (len != sizeof(zfs_iocparm_t)) {
newioc = B_FALSE;
compat = B_TRUE;
vecnum = cmd;
switch (len) {
case sizeof(zfs_cmd_zcmd_t):
cflag = ZFS_CMD_COMPAT_LZC;
break;
case sizeof(zfs_cmd_deadman_t):
cflag = ZFS_CMD_COMPAT_DEADMAN;
break;
case sizeof(zfs_cmd_v28_t):
cflag = ZFS_CMD_COMPAT_V28;
break;
case sizeof(zfs_cmd_v15_t):
cflag = ZFS_CMD_COMPAT_V15;
vecnum = zfs_ioctl_v15_to_v28[cmd];
/*
* Return without further handling
* if the command is blacklisted.
*/
if (vecnum == ZFS_IOC_COMPAT_PASS)
return (0);
else if (vecnum == ZFS_IOC_COMPAT_FAIL)
return (ENOTSUP);
break;
default:
return (EINVAL);
}
}
#ifdef illumos
vecnum = cmd - ZFS_IOC_FIRST;
ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip));
#endif
if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
return (SET_ERROR(EINVAL));
vec = &zfs_ioc_vec[vecnum];
zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP);
#ifdef illumos
error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag);
if (error != 0) {
error = SET_ERROR(EFAULT);
goto out;
}
#else /* !illumos */
bzero(zc, sizeof(zfs_cmd_t));
if (newioc) {
zc_iocparm = (void *)arg;
switch (zc_iocparm->zfs_ioctl_version) {
case ZFS_IOCVER_CURRENT:
if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_t)) {
error = SET_ERROR(EINVAL);
goto out;
}
break;
case ZFS_IOCVER_INLANES:
if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_inlanes_t)) {
error = SET_ERROR(EFAULT);
goto out;
}
compat = B_TRUE;
cflag = ZFS_CMD_COMPAT_INLANES;
break;
case ZFS_IOCVER_RESUME:
if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_resume_t)) {
error = SET_ERROR(EFAULT);
goto out;
}
compat = B_TRUE;
cflag = ZFS_CMD_COMPAT_RESUME;
break;
case ZFS_IOCVER_EDBP:
if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_edbp_t)) {
error = SET_ERROR(EFAULT);
goto out;
}
compat = B_TRUE;
cflag = ZFS_CMD_COMPAT_EDBP;
break;
case ZFS_IOCVER_ZCMD:
if (zc_iocparm->zfs_cmd_size > sizeof(zfs_cmd_t) ||
zc_iocparm->zfs_cmd_size < sizeof(zfs_cmd_zcmd_t)) {
error = SET_ERROR(EFAULT);
goto out;
}
compat = B_TRUE;
cflag = ZFS_CMD_COMPAT_ZCMD;
break;
default:
error = SET_ERROR(EINVAL);
goto out;
/* NOTREACHED */
}
if (compat) {
ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size);
compat_zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP);
bzero(compat_zc, sizeof(zfs_cmd_t));
error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd,
compat_zc, zc_iocparm->zfs_cmd_size, flag);
if (error != 0) {
error = SET_ERROR(EFAULT);
goto out;
}
} else {
error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd,
zc, zc_iocparm->zfs_cmd_size, flag);
if (error != 0) {
error = SET_ERROR(EFAULT);
goto out;
}
}
}
if (compat) {
if (newioc) {
ASSERT(compat_zc != NULL);
zfs_cmd_compat_get(zc, compat_zc, cflag);
} else {
ASSERT(compat_zc == NULL);
zfs_cmd_compat_get(zc, arg, cflag);
}
oldvecnum = vecnum;
error = zfs_ioctl_compat_pre(zc, &vecnum, cflag);
if (error != 0)
goto out;
if (oldvecnum != vecnum)
vec = &zfs_ioc_vec[vecnum];
}
#endif /* !illumos */
zc->zc_iflags = flag & FKIOCTL;
if (zc->zc_nvlist_src_size != 0) {
error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
zc->zc_iflags, &innvl);
if (error != 0)
goto out;
}
/* rewrite innvl for backwards compatibility */
if (compat)
innvl = zfs_ioctl_compat_innvl(zc, innvl, vecnum, cflag);
/*
* Ensure that all pool/dataset names are valid before we pass down to
* the lower layers.
*/
zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
switch (vec->zvec_namecheck) {
case POOL_NAME:
if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
error = SET_ERROR(EINVAL);
else
error = pool_status_check(zc->zc_name,
vec->zvec_namecheck, vec->zvec_pool_check);
break;
case DATASET_NAME:
if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
error = SET_ERROR(EINVAL);
else
error = pool_status_check(zc->zc_name,
vec->zvec_namecheck, vec->zvec_pool_check);
break;
case NO_NAME:
break;
}
if (error == 0)
error = vec->zvec_secpolicy(zc, innvl, cr);
if (error != 0)
goto out;
/* legacy ioctls can modify zc_name */
len = strcspn(zc->zc_name, "/@#") + 1;
saved_poolname = kmem_alloc(len, KM_SLEEP);
(void) strlcpy(saved_poolname, zc->zc_name, len);
if (vec->zvec_func != NULL) {
nvlist_t *outnvl;
int puterror = 0;
spa_t *spa;
nvlist_t *lognv = NULL;
ASSERT(vec->zvec_legacy_func == NULL);
/*
* Add the innvl to the lognv before calling the func,
* in case the func changes the innvl.
*/
if (vec->zvec_allow_log) {
lognv = fnvlist_alloc();
fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL,
vec->zvec_name);
if (!nvlist_empty(innvl)) {
fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL,
innvl);
}
}
outnvl = fnvlist_alloc();
error = vec->zvec_func(zc->zc_name, innvl, outnvl);
/*
* Some commands can partially execute, modfiy state, and still
* return an error. In these cases, attempt to record what
* was modified.
*/
if ((error == 0 ||
(cmd == ZFS_IOC_CHANNEL_PROGRAM && error != EINVAL)) &&
vec->zvec_allow_log &&
spa_open(zc->zc_name, &spa, FTAG) == 0) {
if (!nvlist_empty(outnvl)) {
fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL,
outnvl);
}
if (error != 0) {
fnvlist_add_int64(lognv, ZPOOL_HIST_ERRNO,
error);
}
(void) spa_history_log_nvl(spa, lognv);
spa_close(spa, FTAG);
}
fnvlist_free(lognv);
/* rewrite outnvl for backwards compatibility */
if (compat)
outnvl = zfs_ioctl_compat_outnvl(zc, outnvl, vecnum,
cflag);
if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) {
int smusherror = 0;
if (vec->zvec_smush_outnvlist) {
smusherror = nvlist_smush(outnvl,
zc->zc_nvlist_dst_size);
}
if (smusherror == 0)
puterror = put_nvlist(zc, outnvl);
}
if (puterror != 0)
error = puterror;
nvlist_free(outnvl);
} else {
error = vec->zvec_legacy_func(zc);
}
out:
nvlist_free(innvl);
#ifdef illumos
rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag);
if (error == 0 && rc != 0)
error = SET_ERROR(EFAULT);
#else
if (compat) {
zfs_ioctl_compat_post(zc, cmd, cflag);
if (newioc) {
ASSERT(compat_zc != NULL);
ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size);
zfs_cmd_compat_put(zc, compat_zc, vecnum, cflag);
rc = ddi_copyout(compat_zc,
(void *)(uintptr_t)zc_iocparm->zfs_cmd,
zc_iocparm->zfs_cmd_size, flag);
if (error == 0 && rc != 0)
error = SET_ERROR(EFAULT);
kmem_free(compat_zc, sizeof (zfs_cmd_t));
} else {
zfs_cmd_compat_put(zc, arg, vecnum, cflag);
}
} else {
ASSERT(newioc);
rc = ddi_copyout(zc, (void *)(uintptr_t)zc_iocparm->zfs_cmd,
sizeof (zfs_cmd_t), flag);
if (error == 0 && rc != 0)
error = SET_ERROR(EFAULT);
}
#endif
if (error == 0 && vec->zvec_allow_log) {
char *s = tsd_get(zfs_allow_log_key);
if (s != NULL)
strfree(s);
(void) tsd_set(zfs_allow_log_key, saved_poolname);
} else {
if (saved_poolname != NULL)
strfree(saved_poolname);
}
kmem_free(zc, sizeof (zfs_cmd_t));
return (error);
}
#ifdef illumos
static int
zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
{
if (cmd != DDI_ATTACH)
return (DDI_FAILURE);
if (ddi_create_minor_node(dip, "zfs", S_IFCHR, 0,
DDI_PSEUDO, 0) == DDI_FAILURE)
return (DDI_FAILURE);
zfs_dip = dip;
ddi_report_dev(dip);
return (DDI_SUCCESS);
}
static int
zfs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
{
if (spa_busy() || zfs_busy() || zvol_busy())
return (DDI_FAILURE);
if (cmd != DDI_DETACH)
return (DDI_FAILURE);
zfs_dip = NULL;
ddi_prop_remove_all(dip);
ddi_remove_minor_node(dip, NULL);
return (DDI_SUCCESS);
}
/*ARGSUSED*/
static int
zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
{
switch (infocmd) {
case DDI_INFO_DEVT2DEVINFO:
*result = zfs_dip;
return (DDI_SUCCESS);
case DDI_INFO_DEVT2INSTANCE:
*result = (void *)0;
return (DDI_SUCCESS);
}
return (DDI_FAILURE);
}
#endif /* illumos */
/*
* OK, so this is a little weird.
*
* /dev/zfs is the control node, i.e. minor 0.
* /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0.
*
* /dev/zfs has basically nothing to do except serve up ioctls,
* so most of the standard driver entry points are in zvol.c.
*/
#ifdef illumos
static struct cb_ops zfs_cb_ops = {
zfsdev_open, /* open */
zfsdev_close, /* close */
zvol_strategy, /* strategy */
nodev, /* print */
zvol_dump, /* dump */
zvol_read, /* read */
zvol_write, /* write */
zfsdev_ioctl, /* ioctl */
nodev, /* devmap */
nodev, /* mmap */
nodev, /* segmap */
nochpoll, /* poll */
ddi_prop_op, /* prop_op */
NULL, /* streamtab */
D_NEW | D_MP | D_64BIT, /* Driver compatibility flag */
CB_REV, /* version */
nodev, /* async read */
nodev, /* async write */
};
static struct dev_ops zfs_dev_ops = {
DEVO_REV, /* version */
0, /* refcnt */
zfs_info, /* info */
nulldev, /* identify */
nulldev, /* probe */
zfs_attach, /* attach */
zfs_detach, /* detach */
nodev, /* reset */
&zfs_cb_ops, /* driver operations */
NULL, /* no bus operations */
NULL, /* power */
ddi_quiesce_not_needed, /* quiesce */
};
static struct modldrv zfs_modldrv = {
&mod_driverops,
"ZFS storage pool",
&zfs_dev_ops
};
static struct modlinkage modlinkage = {
MODREV_1,
(void *)&zfs_modlfs,
(void *)&zfs_modldrv,
NULL
};
#endif /* illumos */
static struct cdevsw zfs_cdevsw = {
.d_version = D_VERSION,
.d_open = zfsdev_open,
.d_ioctl = zfsdev_ioctl,
.d_name = ZFS_DEV_NAME
};
static void
zfs_allow_log_destroy(void *arg)
{
char *poolname = arg;
strfree(poolname);
}
static void
zfsdev_init(void)
{
zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666,
ZFS_DEV_NAME);
}
static void
zfsdev_fini(void)
{
if (zfsdev != NULL)
destroy_dev(zfsdev);
}
static struct root_hold_token *zfs_root_token;
struct proc *zfsproc;
#ifdef illumos
int
_init(void)
{
int error;
spa_init(FREAD | FWRITE);
zfs_init();
zvol_init();
zfs_ioctl_init();
if ((error = mod_install(&modlinkage)) != 0) {
zvol_fini();
zfs_fini();
spa_fini();
return (error);
}
tsd_create(&zfs_fsyncer_key, NULL);
tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
error = ldi_ident_from_mod(&modlinkage, &zfs_li);
ASSERT(error == 0);
mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL);
return (0);
}
int
_fini(void)
{
int error;
if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled)
return (SET_ERROR(EBUSY));
if ((error = mod_remove(&modlinkage)) != 0)
return (error);
zvol_fini();
zfs_fini();
spa_fini();
if (zfs_nfsshare_inited)
(void) ddi_modclose(nfs_mod);
if (zfs_smbshare_inited)
(void) ddi_modclose(smbsrv_mod);
if (zfs_nfsshare_inited || zfs_smbshare_inited)
(void) ddi_modclose(sharefs_mod);
tsd_destroy(&zfs_fsyncer_key);
ldi_ident_release(zfs_li);
zfs_li = NULL;
mutex_destroy(&zfs_share_lock);
return (error);
}
int
_info(struct modinfo *modinfop)
{
return (mod_info(&modlinkage, modinfop));
}
#endif /* illumos */
static int zfs__init(void);
static int zfs__fini(void);
static void zfs_shutdown(void *, int);
static eventhandler_tag zfs_shutdown_event_tag;
#ifdef __FreeBSD__
#define ZFS_MIN_KSTACK_PAGES 4
#endif
int
zfs__init(void)
{
#ifdef __FreeBSD__
#if KSTACK_PAGES < ZFS_MIN_KSTACK_PAGES
printf("ZFS NOTICE: KSTACK_PAGES is %d which could result in stack "
"overflow panic!\nPlease consider adding "
"'options KSTACK_PAGES=%d' to your kernel config\n", KSTACK_PAGES,
ZFS_MIN_KSTACK_PAGES);
#endif
#endif
zfs_root_token = root_mount_hold("ZFS");
mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL);
spa_init(FREAD | FWRITE);
zfs_init();
zvol_init();
zfs_ioctl_init();
tsd_create(&zfs_fsyncer_key, NULL);
tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
tsd_create(&zfs_geom_probe_vdev_key, NULL);
printf("ZFS storage pool version: features support (" SPA_VERSION_STRING ")\n");
root_mount_rel(zfs_root_token);
zfsdev_init();
return (0);
}
int
zfs__fini(void)
{
if (spa_busy() || zfs_busy() || zvol_busy() ||
zio_injection_enabled) {
return (EBUSY);
}
zfsdev_fini();
zvol_fini();
zfs_fini();
spa_fini();
tsd_destroy(&zfs_fsyncer_key);
tsd_destroy(&rrw_tsd_key);
tsd_destroy(&zfs_allow_log_key);
mutex_destroy(&zfs_share_lock);
return (0);
}
static void
zfs_shutdown(void *arg __unused, int howto __unused)
{
/*
* ZFS fini routines can not properly work in a panic-ed system.
*/
if (panicstr == NULL)
(void)zfs__fini();
}
static int
zfs_modevent(module_t mod, int type, void *unused __unused)
{
int err;
switch (type) {
case MOD_LOAD:
err = zfs__init();
if (err == 0)
zfs_shutdown_event_tag = EVENTHANDLER_REGISTER(
shutdown_post_sync, zfs_shutdown, NULL,
SHUTDOWN_PRI_FIRST);
return (err);
case MOD_UNLOAD:
err = zfs__fini();
if (err == 0 && zfs_shutdown_event_tag != NULL)
EVENTHANDLER_DEREGISTER(shutdown_post_sync,
zfs_shutdown_event_tag);
return (err);
case MOD_SHUTDOWN:
return (0);
default:
break;
}
return (EOPNOTSUPP);
}
static moduledata_t zfs_mod = {
"zfsctrl",
zfs_modevent,
0
};
DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_VFS, SI_ORDER_ANY);
MODULE_VERSION(zfsctrl, 1);
MODULE_DEPEND(zfsctrl, opensolaris, 1, 1, 1);
MODULE_DEPEND(zfsctrl, krpc, 1, 1, 1);
MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1);
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c (revision 332525)
@@ -1,2564 +1,2564 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
* All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
*/
/* Portions Copyright 2010 Robert Milkowski */
#include <sys/types.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/sysmacros.h>
#include <sys/kmem.h>
#include <sys/acl.h>
#include <sys/vnode.h>
#include <sys/vfs.h>
#include <sys/mntent.h>
#include <sys/mount.h>
#include <sys/cmn_err.h>
#include <sys/zfs_znode.h>
#include <sys/zfs_dir.h>
#include <sys/zil.h>
#include <sys/fs/zfs.h>
#include <sys/dmu.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_deleg.h>
#include <sys/spa.h>
#include <sys/zap.h>
#include <sys/sa.h>
#include <sys/sa_impl.h>
#include <sys/varargs.h>
#include <sys/policy.h>
#include <sys/atomic.h>
#include <sys/zfs_ioctl.h>
#include <sys/zfs_ctldir.h>
#include <sys/zfs_fuid.h>
#include <sys/sunddi.h>
#include <sys/dnlc.h>
#include <sys/dmu_objset.h>
#include <sys/spa_boot.h>
#include <sys/jail.h>
#include "zfs_comutil.h"
struct mtx zfs_debug_mtx;
MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
int zfs_super_owner;
SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
"File system owner can perform privileged operation on his file systems");
int zfs_debug_level;
SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
"Debug level");
SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
static int zfs_version_acl = ZFS_ACL_VERSION;
SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
"ZFS_ACL_VERSION");
static int zfs_version_spa = SPA_VERSION;
SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
"SPA_VERSION");
static int zfs_version_zpl = ZPL_VERSION;
SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
"ZPL_VERSION");
static int zfs_mount(vfs_t *vfsp);
static int zfs_umount(vfs_t *vfsp, int fflag);
static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
static int zfs_sync(vfs_t *vfsp, int waitfor);
static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
struct ucred **credanonp, int *numsecflavors, int **secflavors);
static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
static void zfs_objset_close(zfsvfs_t *zfsvfs);
static void zfs_freevfs(vfs_t *vfsp);
struct vfsops zfs_vfsops = {
.vfs_mount = zfs_mount,
.vfs_unmount = zfs_umount,
.vfs_root = zfs_root,
.vfs_statfs = zfs_statfs,
.vfs_vget = zfs_vget,
.vfs_sync = zfs_sync,
.vfs_checkexp = zfs_checkexp,
.vfs_fhtovp = zfs_fhtovp,
};
VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
/*
* We need to keep a count of active fs's.
* This is necessary to prevent our module
* from being unloaded after a umount -f
*/
static uint32_t zfs_active_fs_count = 0;
/*ARGSUSED*/
static int
zfs_sync(vfs_t *vfsp, int waitfor)
{
/*
* Data integrity is job one. We don't want a compromised kernel
* writing to the storage pool, so we never sync during panic.
*/
if (panicstr)
return (0);
/*
* Ignore the system syncher. ZFS already commits async data
* at zfs_txg_timeout intervals.
*/
if (waitfor == MNT_LAZY)
return (0);
if (vfsp != NULL) {
/*
* Sync a specific filesystem.
*/
zfsvfs_t *zfsvfs = vfsp->vfs_data;
dsl_pool_t *dp;
int error;
error = vfs_stdsync(vfsp, waitfor);
if (error != 0)
return (error);
ZFS_ENTER(zfsvfs);
dp = dmu_objset_pool(zfsvfs->z_os);
/*
* If the system is shutting down, then skip any
* filesystems which may exist on a suspended pool.
*/
if (sys_shutdown && spa_suspended(dp->dp_spa)) {
ZFS_EXIT(zfsvfs);
return (0);
}
if (zfsvfs->z_log != NULL)
zil_commit(zfsvfs->z_log, 0);
ZFS_EXIT(zfsvfs);
} else {
/*
* Sync all ZFS filesystems. This is what happens when you
* run sync(1M). Unlike other filesystems, ZFS honors the
* request by waiting for all pools to commit all dirty data.
*/
spa_sync_allpools();
}
return (0);
}
#ifndef __FreeBSD_kernel__
static int
zfs_create_unique_device(dev_t *dev)
{
major_t new_major;
do {
ASSERT3U(zfs_minor, <=, MAXMIN32);
minor_t start = zfs_minor;
do {
mutex_enter(&zfs_dev_mtx);
if (zfs_minor >= MAXMIN32) {
/*
* If we're still using the real major
* keep out of /dev/zfs and /dev/zvol minor
* number space. If we're using a getudev()'ed
* major number, we can use all of its minors.
*/
if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
zfs_minor = ZFS_MIN_MINOR;
else
zfs_minor = 0;
} else {
zfs_minor++;
}
*dev = makedevice(zfs_major, zfs_minor);
mutex_exit(&zfs_dev_mtx);
} while (vfs_devismounted(*dev) && zfs_minor != start);
if (zfs_minor == start) {
/*
* We are using all ~262,000 minor numbers for the
* current major number. Create a new major number.
*/
if ((new_major = getudev()) == (major_t)-1) {
cmn_err(CE_WARN,
"zfs_mount: Can't get unique major "
"device number.");
return (-1);
}
mutex_enter(&zfs_dev_mtx);
zfs_major = new_major;
zfs_minor = 0;
mutex_exit(&zfs_dev_mtx);
} else {
break;
}
/* CONSTANTCONDITION */
} while (1);
return (0);
}
#endif /* !__FreeBSD_kernel__ */
static void
atime_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
if (newval == TRUE) {
zfsvfs->z_atime = TRUE;
zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
} else {
zfsvfs->z_atime = FALSE;
zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
}
}
static void
xattr_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
if (newval == TRUE) {
/* XXX locking on vfs_flag? */
#ifdef TODO
zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
#endif
vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
} else {
/* XXX locking on vfs_flag? */
#ifdef TODO
zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
#endif
vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
}
}
static void
blksz_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
ASSERT(ISP2(newval));
zfsvfs->z_max_blksz = newval;
zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
}
static void
readonly_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
if (newval) {
/* XXX locking on vfs_flag? */
zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
} else {
/* XXX locking on vfs_flag? */
zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
}
}
static void
setuid_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
if (newval == FALSE) {
zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
} else {
zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
}
}
static void
exec_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
if (newval == FALSE) {
zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
} else {
zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
}
}
/*
* The nbmand mount option can be changed at mount time.
* We can't allow it to be toggled on live file systems or incorrect
* behavior may be seen from cifs clients
*
* This property isn't registered via dsl_prop_register(), but this callback
* will be called when a file system is first mounted
*/
static void
nbmand_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
if (newval == FALSE) {
vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
} else {
vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
}
}
static void
snapdir_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
zfsvfs->z_show_ctldir = newval;
}
static void
vscan_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
zfsvfs->z_vscan = newval;
}
static void
acl_mode_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
zfsvfs->z_acl_mode = newval;
}
static void
acl_inherit_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
zfsvfs->z_acl_inherit = newval;
}
static int
zfs_register_callbacks(vfs_t *vfsp)
{
struct dsl_dataset *ds = NULL;
objset_t *os = NULL;
zfsvfs_t *zfsvfs = NULL;
uint64_t nbmand;
boolean_t readonly = B_FALSE;
boolean_t do_readonly = B_FALSE;
boolean_t setuid = B_FALSE;
boolean_t do_setuid = B_FALSE;
boolean_t exec = B_FALSE;
boolean_t do_exec = B_FALSE;
#ifdef illumos
boolean_t devices = B_FALSE;
boolean_t do_devices = B_FALSE;
#endif
boolean_t xattr = B_FALSE;
boolean_t do_xattr = B_FALSE;
boolean_t atime = B_FALSE;
boolean_t do_atime = B_FALSE;
int error = 0;
ASSERT(vfsp);
zfsvfs = vfsp->vfs_data;
ASSERT(zfsvfs);
os = zfsvfs->z_os;
/*
* This function can be called for a snapshot when we update snapshot's
* mount point, which isn't really supported.
*/
if (dmu_objset_is_snapshot(os))
return (EOPNOTSUPP);
/*
* The act of registering our callbacks will destroy any mount
* options we may have. In order to enable temporary overrides
* of mount options, we stash away the current values and
* restore them after we register the callbacks.
*/
if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
!spa_writeable(dmu_objset_spa(os))) {
readonly = B_TRUE;
do_readonly = B_TRUE;
} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
readonly = B_FALSE;
do_readonly = B_TRUE;
}
if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
setuid = B_FALSE;
do_setuid = B_TRUE;
} else {
if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
setuid = B_FALSE;
do_setuid = B_TRUE;
} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
setuid = B_TRUE;
do_setuid = B_TRUE;
}
}
if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
exec = B_FALSE;
do_exec = B_TRUE;
} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
exec = B_TRUE;
do_exec = B_TRUE;
}
if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
xattr = B_FALSE;
do_xattr = B_TRUE;
} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
xattr = B_TRUE;
do_xattr = B_TRUE;
}
if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
atime = B_FALSE;
do_atime = B_TRUE;
} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
atime = B_TRUE;
do_atime = B_TRUE;
}
/*
* We need to enter pool configuration here, so that we can use
* dsl_prop_get_int_ds() to handle the special nbmand property below.
* dsl_prop_get_integer() can not be used, because it has to acquire
* spa_namespace_lock and we can not do that because we already hold
- * z_teardown_lock. The problem is that spa_config_sync() is called
+ * z_teardown_lock. The problem is that spa_write_cachefile() is called
* with spa_namespace_lock held and the function calls ZFS vnode
* operations to write the cache file and thus z_teardown_lock is
* acquired after spa_namespace_lock.
*/
ds = dmu_objset_ds(os);
dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
/*
* nbmand is a special property. It can only be changed at
* mount time.
*
* This is weird, but it is documented to only be changeable
* at mount time.
*/
if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
nbmand = B_FALSE;
} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
nbmand = B_TRUE;
} else if (error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0) {
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
return (error);
}
/*
* Register property callbacks.
*
* It would probably be fine to just check for i/o error from
* the first prop_register(), but I guess I like to go
* overboard...
*/
error = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
error = error ? error : dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
error = error ? error : dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
error = error ? error : dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
#ifdef illumos
error = error ? error : dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
#endif
error = error ? error : dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
error = error ? error : dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
error = error ? error : dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
error = error ? error : dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
error = error ? error : dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
zfsvfs);
error = error ? error : dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
if (error)
goto unregister;
/*
* Invoke our callbacks to restore temporary mount options.
*/
if (do_readonly)
readonly_changed_cb(zfsvfs, readonly);
if (do_setuid)
setuid_changed_cb(zfsvfs, setuid);
if (do_exec)
exec_changed_cb(zfsvfs, exec);
if (do_xattr)
xattr_changed_cb(zfsvfs, xattr);
if (do_atime)
atime_changed_cb(zfsvfs, atime);
nbmand_changed_cb(zfsvfs, nbmand);
return (0);
unregister:
dsl_prop_unregister_all(ds, zfsvfs);
return (error);
}
static int
zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
uint64_t *userp, uint64_t *groupp)
{
/*
* Is it a valid type of object to track?
*/
if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
return (SET_ERROR(ENOENT));
/*
* If we have a NULL data pointer
* then assume the id's aren't changing and
* return EEXIST to the dmu to let it know to
* use the same ids
*/
if (data == NULL)
return (SET_ERROR(EEXIST));
if (bonustype == DMU_OT_ZNODE) {
znode_phys_t *znp = data;
*userp = znp->zp_uid;
*groupp = znp->zp_gid;
} else {
int hdrsize;
sa_hdr_phys_t *sap = data;
sa_hdr_phys_t sa = *sap;
boolean_t swap = B_FALSE;
ASSERT(bonustype == DMU_OT_SA);
if (sa.sa_magic == 0) {
/*
* This should only happen for newly created
* files that haven't had the znode data filled
* in yet.
*/
*userp = 0;
*groupp = 0;
return (0);
}
if (sa.sa_magic == BSWAP_32(SA_MAGIC)) {
sa.sa_magic = SA_MAGIC;
sa.sa_layout_info = BSWAP_16(sa.sa_layout_info);
swap = B_TRUE;
} else {
VERIFY3U(sa.sa_magic, ==, SA_MAGIC);
}
hdrsize = sa_hdrsize(&sa);
VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t));
*userp = *((uint64_t *)((uintptr_t)data + hdrsize +
SA_UID_OFFSET));
*groupp = *((uint64_t *)((uintptr_t)data + hdrsize +
SA_GID_OFFSET));
if (swap) {
*userp = BSWAP_64(*userp);
*groupp = BSWAP_64(*groupp);
}
}
return (0);
}
static void
fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
char *domainbuf, int buflen, uid_t *ridp)
{
uint64_t fuid;
const char *domain;
fuid = zfs_strtonum(fuidstr, NULL);
domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
if (domain)
(void) strlcpy(domainbuf, domain, buflen);
else
domainbuf[0] = '\0';
*ridp = FUID_RID(fuid);
}
static uint64_t
zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
{
switch (type) {
case ZFS_PROP_USERUSED:
return (DMU_USERUSED_OBJECT);
case ZFS_PROP_GROUPUSED:
return (DMU_GROUPUSED_OBJECT);
case ZFS_PROP_USERQUOTA:
return (zfsvfs->z_userquota_obj);
case ZFS_PROP_GROUPQUOTA:
return (zfsvfs->z_groupquota_obj);
}
return (0);
}
int
zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
{
int error;
zap_cursor_t zc;
zap_attribute_t za;
zfs_useracct_t *buf = vbuf;
uint64_t obj;
if (!dmu_objset_userspace_present(zfsvfs->z_os))
return (SET_ERROR(ENOTSUP));
obj = zfs_userquota_prop_to_obj(zfsvfs, type);
if (obj == 0) {
*bufsizep = 0;
return (0);
}
for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
(error = zap_cursor_retrieve(&zc, &za)) == 0;
zap_cursor_advance(&zc)) {
if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
*bufsizep)
break;
fuidstr_to_sid(zfsvfs, za.za_name,
buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
buf->zu_space = za.za_first_integer;
buf++;
}
if (error == ENOENT)
error = 0;
ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
*bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
*cookiep = zap_cursor_serialize(&zc);
zap_cursor_fini(&zc);
return (error);
}
/*
* buf must be big enough (eg, 32 bytes)
*/
static int
id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
char *buf, boolean_t addok)
{
uint64_t fuid;
int domainid = 0;
if (domain && domain[0]) {
domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
if (domainid == -1)
return (SET_ERROR(ENOENT));
}
fuid = FUID_ENCODE(domainid, rid);
(void) sprintf(buf, "%llx", (longlong_t)fuid);
return (0);
}
int
zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
const char *domain, uint64_t rid, uint64_t *valp)
{
char buf[32];
int err;
uint64_t obj;
*valp = 0;
if (!dmu_objset_userspace_present(zfsvfs->z_os))
return (SET_ERROR(ENOTSUP));
obj = zfs_userquota_prop_to_obj(zfsvfs, type);
if (obj == 0)
return (0);
err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
if (err)
return (err);
err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
if (err == ENOENT)
err = 0;
return (err);
}
int
zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
const char *domain, uint64_t rid, uint64_t quota)
{
char buf[32];
int err;
dmu_tx_t *tx;
uint64_t *objp;
boolean_t fuid_dirtied;
if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
return (SET_ERROR(EINVAL));
if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
return (SET_ERROR(ENOTSUP));
objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
&zfsvfs->z_groupquota_obj;
err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
if (err)
return (err);
fuid_dirtied = zfsvfs->z_fuid_dirty;
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
if (*objp == 0) {
dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
zfs_userquota_prop_prefixes[type]);
}
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err) {
dmu_tx_abort(tx);
return (err);
}
mutex_enter(&zfsvfs->z_lock);
if (*objp == 0) {
*objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
DMU_OT_NONE, 0, tx);
VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
}
mutex_exit(&zfsvfs->z_lock);
if (quota == 0) {
err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
if (err == ENOENT)
err = 0;
} else {
err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
}
ASSERT(err == 0);
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
dmu_tx_commit(tx);
return (err);
}
boolean_t
zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
{
char buf[32];
uint64_t used, quota, usedobj, quotaobj;
int err;
usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
if (quotaobj == 0 || zfsvfs->z_replay)
return (B_FALSE);
(void) sprintf(buf, "%llx", (longlong_t)fuid);
err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
if (err != 0)
return (B_FALSE);
err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
if (err != 0)
return (B_FALSE);
return (used >= quota);
}
boolean_t
zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
{
uint64_t fuid;
uint64_t quotaobj;
quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
fuid = isgroup ? zp->z_gid : zp->z_uid;
if (quotaobj == 0 || zfsvfs->z_replay)
return (B_FALSE);
return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
}
/*
* Associate this zfsvfs with the given objset, which must be owned.
* This will cache a bunch of on-disk state from the objset in the
* zfsvfs.
*/
static int
zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
{
int error;
uint64_t val;
zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
zfsvfs->z_os = os;
error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
if (error != 0)
return (error);
if (zfsvfs->z_version >
zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
(void) printf("Can't mount a version %lld file system "
"on a version %lld pool\n. Pool must be upgraded to mount "
"this file system.", (u_longlong_t)zfsvfs->z_version,
(u_longlong_t)spa_version(dmu_objset_spa(os)));
return (SET_ERROR(ENOTSUP));
}
error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
if (error != 0)
return (error);
zfsvfs->z_norm = (int)val;
error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
if (error != 0)
return (error);
zfsvfs->z_utf8 = (val != 0);
error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
if (error != 0)
return (error);
zfsvfs->z_case = (uint_t)val;
/*
* Fold case on file systems that are always or sometimes case
* insensitive.
*/
if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
zfsvfs->z_case == ZFS_CASE_MIXED)
zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
uint64_t sa_obj = 0;
if (zfsvfs->z_use_sa) {
/* should either have both of these objects or none */
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
&sa_obj);
if (error != 0)
return (error);
}
error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
&zfsvfs->z_attr_table);
if (error != 0)
return (error);
if (zfsvfs->z_version >= ZPL_VERSION_SA)
sa_register_update_callback(os, zfs_sa_upgrade);
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
&zfsvfs->z_root);
if (error != 0)
return (error);
ASSERT(zfsvfs->z_root != 0);
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
&zfsvfs->z_unlinkedobj);
if (error != 0)
return (error);
error = zap_lookup(os, MASTER_NODE_OBJ,
zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
8, 1, &zfsvfs->z_userquota_obj);
if (error == ENOENT)
zfsvfs->z_userquota_obj = 0;
else if (error != 0)
return (error);
error = zap_lookup(os, MASTER_NODE_OBJ,
zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
8, 1, &zfsvfs->z_groupquota_obj);
if (error == ENOENT)
zfsvfs->z_groupquota_obj = 0;
else if (error != 0)
return (error);
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
&zfsvfs->z_fuid_obj);
if (error == ENOENT)
zfsvfs->z_fuid_obj = 0;
else if (error != 0)
return (error);
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
&zfsvfs->z_shares_dir);
if (error == ENOENT)
zfsvfs->z_shares_dir = 0;
else if (error != 0)
return (error);
/*
* Only use the name cache if we are looking for a
* name on a file system that does not require normalization
* or case folding. We can also look there if we happen to be
* on a non-normalizing, mixed sensitivity file system IF we
* are looking for the exact name (which is always the case on
* FreeBSD).
*/
zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
((zfsvfs->z_case == ZFS_CASE_MIXED) &&
!(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
return (0);
}
int
zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
{
objset_t *os;
zfsvfs_t *zfsvfs;
int error;
/*
* XXX: Fix struct statfs so this isn't necessary!
*
* The 'osname' is used as the filesystem's special node, which means
* it must fit in statfs.f_mntfromname, or else it can't be
* enumerated, so libzfs_mnttab_find() returns NULL, which causes
* 'zfs unmount' to think it's not mounted when it is.
*/
if (strlen(osname) >= MNAMELEN)
return (SET_ERROR(ENAMETOOLONG));
zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
/*
* We claim to always be readonly so we can open snapshots;
* other ZPL code will prevent us from writing to snapshots.
*/
error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
if (error != 0) {
kmem_free(zfsvfs, sizeof (zfsvfs_t));
return (error);
}
error = zfsvfs_create_impl(zfvp, zfsvfs, os);
if (error != 0) {
dmu_objset_disown(os, zfsvfs);
}
return (error);
}
int
zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
{
int error;
zfsvfs->z_vfs = NULL;
zfsvfs->z_parent = zfsvfs;
mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
offsetof(znode_t, z_link_node));
#ifdef DIAGNOSTIC
rrm_init(&zfsvfs->z_teardown_lock, B_TRUE);
#else
rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
#endif
rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
error = zfsvfs_init(zfsvfs, os);
if (error != 0) {
*zfvp = NULL;
kmem_free(zfsvfs, sizeof (zfsvfs_t));
return (error);
}
*zfvp = zfsvfs;
return (0);
}
static int
zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
{
int error;
error = zfs_register_callbacks(zfsvfs->z_vfs);
if (error)
return (error);
zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
/*
* If we are not mounting (ie: online recv), then we don't
* have to worry about replaying the log as we blocked all
* operations out since we closed the ZIL.
*/
if (mounting) {
boolean_t readonly;
/*
* During replay we remove the read only flag to
* allow replays to succeed.
*/
readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
if (readonly != 0)
zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
else
zfs_unlinked_drain(zfsvfs);
/*
* Parse and replay the intent log.
*
* Because of ziltest, this must be done after
* zfs_unlinked_drain(). (Further note: ziltest
* doesn't use readonly mounts, where
* zfs_unlinked_drain() isn't called.) This is because
* ziltest causes spa_sync() to think it's committed,
* but actually it is not, so the intent log contains
* many txg's worth of changes.
*
* In particular, if object N is in the unlinked set in
* the last txg to actually sync, then it could be
* actually freed in a later txg and then reallocated
* in a yet later txg. This would write a "create
* object N" record to the intent log. Normally, this
* would be fine because the spa_sync() would have
* written out the fact that object N is free, before
* we could write the "create object N" intent log
* record.
*
* But when we are in ziltest mode, we advance the "open
* txg" without actually spa_sync()-ing the changes to
* disk. So we would see that object N is still
* allocated and in the unlinked set, and there is an
* intent log record saying to allocate it.
*/
if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
if (zil_replay_disable) {
zil_destroy(zfsvfs->z_log, B_FALSE);
} else {
zfsvfs->z_replay = B_TRUE;
zil_replay(zfsvfs->z_os, zfsvfs,
zfs_replay_vector);
zfsvfs->z_replay = B_FALSE;
}
}
zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
}
/*
* Set the objset user_ptr to track its zfsvfs.
*/
mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
return (0);
}
extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
void
zfsvfs_free(zfsvfs_t *zfsvfs)
{
int i;
/*
* This is a barrier to prevent the filesystem from going away in
* zfs_znode_move() until we can safely ensure that the filesystem is
* not unmounted. We consider the filesystem valid before the barrier
* and invalid after the barrier.
*/
rw_enter(&zfsvfs_lock, RW_READER);
rw_exit(&zfsvfs_lock);
zfs_fuid_destroy(zfsvfs);
mutex_destroy(&zfsvfs->z_znodes_lock);
mutex_destroy(&zfsvfs->z_lock);
list_destroy(&zfsvfs->z_all_znodes);
rrm_destroy(&zfsvfs->z_teardown_lock);
rw_destroy(&zfsvfs->z_teardown_inactive_lock);
rw_destroy(&zfsvfs->z_fuid_lock);
for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
mutex_destroy(&zfsvfs->z_hold_mtx[i]);
kmem_free(zfsvfs, sizeof (zfsvfs_t));
}
static void
zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
{
zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
if (zfsvfs->z_vfs) {
if (zfsvfs->z_use_fuids) {
vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
} else {
vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
}
}
zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
}
static int
zfs_domount(vfs_t *vfsp, char *osname)
{
uint64_t recordsize, fsid_guid;
int error = 0;
zfsvfs_t *zfsvfs;
vnode_t *vp;
ASSERT(vfsp);
ASSERT(osname);
error = zfsvfs_create(osname, &zfsvfs);
if (error)
return (error);
zfsvfs->z_vfs = vfsp;
#ifdef illumos
/* Initialize the generic filesystem structure. */
vfsp->vfs_bcount = 0;
vfsp->vfs_data = NULL;
if (zfs_create_unique_device(&mount_dev) == -1) {
error = SET_ERROR(ENODEV);
goto out;
}
ASSERT(vfs_devismounted(mount_dev) == 0);
#endif
if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
NULL))
goto out;
zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
vfsp->vfs_data = zfsvfs;
vfsp->mnt_flag |= MNT_LOCAL;
vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */
/*
* The fsid is 64 bits, composed of an 8-bit fs type, which
* separates our fsid from any other filesystem types, and a
* 56-bit objset unique ID. The objset unique ID is unique to
* all objsets open on this system, provided by unique_create().
* The 8-bit fs type must be put in the low bits of fsid[1]
* because that's where other Solaris filesystems put it.
*/
fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
vfsp->vfs_fsid.val[0] = fsid_guid;
vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
vfsp->mnt_vfc->vfc_typenum & 0xFF;
/*
* Set features for file system.
*/
zfs_set_fuid_feature(zfsvfs);
if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
}
vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
uint64_t pval;
atime_changed_cb(zfsvfs, B_FALSE);
readonly_changed_cb(zfsvfs, B_TRUE);
if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
goto out;
xattr_changed_cb(zfsvfs, pval);
zfsvfs->z_issnap = B_TRUE;
zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
} else {
error = zfsvfs_setup(zfsvfs, B_TRUE);
}
vfs_mountedfrom(vfsp, osname);
if (!zfsvfs->z_issnap)
zfsctl_create(zfsvfs);
out:
if (error) {
dmu_objset_disown(zfsvfs->z_os, zfsvfs);
zfsvfs_free(zfsvfs);
} else {
atomic_inc_32(&zfs_active_fs_count);
}
return (error);
}
void
zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
{
objset_t *os = zfsvfs->z_os;
if (!dmu_objset_is_snapshot(os))
dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
}
#ifdef SECLABEL
/*
* Convert a decimal digit string to a uint64_t integer.
*/
static int
str_to_uint64(char *str, uint64_t *objnum)
{
uint64_t num = 0;
while (*str) {
if (*str < '0' || *str > '9')
return (SET_ERROR(EINVAL));
num = num*10 + *str++ - '0';
}
*objnum = num;
return (0);
}
/*
* The boot path passed from the boot loader is in the form of
* "rootpool-name/root-filesystem-object-number'. Convert this
* string to a dataset name: "rootpool-name/root-filesystem-name".
*/
static int
zfs_parse_bootfs(char *bpath, char *outpath)
{
char *slashp;
uint64_t objnum;
int error;
if (*bpath == 0 || *bpath == '/')
return (SET_ERROR(EINVAL));
(void) strcpy(outpath, bpath);
slashp = strchr(bpath, '/');
/* if no '/', just return the pool name */
if (slashp == NULL) {
return (0);
}
/* if not a number, just return the root dataset name */
if (str_to_uint64(slashp+1, &objnum)) {
return (0);
}
*slashp = '\0';
error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
*slashp = '/';
return (error);
}
/*
* Check that the hex label string is appropriate for the dataset being
* mounted into the global_zone proper.
*
* Return an error if the hex label string is not default or
* admin_low/admin_high. For admin_low labels, the corresponding
* dataset must be readonly.
*/
int
zfs_check_global_label(const char *dsname, const char *hexsl)
{
if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
return (0);
if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
return (0);
if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
/* must be readonly */
uint64_t rdonly;
if (dsl_prop_get_integer(dsname,
zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
return (SET_ERROR(EACCES));
return (rdonly ? 0 : EACCES);
}
return (SET_ERROR(EACCES));
}
/*
* Determine whether the mount is allowed according to MAC check.
* by comparing (where appropriate) label of the dataset against
* the label of the zone being mounted into. If the dataset has
* no label, create one.
*
* Returns 0 if access allowed, error otherwise (e.g. EACCES)
*/
static int
zfs_mount_label_policy(vfs_t *vfsp, char *osname)
{
int error, retv;
zone_t *mntzone = NULL;
ts_label_t *mnt_tsl;
bslabel_t *mnt_sl;
bslabel_t ds_sl;
char ds_hexsl[MAXNAMELEN];
retv = EACCES; /* assume the worst */
/*
* Start by getting the dataset label if it exists.
*/
error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
1, sizeof (ds_hexsl), &ds_hexsl, NULL);
if (error)
return (SET_ERROR(EACCES));
/*
* If labeling is NOT enabled, then disallow the mount of datasets
* which have a non-default label already. No other label checks
* are needed.
*/
if (!is_system_labeled()) {
if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
return (0);
return (SET_ERROR(EACCES));
}
/*
* Get the label of the mountpoint. If mounting into the global
* zone (i.e. mountpoint is not within an active zone and the
* zoned property is off), the label must be default or
* admin_low/admin_high only; no other checks are needed.
*/
mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
if (mntzone->zone_id == GLOBAL_ZONEID) {
uint64_t zoned;
zone_rele(mntzone);
if (dsl_prop_get_integer(osname,
zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
return (SET_ERROR(EACCES));
if (!zoned)
return (zfs_check_global_label(osname, ds_hexsl));
else
/*
* This is the case of a zone dataset being mounted
* initially, before the zone has been fully created;
* allow this mount into global zone.
*/
return (0);
}
mnt_tsl = mntzone->zone_slabel;
ASSERT(mnt_tsl != NULL);
label_hold(mnt_tsl);
mnt_sl = label2bslabel(mnt_tsl);
if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
/*
* The dataset doesn't have a real label, so fabricate one.
*/
char *str = NULL;
if (l_to_str_internal(mnt_sl, &str) == 0 &&
dsl_prop_set_string(osname,
zfs_prop_to_name(ZFS_PROP_MLSLABEL),
ZPROP_SRC_LOCAL, str) == 0)
retv = 0;
if (str != NULL)
kmem_free(str, strlen(str) + 1);
} else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
/*
* Now compare labels to complete the MAC check. If the
* labels are equal then allow access. If the mountpoint
* label dominates the dataset label, allow readonly access.
* Otherwise, access is denied.
*/
if (blequal(mnt_sl, &ds_sl))
retv = 0;
else if (bldominates(mnt_sl, &ds_sl)) {
vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
retv = 0;
}
}
label_rele(mnt_tsl);
zone_rele(mntzone);
return (retv);
}
#endif /* SECLABEL */
#ifdef OPENSOLARIS_MOUNTROOT
static int
zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
{
int error = 0;
static int zfsrootdone = 0;
zfsvfs_t *zfsvfs = NULL;
znode_t *zp = NULL;
vnode_t *vp = NULL;
char *zfs_bootfs;
char *zfs_devid;
ASSERT(vfsp);
/*
* The filesystem that we mount as root is defined in the
* boot property "zfs-bootfs" with a format of
* "poolname/root-dataset-objnum".
*/
if (why == ROOT_INIT) {
if (zfsrootdone++)
return (SET_ERROR(EBUSY));
/*
* the process of doing a spa_load will require the
* clock to be set before we could (for example) do
* something better by looking at the timestamp on
* an uberblock, so just set it to -1.
*/
clkset(-1);
if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
"bootfs name");
return (SET_ERROR(EINVAL));
}
zfs_devid = spa_get_bootprop("diskdevid");
error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
if (zfs_devid)
spa_free_bootprop(zfs_devid);
if (error) {
spa_free_bootprop(zfs_bootfs);
cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
error);
return (error);
}
if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
spa_free_bootprop(zfs_bootfs);
cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
error);
return (error);
}
spa_free_bootprop(zfs_bootfs);
if (error = vfs_lock(vfsp))
return (error);
if (error = zfs_domount(vfsp, rootfs.bo_name)) {
cmn_err(CE_NOTE, "zfs_domount: error %d", error);
goto out;
}
zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
ASSERT(zfsvfs);
if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
cmn_err(CE_NOTE, "zfs_zget: error %d", error);
goto out;
}
vp = ZTOV(zp);
mutex_enter(&vp->v_lock);
vp->v_flag |= VROOT;
mutex_exit(&vp->v_lock);
rootvp = vp;
/*
* Leave rootvp held. The root file system is never unmounted.
*/
vfs_add((struct vnode *)0, vfsp,
(vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
out:
vfs_unlock(vfsp);
return (error);
} else if (why == ROOT_REMOUNT) {
readonly_changed_cb(vfsp->vfs_data, B_FALSE);
vfsp->vfs_flag |= VFS_REMOUNT;
/* refresh mount options */
zfs_unregister_callbacks(vfsp->vfs_data);
return (zfs_register_callbacks(vfsp));
} else if (why == ROOT_UNMOUNT) {
zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
(void) zfs_sync(vfsp, 0, 0);
return (0);
}
/*
* if "why" is equal to anything else other than ROOT_INIT,
* ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
*/
return (SET_ERROR(ENOTSUP));
}
#endif /* OPENSOLARIS_MOUNTROOT */
static int
getpoolname(const char *osname, char *poolname)
{
char *p;
p = strchr(osname, '/');
if (p == NULL) {
if (strlen(osname) >= MAXNAMELEN)
return (ENAMETOOLONG);
(void) strcpy(poolname, osname);
} else {
if (p - osname >= MAXNAMELEN)
return (ENAMETOOLONG);
(void) strncpy(poolname, osname, p - osname);
poolname[p - osname] = '\0';
}
return (0);
}
/*ARGSUSED*/
static int
zfs_mount(vfs_t *vfsp)
{
kthread_t *td = curthread;
vnode_t *mvp = vfsp->mnt_vnodecovered;
cred_t *cr = td->td_ucred;
char *osname;
int error = 0;
int canwrite;
#ifdef illumos
if (mvp->v_type != VDIR)
return (SET_ERROR(ENOTDIR));
mutex_enter(&mvp->v_lock);
if ((uap->flags & MS_REMOUNT) == 0 &&
(uap->flags & MS_OVERLAY) == 0 &&
(mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
mutex_exit(&mvp->v_lock);
return (SET_ERROR(EBUSY));
}
mutex_exit(&mvp->v_lock);
/*
* ZFS does not support passing unparsed data in via MS_DATA.
* Users should use the MS_OPTIONSTR interface; this means
* that all option parsing is already done and the options struct
* can be interrogated.
*/
if ((uap->flags & MS_DATA) && uap->datalen > 0)
return (SET_ERROR(EINVAL));
/*
* Get the objset name (the "special" mount argument).
*/
if (error = pn_get(uap->spec, fromspace, &spn))
return (error);
osname = spn.pn_path;
#else /* !illumos */
if (!prison_allow(td->td_ucred, PR_ALLOW_MOUNT_ZFS))
return (SET_ERROR(EPERM));
if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
return (SET_ERROR(EINVAL));
/*
* If full-owner-access is enabled and delegated administration is
* turned on, we must set nosuid.
*/
if (zfs_super_owner &&
dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
secpolicy_fs_mount_clearopts(cr, vfsp);
}
#endif /* illumos */
/*
* Check for mount privilege?
*
* If we don't have privilege then see if
* we have local permission to allow it
*/
error = secpolicy_fs_mount(cr, mvp, vfsp);
if (error) {
if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
goto out;
if (!(vfsp->vfs_flag & MS_REMOUNT)) {
vattr_t vattr;
/*
* Make sure user is the owner of the mount point
* or has sufficient privileges.
*/
vattr.va_mask = AT_UID;
vn_lock(mvp, LK_SHARED | LK_RETRY);
if (VOP_GETATTR(mvp, &vattr, cr)) {
VOP_UNLOCK(mvp, 0);
goto out;
}
if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
VOP_UNLOCK(mvp, 0);
goto out;
}
VOP_UNLOCK(mvp, 0);
}
secpolicy_fs_mount_clearopts(cr, vfsp);
}
/*
* Refuse to mount a filesystem if we are in a local zone and the
* dataset is not visible.
*/
if (!INGLOBALZONE(curthread) &&
(!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
error = SET_ERROR(EPERM);
goto out;
}
#ifdef SECLABEL
error = zfs_mount_label_policy(vfsp, osname);
if (error)
goto out;
#endif
vfsp->vfs_flag |= MNT_NFS4ACLS;
/*
* When doing a remount, we simply refresh our temporary properties
* according to those options set in the current VFS options.
*/
if (vfsp->vfs_flag & MS_REMOUNT) {
zfsvfs_t *zfsvfs = vfsp->vfs_data;
/*
* Refresh mount options with z_teardown_lock blocking I/O while
* the filesystem is in an inconsistent state.
* The lock also serializes this code with filesystem
* manipulations between entry to zfs_suspend_fs() and return
* from zfs_resume_fs().
*/
rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
zfs_unregister_callbacks(zfsvfs);
error = zfs_register_callbacks(vfsp);
rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
goto out;
}
/* Initial root mount: try hard to import the requested root pool. */
if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
(vfsp->vfs_flag & MNT_UPDATE) == 0) {
char pname[MAXNAMELEN];
error = getpoolname(osname, pname);
if (error == 0)
error = spa_import_rootpool(pname);
if (error)
goto out;
}
DROP_GIANT();
error = zfs_domount(vfsp, osname);
PICKUP_GIANT();
#ifdef illumos
/*
* Add an extra VFS_HOLD on our parent vfs so that it can't
* disappear due to a forced unmount.
*/
if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
VFS_HOLD(mvp->v_vfsp);
#endif
out:
return (error);
}
static int
zfs_statfs(vfs_t *vfsp, struct statfs *statp)
{
zfsvfs_t *zfsvfs = vfsp->vfs_data;
uint64_t refdbytes, availbytes, usedobjs, availobjs;
statp->f_version = STATFS_VERSION;
ZFS_ENTER(zfsvfs);
dmu_objset_space(zfsvfs->z_os,
&refdbytes, &availbytes, &usedobjs, &availobjs);
/*
* The underlying storage pool actually uses multiple block sizes.
* We report the fragsize as the smallest block size we support,
* and we report our blocksize as the filesystem's maximum blocksize.
*/
statp->f_bsize = SPA_MINBLOCKSIZE;
statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
/*
* The following report "total" blocks of various kinds in the
* file system, but reported in terms of f_frsize - the
* "fragment" size.
*/
statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
statp->f_bfree = availbytes / statp->f_bsize;
statp->f_bavail = statp->f_bfree; /* no root reservation */
/*
* statvfs() should really be called statufs(), because it assumes
* static metadata. ZFS doesn't preallocate files, so the best
* we can do is report the max that could possibly fit in f_files,
* and that minus the number actually used in f_ffree.
* For f_ffree, report the smaller of the number of object available
* and the number of blocks (each object will take at least a block).
*/
statp->f_ffree = MIN(availobjs, statp->f_bfree);
statp->f_files = statp->f_ffree + usedobjs;
/*
* We're a zfs filesystem.
*/
(void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
sizeof(statp->f_mntfromname));
strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
sizeof(statp->f_mntonname));
statp->f_namemax = MAXNAMELEN - 1;
ZFS_EXIT(zfsvfs);
return (0);
}
static int
zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
{
zfsvfs_t *zfsvfs = vfsp->vfs_data;
znode_t *rootzp;
int error;
ZFS_ENTER(zfsvfs);
error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
if (error == 0)
*vpp = ZTOV(rootzp);
ZFS_EXIT(zfsvfs);
if (error == 0) {
error = vn_lock(*vpp, flags);
if (error != 0) {
VN_RELE(*vpp);
*vpp = NULL;
}
}
return (error);
}
/*
* Teardown the zfsvfs::z_os.
*
* Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
* and 'z_teardown_inactive_lock' held.
*/
static int
zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
{
znode_t *zp;
rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
if (!unmounting) {
/*
* We purge the parent filesystem's vfsp as the parent
* filesystem and all of its snapshots have their vnode's
* v_vfsp set to the parent's filesystem's vfsp. Note,
* 'z_parent' is self referential for non-snapshots.
*/
(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
#ifdef FREEBSD_NAMECACHE
cache_purgevfs(zfsvfs->z_parent->z_vfs, true);
#endif
}
/*
* Close the zil. NB: Can't close the zil while zfs_inactive
* threads are blocked as zil_close can call zfs_inactive.
*/
if (zfsvfs->z_log) {
zil_close(zfsvfs->z_log);
zfsvfs->z_log = NULL;
}
rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
/*
* If we are not unmounting (ie: online recv) and someone already
* unmounted this file system while we were doing the switcheroo,
* or a reopen of z_os failed then just bail out now.
*/
if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
rw_exit(&zfsvfs->z_teardown_inactive_lock);
rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
return (SET_ERROR(EIO));
}
/*
* At this point there are no vops active, and any new vops will
* fail with EIO since we have z_teardown_lock for writer (only
* relavent for forced unmount).
*
* Release all holds on dbufs.
*/
mutex_enter(&zfsvfs->z_znodes_lock);
for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
zp = list_next(&zfsvfs->z_all_znodes, zp))
if (zp->z_sa_hdl) {
ASSERT(ZTOV(zp)->v_count >= 0);
zfs_znode_dmu_fini(zp);
}
mutex_exit(&zfsvfs->z_znodes_lock);
/*
* If we are unmounting, set the unmounted flag and let new vops
* unblock. zfs_inactive will have the unmounted behavior, and all
* other vops will fail with EIO.
*/
if (unmounting) {
zfsvfs->z_unmounted = B_TRUE;
rw_exit(&zfsvfs->z_teardown_inactive_lock);
rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
}
/*
* z_os will be NULL if there was an error in attempting to reopen
* zfsvfs, so just return as the properties had already been
* unregistered and cached data had been evicted before.
*/
if (zfsvfs->z_os == NULL)
return (0);
/*
* Unregister properties.
*/
zfs_unregister_callbacks(zfsvfs);
/*
* Evict cached data
*/
if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
dmu_objset_evict_dbufs(zfsvfs->z_os);
return (0);
}
/*ARGSUSED*/
static int
zfs_umount(vfs_t *vfsp, int fflag)
{
kthread_t *td = curthread;
zfsvfs_t *zfsvfs = vfsp->vfs_data;
objset_t *os;
cred_t *cr = td->td_ucred;
int ret;
ret = secpolicy_fs_unmount(cr, vfsp);
if (ret) {
if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
ZFS_DELEG_PERM_MOUNT, cr))
return (ret);
}
/*
* We purge the parent filesystem's vfsp as the parent filesystem
* and all of its snapshots have their vnode's v_vfsp set to the
* parent's filesystem's vfsp. Note, 'z_parent' is self
* referential for non-snapshots.
*/
(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
/*
* Unmount any snapshots mounted under .zfs before unmounting the
* dataset itself.
*/
if (zfsvfs->z_ctldir != NULL) {
if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
return (ret);
}
if (fflag & MS_FORCE) {
/*
* Mark file system as unmounted before calling
* vflush(FORCECLOSE). This way we ensure no future vnops
* will be called and risk operating on DOOMED vnodes.
*/
rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
zfsvfs->z_unmounted = B_TRUE;
rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
}
/*
* Flush all the files.
*/
ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
if (ret != 0)
return (ret);
#ifdef illumos
if (!(fflag & MS_FORCE)) {
/*
* Check the number of active vnodes in the file system.
* Our count is maintained in the vfs structure, but the
* number is off by 1 to indicate a hold on the vfs
* structure itself.
*
* The '.zfs' directory maintains a reference of its
* own, and any active references underneath are
* reflected in the vnode count.
*/
if (zfsvfs->z_ctldir == NULL) {
if (vfsp->vfs_count > 1)
return (SET_ERROR(EBUSY));
} else {
if (vfsp->vfs_count > 2 ||
zfsvfs->z_ctldir->v_count > 1)
return (SET_ERROR(EBUSY));
}
}
#endif
VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
os = zfsvfs->z_os;
/*
* z_os will be NULL if there was an error in
* attempting to reopen zfsvfs.
*/
if (os != NULL) {
/*
* Unset the objset user_ptr.
*/
mutex_enter(&os->os_user_ptr_lock);
dmu_objset_set_user(os, NULL);
mutex_exit(&os->os_user_ptr_lock);
/*
* Finally release the objset
*/
dmu_objset_disown(os, zfsvfs);
}
/*
* We can now safely destroy the '.zfs' directory node.
*/
if (zfsvfs->z_ctldir != NULL)
zfsctl_destroy(zfsvfs);
zfs_freevfs(vfsp);
return (0);
}
static int
zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
{
zfsvfs_t *zfsvfs = vfsp->vfs_data;
znode_t *zp;
int err;
/*
* zfs_zget() can't operate on virtual entries like .zfs/ or
* .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
* This will make NFS to switch to LOOKUP instead of using VGET.
*/
if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
(zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
return (EOPNOTSUPP);
ZFS_ENTER(zfsvfs);
err = zfs_zget(zfsvfs, ino, &zp);
if (err == 0 && zp->z_unlinked) {
vrele(ZTOV(zp));
err = EINVAL;
}
if (err == 0)
*vpp = ZTOV(zp);
ZFS_EXIT(zfsvfs);
if (err == 0)
err = vn_lock(*vpp, flags);
if (err != 0)
*vpp = NULL;
return (err);
}
static int
zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
struct ucred **credanonp, int *numsecflavors, int **secflavors)
{
zfsvfs_t *zfsvfs = vfsp->vfs_data;
/*
* If this is regular file system vfsp is the same as
* zfsvfs->z_parent->z_vfs, but if it is snapshot,
* zfsvfs->z_parent->z_vfs represents parent file system
* which we have to use here, because only this file system
* has mnt_export configured.
*/
return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
credanonp, numsecflavors, secflavors));
}
CTASSERT(SHORT_FID_LEN <= sizeof(struct fid));
CTASSERT(LONG_FID_LEN <= sizeof(struct fid));
static int
zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
{
struct componentname cn;
zfsvfs_t *zfsvfs = vfsp->vfs_data;
znode_t *zp;
vnode_t *dvp;
uint64_t object = 0;
uint64_t fid_gen = 0;
uint64_t gen_mask;
uint64_t zp_gen;
int i, err;
*vpp = NULL;
ZFS_ENTER(zfsvfs);
/*
* On FreeBSD we can get snapshot's mount point or its parent file
* system mount point depending if snapshot is already mounted or not.
*/
if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
zfid_long_t *zlfid = (zfid_long_t *)fidp;
uint64_t objsetid = 0;
uint64_t setgen = 0;
for (i = 0; i < sizeof (zlfid->zf_setid); i++)
objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
ZFS_EXIT(zfsvfs);
err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
if (err)
return (SET_ERROR(EINVAL));
ZFS_ENTER(zfsvfs);
}
if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
zfid_short_t *zfid = (zfid_short_t *)fidp;
for (i = 0; i < sizeof (zfid->zf_object); i++)
object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
for (i = 0; i < sizeof (zfid->zf_gen); i++)
fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
} else {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EINVAL));
}
/*
* A zero fid_gen means we are in .zfs or the .zfs/snapshot
* directory tree. If the object == zfsvfs->z_shares_dir, then
* we are in the .zfs/shares directory tree.
*/
if ((fid_gen == 0 &&
(object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
(zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
ZFS_EXIT(zfsvfs);
VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
if (object == ZFSCTL_INO_SNAPDIR) {
cn.cn_nameptr = "snapshot";
cn.cn_namelen = strlen(cn.cn_nameptr);
cn.cn_nameiop = LOOKUP;
cn.cn_flags = ISLASTCN | LOCKLEAF;
cn.cn_lkflags = flags;
VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
vput(dvp);
} else if (object == zfsvfs->z_shares_dir) {
/*
* XXX This branch must not be taken,
* if it is, then the lookup below will
* explode.
*/
cn.cn_nameptr = "shares";
cn.cn_namelen = strlen(cn.cn_nameptr);
cn.cn_nameiop = LOOKUP;
cn.cn_flags = ISLASTCN;
cn.cn_lkflags = flags;
VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
vput(dvp);
} else {
*vpp = dvp;
}
return (err);
}
gen_mask = -1ULL >> (64 - 8 * i);
dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
if (err = zfs_zget(zfsvfs, object, &zp)) {
ZFS_EXIT(zfsvfs);
return (err);
}
(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
sizeof (uint64_t));
zp_gen = zp_gen & gen_mask;
if (zp_gen == 0)
zp_gen = 1;
if (zp->z_unlinked || zp_gen != fid_gen) {
dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
vrele(ZTOV(zp));
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EINVAL));
}
*vpp = ZTOV(zp);
ZFS_EXIT(zfsvfs);
err = vn_lock(*vpp, flags);
if (err == 0)
vnode_create_vobject(*vpp, zp->z_size, curthread);
else
*vpp = NULL;
return (err);
}
/*
* Block out VOPs and close zfsvfs_t::z_os
*
* Note, if successful, then we return with the 'z_teardown_lock' and
* 'z_teardown_inactive_lock' write held. We leave ownership of the underlying
* dataset and objset intact so that they can be atomically handed off during
* a subsequent rollback or recv operation and the resume thereafter.
*/
int
zfs_suspend_fs(zfsvfs_t *zfsvfs)
{
int error;
if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
return (error);
return (0);
}
/*
* Rebuild SA and release VOPs. Note that ownership of the underlying dataset
* is an invariant across any of the operations that can be performed while the
* filesystem was suspended. Whether it succeeded or failed, the preconditions
* are the same: the relevant objset and associated dataset are owned by
* zfsvfs, held, and long held on entry.
*/
int
zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
{
int err;
znode_t *zp;
ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
/*
* We already own this, so just update the objset_t, as the one we
* had before may have been evicted.
*/
objset_t *os;
VERIFY3P(ds->ds_owner, ==, zfsvfs);
VERIFY(dsl_dataset_long_held(ds));
VERIFY0(dmu_objset_from_ds(ds, &os));
err = zfsvfs_init(zfsvfs, os);
if (err != 0)
goto bail;
VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
zfs_set_fuid_feature(zfsvfs);
/*
* Attempt to re-establish all the active znodes with
* their dbufs. If a zfs_rezget() fails, then we'll let
* any potential callers discover that via ZFS_ENTER_VERIFY_VP
* when they try to use their znode.
*/
mutex_enter(&zfsvfs->z_znodes_lock);
for (zp = list_head(&zfsvfs->z_all_znodes); zp;
zp = list_next(&zfsvfs->z_all_znodes, zp)) {
(void) zfs_rezget(zp);
}
mutex_exit(&zfsvfs->z_znodes_lock);
bail:
/* release the VOPs */
rw_exit(&zfsvfs->z_teardown_inactive_lock);
rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
if (err) {
/*
* Since we couldn't setup the sa framework, try to force
* unmount this file system.
*/
if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
vfs_ref(zfsvfs->z_vfs);
(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
}
}
return (err);
}
static void
zfs_freevfs(vfs_t *vfsp)
{
zfsvfs_t *zfsvfs = vfsp->vfs_data;
#ifdef illumos
/*
* If this is a snapshot, we have an extra VFS_HOLD on our parent
* from zfs_mount(). Release it here. If we came through
* zfs_mountroot() instead, we didn't grab an extra hold, so
* skip the VFS_RELE for rootvfs.
*/
if (zfsvfs->z_issnap && (vfsp != rootvfs))
VFS_RELE(zfsvfs->z_parent->z_vfs);
#endif
zfsvfs_free(zfsvfs);
atomic_dec_32(&zfs_active_fs_count);
}
#ifdef __i386__
static int desiredvnodes_backup;
#endif
static void
zfs_vnodes_adjust(void)
{
#ifdef __i386__
int newdesiredvnodes;
desiredvnodes_backup = desiredvnodes;
/*
* We calculate newdesiredvnodes the same way it is done in
* vntblinit(). If it is equal to desiredvnodes, it means that
* it wasn't tuned by the administrator and we can tune it down.
*/
newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
vm_kmem_size / (5 * (sizeof(struct vm_object) +
sizeof(struct vnode))));
if (newdesiredvnodes == desiredvnodes)
desiredvnodes = (3 * newdesiredvnodes) / 4;
#endif
}
static void
zfs_vnodes_adjust_back(void)
{
#ifdef __i386__
desiredvnodes = desiredvnodes_backup;
#endif
}
void
zfs_init(void)
{
printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
/*
* Initialize .zfs directory structures
*/
zfsctl_init();
/*
* Initialize znode cache, vnode ops, etc...
*/
zfs_znode_init();
/*
* Reduce number of vnodes. Originally number of vnodes is calculated
* with UFS inode in mind. We reduce it here, because it's too big for
* ZFS/i386.
*/
zfs_vnodes_adjust();
dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
}
void
zfs_fini(void)
{
zfsctl_fini();
zfs_znode_fini();
zfs_vnodes_adjust_back();
}
int
zfs_busy(void)
{
return (zfs_active_fs_count != 0);
}
int
zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
{
int error;
objset_t *os = zfsvfs->z_os;
dmu_tx_t *tx;
if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
return (SET_ERROR(EINVAL));
if (newvers < zfsvfs->z_version)
return (SET_ERROR(EINVAL));
if (zfs_spa_version_map(newvers) >
spa_version(dmu_objset_spa(zfsvfs->z_os)))
return (SET_ERROR(ENOTSUP));
tx = dmu_tx_create(os);
dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
ZFS_SA_ATTRS);
dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
}
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
return (error);
}
error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
8, 1, &newvers, tx);
if (error) {
dmu_tx_commit(tx);
return (error);
}
if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
uint64_t sa_obj;
ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
SPA_VERSION_SA);
sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
DMU_OT_NONE, 0, tx);
error = zap_add(os, MASTER_NODE_OBJ,
ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
ASSERT0(error);
VERIFY(0 == sa_set_sa_object(os, sa_obj));
sa_register_update_callback(os, zfs_sa_upgrade);
}
spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
"from %llu to %llu", zfsvfs->z_version, newvers);
dmu_tx_commit(tx);
zfsvfs->z_version = newvers;
zfs_set_fuid_feature(zfsvfs);
return (0);
}
/*
* Read a property stored within the master node.
*/
int
zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
{
const char *pname;
int error = ENOENT;
/*
* Look up the file system's value for the property. For the
* version property, we look up a slightly different string.
*/
if (prop == ZFS_PROP_VERSION)
pname = ZPL_VERSION_STR;
else
pname = zfs_prop_to_name(prop);
if (os != NULL) {
ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
}
if (error == ENOENT) {
/* No value set, use the default value */
switch (prop) {
case ZFS_PROP_VERSION:
*value = ZPL_VERSION;
break;
case ZFS_PROP_NORMALIZE:
case ZFS_PROP_UTF8ONLY:
*value = 0;
break;
case ZFS_PROP_CASE:
*value = ZFS_CASE_SENSITIVE;
break;
default:
return (error);
}
error = 0;
}
return (error);
}
/*
* Return true if the coresponding vfs's unmounted flag is set.
* Otherwise return false.
* If this function returns true we know VFS unmount has been initiated.
*/
boolean_t
zfs_get_vfs_flag_unmounted(objset_t *os)
{
zfsvfs_t *zfvp;
boolean_t unmounted = B_FALSE;
ASSERT(dmu_objset_type(os) == DMU_OST_ZFS);
mutex_enter(&os->os_user_ptr_lock);
zfvp = dmu_objset_get_user(os);
if (zfvp != NULL && zfvp->z_vfs != NULL &&
(zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT))
unmounted = B_TRUE;
mutex_exit(&os->os_user_ptr_lock);
return (unmounted);
}
#ifdef _KERNEL
void
zfsvfs_update_fromname(const char *oldname, const char *newname)
{
char tmpbuf[MAXPATHLEN];
struct mount *mp;
char *fromname;
size_t oldlen;
oldlen = strlen(oldname);
mtx_lock(&mountlist_mtx);
TAILQ_FOREACH(mp, &mountlist, mnt_list) {
fromname = mp->mnt_stat.f_mntfromname;
if (strcmp(fromname, oldname) == 0) {
(void)strlcpy(fromname, newname,
sizeof(mp->mnt_stat.f_mntfromname));
continue;
}
if (strncmp(fromname, oldname, oldlen) == 0 &&
(fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
(void)snprintf(tmpbuf, sizeof(tmpbuf), "%s%s",
newname, fromname + oldlen);
(void)strlcpy(fromname, tmpbuf,
sizeof(mp->mnt_stat.f_mntfromname));
continue;
}
}
mtx_unlock(&mountlist_mtx);
}
#endif
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c (revision 332525)
@@ -1,6060 +1,6072 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2017 Nexenta Systems, Inc.
*/
/* Portions Copyright 2007 Jeremy Teo */
/* Portions Copyright 2010 Robert Milkowski */
#include <sys/types.h>
#include <sys/param.h>
#include <sys/time.h>
#include <sys/systm.h>
#include <sys/sysmacros.h>
#include <sys/resource.h>
#include <sys/vfs.h>
#include <sys/vm.h>
#include <sys/vnode.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/kmem.h>
#include <sys/taskq.h>
#include <sys/uio.h>
#include <sys/atomic.h>
#include <sys/namei.h>
#include <sys/mman.h>
#include <sys/cmn_err.h>
#include <sys/errno.h>
#include <sys/unistd.h>
#include <sys/zfs_dir.h>
#include <sys/zfs_ioctl.h>
#include <sys/fs/zfs.h>
#include <sys/dmu.h>
#include <sys/dmu_objset.h>
#include <sys/spa.h>
#include <sys/txg.h>
#include <sys/dbuf.h>
#include <sys/zap.h>
#include <sys/sa.h>
#include <sys/dirent.h>
#include <sys/policy.h>
#include <sys/sunddi.h>
#include <sys/filio.h>
#include <sys/sid.h>
#include <sys/zfs_ctldir.h>
#include <sys/zfs_fuid.h>
#include <sys/zfs_sa.h>
#include <sys/zfs_rlock.h>
#include <sys/extdirent.h>
#include <sys/kidmap.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/sched.h>
#include <sys/acl.h>
#include <sys/vmmeter.h>
#include <vm/vm_param.h>
#include <sys/zil.h>
/*
* Programming rules.
*
* Each vnode op performs some logical unit of work. To do this, the ZPL must
* properly lock its in-core state, create a DMU transaction, do the work,
* record this work in the intent log (ZIL), commit the DMU transaction,
* and wait for the intent log to commit if it is a synchronous operation.
* Moreover, the vnode ops must work in both normal and log replay context.
* The ordering of events is important to avoid deadlocks and references
* to freed memory. The example below illustrates the following Big Rules:
*
* (1) A check must be made in each zfs thread for a mounted file system.
* This is done avoiding races using ZFS_ENTER(zfsvfs).
* A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
* must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
* can return EIO from the calling function.
*
* (2) VN_RELE() should always be the last thing except for zil_commit()
* (if necessary) and ZFS_EXIT(). This is for 3 reasons:
* First, if it's the last reference, the vnode/znode
* can be freed, so the zp may point to freed memory. Second, the last
* reference will call zfs_zinactive(), which may induce a lot of work --
* pushing cached pages (which acquires range locks) and syncing out
* cached atime changes. Third, zfs_zinactive() may require a new tx,
* which could deadlock the system if you were already holding one.
* If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
*
* (3) All range locks must be grabbed before calling dmu_tx_assign(),
* as they can span dmu_tx_assign() calls.
*
* (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
* dmu_tx_assign(). This is critical because we don't want to block
* while holding locks.
*
* If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This
* reduces lock contention and CPU usage when we must wait (note that if
* throughput is constrained by the storage, nearly every transaction
* must wait).
*
* Note, in particular, that if a lock is sometimes acquired before
* the tx assigns, and sometimes after (e.g. z_lock), then failing
* to use a non-blocking assign can deadlock the system. The scenario:
*
* Thread A has grabbed a lock before calling dmu_tx_assign().
* Thread B is in an already-assigned tx, and blocks for this lock.
* Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
* forever, because the previous txg can't quiesce until B's tx commits.
*
* If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
* then drop all locks, call dmu_tx_wait(), and try again. On subsequent
* calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
* to indicate that this operation has already called dmu_tx_wait().
* This will ensure that we don't retry forever, waiting a short bit
* each time.
*
* (5) If the operation succeeded, generate the intent log entry for it
* before dropping locks. This ensures that the ordering of events
* in the intent log matches the order in which they actually occurred.
* During ZIL replay the zfs_log_* functions will update the sequence
* number to indicate the zil transaction has replayed.
*
* (6) At the end of each vnode op, the DMU tx must always commit,
* regardless of whether there were any errors.
*
* (7) After dropping all locks, invoke zil_commit(zilog, foid)
* to ensure that synchronous semantics are provided when necessary.
*
* In general, this is how things should be ordered in each vnode op:
*
* ZFS_ENTER(zfsvfs); // exit if unmounted
* top:
* zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD())
* rw_enter(...); // grab any other locks you need
* tx = dmu_tx_create(...); // get DMU tx
* dmu_tx_hold_*(); // hold each object you might modify
* error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
* if (error) {
* rw_exit(...); // drop locks
* zfs_dirent_unlock(dl); // unlock directory entry
* VN_RELE(...); // release held vnodes
* if (error == ERESTART) {
* waited = B_TRUE;
* dmu_tx_wait(tx);
* dmu_tx_abort(tx);
* goto top;
* }
* dmu_tx_abort(tx); // abort DMU tx
* ZFS_EXIT(zfsvfs); // finished in zfs
* return (error); // really out of space
* }
* error = do_real_work(); // do whatever this VOP does
* if (error == 0)
* zfs_log_*(...); // on success, make ZIL entry
* dmu_tx_commit(tx); // commit DMU tx -- error or not
* rw_exit(...); // drop locks
* zfs_dirent_unlock(dl); // unlock directory entry
* VN_RELE(...); // release held vnodes
* zil_commit(zilog, foid); // synchronous when necessary
* ZFS_EXIT(zfsvfs); // finished in zfs
* return (error); // done, report error
*/
/* ARGSUSED */
static int
zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
{
znode_t *zp = VTOZ(*vpp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
((flag & FAPPEND) == 0)) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EPERM));
}
if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
ZTOV(zp)->v_type == VREG &&
!(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
if (fs_vscan(*vpp, cr, 0) != 0) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EACCES));
}
}
/* Keep a count of the synchronous opens in the znode */
if (flag & (FSYNC | FDSYNC))
atomic_inc_32(&zp->z_sync_cnt);
ZFS_EXIT(zfsvfs);
return (0);
}
/* ARGSUSED */
static int
zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
/*
* Clean up any locks held by this process on the vp.
*/
cleanlocks(vp, ddi_get_pid(), 0);
cleanshares(vp, ddi_get_pid());
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
/* Decrement the synchronous opens in the znode */
if ((flag & (FSYNC | FDSYNC)) && (count == 1))
atomic_dec_32(&zp->z_sync_cnt);
if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
ZTOV(zp)->v_type == VREG &&
!(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
VERIFY(fs_vscan(vp, cr, 1) == 0);
ZFS_EXIT(zfsvfs);
return (0);
}
/*
* Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
* data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
*/
static int
zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
{
znode_t *zp = VTOZ(vp);
uint64_t noff = (uint64_t)*off; /* new offset */
uint64_t file_sz;
int error;
boolean_t hole;
file_sz = zp->z_size;
if (noff >= file_sz) {
return (SET_ERROR(ENXIO));
}
if (cmd == _FIO_SEEK_HOLE)
hole = B_TRUE;
else
hole = B_FALSE;
error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
if (error == ESRCH)
return (SET_ERROR(ENXIO));
/*
* We could find a hole that begins after the logical end-of-file,
* because dmu_offset_next() only works on whole blocks. If the
* EOF falls mid-block, then indicate that the "virtual hole"
* at the end of the file begins at the logical EOF, rather than
* at the end of the last block.
*/
if (noff > file_sz) {
ASSERT(hole);
noff = file_sz;
}
if (noff < *off)
return (error);
*off = noff;
return (error);
}
/* ARGSUSED */
static int
zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
int *rvalp, caller_context_t *ct)
{
offset_t off;
offset_t ndata;
dmu_object_info_t doi;
int error;
zfsvfs_t *zfsvfs;
znode_t *zp;
switch (com) {
case _FIOFFS:
{
return (0);
/*
* The following two ioctls are used by bfu. Faking out,
* necessary to avoid bfu errors.
*/
}
case _FIOGDIO:
case _FIOSDIO:
{
return (0);
}
case _FIO_SEEK_DATA:
case _FIO_SEEK_HOLE:
{
#ifdef illumos
if (ddi_copyin((void *)data, &off, sizeof (off), flag))
return (SET_ERROR(EFAULT));
#else
off = *(offset_t *)data;
#endif
zp = VTOZ(vp);
zfsvfs = zp->z_zfsvfs;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
/* offset parameter is in/out */
error = zfs_holey(vp, com, &off);
ZFS_EXIT(zfsvfs);
if (error)
return (error);
#ifdef illumos
if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
return (SET_ERROR(EFAULT));
#else
*(offset_t *)data = off;
#endif
return (0);
}
#ifdef illumos
case _FIO_COUNT_FILLED:
{
/*
* _FIO_COUNT_FILLED adds a new ioctl command which
* exposes the number of filled blocks in a
* ZFS object.
*/
zp = VTOZ(vp);
zfsvfs = zp->z_zfsvfs;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
/*
* Wait for all dirty blocks for this object
* to get synced out to disk, and the DMU info
* updated.
*/
error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
if (error) {
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Retrieve fill count from DMU object.
*/
error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
if (error) {
ZFS_EXIT(zfsvfs);
return (error);
}
ndata = doi.doi_fill_count;
ZFS_EXIT(zfsvfs);
if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
return (SET_ERROR(EFAULT));
return (0);
}
#endif
}
return (SET_ERROR(ENOTTY));
}
static vm_page_t
page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
{
vm_object_t obj;
vm_page_t pp;
int64_t end;
/*
* At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
* aligned boundaries, if the range is not aligned. As a result a
* DEV_BSIZE subrange with partially dirty data may get marked as clean.
* It may happen that all DEV_BSIZE subranges are marked clean and thus
* the whole page would be considred clean despite have some dirty data.
* For this reason we should shrink the range to DEV_BSIZE aligned
* boundaries before calling vm_page_clear_dirty.
*/
end = rounddown2(off + nbytes, DEV_BSIZE);
off = roundup2(off, DEV_BSIZE);
nbytes = end - off;
obj = vp->v_object;
zfs_vmobject_assert_wlocked(obj);
for (;;) {
if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
pp->valid) {
if (vm_page_xbusied(pp)) {
/*
* Reference the page before unlocking and
* sleeping so that the page daemon is less
* likely to reclaim it.
*/
vm_page_reference(pp);
vm_page_lock(pp);
zfs_vmobject_wunlock(obj);
vm_page_busy_sleep(pp, "zfsmwb", true);
zfs_vmobject_wlock(obj);
continue;
}
vm_page_sbusy(pp);
} else if (pp != NULL) {
ASSERT(!pp->valid);
pp = NULL;
}
if (pp != NULL) {
ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
vm_object_pip_add(obj, 1);
pmap_remove_write(pp);
if (nbytes != 0)
vm_page_clear_dirty(pp, off, nbytes);
}
break;
}
return (pp);
}
static void
page_unbusy(vm_page_t pp)
{
vm_page_sunbusy(pp);
vm_object_pip_subtract(pp->object, 1);
}
static vm_page_t
page_hold(vnode_t *vp, int64_t start)
{
vm_object_t obj;
vm_page_t pp;
obj = vp->v_object;
zfs_vmobject_assert_wlocked(obj);
for (;;) {
if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
pp->valid) {
if (vm_page_xbusied(pp)) {
/*
* Reference the page before unlocking and
* sleeping so that the page daemon is less
* likely to reclaim it.
*/
vm_page_reference(pp);
vm_page_lock(pp);
zfs_vmobject_wunlock(obj);
vm_page_busy_sleep(pp, "zfsmwb", true);
zfs_vmobject_wlock(obj);
continue;
}
ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
vm_page_lock(pp);
vm_page_hold(pp);
vm_page_unlock(pp);
} else
pp = NULL;
break;
}
return (pp);
}
static void
page_unhold(vm_page_t pp)
{
vm_page_lock(pp);
vm_page_unhold(pp);
vm_page_unlock(pp);
}
/*
* When a file is memory mapped, we must keep the IO data synchronized
* between the DMU cache and the memory mapped pages. What this means:
*
* On Write: If we find a memory mapped page, we write to *both*
* the page and the dmu buffer.
*/
static void
update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
int segflg, dmu_tx_t *tx)
{
vm_object_t obj;
struct sf_buf *sf;
caddr_t va;
int off;
ASSERT(segflg != UIO_NOCOPY);
ASSERT(vp->v_mount != NULL);
obj = vp->v_object;
ASSERT(obj != NULL);
off = start & PAGEOFFSET;
zfs_vmobject_wlock(obj);
for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
vm_page_t pp;
int nbytes = imin(PAGESIZE - off, len);
if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
zfs_vmobject_wunlock(obj);
va = zfs_map_page(pp, &sf);
(void) dmu_read(os, oid, start+off, nbytes,
va+off, DMU_READ_PREFETCH);;
zfs_unmap_page(sf);
zfs_vmobject_wlock(obj);
page_unbusy(pp);
}
len -= nbytes;
off = 0;
}
vm_object_pip_wakeupn(obj, 0);
zfs_vmobject_wunlock(obj);
}
/*
* Read with UIO_NOCOPY flag means that sendfile(2) requests
* ZFS to populate a range of page cache pages with data.
*
* NOTE: this function could be optimized to pre-allocate
* all pages in advance, drain exclusive busy on all of them,
* map them into contiguous KVA region and populate them
* in one single dmu_read() call.
*/
static int
mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
{
znode_t *zp = VTOZ(vp);
objset_t *os = zp->z_zfsvfs->z_os;
struct sf_buf *sf;
vm_object_t obj;
vm_page_t pp;
int64_t start;
caddr_t va;
int len = nbytes;
int off;
int error = 0;
ASSERT(uio->uio_segflg == UIO_NOCOPY);
ASSERT(vp->v_mount != NULL);
obj = vp->v_object;
ASSERT(obj != NULL);
ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
zfs_vmobject_wlock(obj);
for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
int bytes = MIN(PAGESIZE, len);
pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
if (pp->valid == 0) {
zfs_vmobject_wunlock(obj);
va = zfs_map_page(pp, &sf);
error = dmu_read(os, zp->z_id, start, bytes, va,
DMU_READ_PREFETCH);
if (bytes != PAGESIZE && error == 0)
bzero(va + bytes, PAGESIZE - bytes);
zfs_unmap_page(sf);
zfs_vmobject_wlock(obj);
vm_page_sunbusy(pp);
vm_page_lock(pp);
if (error) {
if (pp->wire_count == 0 && pp->valid == 0 &&
!vm_page_busied(pp))
vm_page_free(pp);
} else {
pp->valid = VM_PAGE_BITS_ALL;
vm_page_activate(pp);
}
vm_page_unlock(pp);
} else {
ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
vm_page_sunbusy(pp);
}
if (error)
break;
uio->uio_resid -= bytes;
uio->uio_offset += bytes;
len -= bytes;
}
zfs_vmobject_wunlock(obj);
return (error);
}
/*
* When a file is memory mapped, we must keep the IO data synchronized
* between the DMU cache and the memory mapped pages. What this means:
*
* On Read: We "read" preferentially from memory mapped pages,
* else we default from the dmu buffer.
*
* NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
* the file is memory mapped.
*/
static int
mappedread(vnode_t *vp, int nbytes, uio_t *uio)
{
znode_t *zp = VTOZ(vp);
vm_object_t obj;
int64_t start;
caddr_t va;
int len = nbytes;
int off;
int error = 0;
ASSERT(vp->v_mount != NULL);
obj = vp->v_object;
ASSERT(obj != NULL);
start = uio->uio_loffset;
off = start & PAGEOFFSET;
zfs_vmobject_wlock(obj);
for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
vm_page_t pp;
uint64_t bytes = MIN(PAGESIZE - off, len);
if (pp = page_hold(vp, start)) {
struct sf_buf *sf;
caddr_t va;
zfs_vmobject_wunlock(obj);
va = zfs_map_page(pp, &sf);
#ifdef illumos
error = uiomove(va + off, bytes, UIO_READ, uio);
#else
error = vn_io_fault_uiomove(va + off, bytes, uio);
#endif
zfs_unmap_page(sf);
zfs_vmobject_wlock(obj);
page_unhold(pp);
} else {
zfs_vmobject_wunlock(obj);
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
uio, bytes);
zfs_vmobject_wlock(obj);
}
len -= bytes;
off = 0;
if (error)
break;
}
zfs_vmobject_wunlock(obj);
return (error);
}
offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
/*
* Read bytes from specified file into supplied buffer.
*
* IN: vp - vnode of file to be read from.
* uio - structure supplying read location, range info,
* and return buffer.
* ioflag - SYNC flags; used to provide FRSYNC semantics.
* cr - credentials of caller.
* ct - caller context
*
* OUT: uio - updated offset and range, buffer filled.
*
* RETURN: 0 on success, error code on failure.
*
* Side Effects:
* vp - atime updated if byte count > 0
*/
/* ARGSUSED */
static int
zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
ssize_t n, nbytes;
int error = 0;
rl_t *rl;
xuio_t *xuio = NULL;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
if (zp->z_pflags & ZFS_AV_QUARANTINED) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EACCES));
}
/*
* Validate file offset
*/
if (uio->uio_loffset < (offset_t)0) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EINVAL));
}
/*
* Fasttrack empty reads
*/
if (uio->uio_resid == 0) {
ZFS_EXIT(zfsvfs);
return (0);
}
/*
* Check for mandatory locks
*/
if (MANDMODE(zp->z_mode)) {
if (error = chklock(vp, FREAD,
uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
ZFS_EXIT(zfsvfs);
return (error);
}
}
/*
* If we're in FRSYNC mode, sync out this znode before reading it.
*/
if (zfsvfs->z_log &&
(ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
zil_commit(zfsvfs->z_log, zp->z_id);
/*
* Lock the range against changes.
*/
rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
/*
* If we are reading past end-of-file we can skip
* to the end; but we might still need to set atime.
*/
if (uio->uio_loffset >= zp->z_size) {
error = 0;
goto out;
}
ASSERT(uio->uio_loffset < zp->z_size);
n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
#ifdef illumos
if ((uio->uio_extflg == UIO_XUIO) &&
(((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
int nblk;
int blksz = zp->z_blksz;
uint64_t offset = uio->uio_loffset;
xuio = (xuio_t *)uio;
if ((ISP2(blksz))) {
nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
blksz)) / blksz;
} else {
ASSERT(offset + n <= blksz);
nblk = 1;
}
(void) dmu_xuio_init(xuio, nblk);
if (vn_has_cached_data(vp)) {
/*
* For simplicity, we always allocate a full buffer
* even if we only expect to read a portion of a block.
*/
while (--nblk >= 0) {
(void) dmu_xuio_add(xuio,
dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
blksz), 0, blksz);
}
}
}
#endif /* illumos */
while (n > 0) {
nbytes = MIN(n, zfs_read_chunk_size -
P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
#ifdef __FreeBSD__
if (uio->uio_segflg == UIO_NOCOPY)
error = mappedread_sf(vp, nbytes, uio);
else
#endif /* __FreeBSD__ */
if (vn_has_cached_data(vp)) {
error = mappedread(vp, nbytes, uio);
} else {
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
uio, nbytes);
}
if (error) {
/* convert checksum errors into IO errors */
if (error == ECKSUM)
error = SET_ERROR(EIO);
break;
}
n -= nbytes;
}
out:
zfs_range_unlock(rl);
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Write the bytes to a file.
*
* IN: vp - vnode of file to be written to.
* uio - structure supplying write location, range info,
* and data buffer.
* ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is
* set if in append mode.
* cr - credentials of caller.
* ct - caller context (NFS/CIFS fem monitor only)
*
* OUT: uio - updated offset and range.
*
* RETURN: 0 on success, error code on failure.
*
* Timestamps:
* vp - ctime|mtime updated if byte count > 0
*/
/* ARGSUSED */
static int
zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
rlim64_t limit = MAXOFFSET_T;
ssize_t start_resid = uio->uio_resid;
ssize_t tx_bytes;
uint64_t end_size;
dmu_tx_t *tx;
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
zilog_t *zilog;
offset_t woff;
ssize_t n, nbytes;
rl_t *rl;
int max_blksz = zfsvfs->z_max_blksz;
int error = 0;
arc_buf_t *abuf;
iovec_t *aiov = NULL;
xuio_t *xuio = NULL;
int i_iov = 0;
int iovcnt = uio->uio_iovcnt;
iovec_t *iovp = uio->uio_iov;
int write_eof;
int count = 0;
sa_bulk_attr_t bulk[4];
uint64_t mtime[2], ctime[2];
/*
* Fasttrack empty write
*/
n = start_resid;
if (n == 0)
return (0);
if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
limit = MAXOFFSET_T;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
&zp->z_size, 8);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
&zp->z_pflags, 8);
/*
* In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
* callers might not be able to detect properly that we are read-only,
* so check it explicitly here.
*/
if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EROFS));
}
/*
* If immutable or not appending then return EPERM.
* Intentionally allow ZFS_READONLY through here.
* See zfs_zaccess_common()
*/
if ((zp->z_pflags & ZFS_IMMUTABLE) ||
((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
(uio->uio_loffset < zp->z_size))) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EPERM));
}
zilog = zfsvfs->z_log;
/*
* Validate file offset
*/
woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
if (woff < 0) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EINVAL));
}
/*
* Check for mandatory locks before calling zfs_range_lock()
* in order to prevent a deadlock with locks set via fcntl().
*/
if (MANDMODE((mode_t)zp->z_mode) &&
(error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
#ifdef illumos
/*
* Pre-fault the pages to ensure slow (eg NFS) pages
* don't hold up txg.
* Skip this if uio contains loaned arc_buf.
*/
if ((uio->uio_extflg == UIO_XUIO) &&
(((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
xuio = (xuio_t *)uio;
else
uio_prefaultpages(MIN(n, max_blksz), uio);
#endif
/*
* If in append mode, set the io offset pointer to eof.
*/
if (ioflag & FAPPEND) {
/*
* Obtain an appending range lock to guarantee file append
* semantics. We reset the write offset once we have the lock.
*/
rl = zfs_range_lock(zp, 0, n, RL_APPEND);
woff = rl->r_off;
if (rl->r_len == UINT64_MAX) {
/*
* We overlocked the file because this write will cause
* the file block size to increase.
* Note that zp_size cannot change with this lock held.
*/
woff = zp->z_size;
}
uio->uio_loffset = woff;
} else {
/*
* Note that if the file block size will change as a result of
* this write, then this range lock will lock the entire file
* so that we can re-write the block safely.
*/
rl = zfs_range_lock(zp, woff, n, RL_WRITER);
}
if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
zfs_range_unlock(rl);
ZFS_EXIT(zfsvfs);
return (EFBIG);
}
if (woff >= limit) {
zfs_range_unlock(rl);
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EFBIG));
}
if ((woff + n) > limit || woff > (limit - n))
n = limit - woff;
/* Will this write extend the file length? */
write_eof = (woff + n > zp->z_size);
end_size = MAX(zp->z_size, woff + n);
/*
* Write the file in reasonable size chunks. Each chunk is written
* in a separate transaction; this keeps the intent log records small
* and allows us to do more fine-grained space accounting.
*/
while (n > 0) {
abuf = NULL;
woff = uio->uio_loffset;
if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
if (abuf != NULL)
dmu_return_arcbuf(abuf);
error = SET_ERROR(EDQUOT);
break;
}
if (xuio && abuf == NULL) {
ASSERT(i_iov < iovcnt);
aiov = &iovp[i_iov];
abuf = dmu_xuio_arcbuf(xuio, i_iov);
dmu_xuio_clear(xuio, i_iov);
DTRACE_PROBE3(zfs_cp_write, int, i_iov,
iovec_t *, aiov, arc_buf_t *, abuf);
ASSERT((aiov->iov_base == abuf->b_data) ||
((char *)aiov->iov_base - (char *)abuf->b_data +
aiov->iov_len == arc_buf_size(abuf)));
i_iov++;
} else if (abuf == NULL && n >= max_blksz &&
woff >= zp->z_size &&
P2PHASE(woff, max_blksz) == 0 &&
zp->z_blksz == max_blksz) {
/*
* This write covers a full block. "Borrow" a buffer
* from the dmu so that we can fill it before we enter
* a transaction. This avoids the possibility of
* holding up the transaction if the data copy hangs
* up on a pagefault (e.g., from an NFS server mapping).
*/
size_t cbytes;
abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
max_blksz);
ASSERT(abuf != NULL);
ASSERT(arc_buf_size(abuf) == max_blksz);
if (error = uiocopy(abuf->b_data, max_blksz,
UIO_WRITE, uio, &cbytes)) {
dmu_return_arcbuf(abuf);
break;
}
ASSERT(cbytes == max_blksz);
}
/*
* Start a transaction.
*/
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
if (abuf != NULL)
dmu_return_arcbuf(abuf);
break;
}
/*
* If zfs_range_lock() over-locked we grow the blocksize
* and then reduce the lock range. This will only happen
* on the first iteration since zfs_range_reduce() will
* shrink down r_len to the appropriate size.
*/
if (rl->r_len == UINT64_MAX) {
uint64_t new_blksz;
if (zp->z_blksz > max_blksz) {
/*
* File's blocksize is already larger than the
* "recordsize" property. Only let it grow to
* the next power of 2.
*/
ASSERT(!ISP2(zp->z_blksz));
new_blksz = MIN(end_size,
1 << highbit64(zp->z_blksz));
} else {
new_blksz = MIN(end_size, max_blksz);
}
zfs_grow_blocksize(zp, new_blksz, tx);
zfs_range_reduce(rl, woff, n);
}
/*
* XXX - should we really limit each write to z_max_blksz?
* Perhaps we should use SPA_MAXBLOCKSIZE chunks?
*/
nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
if (woff + nbytes > zp->z_size)
vnode_pager_setsize(vp, woff + nbytes);
if (abuf == NULL) {
tx_bytes = uio->uio_resid;
error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
uio, nbytes, tx);
tx_bytes -= uio->uio_resid;
} else {
tx_bytes = nbytes;
ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
/*
* If this is not a full block write, but we are
* extending the file past EOF and this data starts
* block-aligned, use assign_arcbuf(). Otherwise,
* write via dmu_write().
*/
if (tx_bytes < max_blksz && (!write_eof ||
aiov->iov_base != abuf->b_data)) {
ASSERT(xuio);
dmu_write(zfsvfs->z_os, zp->z_id, woff,
aiov->iov_len, aiov->iov_base, tx);
dmu_return_arcbuf(abuf);
xuio_stat_wbuf_copied();
} else {
ASSERT(xuio || tx_bytes == max_blksz);
dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
woff, abuf, tx);
}
ASSERT(tx_bytes <= uio->uio_resid);
uioskip(uio, tx_bytes);
}
if (tx_bytes && vn_has_cached_data(vp)) {
update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
zp->z_id, uio->uio_segflg, tx);
}
/*
* If we made no progress, we're done. If we made even
* partial progress, update the znode and ZIL accordingly.
*/
if (tx_bytes == 0) {
(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
(void *)&zp->z_size, sizeof (uint64_t), tx);
dmu_tx_commit(tx);
ASSERT(error != 0);
break;
}
/*
* Clear Set-UID/Set-GID bits on successful write if not
* privileged and at least one of the excute bits is set.
*
* It would be nice to to this after all writes have
* been done, but that would still expose the ISUID/ISGID
* to another app after the partial write is committed.
*
* Note: we don't call zfs_fuid_map_id() here because
* user 0 is not an ephemeral uid.
*/
mutex_enter(&zp->z_acl_lock);
if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
(S_IXUSR >> 6))) != 0 &&
(zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
secpolicy_vnode_setid_retain(vp, cr,
(zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
uint64_t newmode;
zp->z_mode &= ~(S_ISUID | S_ISGID);
newmode = zp->z_mode;
(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
(void *)&newmode, sizeof (uint64_t), tx);
}
mutex_exit(&zp->z_acl_lock);
zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
B_TRUE);
/*
* Update the file size (zp_size) if it has changed;
* account for possible concurrent updates.
*/
while ((end_size = zp->z_size) < uio->uio_loffset) {
(void) atomic_cas_64(&zp->z_size, end_size,
uio->uio_loffset);
#ifdef illumos
ASSERT(error == 0);
#else
ASSERT(error == 0 || error == EFAULT);
#endif
}
/*
* If we are replaying and eof is non zero then force
* the file size to the specified eof. Note, there's no
* concurrency during replay.
*/
if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
zp->z_size = zfsvfs->z_replay_eof;
if (error == 0)
error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
else
(void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
dmu_tx_commit(tx);
if (error != 0)
break;
ASSERT(tx_bytes == nbytes);
n -= nbytes;
#ifdef illumos
if (!xuio && n > 0)
uio_prefaultpages(MIN(n, max_blksz), uio);
#endif
}
zfs_range_unlock(rl);
/*
* If we're in replay mode, or we made no progress, return error.
* Otherwise, it's at least a partial write, so it's successful.
*/
if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
ZFS_EXIT(zfsvfs);
return (error);
}
#ifdef __FreeBSD__
/*
* EFAULT means that at least one page of the source buffer was not
* available. VFS will re-try remaining I/O upon this error.
*/
if (error == EFAULT) {
ZFS_EXIT(zfsvfs);
return (error);
}
#endif
if (ioflag & (FSYNC | FDSYNC) ||
zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, zp->z_id);
ZFS_EXIT(zfsvfs);
return (0);
}
void
zfs_get_done(zgd_t *zgd, int error)
{
znode_t *zp = zgd->zgd_private;
objset_t *os = zp->z_zfsvfs->z_os;
if (zgd->zgd_db)
dmu_buf_rele(zgd->zgd_db, zgd);
zfs_range_unlock(zgd->zgd_rl);
/*
* Release the vnode asynchronously as we currently have the
* txg stopped from syncing.
*/
VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
if (error == 0 && zgd->zgd_bp)
zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
kmem_free(zgd, sizeof (zgd_t));
}
#ifdef DEBUG
static int zil_fault_io = 0;
#endif
/*
* Get data to generate a TX_WRITE intent log record.
*/
int
zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
{
zfsvfs_t *zfsvfs = arg;
objset_t *os = zfsvfs->z_os;
znode_t *zp;
uint64_t object = lr->lr_foid;
uint64_t offset = lr->lr_offset;
uint64_t size = lr->lr_length;
dmu_buf_t *db;
zgd_t *zgd;
int error = 0;
ASSERT3P(lwb, !=, NULL);
ASSERT3P(zio, !=, NULL);
ASSERT3U(size, !=, 0);
/*
* Nothing to do if the file has been removed
*/
if (zfs_zget(zfsvfs, object, &zp) != 0)
return (SET_ERROR(ENOENT));
if (zp->z_unlinked) {
/*
* Release the vnode asynchronously as we currently have the
* txg stopped from syncing.
*/
VN_RELE_ASYNC(ZTOV(zp),
dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
return (SET_ERROR(ENOENT));
}
zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
zgd->zgd_lwb = lwb;
zgd->zgd_private = zp;
/*
* Write records come in two flavors: immediate and indirect.
* For small writes it's cheaper to store the data with the
* log record (immediate); for large writes it's cheaper to
* sync the data and get a pointer to it (indirect) so that
* we don't have to write the data twice.
*/
if (buf != NULL) { /* immediate write */
zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
/* test for truncation needs to be done while range locked */
if (offset >= zp->z_size) {
error = SET_ERROR(ENOENT);
} else {
error = dmu_read(os, object, offset, size, buf,
DMU_READ_NO_PREFETCH);
}
ASSERT(error == 0 || error == ENOENT);
} else { /* indirect write */
/*
* Have to lock the whole block to ensure when it's
* written out and its checksum is being calculated
* that no one can change the data. We need to re-check
* blocksize after we get the lock in case it's changed!
*/
for (;;) {
uint64_t blkoff;
size = zp->z_blksz;
blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
offset -= blkoff;
zgd->zgd_rl = zfs_range_lock(zp, offset, size,
RL_READER);
if (zp->z_blksz == size)
break;
offset += blkoff;
zfs_range_unlock(zgd->zgd_rl);
}
/* test for truncation needs to be done while range locked */
if (lr->lr_offset >= zp->z_size)
error = SET_ERROR(ENOENT);
#ifdef DEBUG
if (zil_fault_io) {
error = SET_ERROR(EIO);
zil_fault_io = 0;
}
#endif
if (error == 0)
error = dmu_buf_hold(os, object, offset, zgd, &db,
DMU_READ_NO_PREFETCH);
if (error == 0) {
blkptr_t *bp = &lr->lr_blkptr;
zgd->zgd_db = db;
zgd->zgd_bp = bp;
ASSERT(db->db_offset == offset);
ASSERT(db->db_size == size);
error = dmu_sync(zio, lr->lr_common.lrc_txg,
zfs_get_done, zgd);
ASSERT(error || lr->lr_length <= size);
/*
* On success, we need to wait for the write I/O
* initiated by dmu_sync() to complete before we can
* release this dbuf. We will finish everything up
* in the zfs_get_done() callback.
*/
if (error == 0)
return (0);
if (error == EALREADY) {
lr->lr_common.lrc_txtype = TX_WRITE2;
+ /*
+ * TX_WRITE2 relies on the data previously
+ * written by the TX_WRITE that caused
+ * EALREADY. We zero out the BP because
+ * it is the old, currently-on-disk BP,
+ * so there's no need to zio_flush() its
+ * vdevs (flushing would needlesly hurt
+ * performance, and doesn't work on
+ * indirect vdevs).
+ */
+ zgd->zgd_bp = NULL;
+ BP_ZERO(bp);
error = 0;
}
}
}
zfs_get_done(zgd, error);
return (error);
}
/*ARGSUSED*/
static int
zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
int error;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
if (flag & V_ACE_MASK)
error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
else
error = zfs_zaccess_rwx(zp, mode, flag, cr);
ZFS_EXIT(zfsvfs);
return (error);
}
static int
zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
{
int error;
*vpp = arg;
error = vn_lock(*vpp, lkflags);
if (error != 0)
vrele(*vpp);
return (error);
}
static int
zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
{
znode_t *zdp = VTOZ(dvp);
zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
int error;
int ltype;
ASSERT_VOP_LOCKED(dvp, __func__);
#ifdef DIAGNOSTIC
if ((zdp->z_pflags & ZFS_XATTR) == 0)
VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
#endif
if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
ASSERT3P(dvp, ==, vp);
vref(dvp);
ltype = lkflags & LK_TYPE_MASK;
if (ltype != VOP_ISLOCKED(dvp)) {
if (ltype == LK_EXCLUSIVE)
vn_lock(dvp, LK_UPGRADE | LK_RETRY);
else /* if (ltype == LK_SHARED) */
vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
/*
* Relock for the "." case could leave us with
* reclaimed vnode.
*/
if (dvp->v_iflag & VI_DOOMED) {
vrele(dvp);
return (SET_ERROR(ENOENT));
}
}
return (0);
} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
/*
* Note that in this case, dvp is the child vnode, and we
* are looking up the parent vnode - exactly reverse from
* normal operation. Unlocking dvp requires some rather
* tricky unlock/relock dance to prevent mp from being freed;
* use vn_vget_ino_gen() which takes care of all that.
*
* XXX Note that there is a time window when both vnodes are
* unlocked. It is possible, although highly unlikely, that
* during that window the parent-child relationship between
* the vnodes may change, for example, get reversed.
* In that case we would have a wrong lock order for the vnodes.
* All other filesystems seem to ignore this problem, so we
* do the same here.
* A potential solution could be implemented as follows:
* - using LK_NOWAIT when locking the second vnode and retrying
* if necessary
* - checking that the parent-child relationship still holds
* after locking both vnodes and retrying if it doesn't
*/
error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
return (error);
} else {
error = vn_lock(vp, lkflags);
if (error != 0)
vrele(vp);
return (error);
}
}
/*
* Lookup an entry in a directory, or an extended attribute directory.
* If it exists, return a held vnode reference for it.
*
* IN: dvp - vnode of directory to search.
* nm - name of entry to lookup.
* pnp - full pathname to lookup [UNUSED].
* flags - LOOKUP_XATTR set if looking for an attribute.
* rdir - root directory vnode [UNUSED].
* cr - credentials of caller.
* ct - caller context
*
* OUT: vpp - vnode of located entry, NULL if not found.
*
* RETURN: 0 on success, error code on failure.
*
* Timestamps:
* NA
*/
/* ARGSUSED */
static int
zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
int nameiop, cred_t *cr, kthread_t *td, int flags)
{
znode_t *zdp = VTOZ(dvp);
znode_t *zp;
zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
int error = 0;
/*
* Fast path lookup, however we must skip DNLC lookup
* for case folding or normalizing lookups because the
* DNLC code only stores the passed in name. This means
* creating 'a' and removing 'A' on a case insensitive
* file system would work, but DNLC still thinks 'a'
* exists and won't let you create it again on the next
* pass through fast path.
*/
if (!(flags & LOOKUP_XATTR)) {
if (dvp->v_type != VDIR) {
return (SET_ERROR(ENOTDIR));
} else if (zdp->z_sa_hdl == NULL) {
return (SET_ERROR(EIO));
}
}
DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zdp);
*vpp = NULL;
if (flags & LOOKUP_XATTR) {
#ifdef TODO
/*
* If the xattr property is off, refuse the lookup request.
*/
if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EINVAL));
}
#endif
/*
* We don't allow recursive attributes..
* Maybe someday we will.
*/
if (zdp->z_pflags & ZFS_XATTR) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EINVAL));
}
if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Do we have permission to get into attribute directory?
*/
if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
B_FALSE, cr)) {
vrele(*vpp);
*vpp = NULL;
}
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Check accessibility of directory.
*/
if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
ZFS_EXIT(zfsvfs);
return (error);
}
if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EILSEQ));
}
/*
* First handle the special cases.
*/
if ((cnp->cn_flags & ISDOTDOT) != 0) {
/*
* If we are a snapshot mounted under .zfs, return
* the vp for the snapshot directory.
*/
if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
struct componentname cn;
vnode_t *zfsctl_vp;
int ltype;
ZFS_EXIT(zfsvfs);
ltype = VOP_ISLOCKED(dvp);
VOP_UNLOCK(dvp, 0);
error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
&zfsctl_vp);
if (error == 0) {
cn.cn_nameptr = "snapshot";
cn.cn_namelen = strlen(cn.cn_nameptr);
cn.cn_nameiop = cnp->cn_nameiop;
cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
cn.cn_lkflags = cnp->cn_lkflags;
error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
vput(zfsctl_vp);
}
vn_lock(dvp, ltype | LK_RETRY);
return (error);
}
}
if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
ZFS_EXIT(zfsvfs);
if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
return (SET_ERROR(ENOTSUP));
error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
return (error);
}
/*
* The loop is retry the lookup if the parent-child relationship
* changes during the dot-dot locking complexities.
*/
for (;;) {
uint64_t parent;
error = zfs_dirlook(zdp, nm, &zp);
if (error == 0)
*vpp = ZTOV(zp);
ZFS_EXIT(zfsvfs);
if (error != 0)
break;
error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
if (error != 0) {
/*
* If we've got a locking error, then the vnode
* got reclaimed because of a force unmount.
* We never enter doomed vnodes into the name cache.
*/
*vpp = NULL;
return (error);
}
if ((cnp->cn_flags & ISDOTDOT) == 0)
break;
ZFS_ENTER(zfsvfs);
if (zdp->z_sa_hdl == NULL) {
error = SET_ERROR(EIO);
} else {
error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
&parent, sizeof (parent));
}
if (error != 0) {
ZFS_EXIT(zfsvfs);
vput(ZTOV(zp));
break;
}
if (zp->z_id == parent) {
ZFS_EXIT(zfsvfs);
break;
}
vput(ZTOV(zp));
}
out:
if (error != 0)
*vpp = NULL;
/* Translate errors and add SAVENAME when needed. */
if (cnp->cn_flags & ISLASTCN) {
switch (nameiop) {
case CREATE:
case RENAME:
if (error == ENOENT) {
error = EJUSTRETURN;
cnp->cn_flags |= SAVENAME;
break;
}
/* FALLTHROUGH */
case DELETE:
if (error == 0)
cnp->cn_flags |= SAVENAME;
break;
}
}
/* Insert name into cache (as non-existent) if appropriate. */
if (zfsvfs->z_use_namecache &&
error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
cache_enter(dvp, NULL, cnp);
/* Insert name into cache if appropriate. */
if (zfsvfs->z_use_namecache &&
error == 0 && (cnp->cn_flags & MAKEENTRY)) {
if (!(cnp->cn_flags & ISLASTCN) ||
(nameiop != DELETE && nameiop != RENAME)) {
cache_enter(dvp, *vpp, cnp);
}
}
return (error);
}
/*
* Attempt to create a new entry in a directory. If the entry
* already exists, truncate the file if permissible, else return
* an error. Return the vp of the created or trunc'd file.
*
* IN: dvp - vnode of directory to put new file entry in.
* name - name of new file entry.
* vap - attributes of new file.
* excl - flag indicating exclusive or non-exclusive mode.
* mode - mode to open file with.
* cr - credentials of caller.
* flag - large file flag [UNUSED].
* ct - caller context
* vsecp - ACL to be set
*
* OUT: vpp - vnode of created or trunc'd entry.
*
* RETURN: 0 on success, error code on failure.
*
* Timestamps:
* dvp - ctime|mtime updated if new entry created
* vp - ctime|mtime always, atime if new
*/
/* ARGSUSED */
static int
zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
vnode_t **vpp, cred_t *cr, kthread_t *td)
{
znode_t *zp, *dzp = VTOZ(dvp);
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
objset_t *os;
dmu_tx_t *tx;
int error;
ksid_t *ksid;
uid_t uid;
gid_t gid = crgetgid(cr);
zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
void *vsecp = NULL;
int flag = 0;
uint64_t txtype;
/*
* If we have an ephemeral id, ACL, or XVATTR then
* make sure file system is at proper version
*/
ksid = crgetsid(cr, KSID_OWNER);
if (ksid)
uid = ksid_getid(ksid);
else
uid = crgetuid(cr);
if (zfsvfs->z_use_fuids == B_FALSE &&
(vsecp || (vap->va_mask & AT_XVATTR) ||
IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
return (SET_ERROR(EINVAL));
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(dzp);
os = zfsvfs->z_os;
zilog = zfsvfs->z_log;
if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EILSEQ));
}
if (vap->va_mask & AT_XVATTR) {
if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
crgetuid(cr), cr, vap->va_type)) != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
}
*vpp = NULL;
if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
vap->va_mode &= ~S_ISVTX;
error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
if (error) {
ZFS_EXIT(zfsvfs);
return (error);
}
ASSERT3P(zp, ==, NULL);
/*
* Create a new file object and update the directory
* to reference it.
*/
if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
goto out;
}
/*
* We only support the creation of regular files in
* extended attribute directories.
*/
if ((dzp->z_pflags & ZFS_XATTR) &&
(vap->va_type != VREG)) {
error = SET_ERROR(EINVAL);
goto out;
}
if ((error = zfs_acl_ids_create(dzp, 0, vap,
cr, vsecp, &acl_ids)) != 0)
goto out;
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
zfs_acl_ids_free(&acl_ids);
error = SET_ERROR(EDQUOT);
goto out;
}
getnewvnode_reserve(1);
tx = dmu_tx_create(os);
dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
ZFS_SA_BASE_ATTR_SIZE);
fuid_dirtied = zfsvfs->z_fuid_dirty;
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
if (!zfsvfs->z_use_sa &&
acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
0, acl_ids.z_aclp->z_acl_bytes);
}
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
zfs_acl_ids_free(&acl_ids);
dmu_tx_abort(tx);
getnewvnode_drop_reserve();
ZFS_EXIT(zfsvfs);
return (error);
}
zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
zfs_log_create(zilog, tx, txtype, dzp, zp, name,
vsecp, acl_ids.z_fuidp, vap);
zfs_acl_ids_free(&acl_ids);
dmu_tx_commit(tx);
getnewvnode_drop_reserve();
out:
if (error == 0) {
*vpp = ZTOV(zp);
}
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Remove an entry from a directory.
*
* IN: dvp - vnode of directory to remove entry from.
* name - name of entry to remove.
* cr - credentials of caller.
* ct - caller context
* flags - case flags
*
* RETURN: 0 on success, error code on failure.
*
* Timestamps:
* dvp - ctime|mtime
* vp - ctime (if nlink > 0)
*/
/*ARGSUSED*/
static int
zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
{
znode_t *dzp = VTOZ(dvp);
znode_t *zp = VTOZ(vp);
znode_t *xzp;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
uint64_t acl_obj, xattr_obj;
uint64_t obj = 0;
dmu_tx_t *tx;
boolean_t unlinked, toobig = FALSE;
uint64_t txtype;
int error;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(dzp);
ZFS_VERIFY_ZP(zp);
zilog = zfsvfs->z_log;
zp = VTOZ(vp);
xattr_obj = 0;
xzp = NULL;
if (error = zfs_zaccess_delete(dzp, zp, cr)) {
goto out;
}
/*
* Need to use rmdir for removing directories.
*/
if (vp->v_type == VDIR) {
error = SET_ERROR(EPERM);
goto out;
}
vnevent_remove(vp, dvp, name, ct);
obj = zp->z_id;
/* are there any extended attributes? */
error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
&xattr_obj, sizeof (xattr_obj));
if (error == 0 && xattr_obj) {
error = zfs_zget(zfsvfs, xattr_obj, &xzp);
ASSERT0(error);
}
/*
* We may delete the znode now, or we may put it in the unlinked set;
* it depends on whether we're the last link, and on whether there are
* other holds on the vnode. So we dmu_tx_hold() the right things to
* allow for either case.
*/
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
zfs_sa_upgrade_txholds(tx, zp);
zfs_sa_upgrade_txholds(tx, dzp);
if (xzp) {
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
}
/* charge as an update -- would be nice not to charge at all */
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
/*
* Mark this transaction as typically resulting in a net free of space
*/
dmu_tx_mark_netfree(tx);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Remove the directory entry.
*/
error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
if (error) {
dmu_tx_commit(tx);
goto out;
}
if (unlinked) {
zfs_unlinked_add(zp, tx);
vp->v_vflag |= VV_NOSYNC;
}
txtype = TX_REMOVE;
zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
dmu_tx_commit(tx);
out:
if (xzp)
vrele(ZTOV(xzp));
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Create a new directory and insert it into dvp using the name
* provided. Return a pointer to the inserted directory.
*
* IN: dvp - vnode of directory to add subdir to.
* dirname - name of new directory.
* vap - attributes of new directory.
* cr - credentials of caller.
* ct - caller context
* flags - case flags
* vsecp - ACL to be set
*
* OUT: vpp - vnode of created directory.
*
* RETURN: 0 on success, error code on failure.
*
* Timestamps:
* dvp - ctime|mtime updated
* vp - ctime|mtime|atime updated
*/
/*ARGSUSED*/
static int
zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
{
znode_t *zp, *dzp = VTOZ(dvp);
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
uint64_t txtype;
dmu_tx_t *tx;
int error;
ksid_t *ksid;
uid_t uid;
gid_t gid = crgetgid(cr);
zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
ASSERT(vap->va_type == VDIR);
/*
* If we have an ephemeral id, ACL, or XVATTR then
* make sure file system is at proper version
*/
ksid = crgetsid(cr, KSID_OWNER);
if (ksid)
uid = ksid_getid(ksid);
else
uid = crgetuid(cr);
if (zfsvfs->z_use_fuids == B_FALSE &&
((vap->va_mask & AT_XVATTR) ||
IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
return (SET_ERROR(EINVAL));
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(dzp);
zilog = zfsvfs->z_log;
if (dzp->z_pflags & ZFS_XATTR) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EINVAL));
}
if (zfsvfs->z_utf8 && u8_validate(dirname,
strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EILSEQ));
}
if (vap->va_mask & AT_XVATTR) {
if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
crgetuid(cr), cr, vap->va_type)) != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
}
if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
NULL, &acl_ids)) != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* First make sure the new directory doesn't exist.
*
* Existence is checked first to make sure we don't return
* EACCES instead of EEXIST which can cause some applications
* to fail.
*/
*vpp = NULL;
if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
zfs_acl_ids_free(&acl_ids);
ZFS_EXIT(zfsvfs);
return (error);
}
ASSERT3P(zp, ==, NULL);
if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
zfs_acl_ids_free(&acl_ids);
ZFS_EXIT(zfsvfs);
return (error);
}
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
zfs_acl_ids_free(&acl_ids);
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EDQUOT));
}
/*
* Add a new entry to the directory.
*/
getnewvnode_reserve(1);
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
fuid_dirtied = zfsvfs->z_fuid_dirty;
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
acl_ids.z_aclp->z_acl_bytes);
}
dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
ZFS_SA_BASE_ATTR_SIZE);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
zfs_acl_ids_free(&acl_ids);
dmu_tx_abort(tx);
getnewvnode_drop_reserve();
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Create new node.
*/
zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
/*
* Now put new name in parent dir.
*/
(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
*vpp = ZTOV(zp);
txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
acl_ids.z_fuidp, vap);
zfs_acl_ids_free(&acl_ids);
dmu_tx_commit(tx);
getnewvnode_drop_reserve();
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (0);
}
/*
* Remove a directory subdir entry. If the current working
* directory is the same as the subdir to be removed, the
* remove will fail.
*
* IN: dvp - vnode of directory to remove from.
* name - name of directory to be removed.
* cwd - vnode of current working directory.
* cr - credentials of caller.
* ct - caller context
* flags - case flags
*
* RETURN: 0 on success, error code on failure.
*
* Timestamps:
* dvp - ctime|mtime updated
*/
/*ARGSUSED*/
static int
zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
{
znode_t *dzp = VTOZ(dvp);
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
dmu_tx_t *tx;
int error;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(dzp);
ZFS_VERIFY_ZP(zp);
zilog = zfsvfs->z_log;
if (error = zfs_zaccess_delete(dzp, zp, cr)) {
goto out;
}
if (vp->v_type != VDIR) {
error = SET_ERROR(ENOTDIR);
goto out;
}
vnevent_rmdir(vp, dvp, name, ct);
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
zfs_sa_upgrade_txholds(tx, zp);
zfs_sa_upgrade_txholds(tx, dzp);
dmu_tx_mark_netfree(tx);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
ZFS_EXIT(zfsvfs);
return (error);
}
cache_purge(dvp);
error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
if (error == 0) {
uint64_t txtype = TX_RMDIR;
zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
}
dmu_tx_commit(tx);
cache_purge(vp);
out:
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Read as many directory entries as will fit into the provided
* buffer from the given directory cursor position (specified in
* the uio structure).
*
* IN: vp - vnode of directory to read.
* uio - structure supplying read location, range info,
* and return buffer.
* cr - credentials of caller.
* ct - caller context
* flags - case flags
*
* OUT: uio - updated offset and range, buffer filled.
* eofp - set to true if end-of-file detected.
*
* RETURN: 0 on success, error code on failure.
*
* Timestamps:
* vp - atime updated
*
* Note that the low 4 bits of the cookie returned by zap is always zero.
* This allows us to use the low range for "special" directory entries:
* We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
* we use the offset 2 for the '.zfs' directory.
*/
/* ARGSUSED */
static int
zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
{
znode_t *zp = VTOZ(vp);
iovec_t *iovp;
edirent_t *eodp;
dirent64_t *odp;
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
objset_t *os;
caddr_t outbuf;
size_t bufsize;
zap_cursor_t zc;
zap_attribute_t zap;
uint_t bytes_wanted;
uint64_t offset; /* must be unsigned; checks for < 1 */
uint64_t parent;
int local_eof;
int outcount;
int error;
uint8_t prefetch;
boolean_t check_sysattrs;
uint8_t type;
int ncooks;
u_long *cooks = NULL;
int flags = 0;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
&parent, sizeof (parent))) != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* If we are not given an eof variable,
* use a local one.
*/
if (eofp == NULL)
eofp = &local_eof;
/*
* Check for valid iov_len.
*/
if (uio->uio_iov->iov_len <= 0) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EINVAL));
}
/*
* Quit if directory has been removed (posix)
*/
if ((*eofp = zp->z_unlinked) != 0) {
ZFS_EXIT(zfsvfs);
return (0);
}
error = 0;
os = zfsvfs->z_os;
offset = uio->uio_loffset;
prefetch = zp->z_zn_prefetch;
/*
* Initialize the iterator cursor.
*/
if (offset <= 3) {
/*
* Start iteration from the beginning of the directory.
*/
zap_cursor_init(&zc, os, zp->z_id);
} else {
/*
* The offset is a serialized cursor.
*/
zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
}
/*
* Get space to change directory entries into fs independent format.
*/
iovp = uio->uio_iov;
bytes_wanted = iovp->iov_len;
if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
bufsize = bytes_wanted;
outbuf = kmem_alloc(bufsize, KM_SLEEP);
odp = (struct dirent64 *)outbuf;
} else {
bufsize = bytes_wanted;
outbuf = NULL;
odp = (struct dirent64 *)iovp->iov_base;
}
eodp = (struct edirent *)odp;
if (ncookies != NULL) {
/*
* Minimum entry size is dirent size and 1 byte for a file name.
*/
ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
*cookies = cooks;
*ncookies = ncooks;
}
/*
* If this VFS supports the system attribute view interface; and
* we're looking at an extended attribute directory; and we care
* about normalization conflicts on this vfs; then we must check
* for normalization conflicts with the sysattr name space.
*/
#ifdef TODO
check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
(vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
(flags & V_RDDIR_ENTFLAGS);
#else
check_sysattrs = 0;
#endif
/*
* Transform to file-system independent format
*/
outcount = 0;
while (outcount < bytes_wanted) {
ino64_t objnum;
ushort_t reclen;
off64_t *next = NULL;
/*
* Special case `.', `..', and `.zfs'.
*/
if (offset == 0) {
(void) strcpy(zap.za_name, ".");
zap.za_normalization_conflict = 0;
objnum = zp->z_id;
type = DT_DIR;
} else if (offset == 1) {
(void) strcpy(zap.za_name, "..");
zap.za_normalization_conflict = 0;
objnum = parent;
type = DT_DIR;
} else if (offset == 2 && zfs_show_ctldir(zp)) {
(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
zap.za_normalization_conflict = 0;
objnum = ZFSCTL_INO_ROOT;
type = DT_DIR;
} else {
/*
* Grab next entry.
*/
if (error = zap_cursor_retrieve(&zc, &zap)) {
if ((*eofp = (error == ENOENT)) != 0)
break;
else
goto update;
}
if (zap.za_integer_length != 8 ||
zap.za_num_integers != 1) {
cmn_err(CE_WARN, "zap_readdir: bad directory "
"entry, obj = %lld, offset = %lld\n",
(u_longlong_t)zp->z_id,
(u_longlong_t)offset);
error = SET_ERROR(ENXIO);
goto update;
}
objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
/*
* MacOS X can extract the object type here such as:
* uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
*/
type = ZFS_DIRENT_TYPE(zap.za_first_integer);
if (check_sysattrs && !zap.za_normalization_conflict) {
#ifdef TODO
zap.za_normalization_conflict =
xattr_sysattr_casechk(zap.za_name);
#else
panic("%s:%u: TODO", __func__, __LINE__);
#endif
}
}
if (flags & V_RDDIR_ACCFILTER) {
/*
* If we have no access at all, don't include
* this entry in the returned information
*/
znode_t *ezp;
if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
goto skip_entry;
if (!zfs_has_access(ezp, cr)) {
vrele(ZTOV(ezp));
goto skip_entry;
}
vrele(ZTOV(ezp));
}
if (flags & V_RDDIR_ENTFLAGS)
reclen = EDIRENT_RECLEN(strlen(zap.za_name));
else
reclen = DIRENT64_RECLEN(strlen(zap.za_name));
/*
* Will this entry fit in the buffer?
*/
if (outcount + reclen > bufsize) {
/*
* Did we manage to fit anything in the buffer?
*/
if (!outcount) {
error = SET_ERROR(EINVAL);
goto update;
}
break;
}
if (flags & V_RDDIR_ENTFLAGS) {
/*
* Add extended flag entry:
*/
eodp->ed_ino = objnum;
eodp->ed_reclen = reclen;
/* NOTE: ed_off is the offset for the *next* entry */
next = &(eodp->ed_off);
eodp->ed_eflags = zap.za_normalization_conflict ?
ED_CASE_CONFLICT : 0;
(void) strncpy(eodp->ed_name, zap.za_name,
EDIRENT_NAMELEN(reclen));
eodp = (edirent_t *)((intptr_t)eodp + reclen);
} else {
/*
* Add normal entry:
*/
odp->d_ino = objnum;
odp->d_reclen = reclen;
odp->d_namlen = strlen(zap.za_name);
(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
odp->d_type = type;
odp = (dirent64_t *)((intptr_t)odp + reclen);
}
outcount += reclen;
ASSERT(outcount <= bufsize);
/* Prefetch znode */
if (prefetch)
dmu_prefetch(os, objnum, 0, 0, 0,
ZIO_PRIORITY_SYNC_READ);
skip_entry:
/*
* Move to the next entry, fill in the previous offset.
*/
if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
zap_cursor_advance(&zc);
offset = zap_cursor_serialize(&zc);
} else {
offset += 1;
}
if (cooks != NULL) {
*cooks++ = offset;
ncooks--;
KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
}
}
zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
/* Subtract unused cookies */
if (ncookies != NULL)
*ncookies -= ncooks;
if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
iovp->iov_base += outcount;
iovp->iov_len -= outcount;
uio->uio_resid -= outcount;
} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
/*
* Reset the pointer.
*/
offset = uio->uio_loffset;
}
update:
zap_cursor_fini(&zc);
if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
kmem_free(outbuf, bufsize);
if (error == ENOENT)
error = 0;
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
uio->uio_loffset = offset;
ZFS_EXIT(zfsvfs);
if (error != 0 && cookies != NULL) {
free(*cookies, M_TEMP);
*cookies = NULL;
*ncookies = 0;
}
return (error);
}
ulong_t zfs_fsync_sync_cnt = 4;
static int
zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
zil_commit(zfsvfs->z_log, zp->z_id);
ZFS_EXIT(zfsvfs);
}
return (0);
}
/*
* Get the requested file attributes and place them in the provided
* vattr structure.
*
* IN: vp - vnode of file.
* vap - va_mask identifies requested attributes.
* If AT_XVATTR set, then optional attrs are requested
* flags - ATTR_NOACLCHECK (CIFS server context)
* cr - credentials of caller.
* ct - caller context
*
* OUT: vap - attribute values.
*
* RETURN: 0 (always succeeds).
*/
/* ARGSUSED */
static int
zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
int error = 0;
uint32_t blksize;
u_longlong_t nblocks;
uint64_t links;
uint64_t mtime[2], ctime[2], crtime[2], rdev;
xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
xoptattr_t *xoap = NULL;
boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
sa_bulk_attr_t bulk[4];
int count = 0;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
if (vp->v_type == VBLK || vp->v_type == VCHR)
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
&rdev, 8);
if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
* Also, if we are the owner don't bother, since owner should
* always be allowed to read basic attributes of file.
*/
if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
(vap->va_uid != crgetuid(cr))) {
if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
skipaclchk, cr)) {
ZFS_EXIT(zfsvfs);
return (error);
}
}
/*
* Return all attributes. It's cheaper to provide the answer
* than to determine whether we were asked the question.
*/
vap->va_type = IFTOVT(zp->z_mode);
vap->va_mode = zp->z_mode & ~S_IFMT;
#ifdef illumos
vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
#else
vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
#endif
vap->va_nodeid = zp->z_id;
if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
links = zp->z_links + 1;
else
links = zp->z_links;
vap->va_nlink = MIN(links, LINK_MAX); /* nlink_t limit! */
vap->va_size = zp->z_size;
#ifdef illumos
vap->va_rdev = vp->v_rdev;
#else
if (vp->v_type == VBLK || vp->v_type == VCHR)
vap->va_rdev = zfs_cmpldev(rdev);
#endif
vap->va_seq = zp->z_seq;
vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */
vap->va_filerev = zp->z_seq;
/*
* Add in any requested optional attributes and the create time.
* Also set the corresponding bits in the returned attribute bitmap.
*/
if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
xoap->xoa_archive =
((zp->z_pflags & ZFS_ARCHIVE) != 0);
XVA_SET_RTN(xvap, XAT_ARCHIVE);
}
if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
xoap->xoa_readonly =
((zp->z_pflags & ZFS_READONLY) != 0);
XVA_SET_RTN(xvap, XAT_READONLY);
}
if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
xoap->xoa_system =
((zp->z_pflags & ZFS_SYSTEM) != 0);
XVA_SET_RTN(xvap, XAT_SYSTEM);
}
if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
xoap->xoa_hidden =
((zp->z_pflags & ZFS_HIDDEN) != 0);
XVA_SET_RTN(xvap, XAT_HIDDEN);
}
if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
xoap->xoa_nounlink =
((zp->z_pflags & ZFS_NOUNLINK) != 0);
XVA_SET_RTN(xvap, XAT_NOUNLINK);
}
if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
xoap->xoa_immutable =
((zp->z_pflags & ZFS_IMMUTABLE) != 0);
XVA_SET_RTN(xvap, XAT_IMMUTABLE);
}
if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
xoap->xoa_appendonly =
((zp->z_pflags & ZFS_APPENDONLY) != 0);
XVA_SET_RTN(xvap, XAT_APPENDONLY);
}
if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
xoap->xoa_nodump =
((zp->z_pflags & ZFS_NODUMP) != 0);
XVA_SET_RTN(xvap, XAT_NODUMP);
}
if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
xoap->xoa_opaque =
((zp->z_pflags & ZFS_OPAQUE) != 0);
XVA_SET_RTN(xvap, XAT_OPAQUE);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
xoap->xoa_av_quarantined =
((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
xoap->xoa_av_modified =
((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
vp->v_type == VREG) {
zfs_sa_get_scanstamp(zp, xvap);
}
if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
XVA_SET_RTN(xvap, XAT_REPARSE);
}
if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
xoap->xoa_generation = zp->z_gen;
XVA_SET_RTN(xvap, XAT_GEN);
}
if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
xoap->xoa_offline =
((zp->z_pflags & ZFS_OFFLINE) != 0);
XVA_SET_RTN(xvap, XAT_OFFLINE);
}
if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
xoap->xoa_sparse =
((zp->z_pflags & ZFS_SPARSE) != 0);
XVA_SET_RTN(xvap, XAT_SPARSE);
}
}
ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
ZFS_TIME_DECODE(&vap->va_mtime, mtime);
ZFS_TIME_DECODE(&vap->va_ctime, ctime);
ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
vap->va_blksize = blksize;
vap->va_bytes = nblocks << 9; /* nblocks * 512 */
if (zp->z_blksz == 0) {
/*
* Block size hasn't been set; suggest maximal I/O transfers.
*/
vap->va_blksize = zfsvfs->z_max_blksz;
}
ZFS_EXIT(zfsvfs);
return (0);
}
/*
* Set the file attributes to the values contained in the
* vattr structure.
*
* IN: vp - vnode of file to be modified.
* vap - new attribute values.
* If AT_XVATTR set, then optional attrs are being set
* flags - ATTR_UTIME set if non-default time values provided.
* - ATTR_NOACLCHECK (CIFS context only).
* cr - credentials of caller.
* ct - caller context
*
* RETURN: 0 on success, error code on failure.
*
* Timestamps:
* vp - ctime updated, mtime updated if size changed.
*/
/* ARGSUSED */
static int
zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
zilog_t *zilog;
dmu_tx_t *tx;
vattr_t oldva;
xvattr_t tmpxvattr;
uint_t mask = vap->va_mask;
uint_t saved_mask = 0;
uint64_t saved_mode;
int trim_mask = 0;
uint64_t new_mode;
uint64_t new_uid, new_gid;
uint64_t xattr_obj;
uint64_t mtime[2], ctime[2];
znode_t *attrzp;
int need_policy = FALSE;
int err, err2;
zfs_fuid_info_t *fuidp = NULL;
xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
xoptattr_t *xoap;
zfs_acl_t *aclp;
boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
boolean_t fuid_dirtied = B_FALSE;
sa_bulk_attr_t bulk[7], xattr_bulk[7];
int count = 0, xattr_count = 0;
if (mask == 0)
return (0);
if (mask & AT_NOSET)
return (SET_ERROR(EINVAL));
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
zilog = zfsvfs->z_log;
/*
* Make sure that if we have ephemeral uid/gid or xvattr specified
* that file system is at proper version level
*/
if (zfsvfs->z_use_fuids == B_FALSE &&
(((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
(mask & AT_XVATTR))) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EINVAL));
}
if (mask & AT_SIZE && vp->v_type == VDIR) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EISDIR));
}
if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EINVAL));
}
/*
* If this is an xvattr_t, then get a pointer to the structure of
* optional attributes. If this is NULL, then we have a vattr_t.
*/
xoap = xva_getxoptattr(xvap);
xva_init(&tmpxvattr);
/*
* Immutable files can only alter immutable bit and atime
*/
if ((zp->z_pflags & ZFS_IMMUTABLE) &&
((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EPERM));
}
/*
* Note: ZFS_READONLY is handled in zfs_zaccess_common.
*/
/*
* Verify timestamps doesn't overflow 32 bits.
* ZFS can handle large timestamps, but 32bit syscalls can't
* handle times greater than 2039. This check should be removed
* once large timestamps are fully supported.
*/
if (mask & (AT_ATIME | AT_MTIME)) {
if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EOVERFLOW));
}
}
if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EOVERFLOW));
}
attrzp = NULL;
aclp = NULL;
/* Can this be moved to before the top label? */
if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EROFS));
}
/*
* First validate permissions
*/
if (mask & AT_SIZE) {
/*
* XXX - Note, we are not providing any open
* mode flags here (like FNDELAY), so we may
* block if there are locks present... this
* should be addressed in openat().
*/
/* XXX - would it be OK to generate a log record here? */
err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
if (err) {
ZFS_EXIT(zfsvfs);
return (err);
}
}
if (mask & (AT_ATIME|AT_MTIME) ||
((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
XVA_ISSET_REQ(xvap, XAT_READONLY) ||
XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
skipaclchk, cr);
}
if (mask & (AT_UID|AT_GID)) {
int idmask = (mask & (AT_UID|AT_GID));
int take_owner;
int take_group;
/*
* NOTE: even if a new mode is being set,
* we may clear S_ISUID/S_ISGID bits.
*/
if (!(mask & AT_MODE))
vap->va_mode = zp->z_mode;
/*
* Take ownership or chgrp to group we are a member of
*/
take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
take_group = (mask & AT_GID) &&
zfs_groupmember(zfsvfs, vap->va_gid, cr);
/*
* If both AT_UID and AT_GID are set then take_owner and
* take_group must both be set in order to allow taking
* ownership.
*
* Otherwise, send the check through secpolicy_vnode_setattr()
*
*/
if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
((idmask == AT_UID) && take_owner) ||
((idmask == AT_GID) && take_group)) {
if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
skipaclchk, cr) == 0) {
/*
* Remove setuid/setgid for non-privileged users
*/
secpolicy_setid_clear(vap, vp, cr);
trim_mask = (mask & (AT_UID|AT_GID));
} else {
need_policy = TRUE;
}
} else {
need_policy = TRUE;
}
}
oldva.va_mode = zp->z_mode;
zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
if (mask & AT_XVATTR) {
/*
* Update xvattr mask to include only those attributes
* that are actually changing.
*
* the bits will be restored prior to actually setting
* the attributes so the caller thinks they were set.
*/
if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
if (xoap->xoa_appendonly !=
((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_APPENDONLY);
XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
}
}
if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
if (xoap->xoa_nounlink !=
((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_NOUNLINK);
XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
}
}
if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
if (xoap->xoa_immutable !=
((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
}
}
if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
if (xoap->xoa_nodump !=
((zp->z_pflags & ZFS_NODUMP) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_NODUMP);
XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
}
}
if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
if (xoap->xoa_av_modified !=
((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
}
}
if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
if ((vp->v_type != VREG &&
xoap->xoa_av_quarantined) ||
xoap->xoa_av_quarantined !=
((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
}
}
if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EPERM));
}
if (need_policy == FALSE &&
(XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
need_policy = TRUE;
}
}
if (mask & AT_MODE) {
if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
err = secpolicy_setid_setsticky_clear(vp, vap,
&oldva, cr);
if (err) {
ZFS_EXIT(zfsvfs);
return (err);
}
trim_mask |= AT_MODE;
} else {
need_policy = TRUE;
}
}
if (need_policy) {
/*
* If trim_mask is set then take ownership
* has been granted or write_acl is present and user
* has the ability to modify mode. In that case remove
* UID|GID and or MODE from mask so that
* secpolicy_vnode_setattr() doesn't revoke it.
*/
if (trim_mask) {
saved_mask = vap->va_mask;
vap->va_mask &= ~trim_mask;
if (trim_mask & AT_MODE) {
/*
* Save the mode, as secpolicy_vnode_setattr()
* will overwrite it with ova.va_mode.
*/
saved_mode = vap->va_mode;
}
}
err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
(int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
if (err) {
ZFS_EXIT(zfsvfs);
return (err);
}
if (trim_mask) {
vap->va_mask |= saved_mask;
if (trim_mask & AT_MODE) {
/*
* Recover the mode after
* secpolicy_vnode_setattr().
*/
vap->va_mode = saved_mode;
}
}
}
/*
* secpolicy_vnode_setattr, or take ownership may have
* changed va_mask
*/
mask = vap->va_mask;
if ((mask & (AT_UID | AT_GID))) {
err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
&xattr_obj, sizeof (xattr_obj));
if (err == 0 && xattr_obj) {
err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
if (err == 0) {
err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
if (err != 0)
vrele(ZTOV(attrzp));
}
if (err)
goto out2;
}
if (mask & AT_UID) {
new_uid = zfs_fuid_create(zfsvfs,
(uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
if (new_uid != zp->z_uid &&
zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
if (attrzp)
vput(ZTOV(attrzp));
err = SET_ERROR(EDQUOT);
goto out2;
}
}
if (mask & AT_GID) {
new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
cr, ZFS_GROUP, &fuidp);
if (new_gid != zp->z_gid &&
zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
if (attrzp)
vput(ZTOV(attrzp));
err = SET_ERROR(EDQUOT);
goto out2;
}
}
}
tx = dmu_tx_create(zfsvfs->z_os);
if (mask & AT_MODE) {
uint64_t pmode = zp->z_mode;
uint64_t acl_obj;
new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
!(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
err = SET_ERROR(EPERM);
goto out;
}
if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
goto out;
if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
/*
* Are we upgrading ACL from old V0 format
* to V1 format?
*/
if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
zfs_znode_acl_version(zp) ==
ZFS_ACL_VERSION_INITIAL) {
dmu_tx_hold_free(tx, acl_obj, 0,
DMU_OBJECT_END);
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
0, aclp->z_acl_bytes);
} else {
dmu_tx_hold_write(tx, acl_obj, 0,
aclp->z_acl_bytes);
}
} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
0, aclp->z_acl_bytes);
}
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
} else {
if ((mask & AT_XVATTR) &&
XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
else
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
}
if (attrzp) {
dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
}
fuid_dirtied = zfsvfs->z_fuid_dirty;
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
zfs_sa_upgrade_txholds(tx, zp);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err)
goto out;
count = 0;
/*
* Set each attribute requested.
* We group settings according to the locks they need to acquire.
*
* Note: you cannot set ctime directly, although it will be
* updated as a side-effect of calling this function.
*/
if (mask & (AT_UID|AT_GID|AT_MODE))
mutex_enter(&zp->z_acl_lock);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
&zp->z_pflags, sizeof (zp->z_pflags));
if (attrzp) {
if (mask & (AT_UID|AT_GID|AT_MODE))
mutex_enter(&attrzp->z_acl_lock);
SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
sizeof (attrzp->z_pflags));
}
if (mask & (AT_UID|AT_GID)) {
if (mask & AT_UID) {
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
&new_uid, sizeof (new_uid));
zp->z_uid = new_uid;
if (attrzp) {
SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
SA_ZPL_UID(zfsvfs), NULL, &new_uid,
sizeof (new_uid));
attrzp->z_uid = new_uid;
}
}
if (mask & AT_GID) {
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
NULL, &new_gid, sizeof (new_gid));
zp->z_gid = new_gid;
if (attrzp) {
SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
SA_ZPL_GID(zfsvfs), NULL, &new_gid,
sizeof (new_gid));
attrzp->z_gid = new_gid;
}
}
if (!(mask & AT_MODE)) {
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
NULL, &new_mode, sizeof (new_mode));
new_mode = zp->z_mode;
}
err = zfs_acl_chown_setattr(zp);
ASSERT(err == 0);
if (attrzp) {
err = zfs_acl_chown_setattr(attrzp);
ASSERT(err == 0);
}
}
if (mask & AT_MODE) {
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
&new_mode, sizeof (new_mode));
zp->z_mode = new_mode;
ASSERT3U((uintptr_t)aclp, !=, 0);
err = zfs_aclset_common(zp, aclp, cr, tx);
ASSERT0(err);
if (zp->z_acl_cached)
zfs_acl_free(zp->z_acl_cached);
zp->z_acl_cached = aclp;
aclp = NULL;
}
if (mask & AT_ATIME) {
ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
&zp->z_atime, sizeof (zp->z_atime));
}
if (mask & AT_MTIME) {
ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
mtime, sizeof (mtime));
}
/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
if (mask & AT_SIZE && !(mask & AT_MTIME)) {
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
NULL, mtime, sizeof (mtime));
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
&ctime, sizeof (ctime));
zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
B_TRUE);
} else if (mask != 0) {
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
&ctime, sizeof (ctime));
zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
B_TRUE);
if (attrzp) {
SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
SA_ZPL_CTIME(zfsvfs), NULL,
&ctime, sizeof (ctime));
zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
mtime, ctime, B_TRUE);
}
}
/*
* Do this after setting timestamps to prevent timestamp
* update from toggling bit
*/
if (xoap && (mask & AT_XVATTR)) {
if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
xoap->xoa_createtime = vap->va_birthtime;
/*
* restore trimmed off masks
* so that return masks can be set for caller.
*/
if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
XVA_SET_REQ(xvap, XAT_APPENDONLY);
}
if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
XVA_SET_REQ(xvap, XAT_NOUNLINK);
}
if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
XVA_SET_REQ(xvap, XAT_IMMUTABLE);
}
if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
XVA_SET_REQ(xvap, XAT_NODUMP);
}
if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
}
if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
ASSERT(vp->v_type == VREG);
zfs_xvattr_set(zp, xvap, tx);
}
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
if (mask != 0)
zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
if (mask & (AT_UID|AT_GID|AT_MODE))
mutex_exit(&zp->z_acl_lock);
if (attrzp) {
if (mask & (AT_UID|AT_GID|AT_MODE))
mutex_exit(&attrzp->z_acl_lock);
}
out:
if (err == 0 && attrzp) {
err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
xattr_count, tx);
ASSERT(err2 == 0);
}
if (attrzp)
vput(ZTOV(attrzp));
if (aclp)
zfs_acl_free(aclp);
if (fuidp) {
zfs_fuid_info_free(fuidp);
fuidp = NULL;
}
if (err) {
dmu_tx_abort(tx);
} else {
err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
dmu_tx_commit(tx);
}
out2:
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (err);
}
/*
* We acquire all but fdvp locks using non-blocking acquisitions. If we
* fail to acquire any lock in the path we will drop all held locks,
* acquire the new lock in a blocking fashion, and then release it and
* restart the rename. This acquire/release step ensures that we do not
* spin on a lock waiting for release. On error release all vnode locks
* and decrement references the way tmpfs_rename() would do.
*/
static int
zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
struct vnode *tdvp, struct vnode **tvpp,
const struct componentname *scnp, const struct componentname *tcnp)
{
zfsvfs_t *zfsvfs;
struct vnode *nvp, *svp, *tvp;
znode_t *sdzp, *tdzp, *szp, *tzp;
const char *snm = scnp->cn_nameptr;
const char *tnm = tcnp->cn_nameptr;
int error;
VOP_UNLOCK(tdvp, 0);
if (*tvpp != NULL && *tvpp != tdvp)
VOP_UNLOCK(*tvpp, 0);
relock:
error = vn_lock(sdvp, LK_EXCLUSIVE);
if (error)
goto out;
sdzp = VTOZ(sdvp);
error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
if (error != 0) {
VOP_UNLOCK(sdvp, 0);
if (error != EBUSY)
goto out;
error = vn_lock(tdvp, LK_EXCLUSIVE);
if (error)
goto out;
VOP_UNLOCK(tdvp, 0);
goto relock;
}
tdzp = VTOZ(tdvp);
/*
* Before using sdzp and tdzp we must ensure that they are live.
* As a porting legacy from illumos we have two things to worry
* about. One is typical for FreeBSD and it is that the vnode is
* not reclaimed (doomed). The other is that the znode is live.
* The current code can invalidate the znode without acquiring the
* corresponding vnode lock if the object represented by the znode
* and vnode is no longer valid after a rollback or receive operation.
* z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
* that protects the znodes from the invalidation.
*/
zfsvfs = sdzp->z_zfsvfs;
ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
ZFS_ENTER(zfsvfs);
/*
* We can not use ZFS_VERIFY_ZP() here because it could directly return
* bypassing the cleanup code in the case of an error.
*/
if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
ZFS_EXIT(zfsvfs);
VOP_UNLOCK(sdvp, 0);
VOP_UNLOCK(tdvp, 0);
error = SET_ERROR(EIO);
goto out;
}
/*
* Re-resolve svp to be certain it still exists and fetch the
* correct vnode.
*/
error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
if (error != 0) {
/* Source entry invalid or not there. */
ZFS_EXIT(zfsvfs);
VOP_UNLOCK(sdvp, 0);
VOP_UNLOCK(tdvp, 0);
if ((scnp->cn_flags & ISDOTDOT) != 0 ||
(scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
error = SET_ERROR(EINVAL);
goto out;
}
svp = ZTOV(szp);
/*
* Re-resolve tvp, if it disappeared we just carry on.
*/
error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
if (error != 0) {
ZFS_EXIT(zfsvfs);
VOP_UNLOCK(sdvp, 0);
VOP_UNLOCK(tdvp, 0);
vrele(svp);
if ((tcnp->cn_flags & ISDOTDOT) != 0)
error = SET_ERROR(EINVAL);
goto out;
}
if (tzp != NULL)
tvp = ZTOV(tzp);
else
tvp = NULL;
/*
* At present the vnode locks must be acquired before z_teardown_lock,
* although it would be more logical to use the opposite order.
*/
ZFS_EXIT(zfsvfs);
/*
* Now try acquire locks on svp and tvp.
*/
nvp = svp;
error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
if (error != 0) {
VOP_UNLOCK(sdvp, 0);
VOP_UNLOCK(tdvp, 0);
if (tvp != NULL)
vrele(tvp);
if (error != EBUSY) {
vrele(nvp);
goto out;
}
error = vn_lock(nvp, LK_EXCLUSIVE);
if (error != 0) {
vrele(nvp);
goto out;
}
VOP_UNLOCK(nvp, 0);
/*
* Concurrent rename race.
* XXX ?
*/
if (nvp == tdvp) {
vrele(nvp);
error = SET_ERROR(EINVAL);
goto out;
}
vrele(*svpp);
*svpp = nvp;
goto relock;
}
vrele(*svpp);
*svpp = nvp;
if (*tvpp != NULL)
vrele(*tvpp);
*tvpp = NULL;
if (tvp != NULL) {
nvp = tvp;
error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
if (error != 0) {
VOP_UNLOCK(sdvp, 0);
VOP_UNLOCK(tdvp, 0);
VOP_UNLOCK(*svpp, 0);
if (error != EBUSY) {
vrele(nvp);
goto out;
}
error = vn_lock(nvp, LK_EXCLUSIVE);
if (error != 0) {
vrele(nvp);
goto out;
}
vput(nvp);
goto relock;
}
*tvpp = nvp;
}
return (0);
out:
return (error);
}
/*
* Note that we must use VRELE_ASYNC in this function as it walks
* up the directory tree and vrele may need to acquire an exclusive
* lock if a last reference to a vnode is dropped.
*/
static int
zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
{
zfsvfs_t *zfsvfs;
znode_t *zp, *zp1;
uint64_t parent;
int error;
zfsvfs = tdzp->z_zfsvfs;
if (tdzp == szp)
return (SET_ERROR(EINVAL));
if (tdzp == sdzp)
return (0);
if (tdzp->z_id == zfsvfs->z_root)
return (0);
zp = tdzp;
for (;;) {
ASSERT(!zp->z_unlinked);
if ((error = sa_lookup(zp->z_sa_hdl,
SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
break;
if (parent == szp->z_id) {
error = SET_ERROR(EINVAL);
break;
}
if (parent == zfsvfs->z_root)
break;
if (parent == sdzp->z_id)
break;
error = zfs_zget(zfsvfs, parent, &zp1);
if (error != 0)
break;
if (zp != tdzp)
VN_RELE_ASYNC(ZTOV(zp),
dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
zp = zp1;
}
if (error == ENOTDIR)
panic("checkpath: .. not a directory\n");
if (zp != tdzp)
VN_RELE_ASYNC(ZTOV(zp),
dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
return (error);
}
/*
* Move an entry from the provided source directory to the target
* directory. Change the entry name as indicated.
*
* IN: sdvp - Source directory containing the "old entry".
* snm - Old entry name.
* tdvp - Target directory to contain the "new entry".
* tnm - New entry name.
* cr - credentials of caller.
* ct - caller context
* flags - case flags
*
* RETURN: 0 on success, error code on failure.
*
* Timestamps:
* sdvp,tdvp - ctime|mtime updated
*/
/*ARGSUSED*/
static int
zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
cred_t *cr)
{
zfsvfs_t *zfsvfs;
znode_t *sdzp, *tdzp, *szp, *tzp;
zilog_t *zilog = NULL;
dmu_tx_t *tx;
char *snm = scnp->cn_nameptr;
char *tnm = tcnp->cn_nameptr;
int error = 0;
/* Reject renames across filesystems. */
if ((*svpp)->v_mount != tdvp->v_mount ||
((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
error = SET_ERROR(EXDEV);
goto out;
}
if (zfsctl_is_node(tdvp)) {
error = SET_ERROR(EXDEV);
goto out;
}
/*
* Lock all four vnodes to ensure safety and semantics of renaming.
*/
error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
if (error != 0) {
/* no vnodes are locked in the case of error here */
return (error);
}
tdzp = VTOZ(tdvp);
sdzp = VTOZ(sdvp);
zfsvfs = tdzp->z_zfsvfs;
zilog = zfsvfs->z_log;
/*
* After we re-enter ZFS_ENTER() we will have to revalidate all
* znodes involved.
*/
ZFS_ENTER(zfsvfs);
if (zfsvfs->z_utf8 && u8_validate(tnm,
strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
error = SET_ERROR(EILSEQ);
goto unlockout;
}
/* If source and target are the same file, there is nothing to do. */
if ((*svpp) == (*tvpp)) {
error = 0;
goto unlockout;
}
if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
(*tvpp)->v_mountedhere != NULL)) {
error = SET_ERROR(EXDEV);
goto unlockout;
}
/*
* We can not use ZFS_VERIFY_ZP() here because it could directly return
* bypassing the cleanup code in the case of an error.
*/
if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
error = SET_ERROR(EIO);
goto unlockout;
}
szp = VTOZ(*svpp);
tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
error = SET_ERROR(EIO);
goto unlockout;
}
/*
* This is to prevent the creation of links into attribute space
* by renaming a linked file into/outof an attribute directory.
* See the comment in zfs_link() for why this is considered bad.
*/
if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
error = SET_ERROR(EINVAL);
goto unlockout;
}
/*
* Must have write access at the source to remove the old entry
* and write access at the target to create the new entry.
* Note that if target and source are the same, this can be
* done in a single check.
*/
if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
goto unlockout;
if ((*svpp)->v_type == VDIR) {
/*
* Avoid ".", "..", and aliases of "." for obvious reasons.
*/
if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
sdzp == szp ||
(scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
error = EINVAL;
goto unlockout;
}
/*
* Check to make sure rename is valid.
* Can't do a move like this: /usr/a/b to /usr/a/b/c/d
*/
if (error = zfs_rename_check(szp, sdzp, tdzp))
goto unlockout;
}
/*
* Does target exist?
*/
if (tzp) {
/*
* Source and target must be the same type.
*/
if ((*svpp)->v_type == VDIR) {
if ((*tvpp)->v_type != VDIR) {
error = SET_ERROR(ENOTDIR);
goto unlockout;
} else {
cache_purge(tdvp);
if (sdvp != tdvp)
cache_purge(sdvp);
}
} else {
if ((*tvpp)->v_type == VDIR) {
error = SET_ERROR(EISDIR);
goto unlockout;
}
}
}
vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
if (tzp)
vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
/*
* notify the target directory if it is not the same
* as source directory.
*/
if (tdvp != sdvp) {
vnevent_rename_dest_dir(tdvp, ct);
}
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
if (sdzp != tdzp) {
dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
zfs_sa_upgrade_txholds(tx, tdzp);
}
if (tzp) {
dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
zfs_sa_upgrade_txholds(tx, tzp);
}
zfs_sa_upgrade_txholds(tx, szp);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
goto unlockout;
}
if (tzp) /* Attempt to remove the existing target */
error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
if (error == 0) {
error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
if (error == 0) {
szp->z_pflags |= ZFS_AV_MODIFIED;
error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
(void *)&szp->z_pflags, sizeof (uint64_t), tx);
ASSERT0(error);
error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
NULL);
if (error == 0) {
zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
snm, tdzp, tnm, szp);
/*
* Update path information for the target vnode
*/
vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
} else {
/*
* At this point, we have successfully created
* the target name, but have failed to remove
* the source name. Since the create was done
* with the ZRENAMING flag, there are
* complications; for one, the link count is
* wrong. The easiest way to deal with this
* is to remove the newly created target, and
* return the original error. This must
* succeed; fortunately, it is very unlikely to
* fail, since we just created it.
*/
VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
ZRENAMING, NULL), ==, 0);
}
}
if (error == 0) {
cache_purge(*svpp);
if (*tvpp != NULL)
cache_purge(*tvpp);
cache_purge_negative(tdvp);
}
}
dmu_tx_commit(tx);
unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */
ZFS_EXIT(zfsvfs);
VOP_UNLOCK(*svpp, 0);
VOP_UNLOCK(sdvp, 0);
out: /* original two vnodes are locked */
if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
if (*tvpp != NULL)
VOP_UNLOCK(*tvpp, 0);
if (tdvp != *tvpp)
VOP_UNLOCK(tdvp, 0);
return (error);
}
/*
* Insert the indicated symbolic reference entry into the directory.
*
* IN: dvp - Directory to contain new symbolic link.
* link - Name for new symlink entry.
* vap - Attributes of new entry.
* cr - credentials of caller.
* ct - caller context
* flags - case flags
*
* RETURN: 0 on success, error code on failure.
*
* Timestamps:
* dvp - ctime|mtime updated
*/
/*ARGSUSED*/
static int
zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
cred_t *cr, kthread_t *td)
{
znode_t *zp, *dzp = VTOZ(dvp);
dmu_tx_t *tx;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
uint64_t len = strlen(link);
int error;
zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
uint64_t txtype = TX_SYMLINK;
int flags = 0;
ASSERT(vap->va_type == VLNK);
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(dzp);
zilog = zfsvfs->z_log;
if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EILSEQ));
}
if (len > MAXPATHLEN) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(ENAMETOOLONG));
}
if ((error = zfs_acl_ids_create(dzp, 0,
vap, cr, NULL, &acl_ids)) != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Attempt to lock directory; fail if entry already exists.
*/
error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
if (error) {
zfs_acl_ids_free(&acl_ids);
ZFS_EXIT(zfsvfs);
return (error);
}
if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
zfs_acl_ids_free(&acl_ids);
ZFS_EXIT(zfsvfs);
return (error);
}
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
zfs_acl_ids_free(&acl_ids);
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EDQUOT));
}
getnewvnode_reserve(1);
tx = dmu_tx_create(zfsvfs->z_os);
fuid_dirtied = zfsvfs->z_fuid_dirty;
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
ZFS_SA_BASE_ATTR_SIZE + len);
dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
acl_ids.z_aclp->z_acl_bytes);
}
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
zfs_acl_ids_free(&acl_ids);
dmu_tx_abort(tx);
getnewvnode_drop_reserve();
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Create a new object for the symlink.
* for version 4 ZPL datsets the symlink will be an SA attribute
*/
zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
if (zp->z_is_sa)
error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
link, len, tx);
else
zfs_sa_symlink(zp, link, len, tx);
zp->z_size = len;
(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
&zp->z_size, sizeof (zp->z_size), tx);
/*
* Insert the new object into the directory.
*/
(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
*vpp = ZTOV(zp);
zfs_acl_ids_free(&acl_ids);
dmu_tx_commit(tx);
getnewvnode_drop_reserve();
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Return, in the buffer contained in the provided uio structure,
* the symbolic path referred to by vp.
*
* IN: vp - vnode of symbolic link.
* uio - structure to contain the link path.
* cr - credentials of caller.
* ct - caller context
*
* OUT: uio - structure containing the link path.
*
* RETURN: 0 on success, error code on failure.
*
* Timestamps:
* vp - atime updated
*/
/* ARGSUSED */
static int
zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
int error;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
if (zp->z_is_sa)
error = sa_lookup_uio(zp->z_sa_hdl,
SA_ZPL_SYMLINK(zfsvfs), uio);
else
error = zfs_sa_readlink(zp, uio);
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Insert a new entry into directory tdvp referencing svp.
*
* IN: tdvp - Directory to contain new entry.
* svp - vnode of new entry.
* name - name of new entry.
* cr - credentials of caller.
* ct - caller context
*
* RETURN: 0 on success, error code on failure.
*
* Timestamps:
* tdvp - ctime|mtime updated
* svp - ctime updated
*/
/* ARGSUSED */
static int
zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
caller_context_t *ct, int flags)
{
znode_t *dzp = VTOZ(tdvp);
znode_t *tzp, *szp;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
dmu_tx_t *tx;
int error;
uint64_t parent;
uid_t owner;
ASSERT(tdvp->v_type == VDIR);
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(dzp);
zilog = zfsvfs->z_log;
/*
* POSIX dictates that we return EPERM here.
* Better choices include ENOTSUP or EISDIR.
*/
if (svp->v_type == VDIR) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EPERM));
}
szp = VTOZ(svp);
ZFS_VERIFY_ZP(szp);
if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EPERM));
}
/* Prevent links to .zfs/shares files */
if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
&parent, sizeof (uint64_t))) != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
if (parent == zfsvfs->z_shares_dir) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EPERM));
}
if (zfsvfs->z_utf8 && u8_validate(name,
strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EILSEQ));
}
/*
* We do not support links between attributes and non-attributes
* because of the potential security risk of creating links
* into "normal" file space in order to circumvent restrictions
* imposed in attribute space.
*/
if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EINVAL));
}
owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EPERM));
}
if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Attempt to lock directory; fail if entry already exists.
*/
error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
if (error) {
ZFS_EXIT(zfsvfs);
return (error);
}
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
zfs_sa_upgrade_txholds(tx, szp);
zfs_sa_upgrade_txholds(tx, dzp);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
ZFS_EXIT(zfsvfs);
return (error);
}
error = zfs_link_create(dzp, name, szp, tx, 0);
if (error == 0) {
uint64_t txtype = TX_LINK;
zfs_log_link(zilog, tx, txtype, dzp, szp, name);
}
dmu_tx_commit(tx);
if (error == 0) {
vnevent_link(svp, ct);
}
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (error);
}
/*ARGSUSED*/
void
zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
int error;
rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
if (zp->z_sa_hdl == NULL) {
/*
* The fs has been unmounted, or we did a
* suspend/resume and this file no longer exists.
*/
rw_exit(&zfsvfs->z_teardown_inactive_lock);
vrecycle(vp);
return;
}
if (zp->z_unlinked) {
/*
* Fast path to recycle a vnode of a removed file.
*/
rw_exit(&zfsvfs->z_teardown_inactive_lock);
vrecycle(vp);
return;
}
if (zp->z_atime_dirty && zp->z_unlinked == 0) {
dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
} else {
(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
(void *)&zp->z_atime, sizeof (zp->z_atime), tx);
zp->z_atime_dirty = 0;
dmu_tx_commit(tx);
}
}
rw_exit(&zfsvfs->z_teardown_inactive_lock);
}
CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
/*ARGSUSED*/
static int
zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
uint32_t gen;
uint64_t gen64;
uint64_t object = zp->z_id;
zfid_short_t *zfid;
int size, i, error;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
&gen64, sizeof (uint64_t))) != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
gen = (uint32_t)gen64;
size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
#ifdef illumos
if (fidp->fid_len < size) {
fidp->fid_len = size;
ZFS_EXIT(zfsvfs);
return (SET_ERROR(ENOSPC));
}
#else
fidp->fid_len = size;
#endif
zfid = (zfid_short_t *)fidp;
zfid->zf_len = size;
for (i = 0; i < sizeof (zfid->zf_object); i++)
zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
/* Must have a non-zero generation number to distinguish from .zfs */
if (gen == 0)
gen = 1;
for (i = 0; i < sizeof (zfid->zf_gen); i++)
zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
if (size == LONG_FID_LEN) {
uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
zfid_long_t *zlfid;
zlfid = (zfid_long_t *)fidp;
for (i = 0; i < sizeof (zlfid->zf_setid); i++)
zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
/* XXX - this should be the generation number for the objset */
for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
zlfid->zf_setgen[i] = 0;
}
ZFS_EXIT(zfsvfs);
return (0);
}
static int
zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
caller_context_t *ct)
{
znode_t *zp, *xzp;
zfsvfs_t *zfsvfs;
int error;
switch (cmd) {
case _PC_LINK_MAX:
*valp = INT_MAX;
return (0);
case _PC_FILESIZEBITS:
*valp = 64;
return (0);
#ifdef illumos
case _PC_XATTR_EXISTS:
zp = VTOZ(vp);
zfsvfs = zp->z_zfsvfs;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
*valp = 0;
error = zfs_dirent_lookup(zp, "", &xzp,
ZXATTR | ZEXISTS | ZSHARED);
if (error == 0) {
if (!zfs_dirempty(xzp))
*valp = 1;
vrele(ZTOV(xzp));
} else if (error == ENOENT) {
/*
* If there aren't extended attributes, it's the
* same as having zero of them.
*/
error = 0;
}
ZFS_EXIT(zfsvfs);
return (error);
case _PC_SATTR_ENABLED:
case _PC_SATTR_EXISTS:
*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
(vp->v_type == VREG || vp->v_type == VDIR);
return (0);
case _PC_ACCESS_FILTERING:
*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
vp->v_type == VDIR;
return (0);
case _PC_ACL_ENABLED:
*valp = _ACL_ACE_ENABLED;
return (0);
#endif /* illumos */
case _PC_MIN_HOLE_SIZE:
*valp = (int)SPA_MINBLOCKSIZE;
return (0);
#ifdef illumos
case _PC_TIMESTAMP_RESOLUTION:
/* nanosecond timestamp resolution */
*valp = 1L;
return (0);
#endif
case _PC_ACL_EXTENDED:
*valp = 0;
return (0);
case _PC_ACL_NFS4:
*valp = 1;
return (0);
case _PC_ACL_PATH_MAX:
*valp = ACL_MAX_ENTRIES;
return (0);
default:
return (EOPNOTSUPP);
}
}
/*ARGSUSED*/
static int
zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
int error;
boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
error = zfs_getacl(zp, vsecp, skipaclchk, cr);
ZFS_EXIT(zfsvfs);
return (error);
}
/*ARGSUSED*/
int
zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
int error;
boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
zilog_t *zilog = zfsvfs->z_log;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
error = zfs_setacl(zp, vsecp, skipaclchk, cr);
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (error);
}
static int
zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
int *rahead)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
objset_t *os = zp->z_zfsvfs->z_os;
rl_t *rl;
vm_object_t object;
off_t start, end, obj_size;
uint_t blksz;
int pgsin_b, pgsin_a;
int error;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
start = IDX_TO_OFF(ma[0]->pindex);
end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
/*
* Lock a range covering all required and optional pages.
* Note that we need to handle the case of the block size growing.
*/
for (;;) {
blksz = zp->z_blksz;
rl = zfs_range_lock(zp, rounddown(start, blksz),
roundup(end, blksz) - rounddown(start, blksz), RL_READER);
if (blksz == zp->z_blksz)
break;
zfs_range_unlock(rl);
}
object = ma[0]->object;
zfs_vmobject_wlock(object);
obj_size = object->un_pager.vnp.vnp_size;
zfs_vmobject_wunlock(object);
if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
zfs_range_unlock(rl);
ZFS_EXIT(zfsvfs);
return (zfs_vm_pagerret_bad);
}
pgsin_b = 0;
if (rbehind != NULL) {
pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
pgsin_b = MIN(*rbehind, pgsin_b);
}
pgsin_a = 0;
if (rahead != NULL) {
pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
pgsin_a = MIN(*rahead, pgsin_a);
}
/*
* NB: we need to pass the exact byte size of the data that we expect
* to read after accounting for the file size. This is required because
* ZFS will panic if we request DMU to read beyond the end of the last
* allocated block.
*/
error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a,
MIN(end, obj_size) - (end - PAGE_SIZE));
zfs_range_unlock(rl);
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
ZFS_EXIT(zfsvfs);
if (error != 0)
return (zfs_vm_pagerret_error);
PCPU_INC(cnt.v_vnodein);
PCPU_ADD(cnt.v_vnodepgsin, count + pgsin_b + pgsin_a);
if (rbehind != NULL)
*rbehind = pgsin_b;
if (rahead != NULL)
*rahead = pgsin_a;
return (zfs_vm_pagerret_ok);
}
static int
zfs_freebsd_getpages(ap)
struct vop_getpages_args /* {
struct vnode *a_vp;
vm_page_t *a_m;
int a_count;
int *a_rbehind;
int *a_rahead;
} */ *ap;
{
return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
ap->a_rahead));
}
static int
zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
int *rtvals)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
rl_t *rl;
dmu_tx_t *tx;
struct sf_buf *sf;
vm_object_t object;
vm_page_t m;
caddr_t va;
size_t tocopy;
size_t lo_len;
vm_ooffset_t lo_off;
vm_ooffset_t off;
uint_t blksz;
int ncount;
int pcount;
int err;
int i;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
object = vp->v_object;
pcount = btoc(len);
ncount = pcount;
KASSERT(ma[0]->object == object, ("mismatching object"));
KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
for (i = 0; i < pcount; i++)
rtvals[i] = zfs_vm_pagerret_error;
off = IDX_TO_OFF(ma[0]->pindex);
blksz = zp->z_blksz;
lo_off = rounddown(off, blksz);
lo_len = roundup(len + (off - lo_off), blksz);
rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER);
zfs_vmobject_wlock(object);
if (len + off > object->un_pager.vnp.vnp_size) {
if (object->un_pager.vnp.vnp_size > off) {
int pgoff;
len = object->un_pager.vnp.vnp_size - off;
ncount = btoc(len);
if ((pgoff = (int)len & PAGE_MASK) != 0) {
/*
* If the object is locked and the following
* conditions hold, then the page's dirty
* field cannot be concurrently changed by a
* pmap operation.
*/
m = ma[ncount - 1];
vm_page_assert_sbusied(m);
KASSERT(!pmap_page_is_write_mapped(m),
("zfs_putpages: page %p is not read-only", m));
vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
pgoff);
}
} else {
len = 0;
ncount = 0;
}
if (ncount < pcount) {
for (i = ncount; i < pcount; i++) {
rtvals[i] = zfs_vm_pagerret_bad;
}
}
}
zfs_vmobject_wunlock(object);
if (ncount == 0)
goto out;
if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
goto out;
}
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_write(tx, zp->z_id, off, len);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
zfs_sa_upgrade_txholds(tx, zp);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err != 0) {
dmu_tx_abort(tx);
goto out;
}
if (zp->z_blksz < PAGE_SIZE) {
for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
va = zfs_map_page(ma[i], &sf);
dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
zfs_unmap_page(sf);
}
} else {
err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
}
if (err == 0) {
uint64_t mtime[2], ctime[2];
sa_bulk_attr_t bulk[3];
int count = 0;
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
&mtime, 16);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
&ctime, 16);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
&zp->z_pflags, 8);
zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
B_TRUE);
err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
ASSERT0(err);
zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
zfs_vmobject_wlock(object);
for (i = 0; i < ncount; i++) {
rtvals[i] = zfs_vm_pagerret_ok;
vm_page_undirty(ma[i]);
}
zfs_vmobject_wunlock(object);
PCPU_INC(cnt.v_vnodeout);
PCPU_ADD(cnt.v_vnodepgsout, ncount);
}
dmu_tx_commit(tx);
out:
zfs_range_unlock(rl);
if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zfsvfs->z_log, zp->z_id);
ZFS_EXIT(zfsvfs);
return (rtvals[0]);
}
int
zfs_freebsd_putpages(ap)
struct vop_putpages_args /* {
struct vnode *a_vp;
vm_page_t *a_m;
int a_count;
int a_sync;
int *a_rtvals;
} */ *ap;
{
return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
ap->a_rtvals));
}
static int
zfs_freebsd_bmap(ap)
struct vop_bmap_args /* {
struct vnode *a_vp;
daddr_t a_bn;
struct bufobj **a_bop;
daddr_t *a_bnp;
int *a_runp;
int *a_runb;
} */ *ap;
{
if (ap->a_bop != NULL)
*ap->a_bop = &ap->a_vp->v_bufobj;
if (ap->a_bnp != NULL)
*ap->a_bnp = ap->a_bn;
if (ap->a_runp != NULL)
*ap->a_runp = 0;
if (ap->a_runb != NULL)
*ap->a_runb = 0;
return (0);
}
static int
zfs_freebsd_open(ap)
struct vop_open_args /* {
struct vnode *a_vp;
int a_mode;
struct ucred *a_cred;
struct thread *a_td;
} */ *ap;
{
vnode_t *vp = ap->a_vp;
znode_t *zp = VTOZ(vp);
int error;
error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
if (error == 0)
vnode_create_vobject(vp, zp->z_size, ap->a_td);
return (error);
}
static int
zfs_freebsd_close(ap)
struct vop_close_args /* {
struct vnode *a_vp;
int a_fflag;
struct ucred *a_cred;
struct thread *a_td;
} */ *ap;
{
return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
}
static int
zfs_freebsd_ioctl(ap)
struct vop_ioctl_args /* {
struct vnode *a_vp;
u_long a_command;
caddr_t a_data;
int a_fflag;
struct ucred *cred;
struct thread *td;
} */ *ap;
{
return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
ap->a_fflag, ap->a_cred, NULL, NULL));
}
static int
ioflags(int ioflags)
{
int flags = 0;
if (ioflags & IO_APPEND)
flags |= FAPPEND;
if (ioflags & IO_NDELAY)
flags |= FNONBLOCK;
if (ioflags & IO_SYNC)
flags |= (FSYNC | FDSYNC | FRSYNC);
return (flags);
}
static int
zfs_freebsd_read(ap)
struct vop_read_args /* {
struct vnode *a_vp;
struct uio *a_uio;
int a_ioflag;
struct ucred *a_cred;
} */ *ap;
{
return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
ap->a_cred, NULL));
}
static int
zfs_freebsd_write(ap)
struct vop_write_args /* {
struct vnode *a_vp;
struct uio *a_uio;
int a_ioflag;
struct ucred *a_cred;
} */ *ap;
{
return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
ap->a_cred, NULL));
}
static int
zfs_freebsd_access(ap)
struct vop_access_args /* {
struct vnode *a_vp;
accmode_t a_accmode;
struct ucred *a_cred;
struct thread *a_td;
} */ *ap;
{
vnode_t *vp = ap->a_vp;
znode_t *zp = VTOZ(vp);
accmode_t accmode;
int error = 0;
/*
* ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
*/
accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
if (accmode != 0)
error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
/*
* VADMIN has to be handled by vaccess().
*/
if (error == 0) {
accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
if (accmode != 0) {
error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
zp->z_gid, accmode, ap->a_cred, NULL);
}
}
/*
* For VEXEC, ensure that at least one execute bit is set for
* non-directories.
*/
if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
(zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
error = EACCES;
}
return (error);
}
static int
zfs_freebsd_lookup(ap)
struct vop_lookup_args /* {
struct vnode *a_dvp;
struct vnode **a_vpp;
struct componentname *a_cnp;
} */ *ap;
{
struct componentname *cnp = ap->a_cnp;
char nm[NAME_MAX + 1];
ASSERT(cnp->cn_namelen < sizeof(nm));
strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
cnp->cn_cred, cnp->cn_thread, 0));
}
static int
zfs_cache_lookup(ap)
struct vop_lookup_args /* {
struct vnode *a_dvp;
struct vnode **a_vpp;
struct componentname *a_cnp;
} */ *ap;
{
zfsvfs_t *zfsvfs;
zfsvfs = ap->a_dvp->v_mount->mnt_data;
if (zfsvfs->z_use_namecache)
return (vfs_cache_lookup(ap));
else
return (zfs_freebsd_lookup(ap));
}
static int
zfs_freebsd_create(ap)
struct vop_create_args /* {
struct vnode *a_dvp;
struct vnode **a_vpp;
struct componentname *a_cnp;
struct vattr *a_vap;
} */ *ap;
{
zfsvfs_t *zfsvfs;
struct componentname *cnp = ap->a_cnp;
vattr_t *vap = ap->a_vap;
int error, mode;
ASSERT(cnp->cn_flags & SAVENAME);
vattr_init_mask(vap);
mode = vap->va_mode & ALLPERMS;
zfsvfs = ap->a_dvp->v_mount->mnt_data;
error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
if (zfsvfs->z_use_namecache &&
error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
return (error);
}
static int
zfs_freebsd_remove(ap)
struct vop_remove_args /* {
struct vnode *a_dvp;
struct vnode *a_vp;
struct componentname *a_cnp;
} */ *ap;
{
ASSERT(ap->a_cnp->cn_flags & SAVENAME);
return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
ap->a_cnp->cn_cred));
}
static int
zfs_freebsd_mkdir(ap)
struct vop_mkdir_args /* {
struct vnode *a_dvp;
struct vnode **a_vpp;
struct componentname *a_cnp;
struct vattr *a_vap;
} */ *ap;
{
vattr_t *vap = ap->a_vap;
ASSERT(ap->a_cnp->cn_flags & SAVENAME);
vattr_init_mask(vap);
return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
ap->a_cnp->cn_cred));
}
static int
zfs_freebsd_rmdir(ap)
struct vop_rmdir_args /* {
struct vnode *a_dvp;
struct vnode *a_vp;
struct componentname *a_cnp;
} */ *ap;
{
struct componentname *cnp = ap->a_cnp;
ASSERT(cnp->cn_flags & SAVENAME);
return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
}
static int
zfs_freebsd_readdir(ap)
struct vop_readdir_args /* {
struct vnode *a_vp;
struct uio *a_uio;
struct ucred *a_cred;
int *a_eofflag;
int *a_ncookies;
u_long **a_cookies;
} */ *ap;
{
return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
ap->a_ncookies, ap->a_cookies));
}
static int
zfs_freebsd_fsync(ap)
struct vop_fsync_args /* {
struct vnode *a_vp;
int a_waitfor;
struct thread *a_td;
} */ *ap;
{
vop_stdfsync(ap);
return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
}
static int
zfs_freebsd_getattr(ap)
struct vop_getattr_args /* {
struct vnode *a_vp;
struct vattr *a_vap;
struct ucred *a_cred;
} */ *ap;
{
vattr_t *vap = ap->a_vap;
xvattr_t xvap;
u_long fflags = 0;
int error;
xva_init(&xvap);
xvap.xva_vattr = *vap;
xvap.xva_vattr.va_mask |= AT_XVATTR;
/* Convert chflags into ZFS-type flags. */
/* XXX: what about SF_SETTABLE?. */
XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
XVA_SET_REQ(&xvap, XAT_APPENDONLY);
XVA_SET_REQ(&xvap, XAT_NOUNLINK);
XVA_SET_REQ(&xvap, XAT_NODUMP);
XVA_SET_REQ(&xvap, XAT_READONLY);
XVA_SET_REQ(&xvap, XAT_ARCHIVE);
XVA_SET_REQ(&xvap, XAT_SYSTEM);
XVA_SET_REQ(&xvap, XAT_HIDDEN);
XVA_SET_REQ(&xvap, XAT_REPARSE);
XVA_SET_REQ(&xvap, XAT_OFFLINE);
XVA_SET_REQ(&xvap, XAT_SPARSE);
error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
if (error != 0)
return (error);
/* Convert ZFS xattr into chflags. */
#define FLAG_CHECK(fflag, xflag, xfield) do { \
if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \
fflags |= (fflag); \
} while (0)
FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
xvap.xva_xoptattrs.xoa_immutable);
FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
xvap.xva_xoptattrs.xoa_appendonly);
FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
xvap.xva_xoptattrs.xoa_nounlink);
FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
xvap.xva_xoptattrs.xoa_archive);
FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
xvap.xva_xoptattrs.xoa_nodump);
FLAG_CHECK(UF_READONLY, XAT_READONLY,
xvap.xva_xoptattrs.xoa_readonly);
FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
xvap.xva_xoptattrs.xoa_system);
FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
xvap.xva_xoptattrs.xoa_hidden);
FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
xvap.xva_xoptattrs.xoa_reparse);
FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
xvap.xva_xoptattrs.xoa_offline);
FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
xvap.xva_xoptattrs.xoa_sparse);
#undef FLAG_CHECK
*vap = xvap.xva_vattr;
vap->va_flags = fflags;
return (0);
}
static int
zfs_freebsd_setattr(ap)
struct vop_setattr_args /* {
struct vnode *a_vp;
struct vattr *a_vap;
struct ucred *a_cred;
} */ *ap;
{
vnode_t *vp = ap->a_vp;
vattr_t *vap = ap->a_vap;
cred_t *cred = ap->a_cred;
xvattr_t xvap;
u_long fflags;
uint64_t zflags;
vattr_init_mask(vap);
vap->va_mask &= ~AT_NOSET;
xva_init(&xvap);
xvap.xva_vattr = *vap;
zflags = VTOZ(vp)->z_pflags;
if (vap->va_flags != VNOVAL) {
zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
int error;
if (zfsvfs->z_use_fuids == B_FALSE)
return (EOPNOTSUPP);
fflags = vap->va_flags;
/*
* XXX KDM
* We need to figure out whether it makes sense to allow
* UF_REPARSE through, since we don't really have other
* facilities to handle reparse points and zfs_setattr()
* doesn't currently allow setting that attribute anyway.
*/
if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
UF_OFFLINE|UF_SPARSE)) != 0)
return (EOPNOTSUPP);
/*
* Unprivileged processes are not permitted to unset system
* flags, or modify flags if any system flags are set.
* Privileged non-jail processes may not modify system flags
* if securelevel > 0 and any existing system flags are set.
* Privileged jail processes behave like privileged non-jail
* processes if the security.jail.chflags_allowed sysctl is
* is non-zero; otherwise, they behave like unprivileged
* processes.
*/
if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
if (zflags &
(ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
error = securelevel_gt(cred, 0);
if (error != 0)
return (error);
}
} else {
/*
* Callers may only modify the file flags on objects they
* have VADMIN rights for.
*/
if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
return (error);
if (zflags &
(ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
return (EPERM);
}
if (fflags &
(SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
return (EPERM);
}
}
#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \
if (((fflags & (fflag)) && !(zflags & (zflag))) || \
((zflags & (zflag)) && !(fflags & (fflag)))) { \
XVA_SET_REQ(&xvap, (xflag)); \
(xfield) = ((fflags & (fflag)) != 0); \
} \
} while (0)
/* Convert chflags into ZFS-type flags. */
/* XXX: what about SF_SETTABLE?. */
FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
xvap.xva_xoptattrs.xoa_immutable);
FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
xvap.xva_xoptattrs.xoa_appendonly);
FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
xvap.xva_xoptattrs.xoa_nounlink);
FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
xvap.xva_xoptattrs.xoa_archive);
FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
xvap.xva_xoptattrs.xoa_nodump);
FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
xvap.xva_xoptattrs.xoa_readonly);
FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
xvap.xva_xoptattrs.xoa_system);
FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
xvap.xva_xoptattrs.xoa_hidden);
FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
xvap.xva_xoptattrs.xoa_hidden);
FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
xvap.xva_xoptattrs.xoa_offline);
FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
xvap.xva_xoptattrs.xoa_sparse);
#undef FLAG_CHANGE
}
if (vap->va_birthtime.tv_sec != VNOVAL) {
xvap.xva_vattr.va_mask |= AT_XVATTR;
XVA_SET_REQ(&xvap, XAT_CREATETIME);
}
return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
}
static int
zfs_freebsd_rename(ap)
struct vop_rename_args /* {
struct vnode *a_fdvp;
struct vnode *a_fvp;
struct componentname *a_fcnp;
struct vnode *a_tdvp;
struct vnode *a_tvp;
struct componentname *a_tcnp;
} */ *ap;
{
vnode_t *fdvp = ap->a_fdvp;
vnode_t *fvp = ap->a_fvp;
vnode_t *tdvp = ap->a_tdvp;
vnode_t *tvp = ap->a_tvp;
int error;
ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
ap->a_tcnp, ap->a_fcnp->cn_cred);
vrele(fdvp);
vrele(fvp);
vrele(tdvp);
if (tvp != NULL)
vrele(tvp);
return (error);
}
static int
zfs_freebsd_symlink(ap)
struct vop_symlink_args /* {
struct vnode *a_dvp;
struct vnode **a_vpp;
struct componentname *a_cnp;
struct vattr *a_vap;
char *a_target;
} */ *ap;
{
struct componentname *cnp = ap->a_cnp;
vattr_t *vap = ap->a_vap;
ASSERT(cnp->cn_flags & SAVENAME);
vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */
vattr_init_mask(vap);
return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
ap->a_target, cnp->cn_cred, cnp->cn_thread));
}
static int
zfs_freebsd_readlink(ap)
struct vop_readlink_args /* {
struct vnode *a_vp;
struct uio *a_uio;
struct ucred *a_cred;
} */ *ap;
{
return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
}
static int
zfs_freebsd_link(ap)
struct vop_link_args /* {
struct vnode *a_tdvp;
struct vnode *a_vp;
struct componentname *a_cnp;
} */ *ap;
{
struct componentname *cnp = ap->a_cnp;
vnode_t *vp = ap->a_vp;
vnode_t *tdvp = ap->a_tdvp;
if (tdvp->v_mount != vp->v_mount)
return (EXDEV);
ASSERT(cnp->cn_flags & SAVENAME);
return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
}
static int
zfs_freebsd_inactive(ap)
struct vop_inactive_args /* {
struct vnode *a_vp;
struct thread *a_td;
} */ *ap;
{
vnode_t *vp = ap->a_vp;
zfs_inactive(vp, ap->a_td->td_ucred, NULL);
return (0);
}
static int
zfs_freebsd_reclaim(ap)
struct vop_reclaim_args /* {
struct vnode *a_vp;
struct thread *a_td;
} */ *ap;
{
vnode_t *vp = ap->a_vp;
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
ASSERT(zp != NULL);
/* Destroy the vm object and flush associated pages. */
vnode_destroy_vobject(vp);
/*
* z_teardown_inactive_lock protects from a race with
* zfs_znode_dmu_fini in zfsvfs_teardown during
* force unmount.
*/
rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
if (zp->z_sa_hdl == NULL)
zfs_znode_free(zp);
else
zfs_zinactive(zp);
rw_exit(&zfsvfs->z_teardown_inactive_lock);
vp->v_data = NULL;
return (0);
}
static int
zfs_freebsd_fid(ap)
struct vop_fid_args /* {
struct vnode *a_vp;
struct fid *a_fid;
} */ *ap;
{
return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
}
static int
zfs_freebsd_pathconf(ap)
struct vop_pathconf_args /* {
struct vnode *a_vp;
int a_name;
register_t *a_retval;
} */ *ap;
{
ulong_t val;
int error;
error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
if (error == 0) {
*ap->a_retval = val;
return (error);
}
if (error != EOPNOTSUPP)
return (error);
switch (ap->a_name) {
case _PC_NAME_MAX:
*ap->a_retval = NAME_MAX;
return (0);
case _PC_PIPE_BUF:
if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) {
*ap->a_retval = PIPE_BUF;
return (0);
}
return (EINVAL);
default:
return (vop_stdpathconf(ap));
}
}
/*
* FreeBSD's extended attributes namespace defines file name prefix for ZFS'
* extended attribute name:
*
* NAMESPACE PREFIX
* system freebsd:system:
* user (none, can be used to access ZFS fsattr(5) attributes
* created on Solaris)
*/
static int
zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
size_t size)
{
const char *namespace, *prefix, *suffix;
/* We don't allow '/' character in attribute name. */
if (strchr(name, '/') != NULL)
return (EINVAL);
/* We don't allow attribute names that start with "freebsd:" string. */
if (strncmp(name, "freebsd:", 8) == 0)
return (EINVAL);
bzero(attrname, size);
switch (attrnamespace) {
case EXTATTR_NAMESPACE_USER:
#if 0
prefix = "freebsd:";
namespace = EXTATTR_NAMESPACE_USER_STRING;
suffix = ":";
#else
/*
* This is the default namespace by which we can access all
* attributes created on Solaris.
*/
prefix = namespace = suffix = "";
#endif
break;
case EXTATTR_NAMESPACE_SYSTEM:
prefix = "freebsd:";
namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
suffix = ":";
break;
case EXTATTR_NAMESPACE_EMPTY:
default:
return (EINVAL);
}
if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
name) >= size) {
return (ENAMETOOLONG);
}
return (0);
}
/*
* Vnode operating to retrieve a named extended attribute.
*/
static int
zfs_getextattr(struct vop_getextattr_args *ap)
/*
vop_getextattr {
IN struct vnode *a_vp;
IN int a_attrnamespace;
IN const char *a_name;
INOUT struct uio *a_uio;
OUT size_t *a_size;
IN struct ucred *a_cred;
IN struct thread *a_td;
};
*/
{
zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
struct thread *td = ap->a_td;
struct nameidata nd;
char attrname[255];
struct vattr va;
vnode_t *xvp = NULL, *vp;
int error, flags;
error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
ap->a_cred, ap->a_td, VREAD);
if (error != 0)
return (error);
error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
sizeof(attrname));
if (error != 0)
return (error);
ZFS_ENTER(zfsvfs);
error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
LOOKUP_XATTR);
if (error != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
flags = FREAD;
NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
xvp, td);
error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
vp = nd.ni_vp;
NDFREE(&nd, NDF_ONLY_PNBUF);
if (error != 0) {
ZFS_EXIT(zfsvfs);
if (error == ENOENT)
error = ENOATTR;
return (error);
}
if (ap->a_size != NULL) {
error = VOP_GETATTR(vp, &va, ap->a_cred);
if (error == 0)
*ap->a_size = (size_t)va.va_size;
} else if (ap->a_uio != NULL)
error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
VOP_UNLOCK(vp, 0);
vn_close(vp, flags, ap->a_cred, td);
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Vnode operation to remove a named attribute.
*/
int
zfs_deleteextattr(struct vop_deleteextattr_args *ap)
/*
vop_deleteextattr {
IN struct vnode *a_vp;
IN int a_attrnamespace;
IN const char *a_name;
IN struct ucred *a_cred;
IN struct thread *a_td;
};
*/
{
zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
struct thread *td = ap->a_td;
struct nameidata nd;
char attrname[255];
struct vattr va;
vnode_t *xvp = NULL, *vp;
int error, flags;
error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
ap->a_cred, ap->a_td, VWRITE);
if (error != 0)
return (error);
error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
sizeof(attrname));
if (error != 0)
return (error);
ZFS_ENTER(zfsvfs);
error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
LOOKUP_XATTR);
if (error != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
UIO_SYSSPACE, attrname, xvp, td);
error = namei(&nd);
vp = nd.ni_vp;
if (error != 0) {
ZFS_EXIT(zfsvfs);
NDFREE(&nd, NDF_ONLY_PNBUF);
if (error == ENOENT)
error = ENOATTR;
return (error);
}
error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_dvp);
if (vp == nd.ni_dvp)
vrele(vp);
else
vput(vp);
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Vnode operation to set a named attribute.
*/
static int
zfs_setextattr(struct vop_setextattr_args *ap)
/*
vop_setextattr {
IN struct vnode *a_vp;
IN int a_attrnamespace;
IN const char *a_name;
INOUT struct uio *a_uio;
IN struct ucred *a_cred;
IN struct thread *a_td;
};
*/
{
zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
struct thread *td = ap->a_td;
struct nameidata nd;
char attrname[255];
struct vattr va;
vnode_t *xvp = NULL, *vp;
int error, flags;
error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
ap->a_cred, ap->a_td, VWRITE);
if (error != 0)
return (error);
error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
sizeof(attrname));
if (error != 0)
return (error);
ZFS_ENTER(zfsvfs);
error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
LOOKUP_XATTR | CREATE_XATTR_DIR);
if (error != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
flags = FFLAGS(O_WRONLY | O_CREAT);
NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
xvp, td);
error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
vp = nd.ni_vp;
NDFREE(&nd, NDF_ONLY_PNBUF);
if (error != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
VATTR_NULL(&va);
va.va_size = 0;
error = VOP_SETATTR(vp, &va, ap->a_cred);
if (error == 0)
VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
VOP_UNLOCK(vp, 0);
vn_close(vp, flags, ap->a_cred, td);
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Vnode operation to retrieve extended attributes on a vnode.
*/
static int
zfs_listextattr(struct vop_listextattr_args *ap)
/*
vop_listextattr {
IN struct vnode *a_vp;
IN int a_attrnamespace;
INOUT struct uio *a_uio;
OUT size_t *a_size;
IN struct ucred *a_cred;
IN struct thread *a_td;
};
*/
{
zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
struct thread *td = ap->a_td;
struct nameidata nd;
char attrprefix[16];
u_char dirbuf[sizeof(struct dirent)];
struct dirent *dp;
struct iovec aiov;
struct uio auio, *uio = ap->a_uio;
size_t *sizep = ap->a_size;
size_t plen;
vnode_t *xvp = NULL, *vp;
int done, error, eof, pos;
error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
ap->a_cred, ap->a_td, VREAD);
if (error != 0)
return (error);
error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
sizeof(attrprefix));
if (error != 0)
return (error);
plen = strlen(attrprefix);
ZFS_ENTER(zfsvfs);
if (sizep != NULL)
*sizep = 0;
error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
LOOKUP_XATTR);
if (error != 0) {
ZFS_EXIT(zfsvfs);
/*
* ENOATTR means that the EA directory does not yet exist,
* i.e. there are no extended attributes there.
*/
if (error == ENOATTR)
error = 0;
return (error);
}
NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
UIO_SYSSPACE, ".", xvp, td);
error = namei(&nd);
vp = nd.ni_vp;
NDFREE(&nd, NDF_ONLY_PNBUF);
if (error != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_segflg = UIO_SYSSPACE;
auio.uio_td = td;
auio.uio_rw = UIO_READ;
auio.uio_offset = 0;
do {
u_char nlen;
aiov.iov_base = (void *)dirbuf;
aiov.iov_len = sizeof(dirbuf);
auio.uio_resid = sizeof(dirbuf);
error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
done = sizeof(dirbuf) - auio.uio_resid;
if (error != 0)
break;
for (pos = 0; pos < done;) {
dp = (struct dirent *)(dirbuf + pos);
pos += dp->d_reclen;
/*
* XXX: Temporarily we also accept DT_UNKNOWN, as this
* is what we get when attribute was created on Solaris.
*/
if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
continue;
if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
continue;
else if (strncmp(dp->d_name, attrprefix, plen) != 0)
continue;
nlen = dp->d_namlen - plen;
if (sizep != NULL)
*sizep += 1 + nlen;
else if (uio != NULL) {
/*
* Format of extattr name entry is one byte for
* length and the rest for name.
*/
error = uiomove(&nlen, 1, uio->uio_rw, uio);
if (error == 0) {
error = uiomove(dp->d_name + plen, nlen,
uio->uio_rw, uio);
}
if (error != 0)
break;
}
}
} while (!eof && error == 0);
vput(vp);
ZFS_EXIT(zfsvfs);
return (error);
}
int
zfs_freebsd_getacl(ap)
struct vop_getacl_args /* {
struct vnode *vp;
acl_type_t type;
struct acl *aclp;
struct ucred *cred;
struct thread *td;
} */ *ap;
{
int error;
vsecattr_t vsecattr;
if (ap->a_type != ACL_TYPE_NFS4)
return (EINVAL);
vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
return (error);
error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
if (vsecattr.vsa_aclentp != NULL)
kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
return (error);
}
int
zfs_freebsd_setacl(ap)
struct vop_setacl_args /* {
struct vnode *vp;
acl_type_t type;
struct acl *aclp;
struct ucred *cred;
struct thread *td;
} */ *ap;
{
int error;
vsecattr_t vsecattr;
int aclbsize; /* size of acl list in bytes */
aclent_t *aaclp;
if (ap->a_type != ACL_TYPE_NFS4)
return (EINVAL);
if (ap->a_aclp == NULL)
return (EINVAL);
if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
return (EINVAL);
/*
* With NFSv4 ACLs, chmod(2) may need to add additional entries,
* splitting every entry into two and appending "canonical six"
* entries at the end. Don't allow for setting an ACL that would
* cause chmod(2) to run out of ACL entries.
*/
if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
return (ENOSPC);
error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
if (error != 0)
return (error);
vsecattr.vsa_mask = VSA_ACE;
aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
aaclp = vsecattr.vsa_aclentp;
vsecattr.vsa_aclentsz = aclbsize;
aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
kmem_free(aaclp, aclbsize);
return (error);
}
int
zfs_freebsd_aclcheck(ap)
struct vop_aclcheck_args /* {
struct vnode *vp;
acl_type_t type;
struct acl *aclp;
struct ucred *cred;
struct thread *td;
} */ *ap;
{
return (EOPNOTSUPP);
}
static int
zfs_vptocnp(struct vop_vptocnp_args *ap)
{
vnode_t *covered_vp;
vnode_t *vp = ap->a_vp;;
zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
znode_t *zp = VTOZ(vp);
int ltype;
int error;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
/*
* If we are a snapshot mounted under .zfs, run the operation
* on the covered vnode.
*/
if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
char name[MAXNAMLEN + 1];
znode_t *dzp;
size_t len;
error = zfs_znode_parent_and_name(zp, &dzp, name);
if (error == 0) {
len = strlen(name);
if (*ap->a_buflen < len)
error = SET_ERROR(ENOMEM);
}
if (error == 0) {
*ap->a_buflen -= len;
bcopy(name, ap->a_buf + *ap->a_buflen, len);
*ap->a_vpp = ZTOV(dzp);
}
ZFS_EXIT(zfsvfs);
return (error);
}
ZFS_EXIT(zfsvfs);
covered_vp = vp->v_mount->mnt_vnodecovered;
vhold(covered_vp);
ltype = VOP_ISLOCKED(vp);
VOP_UNLOCK(vp, 0);
error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread);
if (error == 0) {
error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
ap->a_buf, ap->a_buflen);
vput(covered_vp);
}
vn_lock(vp, ltype | LK_RETRY);
if ((vp->v_iflag & VI_DOOMED) != 0)
error = SET_ERROR(ENOENT);
return (error);
}
#ifdef DIAGNOSTIC
static int
zfs_lock(ap)
struct vop_lock1_args /* {
struct vnode *a_vp;
int a_flags;
char *file;
int line;
} */ *ap;
{
vnode_t *vp;
znode_t *zp;
int err;
err = vop_stdlock(ap);
if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) {
vp = ap->a_vp;
zp = vp->v_data;
if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 &&
zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0)
VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock));
}
return (err);
}
#endif
struct vop_vector zfs_vnodeops;
struct vop_vector zfs_fifoops;
struct vop_vector zfs_shareops;
struct vop_vector zfs_vnodeops = {
.vop_default = &default_vnodeops,
.vop_inactive = zfs_freebsd_inactive,
.vop_reclaim = zfs_freebsd_reclaim,
.vop_access = zfs_freebsd_access,
.vop_lookup = zfs_cache_lookup,
.vop_cachedlookup = zfs_freebsd_lookup,
.vop_getattr = zfs_freebsd_getattr,
.vop_setattr = zfs_freebsd_setattr,
.vop_create = zfs_freebsd_create,
.vop_mknod = zfs_freebsd_create,
.vop_mkdir = zfs_freebsd_mkdir,
.vop_readdir = zfs_freebsd_readdir,
.vop_fsync = zfs_freebsd_fsync,
.vop_open = zfs_freebsd_open,
.vop_close = zfs_freebsd_close,
.vop_rmdir = zfs_freebsd_rmdir,
.vop_ioctl = zfs_freebsd_ioctl,
.vop_link = zfs_freebsd_link,
.vop_symlink = zfs_freebsd_symlink,
.vop_readlink = zfs_freebsd_readlink,
.vop_read = zfs_freebsd_read,
.vop_write = zfs_freebsd_write,
.vop_remove = zfs_freebsd_remove,
.vop_rename = zfs_freebsd_rename,
.vop_pathconf = zfs_freebsd_pathconf,
.vop_bmap = zfs_freebsd_bmap,
.vop_fid = zfs_freebsd_fid,
.vop_getextattr = zfs_getextattr,
.vop_deleteextattr = zfs_deleteextattr,
.vop_setextattr = zfs_setextattr,
.vop_listextattr = zfs_listextattr,
.vop_getacl = zfs_freebsd_getacl,
.vop_setacl = zfs_freebsd_setacl,
.vop_aclcheck = zfs_freebsd_aclcheck,
.vop_getpages = zfs_freebsd_getpages,
.vop_putpages = zfs_freebsd_putpages,
.vop_vptocnp = zfs_vptocnp,
#ifdef DIAGNOSTIC
.vop_lock1 = zfs_lock,
#endif
};
struct vop_vector zfs_fifoops = {
.vop_default = &fifo_specops,
.vop_fsync = zfs_freebsd_fsync,
.vop_access = zfs_freebsd_access,
.vop_getattr = zfs_freebsd_getattr,
.vop_inactive = zfs_freebsd_inactive,
.vop_read = VOP_PANIC,
.vop_reclaim = zfs_freebsd_reclaim,
.vop_setattr = zfs_freebsd_setattr,
.vop_write = VOP_PANIC,
.vop_pathconf = zfs_freebsd_pathconf,
.vop_fid = zfs_freebsd_fid,
.vop_getacl = zfs_freebsd_getacl,
.vop_setacl = zfs_freebsd_setacl,
.vop_aclcheck = zfs_freebsd_aclcheck,
};
/*
* special share hidden files vnode operations template
*/
struct vop_vector zfs_shareops = {
.vop_default = &default_vnodeops,
.vop_access = zfs_freebsd_access,
.vop_inactive = zfs_freebsd_inactive,
.vop_reclaim = zfs_freebsd_reclaim,
.vop_fid = zfs_freebsd_fid,
.vop_pathconf = zfs_freebsd_pathconf,
};
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c (revision 332525)
@@ -1,3249 +1,3249 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
/* Portions Copyright 2010 Robert Milkowski */
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/dmu.h>
#include <sys/zap.h>
#include <sys/arc.h>
#include <sys/stat.h>
#include <sys/resource.h>
#include <sys/zil.h>
#include <sys/zil_impl.h>
#include <sys/dsl_dataset.h>
#include <sys/vdev_impl.h>
#include <sys/dmu_tx.h>
#include <sys/dsl_pool.h>
#include <sys/abd.h>
/*
* The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system
* calls that change the file system. Each itx has enough information to
* be able to replay them after a system crash, power loss, or
* equivalent failure mode. These are stored in memory until either:
*
* 1. they are committed to the pool by the DMU transaction group
* (txg), at which point they can be discarded; or
* 2. they are committed to the on-disk ZIL for the dataset being
* modified (e.g. due to an fsync, O_DSYNC, or other synchronous
* requirement).
*
* In the event of a crash or power loss, the itxs contained by each
* dataset's on-disk ZIL will be replayed when that dataset is first
* instantianted (e.g. if the dataset is a normal fileystem, when it is
* first mounted).
*
* As hinted at above, there is one ZIL per dataset (both the in-memory
* representation, and the on-disk representation). The on-disk format
* consists of 3 parts:
*
* - a single, per-dataset, ZIL header; which points to a chain of
* - zero or more ZIL blocks; each of which contains
* - zero or more ZIL records
*
* A ZIL record holds the information necessary to replay a single
* system call transaction. A ZIL block can hold many ZIL records, and
* the blocks are chained together, similarly to a singly linked list.
*
* Each ZIL block contains a block pointer (blkptr_t) to the next ZIL
* block in the chain, and the ZIL header points to the first block in
* the chain.
*
* Note, there is not a fixed place in the pool to hold these ZIL
* blocks; they are dynamically allocated and freed as needed from the
* blocks available on the pool, though they can be preferentially
* allocated from a dedicated "log" vdev.
*/
/*
* This controls the amount of time that a ZIL block (lwb) will remain
* "open" when it isn't "full", and it has a thread waiting for it to be
* committed to stable storage. Please refer to the zil_commit_waiter()
* function (and the comments within it) for more details.
*/
int zfs_commit_timeout_pct = 5;
/*
* Disable intent logging replay. This global ZIL switch affects all pools.
*/
int zil_replay_disable = 0;
SYSCTL_DECL(_vfs_zfs);
SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_replay_disable, CTLFLAG_RWTUN,
&zil_replay_disable, 0, "Disable intent logging replay");
/*
* Tunable parameter for debugging or performance analysis. Setting
* zfs_nocacheflush will cause corruption on power loss if a volatile
* out-of-order write cache is enabled.
*/
boolean_t zfs_nocacheflush = B_FALSE;
SYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RDTUN,
&zfs_nocacheflush, 0, "Disable cache flush");
boolean_t zfs_trim_enabled = B_TRUE;
SYSCTL_DECL(_vfs_zfs_trim);
SYSCTL_INT(_vfs_zfs_trim, OID_AUTO, enabled, CTLFLAG_RDTUN, &zfs_trim_enabled, 0,
"Enable ZFS TRIM");
/*
* Limit SLOG write size per commit executed with synchronous priority.
* Any writes above that will be executed with lower (asynchronous) priority
* to limit potential SLOG device abuse by single active ZIL writer.
*/
uint64_t zil_slog_bulk = 768 * 1024;
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, zil_slog_bulk, CTLFLAG_RWTUN,
&zil_slog_bulk, 0, "Maximal SLOG commit size with sync priority");
static kmem_cache_t *zil_lwb_cache;
static kmem_cache_t *zil_zcw_cache;
#define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
static int
zil_bp_compare(const void *x1, const void *x2)
{
const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2))
return (-1);
if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2))
return (1);
if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2))
return (-1);
if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2))
return (1);
return (0);
}
static void
zil_bp_tree_init(zilog_t *zilog)
{
avl_create(&zilog->zl_bp_tree, zil_bp_compare,
sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
}
static void
zil_bp_tree_fini(zilog_t *zilog)
{
avl_tree_t *t = &zilog->zl_bp_tree;
zil_bp_node_t *zn;
void *cookie = NULL;
while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
kmem_free(zn, sizeof (zil_bp_node_t));
avl_destroy(t);
}
int
zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
{
avl_tree_t *t = &zilog->zl_bp_tree;
const dva_t *dva;
zil_bp_node_t *zn;
avl_index_t where;
if (BP_IS_EMBEDDED(bp))
return (0);
dva = BP_IDENTITY(bp);
if (avl_find(t, dva, &where) != NULL)
return (SET_ERROR(EEXIST));
zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
zn->zn_dva = *dva;
avl_insert(t, zn, where);
return (0);
}
static zil_header_t *
zil_header_in_syncing_context(zilog_t *zilog)
{
return ((zil_header_t *)zilog->zl_header);
}
static void
zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
{
zio_cksum_t *zc = &bp->blk_cksum;
zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL);
zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL);
zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
}
/*
* Read a log block and make sure it's valid.
*/
static int
zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
char **end)
{
enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
arc_flags_t aflags = ARC_FLAG_WAIT;
arc_buf_t *abuf = NULL;
zbookmark_phys_t zb;
int error;
if (zilog->zl_header->zh_claim_txg == 0)
zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
zio_flags |= ZIO_FLAG_SPECULATIVE;
SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
if (error == 0) {
zio_cksum_t cksum = bp->blk_cksum;
/*
* Validate the checksummed log block.
*
* Sequence numbers should be... sequential. The checksum
* verifier for the next block should be bp's checksum plus 1.
*
* Also check the log chain linkage and size used.
*/
cksum.zc_word[ZIL_ZC_SEQ]++;
if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
zil_chain_t *zilc = abuf->b_data;
char *lr = (char *)(zilc + 1);
uint64_t len = zilc->zc_nused - sizeof (zil_chain_t);
if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
error = SET_ERROR(ECKSUM);
} else {
ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
bcopy(lr, dst, len);
*end = (char *)dst + len;
*nbp = zilc->zc_next_blk;
}
} else {
char *lr = abuf->b_data;
uint64_t size = BP_GET_LSIZE(bp);
zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) ||
(zilc->zc_nused > (size - sizeof (*zilc)))) {
error = SET_ERROR(ECKSUM);
} else {
ASSERT3U(zilc->zc_nused, <=,
SPA_OLD_MAXBLOCKSIZE);
bcopy(lr, dst, zilc->zc_nused);
*end = (char *)dst + zilc->zc_nused;
*nbp = zilc->zc_next_blk;
}
}
arc_buf_destroy(abuf, &abuf);
}
return (error);
}
/*
* Read a TX_WRITE log data block.
*/
static int
zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
{
enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
const blkptr_t *bp = &lr->lr_blkptr;
arc_flags_t aflags = ARC_FLAG_WAIT;
arc_buf_t *abuf = NULL;
zbookmark_phys_t zb;
int error;
if (BP_IS_HOLE(bp)) {
if (wbuf != NULL)
bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length));
return (0);
}
if (zilog->zl_header->zh_claim_txg == 0)
zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
if (error == 0) {
if (wbuf != NULL)
bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
arc_buf_destroy(abuf, &abuf);
}
return (error);
}
/*
* Parse the intent log, and call parse_func for each valid record within.
*/
int
zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
{
const zil_header_t *zh = zilog->zl_header;
boolean_t claimed = !!zh->zh_claim_txg;
uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
uint64_t max_blk_seq = 0;
uint64_t max_lr_seq = 0;
uint64_t blk_count = 0;
uint64_t lr_count = 0;
blkptr_t blk, next_blk;
char *lrbuf, *lrp;
int error = 0;
/*
* Old logs didn't record the maximum zh_claim_lr_seq.
*/
if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
claim_lr_seq = UINT64_MAX;
/*
* Starting at the block pointed to by zh_log we read the log chain.
* For each block in the chain we strongly check that block to
* ensure its validity. We stop when an invalid block is found.
* For each block pointer in the chain we call parse_blk_func().
* For each record in each valid block we call parse_lr_func().
* If the log has been claimed, stop if we encounter a sequence
* number greater than the highest claimed sequence number.
*/
lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
zil_bp_tree_init(zilog);
for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
int reclen;
char *end;
if (blk_seq > claim_blk_seq)
break;
if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0)
break;
ASSERT3U(max_blk_seq, <, blk_seq);
max_blk_seq = blk_seq;
blk_count++;
if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
break;
error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end);
if (error != 0)
break;
for (lrp = lrbuf; lrp < end; lrp += reclen) {
lr_t *lr = (lr_t *)lrp;
reclen = lr->lrc_reclen;
ASSERT3U(reclen, >=, sizeof (lr_t));
if (lr->lrc_seq > claim_lr_seq)
goto done;
if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0)
goto done;
ASSERT3U(max_lr_seq, <, lr->lrc_seq);
max_lr_seq = lr->lrc_seq;
lr_count++;
}
}
done:
zilog->zl_parse_error = error;
zilog->zl_parse_blk_seq = max_blk_seq;
zilog->zl_parse_lr_seq = max_lr_seq;
zilog->zl_parse_blk_count = blk_count;
zilog->zl_parse_lr_count = lr_count;
ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) ||
(max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
zil_bp_tree_fini(zilog);
zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
return (error);
}
static int
zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
{
/*
* Claim log block if not already committed and not already claimed.
* If tx == NULL, just verify that the block is claimable.
*/
if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg ||
zil_bp_tree_add(zilog, bp) != 0)
return (0);
return (zio_wait(zio_claim(NULL, zilog->zl_spa,
tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB)));
}
static int
zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
{
lr_write_t *lr = (lr_write_t *)lrc;
int error;
if (lrc->lrc_txtype != TX_WRITE)
return (0);
/*
* If the block is not readable, don't claim it. This can happen
* in normal operation when a log block is written to disk before
* some of the dmu_sync() blocks it points to. In this case, the
* transaction cannot have been committed to anyone (we would have
* waited for all writes to be stable first), so it is semantically
* correct to declare this the end of the log.
*/
if (lr->lr_blkptr.blk_birth >= first_txg &&
(error = zil_read_log_data(zilog, lr, NULL)) != 0)
return (error);
return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
}
/* ARGSUSED */
static int
zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
{
zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
return (0);
}
static int
zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
{
lr_write_t *lr = (lr_write_t *)lrc;
blkptr_t *bp = &lr->lr_blkptr;
/*
* If we previously claimed it, we need to free it.
*/
if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
!BP_IS_HOLE(bp))
zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
return (0);
}
static int
zil_lwb_vdev_compare(const void *x1, const void *x2)
{
const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
if (v1 < v2)
return (-1);
if (v1 > v2)
return (1);
return (0);
}
static lwb_t *
zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg)
{
lwb_t *lwb;
lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
lwb->lwb_zilog = zilog;
lwb->lwb_blk = *bp;
lwb->lwb_slog = slog;
lwb->lwb_state = LWB_STATE_CLOSED;
lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
lwb->lwb_max_txg = txg;
lwb->lwb_write_zio = NULL;
lwb->lwb_root_zio = NULL;
lwb->lwb_tx = NULL;
lwb->lwb_issued_timestamp = 0;
if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
lwb->lwb_nused = sizeof (zil_chain_t);
lwb->lwb_sz = BP_GET_LSIZE(bp);
} else {
lwb->lwb_nused = 0;
lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
}
mutex_enter(&zilog->zl_lock);
list_insert_tail(&zilog->zl_lwb_list, lwb);
mutex_exit(&zilog->zl_lock);
ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
VERIFY(list_is_empty(&lwb->lwb_waiters));
return (lwb);
}
static void
zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
{
ASSERT(MUTEX_HELD(&zilog->zl_lock));
ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
VERIFY(list_is_empty(&lwb->lwb_waiters));
ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
ASSERT3P(lwb->lwb_write_zio, ==, NULL);
ASSERT3P(lwb->lwb_root_zio, ==, NULL);
ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
ASSERT(lwb->lwb_state == LWB_STATE_CLOSED ||
lwb->lwb_state == LWB_STATE_DONE);
/*
* Clear the zilog's field to indicate this lwb is no longer
* valid, and prevent use-after-free errors.
*/
if (zilog->zl_last_lwb_opened == lwb)
zilog->zl_last_lwb_opened = NULL;
kmem_cache_free(zil_lwb_cache, lwb);
}
/*
* Called when we create in-memory log transactions so that we know
* to cleanup the itxs at the end of spa_sync().
*/
void
zilog_dirty(zilog_t *zilog, uint64_t txg)
{
dsl_pool_t *dp = zilog->zl_dmu_pool;
dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
ASSERT(spa_writeable(zilog->zl_spa));
if (ds->ds_is_snapshot)
panic("dirtying snapshot!");
if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {
/* up the hold count until we can be written out */
dmu_buf_add_ref(ds->ds_dbuf, zilog);
zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg);
}
}
/*
* Determine if the zil is dirty in the specified txg. Callers wanting to
* ensure that the dirty state does not change must hold the itxg_lock for
* the specified txg. Holding the lock will ensure that the zil cannot be
* dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current
* state.
*/
boolean_t
zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg)
{
dsl_pool_t *dp = zilog->zl_dmu_pool;
if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK))
return (B_TRUE);
return (B_FALSE);
}
/*
* Determine if the zil is dirty. The zil is considered dirty if it has
* any pending itx records that have not been cleaned by zil_clean().
*/
boolean_t
zilog_is_dirty(zilog_t *zilog)
{
dsl_pool_t *dp = zilog->zl_dmu_pool;
for (int t = 0; t < TXG_SIZE; t++) {
if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t))
return (B_TRUE);
}
return (B_FALSE);
}
/*
* Create an on-disk intent log.
*/
static lwb_t *
zil_create(zilog_t *zilog)
{
const zil_header_t *zh = zilog->zl_header;
lwb_t *lwb = NULL;
uint64_t txg = 0;
dmu_tx_t *tx = NULL;
blkptr_t blk;
int error = 0;
boolean_t slog = FALSE;
/*
* Wait for any previous destroy to complete.
*/
txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
ASSERT(zh->zh_claim_txg == 0);
ASSERT(zh->zh_replay_seq == 0);
blk = zh->zh_log;
/*
* Allocate an initial log block if:
* - there isn't one already
* - the existing block is the wrong endianess
*/
if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
tx = dmu_tx_create(zilog->zl_os);
VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
txg = dmu_tx_get_txg(tx);
if (!BP_IS_HOLE(&blk)) {
zio_free_zil(zilog->zl_spa, txg, &blk);
BP_ZERO(&blk);
}
error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
ZIL_MIN_BLKSZ, &slog);
if (error == 0)
zil_init_log_chain(zilog, &blk);
}
/*
* Allocate a log write block (lwb) for the first log block.
*/
if (error == 0)
lwb = zil_alloc_lwb(zilog, &blk, slog, txg);
/*
* If we just allocated the first log block, commit our transaction
* and wait for zil_sync() to stuff the block poiner into zh_log.
* (zh is part of the MOS, so we cannot modify it in open context.)
*/
if (tx != NULL) {
dmu_tx_commit(tx);
txg_wait_synced(zilog->zl_dmu_pool, txg);
}
ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
return (lwb);
}
/*
* In one tx, free all log blocks and clear the log header. If keep_first
* is set, then we're replaying a log with no content. We want to keep the
* first block, however, so that the first synchronous transaction doesn't
* require a txg_wait_synced() in zil_create(). We don't need to
* txg_wait_synced() here either when keep_first is set, because both
* zil_create() and zil_destroy() will wait for any in-progress destroys
* to complete.
*/
void
zil_destroy(zilog_t *zilog, boolean_t keep_first)
{
const zil_header_t *zh = zilog->zl_header;
lwb_t *lwb;
dmu_tx_t *tx;
uint64_t txg;
/*
* Wait for any previous destroy to complete.
*/
txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
zilog->zl_old_header = *zh; /* debugging aid */
if (BP_IS_HOLE(&zh->zh_log))
return;
tx = dmu_tx_create(zilog->zl_os);
VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
txg = dmu_tx_get_txg(tx);
mutex_enter(&zilog->zl_lock);
ASSERT3U(zilog->zl_destroy_txg, <, txg);
zilog->zl_destroy_txg = txg;
zilog->zl_keep_first = keep_first;
if (!list_is_empty(&zilog->zl_lwb_list)) {
ASSERT(zh->zh_claim_txg == 0);
VERIFY(!keep_first);
while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
list_remove(&zilog->zl_lwb_list, lwb);
if (lwb->lwb_buf != NULL)
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
zil_free_lwb(zilog, lwb);
}
} else if (!keep_first) {
zil_destroy_sync(zilog, tx);
}
mutex_exit(&zilog->zl_lock);
dmu_tx_commit(tx);
}
void
zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx)
{
ASSERT(list_is_empty(&zilog->zl_lwb_list));
(void) zil_parse(zilog, zil_free_log_block,
zil_free_log_record, tx, zilog->zl_header->zh_claim_txg);
}
int
zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
{
dmu_tx_t *tx = txarg;
uint64_t first_txg = dmu_tx_get_txg(tx);
zilog_t *zilog;
zil_header_t *zh;
objset_t *os;
int error;
error = dmu_objset_own_obj(dp, ds->ds_object,
DMU_OST_ANY, B_FALSE, FTAG, &os);
if (error != 0) {
/*
* EBUSY indicates that the objset is inconsistent, in which
* case it can not have a ZIL.
*/
if (error != EBUSY) {
cmn_err(CE_WARN, "can't open objset for %llu, error %u",
(unsigned long long)ds->ds_object, error);
}
return (0);
}
zilog = dmu_objset_zil(os);
zh = zil_header_in_syncing_context(zilog);
if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) {
if (!BP_IS_HOLE(&zh->zh_log))
zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log);
BP_ZERO(&zh->zh_log);
dsl_dataset_dirty(dmu_objset_ds(os), tx);
dmu_objset_disown(os, FTAG);
return (0);
}
/*
* Claim all log blocks if we haven't already done so, and remember
* the highest claimed sequence number. This ensures that if we can
* read only part of the log now (e.g. due to a missing device),
* but we can read the entire log later, we will not try to replay
* or destroy beyond the last block we successfully claimed.
*/
ASSERT3U(zh->zh_claim_txg, <=, first_txg);
if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
(void) zil_parse(zilog, zil_claim_log_block,
zil_claim_log_record, tx, first_txg);
zh->zh_claim_txg = first_txg;
zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1)
zh->zh_flags |= ZIL_REPLAY_NEEDED;
zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID;
dsl_dataset_dirty(dmu_objset_ds(os), tx);
}
ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
dmu_objset_disown(os, FTAG);
return (0);
}
/*
* Check the log by walking the log chain.
* Checksum errors are ok as they indicate the end of the chain.
* Any other error (no device or read failure) returns an error.
*/
/* ARGSUSED */
int
zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
{
zilog_t *zilog;
objset_t *os;
blkptr_t *bp;
int error;
ASSERT(tx == NULL);
error = dmu_objset_from_ds(ds, &os);
if (error != 0) {
cmn_err(CE_WARN, "can't open objset %llu, error %d",
(unsigned long long)ds->ds_object, error);
return (0);
}
zilog = dmu_objset_zil(os);
bp = (blkptr_t *)&zilog->zl_header->zh_log;
/*
* Check the first block and determine if it's on a log device
* which may have been removed or faulted prior to loading this
* pool. If so, there's no point in checking the rest of the log
* as its content should have already been synced to the pool.
*/
if (!BP_IS_HOLE(bp)) {
vdev_t *vd;
boolean_t valid = B_TRUE;
spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
if (vd->vdev_islog && vdev_is_dead(vd))
valid = vdev_log_state_valid(vd);
spa_config_exit(os->os_spa, SCL_STATE, FTAG);
if (!valid)
return (0);
}
/*
* Because tx == NULL, zil_claim_log_block() will not actually claim
* any blocks, but just determine whether it is possible to do so.
* In addition to checking the log chain, zil_claim_log_block()
* will invoke zio_claim() with a done func of spa_claim_notify(),
* which will update spa_max_claim_txg. See spa_load() for details.
*/
error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa));
return ((error == ECKSUM || error == ENOENT) ? 0 : error);
}
/*
* When an itx is "skipped", this function is used to properly mark the
* waiter as "done, and signal any thread(s) waiting on it. An itx can
* be skipped (and not committed to an lwb) for a variety of reasons,
* one of them being that the itx was committed via spa_sync(), prior to
* it being committed to an lwb; this can happen if a thread calling
* zil_commit() is racing with spa_sync().
*/
static void
zil_commit_waiter_skip(zil_commit_waiter_t *zcw)
{
mutex_enter(&zcw->zcw_lock);
ASSERT3B(zcw->zcw_done, ==, B_FALSE);
zcw->zcw_done = B_TRUE;
cv_broadcast(&zcw->zcw_cv);
mutex_exit(&zcw->zcw_lock);
}
/*
* This function is used when the given waiter is to be linked into an
* lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb.
* At this point, the waiter will no longer be referenced by the itx,
* and instead, will be referenced by the lwb.
*/
static void
zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb)
{
/*
* The lwb_waiters field of the lwb is protected by the zilog's
* zl_lock, thus it must be held when calling this function.
*/
ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock));
mutex_enter(&zcw->zcw_lock);
ASSERT(!list_link_active(&zcw->zcw_node));
ASSERT3P(zcw->zcw_lwb, ==, NULL);
ASSERT3P(lwb, !=, NULL);
ASSERT(lwb->lwb_state == LWB_STATE_OPENED ||
lwb->lwb_state == LWB_STATE_ISSUED);
list_insert_tail(&lwb->lwb_waiters, zcw);
zcw->zcw_lwb = lwb;
mutex_exit(&zcw->zcw_lock);
}
/*
* This function is used when zio_alloc_zil() fails to allocate a ZIL
* block, and the given waiter must be linked to the "nolwb waiters"
* list inside of zil_process_commit_list().
*/
static void
zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb)
{
mutex_enter(&zcw->zcw_lock);
ASSERT(!list_link_active(&zcw->zcw_node));
ASSERT3P(zcw->zcw_lwb, ==, NULL);
list_insert_tail(nolwb, zcw);
mutex_exit(&zcw->zcw_lock);
}
void
zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
{
avl_tree_t *t = &lwb->lwb_vdev_tree;
avl_index_t where;
zil_vdev_node_t *zv, zvsearch;
int ndvas = BP_GET_NDVAS(bp);
int i;
if (zfs_nocacheflush)
return;
mutex_enter(&lwb->lwb_vdev_lock);
for (i = 0; i < ndvas; i++) {
zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
if (avl_find(t, &zvsearch, &where) == NULL) {
zv = kmem_alloc(sizeof (*zv), KM_SLEEP);
zv->zv_vdev = zvsearch.zv_vdev;
avl_insert(t, zv, where);
}
}
mutex_exit(&lwb->lwb_vdev_lock);
}
void
zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
{
lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
}
/*
* This function is a called after all VDEVs associated with a given lwb
* write have completed their DKIOCFLUSHWRITECACHE command; or as soon
* as the lwb write completes, if "zfs_nocacheflush" is set.
*
* The intention is for this function to be called as soon as the
* contents of an lwb are considered "stable" on disk, and will survive
* any sudden loss of power. At this point, any threads waiting for the
* lwb to reach this state are signalled, and the "waiter" structures
* are marked "done".
*/
static void
zil_lwb_flush_vdevs_done(zio_t *zio)
{
lwb_t *lwb = zio->io_private;
zilog_t *zilog = lwb->lwb_zilog;
dmu_tx_t *tx = lwb->lwb_tx;
zil_commit_waiter_t *zcw;
spa_config_exit(zilog->zl_spa, SCL_STATE, lwb);
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
mutex_enter(&zilog->zl_lock);
/*
* Ensure the lwb buffer pointer is cleared before releasing the
* txg. If we have had an allocation failure and the txg is
* waiting to sync then we want zil_sync() to remove the lwb so
* that it's not picked up as the next new one in
* zil_process_commit_list(). zil_sync() will only remove the
* lwb if lwb_buf is null.
*/
lwb->lwb_buf = NULL;
lwb->lwb_tx = NULL;
ASSERT3U(lwb->lwb_issued_timestamp, >, 0);
zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp;
lwb->lwb_root_zio = NULL;
lwb->lwb_state = LWB_STATE_DONE;
if (zilog->zl_last_lwb_opened == lwb) {
/*
* Remember the highest committed log sequence number
* for ztest. We only update this value when all the log
* writes succeeded, because ztest wants to ASSERT that
* it got the whole log chain.
*/
zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
}
while ((zcw = list_head(&lwb->lwb_waiters)) != NULL) {
mutex_enter(&zcw->zcw_lock);
ASSERT(list_link_active(&zcw->zcw_node));
list_remove(&lwb->lwb_waiters, zcw);
ASSERT3P(zcw->zcw_lwb, ==, lwb);
zcw->zcw_lwb = NULL;
zcw->zcw_zio_error = zio->io_error;
ASSERT3B(zcw->zcw_done, ==, B_FALSE);
zcw->zcw_done = B_TRUE;
cv_broadcast(&zcw->zcw_cv);
mutex_exit(&zcw->zcw_lock);
}
mutex_exit(&zilog->zl_lock);
/*
* Now that we've written this log block, we have a stable pointer
* to the next block in the chain, so it's OK to let the txg in
* which we allocated the next block sync.
*/
dmu_tx_commit(tx);
}
/*
* This is called when an lwb write completes. This means, this specific
* lwb was written to disk, and all dependent lwb have also been
* written to disk.
*
* At this point, a DKIOCFLUSHWRITECACHE command hasn't been issued to
* the VDEVs involved in writing out this specific lwb. The lwb will be
* "done" once zil_lwb_flush_vdevs_done() is called, which occurs in the
* zio completion callback for the lwb's root zio.
*/
static void
zil_lwb_write_done(zio_t *zio)
{
lwb_t *lwb = zio->io_private;
spa_t *spa = zio->io_spa;
zilog_t *zilog = lwb->lwb_zilog;
avl_tree_t *t = &lwb->lwb_vdev_tree;
void *cookie = NULL;
zil_vdev_node_t *zv;
ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG);
ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
ASSERT(!BP_IS_GANG(zio->io_bp));
ASSERT(!BP_IS_HOLE(zio->io_bp));
ASSERT(BP_GET_FILL(zio->io_bp) == 0);
abd_put(zio->io_abd);
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
mutex_enter(&zilog->zl_lock);
lwb->lwb_write_zio = NULL;
mutex_exit(&zilog->zl_lock);
if (avl_numnodes(t) == 0)
return;
/*
* If there was an IO error, we're not going to call zio_flush()
* on these vdevs, so we simply empty the tree and free the
* nodes. We avoid calling zio_flush() since there isn't any
* good reason for doing so, after the lwb block failed to be
* written out.
*/
if (zio->io_error != 0) {
while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
kmem_free(zv, sizeof (*zv));
return;
}
while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
if (vd != NULL)
zio_flush(lwb->lwb_root_zio, vd);
kmem_free(zv, sizeof (*zv));
}
}
/*
* This function's purpose is to "open" an lwb such that it is ready to
* accept new itxs being committed to it. To do this, the lwb's zio
* structures are created, and linked to the lwb. This function is
* idempotent; if the passed in lwb has already been opened, this
* function is essentially a no-op.
*/
static void
zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
{
zbookmark_phys_t zb;
zio_priority_t prio;
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
ASSERT3P(lwb, !=, NULL);
EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED);
EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED);
SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
if (lwb->lwb_root_zio == NULL) {
abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf,
BP_GET_LSIZE(&lwb->lwb_blk));
if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
prio = ZIO_PRIORITY_SYNC_WRITE;
else
prio = ZIO_PRIORITY_ASYNC_WRITE;
lwb->lwb_root_zio = zio_root(zilog->zl_spa,
zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL);
ASSERT3P(lwb->lwb_root_zio, !=, NULL);
lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio,
zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd,
BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb,
prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
ASSERT3P(lwb->lwb_write_zio, !=, NULL);
lwb->lwb_state = LWB_STATE_OPENED;
mutex_enter(&zilog->zl_lock);
/*
* The zilog's "zl_last_lwb_opened" field is used to
* build the lwb/zio dependency chain, which is used to
* preserve the ordering of lwb completions that is
* required by the semantics of the ZIL. Each new lwb
* zio becomes a parent of the "previous" lwb zio, such
* that the new lwb's zio cannot complete until the
* "previous" lwb's zio completes.
*
* This is required by the semantics of zil_commit();
* the commit waiters attached to the lwbs will be woken
* in the lwb zio's completion callback, so this zio
* dependency graph ensures the waiters are woken in the
* correct order (the same order the lwbs were created).
*/
lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened;
if (last_lwb_opened != NULL &&
last_lwb_opened->lwb_state != LWB_STATE_DONE) {
ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
last_lwb_opened->lwb_state == LWB_STATE_ISSUED);
ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL);
zio_add_child(lwb->lwb_root_zio,
last_lwb_opened->lwb_root_zio);
}
zilog->zl_last_lwb_opened = lwb;
mutex_exit(&zilog->zl_lock);
}
ASSERT3P(lwb->lwb_root_zio, !=, NULL);
ASSERT3P(lwb->lwb_write_zio, !=, NULL);
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
}
/*
* Define a limited set of intent log block sizes.
*
* These must be a multiple of 4KB. Note only the amount used (again
* aligned to 4KB) actually gets written. However, we can't always just
* allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
*/
uint64_t zil_block_buckets[] = {
4096, /* non TX_WRITE */
8192+4096, /* data base */
32*1024 + 4096, /* NFS writes */
UINT64_MAX
};
/*
* Start a log block write and advance to the next log block.
* Calls are serialized.
*/
static lwb_t *
zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
{
lwb_t *nlwb = NULL;
zil_chain_t *zilc;
spa_t *spa = zilog->zl_spa;
blkptr_t *bp;
dmu_tx_t *tx;
uint64_t txg;
uint64_t zil_blksz, wsz;
int i, error;
boolean_t slog;
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
ASSERT3P(lwb->lwb_root_zio, !=, NULL);
ASSERT3P(lwb->lwb_write_zio, !=, NULL);
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
zilc = (zil_chain_t *)lwb->lwb_buf;
bp = &zilc->zc_next_blk;
} else {
zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
bp = &zilc->zc_next_blk;
}
ASSERT(lwb->lwb_nused <= lwb->lwb_sz);
/*
* Allocate the next block and save its address in this block
* before writing it in order to establish the log chain.
* Note that if the allocation of nlwb synced before we wrote
* the block that points at it (lwb), we'd leak it if we crashed.
* Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done().
* We dirty the dataset to ensure that zil_sync() will be called
* to clean up in the event of allocation failure or I/O failure.
*/
tx = dmu_tx_create(zilog->zl_os);
/*
* Since we are not going to create any new dirty data, and we
* can even help with clearing the existing dirty data, we
* should not be subject to the dirty data based delays. We
* use TXG_NOTHROTTLE to bypass the delay mechanism.
*/
VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
txg = dmu_tx_get_txg(tx);
lwb->lwb_tx = tx;
/*
* Log blocks are pre-allocated. Here we select the size of the next
* block, based on size used in the last block.
* - first find the smallest bucket that will fit the block from a
* limited set of block sizes. This is because it's faster to write
* blocks allocated from the same metaslab as they are adjacent or
* close.
* - next find the maximum from the new suggested size and an array of
* previous sizes. This lessens a picket fence effect of wrongly
* guesssing the size if we have a stream of say 2k, 64k, 2k, 64k
* requests.
*
* Note we only write what is used, but we can't just allocate
* the maximum block size because we can exhaust the available
* pool log space.
*/
zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
for (i = 0; zil_blksz > zil_block_buckets[i]; i++)
continue;
zil_blksz = zil_block_buckets[i];
if (zil_blksz == UINT64_MAX)
zil_blksz = SPA_OLD_MAXBLOCKSIZE;
zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
for (i = 0; i < ZIL_PREV_BLKS; i++)
zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
BP_ZERO(bp);
/* pass the old blkptr in order to spread log blocks across devs */
error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, &slog);
if (error == 0) {
ASSERT3U(bp->blk_birth, ==, txg);
bp->blk_cksum = lwb->lwb_blk.blk_cksum;
bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
/*
* Allocate a new log write block (lwb).
*/
nlwb = zil_alloc_lwb(zilog, bp, slog, txg);
}
if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
/* For Slim ZIL only write what is used. */
wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);
ASSERT3U(wsz, <=, lwb->lwb_sz);
zio_shrink(lwb->lwb_write_zio, wsz);
} else {
wsz = lwb->lwb_sz;
}
zilc->zc_pad = 0;
zilc->zc_nused = lwb->lwb_nused;
zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
/*
* clear unused data for security
*/
bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused);
spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER);
zil_lwb_add_block(lwb, &lwb->lwb_blk);
lwb->lwb_issued_timestamp = gethrtime();
lwb->lwb_state = LWB_STATE_ISSUED;
zio_nowait(lwb->lwb_root_zio);
zio_nowait(lwb->lwb_write_zio);
/*
* If there was an allocation failure then nlwb will be null which
* forces a txg_wait_synced().
*/
return (nlwb);
}
static lwb_t *
zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
{
lr_t *lrcb, *lrc;
lr_write_t *lrwb, *lrw;
char *lr_buf;
uint64_t dlen, dnow, lwb_sp, reclen, txg;
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
ASSERT3P(lwb, !=, NULL);
ASSERT3P(lwb->lwb_buf, !=, NULL);
zil_lwb_write_open(zilog, lwb);
lrc = &itx->itx_lr;
lrw = (lr_write_t *)lrc;
/*
* A commit itx doesn't represent any on-disk state; instead
* it's simply used as a place holder on the commit list, and
* provides a mechanism for attaching a "commit waiter" onto the
* correct lwb (such that the waiter can be signalled upon
* completion of that lwb). Thus, we don't process this itx's
* log record if it's a commit itx (these itx's don't have log
* records), and instead link the itx's waiter onto the lwb's
* list of waiters.
*
* For more details, see the comment above zil_commit().
*/
if (lrc->lrc_txtype == TX_COMMIT) {
mutex_enter(&zilog->zl_lock);
zil_commit_waiter_link_lwb(itx->itx_private, lwb);
itx->itx_private = NULL;
mutex_exit(&zilog->zl_lock);
return (lwb);
}
if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
dlen = P2ROUNDUP_TYPED(
lrw->lr_length, sizeof (uint64_t), uint64_t);
} else {
dlen = 0;
}
reclen = lrc->lrc_reclen;
zilog->zl_cur_used += (reclen + dlen);
txg = lrc->lrc_txg;
ASSERT3U(zilog->zl_cur_used, <, UINT64_MAX - (reclen + dlen));
cont:
/*
* If this record won't fit in the current log block, start a new one.
* For WR_NEED_COPY optimize layout for minimal number of chunks.
*/
lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
lwb_sp < ZIL_MAX_WASTE_SPACE && (dlen % ZIL_MAX_LOG_DATA == 0 ||
lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) {
lwb = zil_lwb_write_issue(zilog, lwb);
if (lwb == NULL)
return (NULL);
zil_lwb_write_open(zilog, lwb);
ASSERT(LWB_EMPTY(lwb));
lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
}
dnow = MIN(dlen, lwb_sp - reclen);
lr_buf = lwb->lwb_buf + lwb->lwb_nused;
bcopy(lrc, lr_buf, reclen);
lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */
lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */
/*
* If it's a write, fetch the data or get its blkptr as appropriate.
*/
if (lrc->lrc_txtype == TX_WRITE) {
if (txg > spa_freeze_txg(zilog->zl_spa))
txg_wait_synced(zilog->zl_dmu_pool, txg);
if (itx->itx_wr_state != WR_COPIED) {
char *dbuf;
int error;
if (itx->itx_wr_state == WR_NEED_COPY) {
dbuf = lr_buf + reclen;
lrcb->lrc_reclen += dnow;
if (lrwb->lr_length > dnow)
lrwb->lr_length = dnow;
lrw->lr_offset += dnow;
lrw->lr_length -= dnow;
} else {
ASSERT(itx->itx_wr_state == WR_INDIRECT);
dbuf = NULL;
}
/*
* We pass in the "lwb_write_zio" rather than
* "lwb_root_zio" so that the "lwb_write_zio"
* becomes the parent of any zio's created by
* the "zl_get_data" callback. The vdevs are
* flushed after the "lwb_write_zio" completes,
* so we want to make sure that completion
* callback waits for these additional zio's,
* such that the vdevs used by those zio's will
* be included in the lwb's vdev tree, and those
* vdevs will be properly flushed. If we passed
* in "lwb_root_zio" here, then these additional
* vdevs may not be flushed; e.g. if these zio's
* completed after "lwb_write_zio" completed.
*/
error = zilog->zl_get_data(itx->itx_private,
lrwb, dbuf, lwb, lwb->lwb_write_zio);
if (error == EIO) {
txg_wait_synced(zilog->zl_dmu_pool, txg);
return (lwb);
}
if (error != 0) {
ASSERT(error == ENOENT || error == EEXIST ||
error == EALREADY);
return (lwb);
}
}
}
/*
* We're actually making an entry, so update lrc_seq to be the
* log record sequence number. Note that this is generally not
* equal to the itx sequence number because not all transactions
* are synchronous, and sometimes spa_sync() gets there first.
*/
lrcb->lrc_seq = ++zilog->zl_lr_seq;
lwb->lwb_nused += reclen + dnow;
zil_lwb_add_txg(lwb, txg);
ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
dlen -= dnow;
if (dlen > 0) {
zilog->zl_cur_used += reclen;
goto cont;
}
return (lwb);
}
itx_t *
zil_itx_create(uint64_t txtype, size_t lrsize)
{
itx_t *itx;
lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t);
itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
itx->itx_lr.lrc_txtype = txtype;
itx->itx_lr.lrc_reclen = lrsize;
itx->itx_lr.lrc_seq = 0; /* defensive */
itx->itx_sync = B_TRUE; /* default is synchronous */
return (itx);
}
void
zil_itx_destroy(itx_t *itx)
{
kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen);
}
/*
* Free up the sync and async itxs. The itxs_t has already been detached
* so no locks are needed.
*/
static void
zil_itxg_clean(itxs_t *itxs)
{
itx_t *itx;
list_t *list;
avl_tree_t *t;
void *cookie;
itx_async_node_t *ian;
list = &itxs->i_sync_list;
while ((itx = list_head(list)) != NULL) {
/*
* In the general case, commit itxs will not be found
* here, as they'll be committed to an lwb via
* zil_lwb_commit(), and free'd in that function. Having
* said that, it is still possible for commit itxs to be
* found here, due to the following race:
*
* - a thread calls zil_commit() which assigns the
* commit itx to a per-txg i_sync_list
* - zil_itxg_clean() is called (e.g. via spa_sync())
* while the waiter is still on the i_sync_list
*
* There's nothing to prevent syncing the txg while the
* waiter is on the i_sync_list. This normally doesn't
* happen because spa_sync() is slower than zil_commit(),
* but if zil_commit() calls txg_wait_synced() (e.g.
* because zil_create() or zil_commit_writer_stall() is
* called) we will hit this case.
*/
if (itx->itx_lr.lrc_txtype == TX_COMMIT)
zil_commit_waiter_skip(itx->itx_private);
list_remove(list, itx);
zil_itx_destroy(itx);
}
cookie = NULL;
t = &itxs->i_async_tree;
while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
list = &ian->ia_list;
while ((itx = list_head(list)) != NULL) {
list_remove(list, itx);
/* commit itxs should never be on the async lists. */
ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
zil_itx_destroy(itx);
}
list_destroy(list);
kmem_free(ian, sizeof (itx_async_node_t));
}
avl_destroy(t);
kmem_free(itxs, sizeof (itxs_t));
}
static int
zil_aitx_compare(const void *x1, const void *x2)
{
const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;
if (o1 < o2)
return (-1);
if (o1 > o2)
return (1);
return (0);
}
/*
* Remove all async itx with the given oid.
*/
static void
zil_remove_async(zilog_t *zilog, uint64_t oid)
{
uint64_t otxg, txg;
itx_async_node_t *ian;
avl_tree_t *t;
avl_index_t where;
list_t clean_list;
itx_t *itx;
ASSERT(oid != 0);
list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
otxg = ZILTEST_TXG;
else
otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
mutex_enter(&itxg->itxg_lock);
if (itxg->itxg_txg != txg) {
mutex_exit(&itxg->itxg_lock);
continue;
}
/*
* Locate the object node and append its list.
*/
t = &itxg->itxg_itxs->i_async_tree;
ian = avl_find(t, &oid, &where);
if (ian != NULL)
list_move_tail(&clean_list, &ian->ia_list);
mutex_exit(&itxg->itxg_lock);
}
while ((itx = list_head(&clean_list)) != NULL) {
list_remove(&clean_list, itx);
/* commit itxs should never be on the async lists. */
ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
zil_itx_destroy(itx);
}
list_destroy(&clean_list);
}
void
zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
{
uint64_t txg;
itxg_t *itxg;
itxs_t *itxs, *clean = NULL;
/*
* Object ids can be re-instantiated in the next txg so
* remove any async transactions to avoid future leaks.
* This can happen if a fsync occurs on the re-instantiated
* object for a WR_INDIRECT or WR_NEED_COPY write, which gets
* the new file data and flushes a write record for the old object.
*/
if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE)
zil_remove_async(zilog, itx->itx_oid);
/*
* Ensure the data of a renamed file is committed before the rename.
*/
if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME)
zil_async_to_sync(zilog, itx->itx_oid);
if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX)
txg = ZILTEST_TXG;
else
txg = dmu_tx_get_txg(tx);
itxg = &zilog->zl_itxg[txg & TXG_MASK];
mutex_enter(&itxg->itxg_lock);
itxs = itxg->itxg_itxs;
if (itxg->itxg_txg != txg) {
if (itxs != NULL) {
/*
* The zil_clean callback hasn't got around to cleaning
* this itxg. Save the itxs for release below.
* This should be rare.
*/
zfs_dbgmsg("zil_itx_assign: missed itx cleanup for "
"txg %llu", itxg->itxg_txg);
clean = itxg->itxg_itxs;
}
itxg->itxg_txg = txg;
itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP);
list_create(&itxs->i_sync_list, sizeof (itx_t),
offsetof(itx_t, itx_node));
avl_create(&itxs->i_async_tree, zil_aitx_compare,
sizeof (itx_async_node_t),
offsetof(itx_async_node_t, ia_node));
}
if (itx->itx_sync) {
list_insert_tail(&itxs->i_sync_list, itx);
} else {
avl_tree_t *t = &itxs->i_async_tree;
uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid;
itx_async_node_t *ian;
avl_index_t where;
ian = avl_find(t, &foid, &where);
if (ian == NULL) {
ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP);
list_create(&ian->ia_list, sizeof (itx_t),
offsetof(itx_t, itx_node));
ian->ia_foid = foid;
avl_insert(t, ian, where);
}
list_insert_tail(&ian->ia_list, itx);
}
itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
/*
* We don't want to dirty the ZIL using ZILTEST_TXG, because
* zil_clean() will never be called using ZILTEST_TXG. Thus, we
* need to be careful to always dirty the ZIL using the "real"
* TXG (not itxg_txg) even when the SPA is frozen.
*/
zilog_dirty(zilog, dmu_tx_get_txg(tx));
mutex_exit(&itxg->itxg_lock);
/* Release the old itxs now we've dropped the lock */
if (clean != NULL)
zil_itxg_clean(clean);
}
/*
* If there are any in-memory intent log transactions which have now been
* synced then start up a taskq to free them. We should only do this after we
* have written out the uberblocks (i.e. txg has been comitted) so that
* don't inadvertently clean out in-memory log records that would be required
* by zil_commit().
*/
void
zil_clean(zilog_t *zilog, uint64_t synced_txg)
{
itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK];
itxs_t *clean_me;
ASSERT3U(synced_txg, <, ZILTEST_TXG);
mutex_enter(&itxg->itxg_lock);
if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) {
mutex_exit(&itxg->itxg_lock);
return;
}
ASSERT3U(itxg->itxg_txg, <=, synced_txg);
ASSERT3U(itxg->itxg_txg, !=, 0);
clean_me = itxg->itxg_itxs;
itxg->itxg_itxs = NULL;
itxg->itxg_txg = 0;
mutex_exit(&itxg->itxg_lock);
/*
* Preferably start a task queue to free up the old itxs but
* if taskq_dispatch can't allocate resources to do that then
* free it in-line. This should be rare. Note, using TQ_SLEEP
* created a bad performance problem.
*/
ASSERT3P(zilog->zl_dmu_pool, !=, NULL);
ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL);
if (taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq,
(void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == 0)
zil_itxg_clean(clean_me);
}
/*
* This function will traverse the queue of itxs that need to be
* committed, and move them onto the ZIL's zl_itx_commit_list.
*/
static void
zil_get_commit_list(zilog_t *zilog)
{
uint64_t otxg, txg;
list_t *commit_list = &zilog->zl_itx_commit_list;
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
otxg = ZILTEST_TXG;
else
otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
/*
* This is inherently racy, since there is nothing to prevent
* the last synced txg from changing. That's okay since we'll
* only commit things in the future.
*/
for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
mutex_enter(&itxg->itxg_lock);
if (itxg->itxg_txg != txg) {
mutex_exit(&itxg->itxg_lock);
continue;
}
/*
* If we're adding itx records to the zl_itx_commit_list,
* then the zil better be dirty in this "txg". We can assert
* that here since we're holding the itxg_lock which will
* prevent spa_sync from cleaning it. Once we add the itxs
* to the zl_itx_commit_list we must commit it to disk even
* if it's unnecessary (i.e. the txg was synced).
*/
ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
mutex_exit(&itxg->itxg_lock);
}
}
/*
* Move the async itxs for a specified object to commit into sync lists.
*/
void
zil_async_to_sync(zilog_t *zilog, uint64_t foid)
{
uint64_t otxg, txg;
itx_async_node_t *ian;
avl_tree_t *t;
avl_index_t where;
if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
otxg = ZILTEST_TXG;
else
otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
/*
* This is inherently racy, since there is nothing to prevent
* the last synced txg from changing.
*/
for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
mutex_enter(&itxg->itxg_lock);
if (itxg->itxg_txg != txg) {
mutex_exit(&itxg->itxg_lock);
continue;
}
/*
* If a foid is specified then find that node and append its
* list. Otherwise walk the tree appending all the lists
* to the sync list. We add to the end rather than the
* beginning to ensure the create has happened.
*/
t = &itxg->itxg_itxs->i_async_tree;
if (foid != 0) {
ian = avl_find(t, &foid, &where);
if (ian != NULL) {
list_move_tail(&itxg->itxg_itxs->i_sync_list,
&ian->ia_list);
}
} else {
void *cookie = NULL;
while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
list_move_tail(&itxg->itxg_itxs->i_sync_list,
&ian->ia_list);
list_destroy(&ian->ia_list);
kmem_free(ian, sizeof (itx_async_node_t));
}
}
mutex_exit(&itxg->itxg_lock);
}
}
/*
* This function will prune commit itxs that are at the head of the
* commit list (it won't prune past the first non-commit itx), and
* either: a) attach them to the last lwb that's still pending
* completion, or b) skip them altogether.
*
* This is used as a performance optimization to prevent commit itxs
* from generating new lwbs when it's unnecessary to do so.
*/
static void
zil_prune_commit_list(zilog_t *zilog)
{
itx_t *itx;
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
while (itx = list_head(&zilog->zl_itx_commit_list)) {
lr_t *lrc = &itx->itx_lr;
if (lrc->lrc_txtype != TX_COMMIT)
break;
mutex_enter(&zilog->zl_lock);
lwb_t *last_lwb = zilog->zl_last_lwb_opened;
if (last_lwb == NULL || last_lwb->lwb_state == LWB_STATE_DONE) {
/*
* All of the itxs this waiter was waiting on
* must have already completed (or there were
* never any itx's for it to wait on), so it's
* safe to skip this waiter and mark it done.
*/
zil_commit_waiter_skip(itx->itx_private);
} else {
zil_commit_waiter_link_lwb(itx->itx_private, last_lwb);
itx->itx_private = NULL;
}
mutex_exit(&zilog->zl_lock);
list_remove(&zilog->zl_itx_commit_list, itx);
zil_itx_destroy(itx);
}
IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
}
static void
zil_commit_writer_stall(zilog_t *zilog)
{
/*
* When zio_alloc_zil() fails to allocate the next lwb block on
* disk, we must call txg_wait_synced() to ensure all of the
* lwbs in the zilog's zl_lwb_list are synced and then freed (in
* zil_sync()), such that any subsequent ZIL writer (i.e. a call
* to zil_process_commit_list()) will have to call zil_create(),
* and start a new ZIL chain.
*
* Since zil_alloc_zil() failed, the lwb that was previously
* issued does not have a pointer to the "next" lwb on disk.
* Thus, if another ZIL writer thread was to allocate the "next"
* on-disk lwb, that block could be leaked in the event of a
* crash (because the previous lwb on-disk would not point to
* it).
*
* We must hold the zilog's zl_issuer_lock while we do this, to
* ensure no new threads enter zil_process_commit_list() until
* all lwb's in the zl_lwb_list have been synced and freed
* (which is achieved via the txg_wait_synced() call).
*/
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
txg_wait_synced(zilog->zl_dmu_pool, 0);
ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL);
}
/*
* This function will traverse the commit list, creating new lwbs as
* needed, and committing the itxs from the commit list to these newly
* created lwbs. Additionally, as a new lwb is created, the previous
* lwb will be issued to the zio layer to be written to disk.
*/
static void
zil_process_commit_list(zilog_t *zilog)
{
spa_t *spa = zilog->zl_spa;
list_t nolwb_waiters;
lwb_t *lwb;
itx_t *itx;
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
/*
* Return if there's nothing to commit before we dirty the fs by
* calling zil_create().
*/
if (list_head(&zilog->zl_itx_commit_list) == NULL)
return;
list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
offsetof(zil_commit_waiter_t, zcw_node));
lwb = list_tail(&zilog->zl_lwb_list);
if (lwb == NULL) {
lwb = zil_create(zilog);
} else {
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_DONE);
}
while (itx = list_head(&zilog->zl_itx_commit_list)) {
lr_t *lrc = &itx->itx_lr;
uint64_t txg = lrc->lrc_txg;
ASSERT3U(txg, !=, 0);
if (lrc->lrc_txtype == TX_COMMIT) {
DTRACE_PROBE2(zil__process__commit__itx,
zilog_t *, zilog, itx_t *, itx);
} else {
DTRACE_PROBE2(zil__process__normal__itx,
zilog_t *, zilog, itx_t *, itx);
}
boolean_t synced = txg <= spa_last_synced_txg(spa);
boolean_t frozen = txg > spa_freeze_txg(spa);
/*
* If the txg of this itx has already been synced out, then
* we don't need to commit this itx to an lwb. This is
* because the data of this itx will have already been
* written to the main pool. This is inherently racy, and
* it's still ok to commit an itx whose txg has already
* been synced; this will result in a write that's
* unnecessary, but will do no harm.
*
* With that said, we always want to commit TX_COMMIT itxs
* to an lwb, regardless of whether or not that itx's txg
* has been synced out. We do this to ensure any OPENED lwb
* will always have at least one zil_commit_waiter_t linked
* to the lwb.
*
* As a counter-example, if we skipped TX_COMMIT itx's
* whose txg had already been synced, the following
* situation could occur if we happened to be racing with
* spa_sync:
*
* 1. we commit a non-TX_COMMIT itx to an lwb, where the
* itx's txg is 10 and the last synced txg is 9.
* 2. spa_sync finishes syncing out txg 10.
* 3. we move to the next itx in the list, it's a TX_COMMIT
* whose txg is 10, so we skip it rather than committing
* it to the lwb used in (1).
*
* If the itx that is skipped in (3) is the last TX_COMMIT
* itx in the commit list, than it's possible for the lwb
* used in (1) to remain in the OPENED state indefinitely.
*
* To prevent the above scenario from occuring, ensuring
* that once an lwb is OPENED it will transition to ISSUED
* and eventually DONE, we always commit TX_COMMIT itx's to
* an lwb here, even if that itx's txg has already been
* synced.
*
* Finally, if the pool is frozen, we _always_ commit the
* itx. The point of freezing the pool is to prevent data
* from being written to the main pool via spa_sync, and
* instead rely solely on the ZIL to persistently store the
* data; i.e. when the pool is frozen, the last synced txg
* value can't be trusted.
*/
if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) {
if (lwb != NULL) {
lwb = zil_lwb_commit(zilog, itx, lwb);
} else if (lrc->lrc_txtype == TX_COMMIT) {
ASSERT3P(lwb, ==, NULL);
zil_commit_waiter_link_nolwb(
itx->itx_private, &nolwb_waiters);
}
}
list_remove(&zilog->zl_itx_commit_list, itx);
zil_itx_destroy(itx);
}
if (lwb == NULL) {
/*
* This indicates zio_alloc_zil() failed to allocate the
* "next" lwb on-disk. When this happens, we must stall
* the ZIL write pipeline; see the comment within
* zil_commit_writer_stall() for more details.
*/
zil_commit_writer_stall(zilog);
/*
* Additionally, we have to signal and mark the "nolwb"
* waiters as "done" here, since without an lwb, we
* can't do this via zil_lwb_flush_vdevs_done() like
* normal.
*/
zil_commit_waiter_t *zcw;
while (zcw = list_head(&nolwb_waiters)) {
zil_commit_waiter_skip(zcw);
list_remove(&nolwb_waiters, zcw);
}
} else {
ASSERT(list_is_empty(&nolwb_waiters));
ASSERT3P(lwb, !=, NULL);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_DONE);
/*
* At this point, the ZIL block pointed at by the "lwb"
* variable is in one of the following states: "closed"
* or "open".
*
* If its "closed", then no itxs have been committed to
* it, so there's no point in issuing its zio (i.e.
* it's "empty").
*
* If its "open" state, then it contains one or more
* itxs that eventually need to be committed to stable
* storage. In this case we intentionally do not issue
* the lwb's zio to disk yet, and instead rely on one of
* the following two mechanisms for issuing the zio:
*
* 1. Ideally, there will be more ZIL activity occuring
* on the system, such that this function will be
* immediately called again (not necessarily by the same
* thread) and this lwb's zio will be issued via
* zil_lwb_commit(). This way, the lwb is guaranteed to
* be "full" when it is issued to disk, and we'll make
* use of the lwb's size the best we can.
*
* 2. If there isn't sufficient ZIL activity occuring on
* the system, such that this lwb's zio isn't issued via
* zil_lwb_commit(), zil_commit_waiter() will issue the
* lwb's zio. If this occurs, the lwb is not guaranteed
* to be "full" by the time its zio is issued, and means
* the size of the lwb was "too large" given the amount
* of ZIL activity occuring on the system at that time.
*
* We do this for a couple of reasons:
*
* 1. To try and reduce the number of IOPs needed to
* write the same number of itxs. If an lwb has space
* available in it's buffer for more itxs, and more itxs
* will be committed relatively soon (relative to the
* latency of performing a write), then it's beneficial
* to wait for these "next" itxs. This way, more itxs
* can be committed to stable storage with fewer writes.
*
* 2. To try and use the largest lwb block size that the
* incoming rate of itxs can support. Again, this is to
* try and pack as many itxs into as few lwbs as
* possible, without significantly impacting the latency
* of each individual itx.
*/
}
}
/*
* This function is responsible for ensuring the passed in commit waiter
* (and associated commit itx) is committed to an lwb. If the waiter is
* not already committed to an lwb, all itxs in the zilog's queue of
* itxs will be processed. The assumption is the passed in waiter's
* commit itx will found in the queue just like the other non-commit
* itxs, such that when the entire queue is processed, the waiter will
* have been commited to an lwb.
*
* The lwb associated with the passed in waiter is not guaranteed to
* have been issued by the time this function completes. If the lwb is
* not issued, we rely on future calls to zil_commit_writer() to issue
* the lwb, or the timeout mechanism found in zil_commit_waiter().
*/
static void
zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
{
ASSERT(!MUTEX_HELD(&zilog->zl_lock));
ASSERT(spa_writeable(zilog->zl_spa));
mutex_enter(&zilog->zl_issuer_lock);
if (zcw->zcw_lwb != NULL || zcw->zcw_done) {
/*
* It's possible that, while we were waiting to acquire
* the "zl_issuer_lock", another thread committed this
* waiter to an lwb. If that occurs, we bail out early,
* without processing any of the zilog's queue of itxs.
*
* On certain workloads and system configurations, the
* "zl_issuer_lock" can become highly contended. In an
* attempt to reduce this contention, we immediately drop
* the lock if the waiter has already been processed.
*
* We've measured this optimization to reduce CPU spent
* contending on this lock by up to 5%, using a system
* with 32 CPUs, low latency storage (~50 usec writes),
* and 1024 threads performing sync writes.
*/
goto out;
}
zil_get_commit_list(zilog);
zil_prune_commit_list(zilog);
zil_process_commit_list(zilog);
out:
mutex_exit(&zilog->zl_issuer_lock);
}
static void
zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
{
ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
ASSERT(MUTEX_HELD(&zcw->zcw_lock));
ASSERT3B(zcw->zcw_done, ==, B_FALSE);
lwb_t *lwb = zcw->zcw_lwb;
ASSERT3P(lwb, !=, NULL);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED);
/*
* If the lwb has already been issued by another thread, we can
* immediately return since there's no work to be done (the
* point of this function is to issue the lwb). Additionally, we
* do this prior to acquiring the zl_issuer_lock, to avoid
* acquiring it when it's not necessary to do so.
*/
if (lwb->lwb_state == LWB_STATE_ISSUED ||
lwb->lwb_state == LWB_STATE_DONE)
return;
/*
* In order to call zil_lwb_write_issue() we must hold the
* zilog's "zl_issuer_lock". We can't simply acquire that lock,
* since we're already holding the commit waiter's "zcw_lock",
* and those two locks are aquired in the opposite order
* elsewhere.
*/
mutex_exit(&zcw->zcw_lock);
mutex_enter(&zilog->zl_issuer_lock);
mutex_enter(&zcw->zcw_lock);
/*
* Since we just dropped and re-acquired the commit waiter's
* lock, we have to re-check to see if the waiter was marked
* "done" during that process. If the waiter was marked "done",
* the "lwb" pointer is no longer valid (it can be free'd after
* the waiter is marked "done"), so without this check we could
* wind up with a use-after-free error below.
*/
if (zcw->zcw_done)
goto out;
ASSERT3P(lwb, ==, zcw->zcw_lwb);
/*
* We've already checked this above, but since we hadn't acquired
* the zilog's zl_issuer_lock, we have to perform this check a
* second time while holding the lock.
*
* We don't need to hold the zl_lock since the lwb cannot transition
* from OPENED to ISSUED while we hold the zl_issuer_lock. The lwb
* _can_ transition from ISSUED to DONE, but it's OK to race with
* that transition since we treat the lwb the same, whether it's in
* the ISSUED or DONE states.
*
* The important thing, is we treat the lwb differently depending on
* if it's ISSUED or OPENED, and block any other threads that might
* attempt to issue this lwb. For that reason we hold the
* zl_issuer_lock when checking the lwb_state; we must not call
* zil_lwb_write_issue() if the lwb had already been issued.
*
* See the comment above the lwb_state_t structure definition for
* more details on the lwb states, and locking requirements.
*/
if (lwb->lwb_state == LWB_STATE_ISSUED ||
lwb->lwb_state == LWB_STATE_DONE)
goto out;
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
/*
* As described in the comments above zil_commit_waiter() and
* zil_process_commit_list(), we need to issue this lwb's zio
* since we've reached the commit waiter's timeout and it still
* hasn't been issued.
*/
lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
/*
* Since the lwb's zio hadn't been issued by the time this thread
* reached its timeout, we reset the zilog's "zl_cur_used" field
* to influence the zil block size selection algorithm.
*
* By having to issue the lwb's zio here, it means the size of the
* lwb was too large, given the incoming throughput of itxs. By
* setting "zl_cur_used" to zero, we communicate this fact to the
* block size selection algorithm, so it can take this informaiton
* into account, and potentially select a smaller size for the
* next lwb block that is allocated.
*/
zilog->zl_cur_used = 0;
if (nlwb == NULL) {
/*
* When zil_lwb_write_issue() returns NULL, this
* indicates zio_alloc_zil() failed to allocate the
* "next" lwb on-disk. When this occurs, the ZIL write
* pipeline must be stalled; see the comment within the
* zil_commit_writer_stall() function for more details.
*
* We must drop the commit waiter's lock prior to
* calling zil_commit_writer_stall() or else we can wind
* up with the following deadlock:
*
* - This thread is waiting for the txg to sync while
* holding the waiter's lock; txg_wait_synced() is
* used within txg_commit_writer_stall().
*
* - The txg can't sync because it is waiting for this
* lwb's zio callback to call dmu_tx_commit().
*
* - The lwb's zio callback can't call dmu_tx_commit()
* because it's blocked trying to acquire the waiter's
* lock, which occurs prior to calling dmu_tx_commit()
*/
mutex_exit(&zcw->zcw_lock);
zil_commit_writer_stall(zilog);
mutex_enter(&zcw->zcw_lock);
}
out:
mutex_exit(&zilog->zl_issuer_lock);
ASSERT(MUTEX_HELD(&zcw->zcw_lock));
}
/*
* This function is responsible for performing the following two tasks:
*
* 1. its primary responsibility is to block until the given "commit
* waiter" is considered "done".
*
* 2. its secondary responsibility is to issue the zio for the lwb that
* the given "commit waiter" is waiting on, if this function has
* waited "long enough" and the lwb is still in the "open" state.
*
* Given a sufficient amount of itxs being generated and written using
* the ZIL, the lwb's zio will be issued via the zil_lwb_commit()
* function. If this does not occur, this secondary responsibility will
* ensure the lwb is issued even if there is not other synchronous
* activity on the system.
*
* For more details, see zil_process_commit_list(); more specifically,
* the comment at the bottom of that function.
*/
static void
zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
{
ASSERT(!MUTEX_HELD(&zilog->zl_lock));
ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
ASSERT(spa_writeable(zilog->zl_spa));
mutex_enter(&zcw->zcw_lock);
/*
* The timeout is scaled based on the lwb latency to avoid
* significantly impacting the latency of each individual itx.
* For more details, see the comment at the bottom of the
* zil_process_commit_list() function.
*/
int pct = MAX(zfs_commit_timeout_pct, 1);
#if defined(illumos) || !defined(_KERNEL)
hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100;
hrtime_t wakeup = gethrtime() + sleep;
#else
sbintime_t sleep = nstosbt((zilog->zl_last_lwb_latency * pct) / 100);
sbintime_t wakeup = getsbinuptime() + sleep;
#endif
boolean_t timedout = B_FALSE;
while (!zcw->zcw_done) {
ASSERT(MUTEX_HELD(&zcw->zcw_lock));
lwb_t *lwb = zcw->zcw_lwb;
/*
* Usually, the waiter will have a non-NULL lwb field here,
* but it's possible for it to be NULL as a result of
* zil_commit() racing with spa_sync().
*
* When zil_clean() is called, it's possible for the itxg
* list (which may be cleaned via a taskq) to contain
* commit itxs. When this occurs, the commit waiters linked
* off of these commit itxs will not be committed to an
* lwb. Additionally, these commit waiters will not be
* marked done until zil_commit_waiter_skip() is called via
* zil_itxg_clean().
*
* Thus, it's possible for this commit waiter (i.e. the
* "zcw" variable) to be found in this "in between" state;
* where it's "zcw_lwb" field is NULL, and it hasn't yet
* been skipped, so it's "zcw_done" field is still B_FALSE.
*/
IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED);
if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) {
ASSERT3B(timedout, ==, B_FALSE);
/*
* If the lwb hasn't been issued yet, then we
* need to wait with a timeout, in case this
* function needs to issue the lwb after the
* timeout is reached; responsibility (2) from
* the comment above this function.
*/
#if defined(illumos) || !defined(_KERNEL)
clock_t timeleft = cv_timedwait_hires(&zcw->zcw_cv,
&zcw->zcw_lock, wakeup, USEC2NSEC(1),
CALLOUT_FLAG_ABSOLUTE);
if (timeleft >= 0 || zcw->zcw_done)
continue;
#else
int wait_err = cv_timedwait_sbt(&zcw->zcw_cv,
&zcw->zcw_lock, wakeup, SBT_1NS, C_ABSOLUTE);
if (wait_err != EWOULDBLOCK || zcw->zcw_done)
continue;
#endif
timedout = B_TRUE;
zil_commit_waiter_timeout(zilog, zcw);
if (!zcw->zcw_done) {
/*
* If the commit waiter has already been
* marked "done", it's possible for the
* waiter's lwb structure to have already
* been freed. Thus, we can only reliably
* make these assertions if the waiter
* isn't done.
*/
ASSERT3P(lwb, ==, zcw->zcw_lwb);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
}
} else {
/*
* If the lwb isn't open, then it must have already
* been issued. In that case, there's no need to
* use a timeout when waiting for the lwb to
* complete.
*
* Additionally, if the lwb is NULL, the waiter
* will soon be signalled and marked done via
* zil_clean() and zil_itxg_clean(), so no timeout
* is required.
*/
IMPLY(lwb != NULL,
lwb->lwb_state == LWB_STATE_ISSUED ||
lwb->lwb_state == LWB_STATE_DONE);
cv_wait(&zcw->zcw_cv, &zcw->zcw_lock);
}
}
mutex_exit(&zcw->zcw_lock);
}
static zil_commit_waiter_t *
zil_alloc_commit_waiter()
{
zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP);
cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL);
list_link_init(&zcw->zcw_node);
zcw->zcw_lwb = NULL;
zcw->zcw_done = B_FALSE;
zcw->zcw_zio_error = 0;
return (zcw);
}
static void
zil_free_commit_waiter(zil_commit_waiter_t *zcw)
{
ASSERT(!list_link_active(&zcw->zcw_node));
ASSERT3P(zcw->zcw_lwb, ==, NULL);
ASSERT3B(zcw->zcw_done, ==, B_TRUE);
mutex_destroy(&zcw->zcw_lock);
cv_destroy(&zcw->zcw_cv);
kmem_cache_free(zil_zcw_cache, zcw);
}
/*
* This function is used to create a TX_COMMIT itx and assign it. This
* way, it will be linked into the ZIL's list of synchronous itxs, and
* then later committed to an lwb (or skipped) when
* zil_process_commit_list() is called.
*/
static void
zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
{
dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t));
itx->itx_sync = B_TRUE;
itx->itx_private = zcw;
zil_itx_assign(zilog, itx, tx);
dmu_tx_commit(tx);
}
/*
* Commit ZFS Intent Log transactions (itxs) to stable storage.
*
* When writing ZIL transactions to the on-disk representation of the
* ZIL, the itxs are committed to a Log Write Block (lwb). Multiple
* itxs can be committed to a single lwb. Once a lwb is written and
* committed to stable storage (i.e. the lwb is written, and vdevs have
* been flushed), each itx that was committed to that lwb is also
* considered to be committed to stable storage.
*
* When an itx is committed to an lwb, the log record (lr_t) contained
* by the itx is copied into the lwb's zio buffer, and once this buffer
* is written to disk, it becomes an on-disk ZIL block.
*
* As itxs are generated, they're inserted into the ZIL's queue of
* uncommitted itxs. The semantics of zil_commit() are such that it will
* block until all itxs that were in the queue when it was called, are
* committed to stable storage.
*
* If "foid" is zero, this means all "synchronous" and "asynchronous"
* itxs, for all objects in the dataset, will be committed to stable
* storage prior to zil_commit() returning. If "foid" is non-zero, all
* "synchronous" itxs for all objects, but only "asynchronous" itxs
* that correspond to the foid passed in, will be committed to stable
* storage prior to zil_commit() returning.
*
* Generally speaking, when zil_commit() is called, the consumer doesn't
* actually care about _all_ of the uncommitted itxs. Instead, they're
* simply trying to waiting for a specific itx to be committed to disk,
* but the interface(s) for interacting with the ZIL don't allow such
* fine-grained communication. A better interface would allow a consumer
* to create and assign an itx, and then pass a reference to this itx to
* zil_commit(); such that zil_commit() would return as soon as that
* specific itx was committed to disk (instead of waiting for _all_
* itxs to be committed).
*
* When a thread calls zil_commit() a special "commit itx" will be
* generated, along with a corresponding "waiter" for this commit itx.
* zil_commit() will wait on this waiter's CV, such that when the waiter
* is marked done, and signalled, zil_commit() will return.
*
* This commit itx is inserted into the queue of uncommitted itxs. This
* provides an easy mechanism for determining which itxs were in the
* queue prior to zil_commit() having been called, and which itxs were
* added after zil_commit() was called.
*
* The commit it is special; it doesn't have any on-disk representation.
* When a commit itx is "committed" to an lwb, the waiter associated
* with it is linked onto the lwb's list of waiters. Then, when that lwb
* completes, each waiter on the lwb's list is marked done and signalled
* -- allowing the thread waiting on the waiter to return from zil_commit().
*
* It's important to point out a few critical factors that allow us
* to make use of the commit itxs, commit waiters, per-lwb lists of
* commit waiters, and zio completion callbacks like we're doing:
*
* 1. The list of waiters for each lwb is traversed, and each commit
* waiter is marked "done" and signalled, in the zio completion
* callback of the lwb's zio[*].
*
* * Actually, the waiters are signalled in the zio completion
* callback of the root zio for the DKIOCFLUSHWRITECACHE commands
* that are sent to the vdevs upon completion of the lwb zio.
*
* 2. When the itxs are inserted into the ZIL's queue of uncommitted
* itxs, the order in which they are inserted is preserved[*]; as
* itxs are added to the queue, they are added to the tail of
* in-memory linked lists.
*
* When committing the itxs to lwbs (to be written to disk), they
* are committed in the same order in which the itxs were added to
* the uncommitted queue's linked list(s); i.e. the linked list of
* itxs to commit is traversed from head to tail, and each itx is
* committed to an lwb in that order.
*
* * To clarify:
*
* - the order of "sync" itxs is preserved w.r.t. other
* "sync" itxs, regardless of the corresponding objects.
* - the order of "async" itxs is preserved w.r.t. other
* "async" itxs corresponding to the same object.
* - the order of "async" itxs is *not* preserved w.r.t. other
* "async" itxs corresponding to different objects.
* - the order of "sync" itxs w.r.t. "async" itxs (or vice
* versa) is *not* preserved, even for itxs that correspond
* to the same object.
*
* For more details, see: zil_itx_assign(), zil_async_to_sync(),
* zil_get_commit_list(), and zil_process_commit_list().
*
* 3. The lwbs represent a linked list of blocks on disk. Thus, any
* lwb cannot be considered committed to stable storage, until its
* "previous" lwb is also committed to stable storage. This fact,
* coupled with the fact described above, means that itxs are
* committed in (roughly) the order in which they were generated.
* This is essential because itxs are dependent on prior itxs.
* Thus, we *must not* deem an itx as being committed to stable
* storage, until *all* prior itxs have also been committed to
* stable storage.
*
* To enforce this ordering of lwb zio's, while still leveraging as
* much of the underlying storage performance as possible, we rely
* on two fundamental concepts:
*
* 1. The creation and issuance of lwb zio's is protected by
* the zilog's "zl_issuer_lock", which ensures only a single
* thread is creating and/or issuing lwb's at a time
* 2. The "previous" lwb is a child of the "current" lwb
* (leveraging the zio parent-child depenency graph)
*
* By relying on this parent-child zio relationship, we can have
* many lwb zio's concurrently issued to the underlying storage,
* but the order in which they complete will be the same order in
* which they were created.
*/
void
zil_commit(zilog_t *zilog, uint64_t foid)
{
/*
* We should never attempt to call zil_commit on a snapshot for
* a couple of reasons:
*
* 1. A snapshot may never be modified, thus it cannot have any
* in-flight itxs that would have modified the dataset.
*
* 2. By design, when zil_commit() is called, a commit itx will
* be assigned to this zilog; as a result, the zilog will be
* dirtied. We must not dirty the zilog of a snapshot; there's
* checks in the code that enforce this invariant, and will
* cause a panic if it's not upheld.
*/
ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE);
if (zilog->zl_sync == ZFS_SYNC_DISABLED)
return;
if (!spa_writeable(zilog->zl_spa)) {
/*
* If the SPA is not writable, there should never be any
* pending itxs waiting to be committed to disk. If that
* weren't true, we'd skip writing those itxs out, and
* would break the sematics of zil_commit(); thus, we're
* verifying that truth before we return to the caller.
*/
ASSERT(list_is_empty(&zilog->zl_lwb_list));
ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
for (int i = 0; i < TXG_SIZE; i++)
ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL);
return;
}
/*
* If the ZIL is suspended, we don't want to dirty it by calling
* zil_commit_itx_assign() below, nor can we write out
* lwbs like would be done in zil_commit_write(). Thus, we
* simply rely on txg_wait_synced() to maintain the necessary
* semantics, and avoid calling those functions altogether.
*/
if (zilog->zl_suspend > 0) {
txg_wait_synced(zilog->zl_dmu_pool, 0);
return;
}
zil_commit_impl(zilog, foid);
}
void
zil_commit_impl(zilog_t *zilog, uint64_t foid)
{
/*
* Move the "async" itxs for the specified foid to the "sync"
* queues, such that they will be later committed (or skipped)
* to an lwb when zil_process_commit_list() is called.
*
* Since these "async" itxs must be committed prior to this
* call to zil_commit returning, we must perform this operation
* before we call zil_commit_itx_assign().
*/
zil_async_to_sync(zilog, foid);
/*
* We allocate a new "waiter" structure which will initially be
* linked to the commit itx using the itx's "itx_private" field.
* Since the commit itx doesn't represent any on-disk state,
* when it's committed to an lwb, rather than copying the its
* lr_t into the lwb's buffer, the commit itx's "waiter" will be
* added to the lwb's list of waiters. Then, when the lwb is
* committed to stable storage, each waiter in the lwb's list of
* waiters will be marked "done", and signalled.
*
* We must create the waiter and assign the commit itx prior to
* calling zil_commit_writer(), or else our specific commit itx
* is not guaranteed to be committed to an lwb prior to calling
* zil_commit_waiter().
*/
zil_commit_waiter_t *zcw = zil_alloc_commit_waiter();
zil_commit_itx_assign(zilog, zcw);
zil_commit_writer(zilog, zcw);
zil_commit_waiter(zilog, zcw);
if (zcw->zcw_zio_error != 0) {
/*
* If there was an error writing out the ZIL blocks that
* this thread is waiting on, then we fallback to
* relying on spa_sync() to write out the data this
* thread is waiting on. Obviously this has performance
* implications, but the expectation is for this to be
* an exceptional case, and shouldn't occur often.
*/
DTRACE_PROBE2(zil__commit__io__error,
zilog_t *, zilog, zil_commit_waiter_t *, zcw);
txg_wait_synced(zilog->zl_dmu_pool, 0);
}
zil_free_commit_waiter(zcw);
}
/*
* Called in syncing context to free committed log blocks and update log header.
*/
void
zil_sync(zilog_t *zilog, dmu_tx_t *tx)
{
zil_header_t *zh = zil_header_in_syncing_context(zilog);
uint64_t txg = dmu_tx_get_txg(tx);
spa_t *spa = zilog->zl_spa;
uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK];
lwb_t *lwb;
/*
* We don't zero out zl_destroy_txg, so make sure we don't try
* to destroy it twice.
*/
if (spa_sync_pass(spa) != 1)
return;
mutex_enter(&zilog->zl_lock);
ASSERT(zilog->zl_stop_sync == 0);
if (*replayed_seq != 0) {
ASSERT(zh->zh_replay_seq < *replayed_seq);
zh->zh_replay_seq = *replayed_seq;
*replayed_seq = 0;
}
if (zilog->zl_destroy_txg == txg) {
blkptr_t blk = zh->zh_log;
ASSERT(list_head(&zilog->zl_lwb_list) == NULL);
bzero(zh, sizeof (zil_header_t));
bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq));
if (zilog->zl_keep_first) {
/*
* If this block was part of log chain that couldn't
* be claimed because a device was missing during
* zil_claim(), but that device later returns,
* then this block could erroneously appear valid.
* To guard against this, assign a new GUID to the new
* log chain so it doesn't matter what blk points to.
*/
zil_init_log_chain(zilog, &blk);
zh->zh_log = blk;
}
}
while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
zh->zh_log = lwb->lwb_blk;
if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
break;
list_remove(&zilog->zl_lwb_list, lwb);
zio_free(spa, txg, &lwb->lwb_blk);
zil_free_lwb(zilog, lwb);
/*
* If we don't have anything left in the lwb list then
* we've had an allocation failure and we need to zero
* out the zil_header blkptr so that we don't end
* up freeing the same block twice.
*/
if (list_head(&zilog->zl_lwb_list) == NULL)
BP_ZERO(&zh->zh_log);
}
mutex_exit(&zilog->zl_lock);
}
/* ARGSUSED */
static int
zil_lwb_cons(void *vbuf, void *unused, int kmflag)
{
lwb_t *lwb = vbuf;
list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t),
offsetof(zil_commit_waiter_t, zcw_node));
avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare,
sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
return (0);
}
/* ARGSUSED */
static void
zil_lwb_dest(void *vbuf, void *unused)
{
lwb_t *lwb = vbuf;
mutex_destroy(&lwb->lwb_vdev_lock);
avl_destroy(&lwb->lwb_vdev_tree);
list_destroy(&lwb->lwb_waiters);
}
void
zil_init(void)
{
zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0);
zil_zcw_cache = kmem_cache_create("zil_zcw_cache",
sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
}
void
zil_fini(void)
{
kmem_cache_destroy(zil_zcw_cache);
kmem_cache_destroy(zil_lwb_cache);
}
void
zil_set_sync(zilog_t *zilog, uint64_t sync)
{
zilog->zl_sync = sync;
}
void
zil_set_logbias(zilog_t *zilog, uint64_t logbias)
{
zilog->zl_logbias = logbias;
}
zilog_t *
zil_alloc(objset_t *os, zil_header_t *zh_phys)
{
zilog_t *zilog;
zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
zilog->zl_header = zh_phys;
zilog->zl_os = os;
zilog->zl_spa = dmu_objset_spa(os);
zilog->zl_dmu_pool = dmu_objset_pool(os);
zilog->zl_destroy_txg = TXG_INITIAL - 1;
zilog->zl_logbias = dmu_objset_logbias(os);
zilog->zl_sync = dmu_objset_syncprop(os);
zilog->zl_dirty_max_txg = 0;
zilog->zl_last_lwb_opened = NULL;
zilog->zl_last_lwb_latency = 0;
mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL);
for (int i = 0; i < TXG_SIZE; i++) {
mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL,
MUTEX_DEFAULT, NULL);
}
list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
offsetof(lwb_t, lwb_node));
list_create(&zilog->zl_itx_commit_list, sizeof (itx_t),
offsetof(itx_t, itx_node));
cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
return (zilog);
}
void
zil_free(zilog_t *zilog)
{
zilog->zl_stop_sync = 1;
ASSERT0(zilog->zl_suspend);
ASSERT0(zilog->zl_suspending);
ASSERT(list_is_empty(&zilog->zl_lwb_list));
list_destroy(&zilog->zl_lwb_list);
ASSERT(list_is_empty(&zilog->zl_itx_commit_list));
list_destroy(&zilog->zl_itx_commit_list);
for (int i = 0; i < TXG_SIZE; i++) {
/*
* It's possible for an itx to be generated that doesn't dirty
* a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean()
* callback to remove the entry. We remove those here.
*
* Also free up the ziltest itxs.
*/
if (zilog->zl_itxg[i].itxg_itxs)
zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs);
mutex_destroy(&zilog->zl_itxg[i].itxg_lock);
}
mutex_destroy(&zilog->zl_issuer_lock);
mutex_destroy(&zilog->zl_lock);
cv_destroy(&zilog->zl_cv_suspend);
kmem_free(zilog, sizeof (zilog_t));
}
/*
* Open an intent log.
*/
zilog_t *
zil_open(objset_t *os, zil_get_data_t *get_data)
{
zilog_t *zilog = dmu_objset_zil(os);
ASSERT3P(zilog->zl_get_data, ==, NULL);
ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
ASSERT(list_is_empty(&zilog->zl_lwb_list));
zilog->zl_get_data = get_data;
return (zilog);
}
/*
* Close an intent log.
*/
void
zil_close(zilog_t *zilog)
{
lwb_t *lwb;
uint64_t txg;
if (!dmu_objset_is_snapshot(zilog->zl_os)) {
zil_commit(zilog, 0);
} else {
ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL);
ASSERT0(zilog->zl_dirty_max_txg);
ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE);
}
mutex_enter(&zilog->zl_lock);
lwb = list_tail(&zilog->zl_lwb_list);
if (lwb == NULL)
txg = zilog->zl_dirty_max_txg;
else
txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg);
mutex_exit(&zilog->zl_lock);
/*
* We need to use txg_wait_synced() to wait long enough for the
* ZIL to be clean, and to wait for all pending lwbs to be
* written out.
*/
if (txg != 0)
txg_wait_synced(zilog->zl_dmu_pool, txg);
if (zilog_is_dirty(zilog))
zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg);
VERIFY(!zilog_is_dirty(zilog));
zilog->zl_get_data = NULL;
/*
* We should have only one lwb left on the list; remove it now.
*/
mutex_enter(&zilog->zl_lock);
lwb = list_head(&zilog->zl_lwb_list);
if (lwb != NULL) {
ASSERT3P(lwb, ==, list_tail(&zilog->zl_lwb_list));
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
list_remove(&zilog->zl_lwb_list, lwb);
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
zil_free_lwb(zilog, lwb);
}
mutex_exit(&zilog->zl_lock);
}
static char *suspend_tag = "zil suspending";
/*
* Suspend an intent log. While in suspended mode, we still honor
* synchronous semantics, but we rely on txg_wait_synced() to do it.
* On old version pools, we suspend the log briefly when taking a
* snapshot so that it will have an empty intent log.
*
* Long holds are not really intended to be used the way we do here --
* held for such a short time. A concurrent caller of dsl_dataset_long_held()
* could fail. Therefore we take pains to only put a long hold if it is
* actually necessary. Fortunately, it will only be necessary if the
* objset is currently mounted (or the ZVOL equivalent). In that case it
* will already have a long hold, so we are not really making things any worse.
*
* Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or
* zvol_state_t), and use their mechanism to prevent their hold from being
* dropped (e.g. VFS_HOLD()). However, that would be even more pain for
* very little gain.
*
* if cookiep == NULL, this does both the suspend & resume.
* Otherwise, it returns with the dataset "long held", and the cookie
* should be passed into zil_resume().
*/
int
zil_suspend(const char *osname, void **cookiep)
{
objset_t *os;
zilog_t *zilog;
const zil_header_t *zh;
int error;
error = dmu_objset_hold(osname, suspend_tag, &os);
if (error != 0)
return (error);
zilog = dmu_objset_zil(os);
mutex_enter(&zilog->zl_lock);
zh = zilog->zl_header;
if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */
mutex_exit(&zilog->zl_lock);
dmu_objset_rele(os, suspend_tag);
return (SET_ERROR(EBUSY));
}
/*
* Don't put a long hold in the cases where we can avoid it. This
* is when there is no cookie so we are doing a suspend & resume
* (i.e. called from zil_vdev_offline()), and there's nothing to do
* for the suspend because it's already suspended, or there's no ZIL.
*/
if (cookiep == NULL && !zilog->zl_suspending &&
(zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) {
mutex_exit(&zilog->zl_lock);
dmu_objset_rele(os, suspend_tag);
return (0);
}
dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag);
dsl_pool_rele(dmu_objset_pool(os), suspend_tag);
zilog->zl_suspend++;
if (zilog->zl_suspend > 1) {
/*
* Someone else is already suspending it.
* Just wait for them to finish.
*/
while (zilog->zl_suspending)
cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
mutex_exit(&zilog->zl_lock);
if (cookiep == NULL)
zil_resume(os);
else
*cookiep = os;
return (0);
}
/*
* If there is no pointer to an on-disk block, this ZIL must not
* be active (e.g. filesystem not mounted), so there's nothing
* to clean up.
*/
if (BP_IS_HOLE(&zh->zh_log)) {
ASSERT(cookiep != NULL); /* fast path already handled */
*cookiep = os;
mutex_exit(&zilog->zl_lock);
return (0);
}
zilog->zl_suspending = B_TRUE;
mutex_exit(&zilog->zl_lock);
/*
* We need to use zil_commit_impl to ensure we wait for all
* LWB_STATE_OPENED and LWB_STATE_ISSUED lwb's to be committed
* to disk before proceeding. If we used zil_commit instead, it
* would just call txg_wait_synced(), because zl_suspend is set.
* txg_wait_synced() doesn't wait for these lwb's to be
* LWB_STATE_DONE before returning.
*/
zil_commit_impl(zilog, 0);
/*
* Now that we've ensured all lwb's are LWB_STATE_DONE, we use
* txg_wait_synced() to ensure the data from the zilog has
* migrated to the main pool before calling zil_destroy().
*/
txg_wait_synced(zilog->zl_dmu_pool, 0);
zil_destroy(zilog, B_FALSE);
mutex_enter(&zilog->zl_lock);
zilog->zl_suspending = B_FALSE;
cv_broadcast(&zilog->zl_cv_suspend);
mutex_exit(&zilog->zl_lock);
if (cookiep == NULL)
zil_resume(os);
else
*cookiep = os;
return (0);
}
void
zil_resume(void *cookie)
{
objset_t *os = cookie;
zilog_t *zilog = dmu_objset_zil(os);
mutex_enter(&zilog->zl_lock);
ASSERT(zilog->zl_suspend != 0);
zilog->zl_suspend--;
mutex_exit(&zilog->zl_lock);
dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
}
typedef struct zil_replay_arg {
zil_replay_func_t **zr_replay;
void *zr_arg;
boolean_t zr_byteswap;
char *zr_lr;
} zil_replay_arg_t;
static int
zil_replay_error(zilog_t *zilog, lr_t *lr, int error)
{
char name[ZFS_MAX_DATASET_NAME_LEN];
zilog->zl_replaying_seq--; /* didn't actually replay this one */
dmu_objset_name(zilog->zl_os, name);
cmn_err(CE_WARN, "ZFS replay transaction error %d, "
"dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
(u_longlong_t)lr->lrc_seq,
(u_longlong_t)(lr->lrc_txtype & ~TX_CI),
(lr->lrc_txtype & TX_CI) ? "CI" : "");
return (error);
}
static int
zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
{
zil_replay_arg_t *zr = zra;
const zil_header_t *zh = zilog->zl_header;
uint64_t reclen = lr->lrc_reclen;
uint64_t txtype = lr->lrc_txtype;
int error = 0;
zilog->zl_replaying_seq = lr->lrc_seq;
if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */
return (0);
if (lr->lrc_txg < claim_txg) /* already committed */
return (0);
/* Strip case-insensitive bit, still present in log record */
txtype &= ~TX_CI;
if (txtype == 0 || txtype >= TX_MAX_TYPE)
return (zil_replay_error(zilog, lr, EINVAL));
/*
* If this record type can be logged out of order, the object
* (lr_foid) may no longer exist. That's legitimate, not an error.
*/
if (TX_OOO(txtype)) {
error = dmu_object_info(zilog->zl_os,
((lr_ooo_t *)lr)->lr_foid, NULL);
if (error == ENOENT || error == EEXIST)
return (0);
}
/*
* Make a copy of the data so we can revise and extend it.
*/
bcopy(lr, zr->zr_lr, reclen);
/*
* If this is a TX_WRITE with a blkptr, suck in the data.
*/
if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
error = zil_read_log_data(zilog, (lr_write_t *)lr,
zr->zr_lr + reclen);
if (error != 0)
return (zil_replay_error(zilog, lr, error));
}
/*
* The log block containing this lr may have been byteswapped
* so that we can easily examine common fields like lrc_txtype.
* However, the log is a mix of different record types, and only the
* replay vectors know how to byteswap their records. Therefore, if
* the lr was byteswapped, undo it before invoking the replay vector.
*/
if (zr->zr_byteswap)
byteswap_uint64_array(zr->zr_lr, reclen);
/*
* We must now do two things atomically: replay this log record,
* and update the log header sequence number to reflect the fact that
* we did so. At the end of each replay function the sequence number
* is updated if we are in replay mode.
*/
error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
if (error != 0) {
/*
* The DMU's dnode layer doesn't see removes until the txg
* commits, so a subsequent claim can spuriously fail with
* EEXIST. So if we receive any error we try syncing out
* any removes then retry the transaction. Note that we
* specify B_FALSE for byteswap now, so we don't do it twice.
*/
txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
if (error != 0)
return (zil_replay_error(zilog, lr, error));
}
return (0);
}
/* ARGSUSED */
static int
zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
{
zilog->zl_replay_blks++;
return (0);
}
/*
* If this dataset has a non-empty intent log, replay it and destroy it.
*/
void
zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
{
zilog_t *zilog = dmu_objset_zil(os);
const zil_header_t *zh = zilog->zl_header;
zil_replay_arg_t zr;
if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) {
zil_destroy(zilog, B_TRUE);
return;
}
zr.zr_replay = replay_func;
zr.zr_arg = arg;
zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
/*
* Wait for in-progress removes to sync before starting replay.
*/
txg_wait_synced(zilog->zl_dmu_pool, 0);
zilog->zl_replay = B_TRUE;
zilog->zl_replay_time = ddi_get_lbolt();
ASSERT(zilog->zl_replay_blks == 0);
(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
zh->zh_claim_txg);
kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
zil_destroy(zilog, B_FALSE);
txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
zilog->zl_replay = B_FALSE;
}
boolean_t
zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
{
if (zilog->zl_sync == ZFS_SYNC_DISABLED)
return (B_TRUE);
if (zilog->zl_replay) {
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
zilog->zl_replaying_seq;
return (B_TRUE);
}
return (B_FALSE);
}
/* ARGSUSED */
int
-zil_vdev_offline(const char *osname, void *arg)
+zil_reset(const char *osname, void *arg)
{
int error;
error = zil_suspend(osname, NULL);
if (error != 0)
return (SET_ERROR(EEXIST));
return (0);
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c (revision 332525)
@@ -1,4151 +1,4183 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/sysmacros.h>
#include <sys/zfs_context.h>
#include <sys/fm/fs/zfs.h>
#include <sys/spa.h>
#include <sys/txg.h>
#include <sys/spa_impl.h>
#include <sys/vdev_impl.h>
#include <sys/zio_impl.h>
#include <sys/zio_compress.h>
#include <sys/zio_checksum.h>
#include <sys/dmu_objset.h>
#include <sys/arc.h>
#include <sys/ddt.h>
#include <sys/trim_map.h>
#include <sys/blkptr.h>
#include <sys/zfeature.h>
#include <sys/metaslab_impl.h>
#include <sys/abd.h>
SYSCTL_DECL(_vfs_zfs);
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
#if defined(__amd64__)
static int zio_use_uma = 1;
#else
static int zio_use_uma = 0;
#endif
SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,
"Use uma(9) for ZIO allocations");
static int zio_exclude_metadata = 0;
SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0,
"Exclude metadata buffers from dumps as well");
zio_trim_stats_t zio_trim_stats = {
{ "bytes", KSTAT_DATA_UINT64,
"Number of bytes successfully TRIMmed" },
{ "success", KSTAT_DATA_UINT64,
"Number of successful TRIM requests" },
{ "unsupported", KSTAT_DATA_UINT64,
"Number of TRIM requests that failed because TRIM is not supported" },
{ "failed", KSTAT_DATA_UINT64,
"Number of TRIM requests that failed for reasons other than not supported" },
};
static kstat_t *zio_trim_ksp;
/*
* ==========================================================================
* I/O type descriptions
* ==========================================================================
*/
const char *zio_type_name[ZIO_TYPES] = {
"zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
"zio_ioctl"
};
boolean_t zio_dva_throttle_enabled = B_TRUE;
SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, dva_throttle_enabled, CTLFLAG_RDTUN,
&zio_dva_throttle_enabled, 0, "");
/*
* ==========================================================================
* I/O kmem caches
* ==========================================================================
*/
kmem_cache_t *zio_cache;
kmem_cache_t *zio_link_cache;
kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
#ifdef _KERNEL
extern vmem_t *zio_alloc_arena;
#endif
#define ZIO_PIPELINE_CONTINUE 0x100
#define ZIO_PIPELINE_STOP 0x101
#define BP_SPANB(indblkshift, level) \
(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
#define COMPARE_META_LEVEL 0x80000000ul
/*
* The following actions directly effect the spa's sync-to-convergence logic.
* The values below define the sync pass when we start performing the action.
* Care should be taken when changing these values as they directly impact
* spa_sync() performance. Tuning these values may introduce subtle performance
* pathologies and should only be done in the context of performance analysis.
* These tunables will eventually be removed and replaced with #defines once
* enough analysis has been done to determine optimal values.
*
* The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
* regular blocks are not deferred.
*/
int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN,
&zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass");
int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN,
&zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass");
int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN,
&zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass");
/*
* An allocating zio is one that either currently has the DVA allocate
* stage set or will have it later in its lifetime.
*/
#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
boolean_t zio_requeue_io_start_cut_in_line = B_TRUE;
#ifdef illumos
#ifdef ZFS_DEBUG
int zio_buf_debug_limit = 16384;
#else
int zio_buf_debug_limit = 0;
#endif
#endif
static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
void
zio_init(void)
{
size_t c;
zio_cache = kmem_cache_create("zio_cache",
sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
zio_link_cache = kmem_cache_create("zio_link_cache",
sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
if (!zio_use_uma)
goto out;
/*
* For small buffers, we want a cache for each multiple of
* SPA_MINBLOCKSIZE. For larger buffers, we want a cache
* for each quarter-power of 2.
*/
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
size_t p2 = size;
size_t align = 0;
int cflags = zio_exclude_metadata ? KMC_NODEBUG : 0;
while (!ISP2(p2))
p2 &= p2 - 1;
#ifdef illumos
#ifndef _KERNEL
/*
* If we are using watchpoints, put each buffer on its own page,
* to eliminate the performance overhead of trapping to the
* kernel when modifying a non-watched buffer that shares the
* page with a watched buffer.
*/
if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
continue;
#endif
#endif /* illumos */
if (size <= 4 * SPA_MINBLOCKSIZE) {
align = SPA_MINBLOCKSIZE;
} else if (IS_P2ALIGNED(size, p2 >> 2)) {
align = MIN(p2 >> 2, PAGESIZE);
}
if (align != 0) {
char name[36];
(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
zio_buf_cache[c] = kmem_cache_create(name, size,
align, NULL, NULL, NULL, NULL, NULL, cflags);
/*
* Since zio_data bufs do not appear in crash dumps, we
* pass KMC_NOTOUCH so that no allocator metadata is
* stored with the buffers.
*/
(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
zio_data_buf_cache[c] = kmem_cache_create(name, size,
align, NULL, NULL, NULL, NULL, NULL,
cflags | KMC_NOTOUCH | KMC_NODEBUG);
}
}
while (--c != 0) {
ASSERT(zio_buf_cache[c] != NULL);
if (zio_buf_cache[c - 1] == NULL)
zio_buf_cache[c - 1] = zio_buf_cache[c];
ASSERT(zio_data_buf_cache[c] != NULL);
if (zio_data_buf_cache[c - 1] == NULL)
zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
}
out:
zio_inject_init();
zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc",
KSTAT_TYPE_NAMED,
sizeof(zio_trim_stats) / sizeof(kstat_named_t),
KSTAT_FLAG_VIRTUAL);
if (zio_trim_ksp != NULL) {
zio_trim_ksp->ks_data = &zio_trim_stats;
kstat_install(zio_trim_ksp);
}
}
void
zio_fini(void)
{
size_t c;
kmem_cache_t *last_cache = NULL;
kmem_cache_t *last_data_cache = NULL;
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
if (zio_buf_cache[c] != last_cache) {
last_cache = zio_buf_cache[c];
kmem_cache_destroy(zio_buf_cache[c]);
}
zio_buf_cache[c] = NULL;
if (zio_data_buf_cache[c] != last_data_cache) {
last_data_cache = zio_data_buf_cache[c];
kmem_cache_destroy(zio_data_buf_cache[c]);
}
zio_data_buf_cache[c] = NULL;
}
kmem_cache_destroy(zio_link_cache);
kmem_cache_destroy(zio_cache);
zio_inject_fini();
if (zio_trim_ksp != NULL) {
kstat_delete(zio_trim_ksp);
zio_trim_ksp = NULL;
}
}
/*
* ==========================================================================
* Allocate and free I/O buffers
* ==========================================================================
*/
/*
* Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
* crashdump if the kernel panics, so use it judiciously. Obviously, it's
* useful to inspect ZFS metadata, but if possible, we should avoid keeping
* excess / transient data in-core during a crashdump.
*/
void *
zio_buf_alloc(size_t size)
{
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
int flags = zio_exclude_metadata ? KM_NODEBUG : 0;
VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
if (zio_use_uma)
return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
else
return (kmem_alloc(size, KM_SLEEP|flags));
}
/*
* Use zio_data_buf_alloc to allocate data. The data will not appear in a
* crashdump if the kernel panics. This exists so that we will limit the amount
* of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount
* of kernel heap dumped to disk when the kernel panics)
*/
void *
zio_data_buf_alloc(size_t size)
{
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
if (zio_use_uma)
return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
else
return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG));
}
void
zio_buf_free(void *buf, size_t size)
{
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
if (zio_use_uma)
kmem_cache_free(zio_buf_cache[c], buf);
else
kmem_free(buf, size);
}
void
zio_data_buf_free(void *buf, size_t size)
{
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
if (zio_use_uma)
kmem_cache_free(zio_data_buf_cache[c], buf);
else
kmem_free(buf, size);
}
/*
* ==========================================================================
* Push and pop I/O transform buffers
* ==========================================================================
*/
void
zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
zio_transform_func_t *transform)
{
zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
/*
* Ensure that anyone expecting this zio to contain a linear ABD isn't
* going to get a nasty surprise when they try to access the data.
*/
#ifdef illumos
IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data));
#else
IMPLY(zio->io_abd != NULL && abd_is_linear(zio->io_abd),
abd_is_linear(data));
#endif
zt->zt_orig_abd = zio->io_abd;
zt->zt_orig_size = zio->io_size;
zt->zt_bufsize = bufsize;
zt->zt_transform = transform;
zt->zt_next = zio->io_transform_stack;
zio->io_transform_stack = zt;
zio->io_abd = data;
zio->io_size = size;
}
void
zio_pop_transforms(zio_t *zio)
{
zio_transform_t *zt;
while ((zt = zio->io_transform_stack) != NULL) {
if (zt->zt_transform != NULL)
zt->zt_transform(zio,
zt->zt_orig_abd, zt->zt_orig_size);
if (zt->zt_bufsize != 0)
abd_free(zio->io_abd);
zio->io_abd = zt->zt_orig_abd;
zio->io_size = zt->zt_orig_size;
zio->io_transform_stack = zt->zt_next;
kmem_free(zt, sizeof (zio_transform_t));
}
}
/*
* ==========================================================================
* I/O transform callbacks for subblocks and decompression
* ==========================================================================
*/
static void
zio_subblock(zio_t *zio, abd_t *data, uint64_t size)
{
ASSERT(zio->io_size > size);
if (zio->io_type == ZIO_TYPE_READ)
abd_copy(data, zio->io_abd, size);
}
static void
zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
{
if (zio->io_error == 0) {
void *tmp = abd_borrow_buf(data, size);
int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
zio->io_abd, tmp, zio->io_size, size);
abd_return_buf_copy(data, tmp, size);
if (ret != 0)
zio->io_error = SET_ERROR(EIO);
}
}
/*
* ==========================================================================
* I/O parent/child relationships and pipeline interlocks
* ==========================================================================
*/
zio_t *
zio_walk_parents(zio_t *cio, zio_link_t **zl)
{
list_t *pl = &cio->io_parent_list;
*zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl);
if (*zl == NULL)
return (NULL);
ASSERT((*zl)->zl_child == cio);
return ((*zl)->zl_parent);
}
zio_t *
zio_walk_children(zio_t *pio, zio_link_t **zl)
{
list_t *cl = &pio->io_child_list;
*zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
if (*zl == NULL)
return (NULL);
ASSERT((*zl)->zl_parent == pio);
return ((*zl)->zl_child);
}
zio_t *
zio_unique_parent(zio_t *cio)
{
zio_link_t *zl = NULL;
zio_t *pio = zio_walk_parents(cio, &zl);
VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
return (pio);
}
void
zio_add_child(zio_t *pio, zio_t *cio)
{
zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
/*
* Logical I/Os can have logical, gang, or vdev children.
* Gang I/Os can have gang or vdev children.
* Vdev I/Os can only have vdev children.
* The following ASSERT captures all of these constraints.
*/
ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
zl->zl_parent = pio;
zl->zl_child = cio;
mutex_enter(&cio->io_lock);
mutex_enter(&pio->io_lock);
ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
list_insert_head(&pio->io_child_list, zl);
list_insert_head(&cio->io_parent_list, zl);
pio->io_child_count++;
cio->io_parent_count++;
mutex_exit(&pio->io_lock);
mutex_exit(&cio->io_lock);
}
static void
zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
{
ASSERT(zl->zl_parent == pio);
ASSERT(zl->zl_child == cio);
mutex_enter(&cio->io_lock);
mutex_enter(&pio->io_lock);
list_remove(&pio->io_child_list, zl);
list_remove(&cio->io_parent_list, zl);
pio->io_child_count--;
cio->io_parent_count--;
mutex_exit(&pio->io_lock);
mutex_exit(&cio->io_lock);
kmem_cache_free(zio_link_cache, zl);
}
static boolean_t
zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait)
{
boolean_t waiting = B_FALSE;
mutex_enter(&zio->io_lock);
ASSERT(zio->io_stall == NULL);
for (int c = 0; c < ZIO_CHILD_TYPES; c++) {
if (!(ZIO_CHILD_BIT_IS_SET(childbits, c)))
continue;
uint64_t *countp = &zio->io_children[c][wait];
if (*countp != 0) {
zio->io_stage >>= 1;
ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
zio->io_stall = countp;
waiting = B_TRUE;
break;
}
}
mutex_exit(&zio->io_lock);
return (waiting);
}
static void
zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
{
uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
int *errorp = &pio->io_child_error[zio->io_child_type];
mutex_enter(&pio->io_lock);
if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
*errorp = zio_worst_error(*errorp, zio->io_error);
pio->io_reexecute |= zio->io_reexecute;
ASSERT3U(*countp, >, 0);
(*countp)--;
if (*countp == 0 && pio->io_stall == countp) {
zio_taskq_type_t type =
pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE :
ZIO_TASKQ_INTERRUPT;
pio->io_stall = NULL;
mutex_exit(&pio->io_lock);
/*
* Dispatch the parent zio in its own taskq so that
* the child can continue to make progress. This also
* prevents overflowing the stack when we have deeply nested
* parent-child relationships.
*/
zio_taskq_dispatch(pio, type, B_FALSE);
} else {
mutex_exit(&pio->io_lock);
}
}
static void
zio_inherit_child_errors(zio_t *zio, enum zio_child c)
{
if (zio->io_child_error[c] != 0 && zio->io_error == 0)
zio->io_error = zio->io_child_error[c];
}
int
zio_bookmark_compare(const void *x1, const void *x2)
{
const zio_t *z1 = x1;
const zio_t *z2 = x2;
if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset)
return (-1);
if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset)
return (1);
if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object)
return (-1);
if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object)
return (1);
if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level)
return (-1);
if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level)
return (1);
if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid)
return (-1);
if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid)
return (1);
if (z1 < z2)
return (-1);
if (z1 > z2)
return (1);
return (0);
}
/*
* ==========================================================================
* Create the various types of I/O (read, write, free, etc)
* ==========================================================================
*/
static zio_t *
zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
void *private, zio_type_t type, zio_priority_t priority,
enum zio_flag flags, vdev_t *vd, uint64_t offset,
const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline)
{
zio_t *zio;
ASSERT3U(type == ZIO_TYPE_FREE || psize, <=, SPA_MAXBLOCKSIZE);
ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0);
ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
ASSERT(vd || stage == ZIO_STAGE_OPEN);
IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW) != 0);
zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
bzero(zio, sizeof (zio_t));
mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
list_create(&zio->io_parent_list, sizeof (zio_link_t),
offsetof(zio_link_t, zl_parent_node));
list_create(&zio->io_child_list, sizeof (zio_link_t),
offsetof(zio_link_t, zl_child_node));
metaslab_trace_init(&zio->io_alloc_list);
if (vd != NULL)
zio->io_child_type = ZIO_CHILD_VDEV;
else if (flags & ZIO_FLAG_GANG_CHILD)
zio->io_child_type = ZIO_CHILD_GANG;
else if (flags & ZIO_FLAG_DDT_CHILD)
zio->io_child_type = ZIO_CHILD_DDT;
else
zio->io_child_type = ZIO_CHILD_LOGICAL;
if (bp != NULL) {
zio->io_bp = (blkptr_t *)bp;
zio->io_bp_copy = *bp;
zio->io_bp_orig = *bp;
if (type != ZIO_TYPE_WRITE ||
zio->io_child_type == ZIO_CHILD_DDT)
zio->io_bp = &zio->io_bp_copy; /* so caller can free */
if (zio->io_child_type == ZIO_CHILD_LOGICAL)
zio->io_logical = zio;
if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
pipeline |= ZIO_GANG_STAGES;
}
zio->io_spa = spa;
zio->io_txg = txg;
zio->io_done = done;
zio->io_private = private;
zio->io_type = type;
zio->io_priority = priority;
zio->io_vd = vd;
zio->io_offset = offset;
zio->io_orig_abd = zio->io_abd = data;
zio->io_orig_size = zio->io_size = psize;
zio->io_lsize = lsize;
zio->io_orig_flags = zio->io_flags = flags;
zio->io_orig_stage = zio->io_stage = stage;
zio->io_orig_pipeline = zio->io_pipeline = pipeline;
zio->io_pipeline_trace = ZIO_STAGE_OPEN;
zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
if (zb != NULL)
zio->io_bookmark = *zb;
if (pio != NULL) {
if (zio->io_logical == NULL)
zio->io_logical = pio->io_logical;
if (zio->io_child_type == ZIO_CHILD_GANG)
zio->io_gang_leader = pio->io_gang_leader;
zio_add_child(pio, zio);
}
return (zio);
}
static void
zio_destroy(zio_t *zio)
{
metaslab_trace_fini(&zio->io_alloc_list);
list_destroy(&zio->io_parent_list);
list_destroy(&zio->io_child_list);
mutex_destroy(&zio->io_lock);
cv_destroy(&zio->io_cv);
kmem_cache_free(zio_cache, zio);
}
zio_t *
zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
void *private, enum zio_flag flags)
{
zio_t *zio;
zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
return (zio);
}
zio_t *
zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
{
return (zio_null(NULL, spa, NULL, done, private, flags));
}
void
zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
{
if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
zfs_panic_recover("blkptr at %p has invalid TYPE %llu",
bp, (longlong_t)BP_GET_TYPE(bp));
}
if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu",
bp, (longlong_t)BP_GET_CHECKSUM(bp));
}
if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu",
bp, (longlong_t)BP_GET_COMPRESS(bp));
}
if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
zfs_panic_recover("blkptr at %p has invalid LSIZE %llu",
bp, (longlong_t)BP_GET_LSIZE(bp));
}
if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
zfs_panic_recover("blkptr at %p has invalid PSIZE %llu",
bp, (longlong_t)BP_GET_PSIZE(bp));
}
if (BP_IS_EMBEDDED(bp)) {
if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) {
zfs_panic_recover("blkptr at %p has invalid ETYPE %llu",
bp, (longlong_t)BPE_GET_ETYPE(bp));
}
}
/*
* Pool-specific checks.
*
* Note: it would be nice to verify that the blk_birth and
* BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze()
* allows the birth time of log blocks (and dmu_sync()-ed blocks
* that are in the log) to be arbitrarily large.
*/
for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]);
if (vdevid >= spa->spa_root_vdev->vdev_children) {
zfs_panic_recover("blkptr at %p DVA %u has invalid "
"VDEV %llu",
bp, i, (longlong_t)vdevid);
continue;
}
vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
if (vd == NULL) {
zfs_panic_recover("blkptr at %p DVA %u has invalid "
"VDEV %llu",
bp, i, (longlong_t)vdevid);
continue;
}
if (vd->vdev_ops == &vdev_hole_ops) {
zfs_panic_recover("blkptr at %p DVA %u has hole "
"VDEV %llu",
bp, i, (longlong_t)vdevid);
continue;
}
if (vd->vdev_ops == &vdev_missing_ops) {
/*
* "missing" vdevs are valid during import, but we
* don't have their detailed info (e.g. asize), so
* we can't perform any more checks on them.
*/
continue;
}
uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]);
if (BP_IS_GANG(bp))
asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
if (offset + asize > vd->vdev_asize) {
zfs_panic_recover("blkptr at %p DVA %u has invalid "
"OFFSET %llu",
bp, i, (longlong_t)offset);
}
}
}
zio_t *
zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
abd_t *data, uint64_t size, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
{
zio_t *zio;
zfs_blkptr_verify(spa, bp);
zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
data, size, size, done, private,
ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
return (zio);
}
zio_t *
zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *children_ready,
zio_done_func_t *physdone, zio_done_func_t *done,
void *private, zio_priority_t priority, enum zio_flag flags,
const zbookmark_phys_t *zb)
{
zio_t *zio;
ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
zp->zp_compress >= ZIO_COMPRESS_OFF &&
zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
DMU_OT_IS_VALID(zp->zp_type) &&
zp->zp_level < 32 &&
zp->zp_copies > 0 &&
zp->zp_copies <= spa_max_replication(spa));
zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private,
ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
zio->io_ready = ready;
zio->io_children_ready = children_ready;
zio->io_physdone = physdone;
zio->io_prop = *zp;
/*
* Data can be NULL if we are going to call zio_write_override() to
* provide the already-allocated BP. But we may need the data to
* verify a dedup hit (if requested). In this case, don't try to
* dedup (just take the already-allocated BP verbatim).
*/
if (data == NULL && zio->io_prop.zp_dedup_verify) {
zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
}
return (zio);
}
zio_t *
zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
uint64_t size, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
{
zio_t *zio;
zio = zio_create(pio, spa, txg, bp, data, size, size, done, private,
ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
return (zio);
}
void
zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
{
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
/*
* We must reset the io_prop to match the values that existed
* when the bp was first written by dmu_sync() keeping in mind
* that nopwrite and dedup are mutually exclusive.
*/
zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
zio->io_prop.zp_nopwrite = nopwrite;
zio->io_prop.zp_copies = copies;
zio->io_bp_override = bp;
}
void
zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
{
+ zfs_blkptr_verify(spa, bp);
+
/*
* The check for EMBEDDED is a performance optimization. We
* process the free here (by ignoring it) rather than
* putting it on the list and then processing it in zio_free_sync().
*/
if (BP_IS_EMBEDDED(bp))
return;
metaslab_check_free(spa, bp);
/*
* Frees that are for the currently-syncing txg, are not going to be
* deferred, and which will not need to do a read (i.e. not GANG or
* DEDUP), can be processed immediately. Otherwise, put them on the
* in-memory list for later processing.
*/
if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
txg != spa->spa_syncing_txg ||
spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
} else {
VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp,
BP_GET_PSIZE(bp), 0)));
}
}
zio_t *
zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
uint64_t size, enum zio_flag flags)
{
zio_t *zio;
enum zio_stage stage = ZIO_FREE_PIPELINE;
ASSERT(!BP_IS_HOLE(bp));
ASSERT(spa_syncing_txg(spa) == txg);
ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
if (BP_IS_EMBEDDED(bp))
return (zio_null(pio, spa, NULL, NULL, NULL, 0));
metaslab_check_free(spa, bp);
arc_freed(spa, bp);
if (zfs_trim_enabled)
stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START |
ZIO_STAGE_VDEV_IO_ASSESS;
/*
* GANG and DEDUP blocks can induce a read (for the gang block header,
* or the DDT), so issue them asynchronously so that this thread is
* not tied up.
*/
else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
stage |= ZIO_STAGE_ISSUE_ASYNC;
flags |= ZIO_FLAG_DONT_QUEUE;
zio = zio_create(pio, spa, txg, bp, NULL, size,
size, NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW,
flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
return (zio);
}
zio_t *
zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio_done_func_t *done, void *private, enum zio_flag flags)
{
zio_t *zio;
- dprintf_bp(bp, "claiming in txg %llu", txg);
+ zfs_blkptr_verify(spa, bp);
if (BP_IS_EMBEDDED(bp))
return (zio_null(pio, spa, NULL, NULL, NULL, 0));
/*
* A claim is an allocation of a specific block. Claims are needed
* to support immediate writes in the intent log. The issue is that
* immediate writes contain committed data, but in a txg that was
* *not* committed. Upon opening the pool after an unclean shutdown,
* the intent log claims all blocks that contain immediate write data
* so that the SPA knows they're in use.
*
* All claims *must* be resolved in the first txg -- before the SPA
* starts allocating blocks -- so that nothing is allocated twice.
* If txg == 0 we just verify that the block is claimable.
*/
ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
ASSERT(txg == spa_first_txg(spa) || txg == 0);
ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */
zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW,
flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
ASSERT0(zio->io_queued_timestamp);
return (zio);
}
zio_t *
zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset,
uint64_t size, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags)
{
zio_t *zio;
int c;
if (vd->vdev_children == 0) {
zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
zio->io_cmd = cmd;
} else {
zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
for (c = 0; c < vd->vdev_children; c++)
zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
offset, size, done, private, priority, flags));
}
return (zio);
}
zio_t *
zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
abd_t *data, int checksum, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, boolean_t labels)
{
zio_t *zio;
ASSERT(vd->vdev_children == 0);
ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
ASSERT3U(offset + size, <=, vd->vdev_psize);
zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd,
offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
zio->io_prop.zp_checksum = checksum;
return (zio);
}
zio_t *
zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
abd_t *data, int checksum, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, boolean_t labels)
{
zio_t *zio;
ASSERT(vd->vdev_children == 0);
ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
ASSERT3U(offset + size, <=, vd->vdev_psize);
zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd,
offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
zio->io_prop.zp_checksum = checksum;
if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
/*
* zec checksums are necessarily destructive -- they modify
* the end of the write buffer to hold the verifier/checksum.
* Therefore, we must make a local copy in case the data is
* being written to multiple places in parallel.
*/
abd_t *wbuf = abd_alloc_sametype(data, size);
abd_copy(wbuf, data, size);
zio_push_transform(zio, wbuf, size, size, NULL);
}
return (zio);
}
/*
* Create a child I/O to do some work for us.
*/
zio_t *
zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
abd_t *data, uint64_t size, int type, zio_priority_t priority,
enum zio_flag flags, zio_done_func_t *done, void *private)
{
enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
zio_t *zio;
- ASSERT(vd->vdev_parent ==
- (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
+ /*
+ * vdev child I/Os do not propagate their error to the parent.
+ * Therefore, for correct operation the caller *must* check for
+ * and handle the error in the child i/o's done callback.
+ * The only exceptions are i/os that we don't care about
+ * (OPTIONAL or REPAIR).
+ */
+ ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) ||
+ done != NULL);
+ /*
+ * In the common case, where the parent zio was to a normal vdev,
+ * the child zio must be to a child vdev of that vdev. Otherwise,
+ * the child zio must be to a top-level vdev.
+ */
+ if (pio->io_vd != NULL && pio->io_vd->vdev_ops != &vdev_indirect_ops) {
+ ASSERT3P(vd->vdev_parent, ==, pio->io_vd);
+ } else {
+ ASSERT3P(vd, ==, vd->vdev_top);
+ }
+
if (type == ZIO_TYPE_READ && bp != NULL) {
/*
* If we have the bp, then the child should perform the
* checksum and the parent need not. This pushes error
* detection as close to the leaves as possible and
* eliminates redundant checksums in the interior nodes.
*/
pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
}
/* Not all IO types require vdev io done stage e.g. free */
if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE))
pipeline &= ~ZIO_STAGE_VDEV_IO_DONE;
- if (vd->vdev_children == 0)
+ if (vd->vdev_ops->vdev_op_leaf) {
+ ASSERT0(vd->vdev_children);
offset += VDEV_LABEL_START_SIZE;
+ }
- flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
+ flags |= ZIO_VDEV_CHILD_FLAGS(pio);
/*
* If we've decided to do a repair, the write is not speculative --
* even if the original read was.
*/
if (flags & ZIO_FLAG_IO_REPAIR)
flags &= ~ZIO_FLAG_SPECULATIVE;
/*
* If we're creating a child I/O that is not associated with a
* top-level vdev, then the child zio is not an allocating I/O.
* If this is a retried I/O then we ignore it since we will
* have already processed the original allocating I/O.
*/
if (flags & ZIO_FLAG_IO_ALLOCATING &&
(vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
metaslab_class_t *mc = spa_normal_class(pio->io_spa);
ASSERT(mc->mc_alloc_throttle_enabled);
ASSERT(type == ZIO_TYPE_WRITE);
ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
pio->io_child_type == ZIO_CHILD_GANG);
flags &= ~ZIO_FLAG_IO_ALLOCATING;
}
zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
zio->io_physdone = pio->io_physdone;
if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
zio->io_logical->io_phys_children++;
return (zio);
}
zio_t *
zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
int type, zio_priority_t priority, enum zio_flag flags,
zio_done_func_t *done, void *private)
{
zio_t *zio;
ASSERT(vd->vdev_ops->vdev_op_leaf);
zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
data, size, size, done, private, type, priority,
flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
vd, offset, NULL,
ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
return (zio);
}
void
zio_flush(zio_t *zio, vdev_t *vd)
{
zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0,
NULL, NULL, ZIO_PRIORITY_NOW,
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
}
zio_t *
zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size)
{
ASSERT(vd->vdev_ops->vdev_op_leaf);
return (zio_create(zio, spa, 0, NULL, NULL, size, size, NULL, NULL,
ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE |
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY,
vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE));
}
void
zio_shrink(zio_t *zio, uint64_t size)
{
ASSERT3P(zio->io_executor, ==, NULL);
ASSERT3P(zio->io_orig_size, ==, zio->io_size);
ASSERT3U(size, <=, zio->io_size);
/*
* We don't shrink for raidz because of problems with the
* reconstruction when reading back less than the block size.
* Note, BP_IS_RAIDZ() assumes no compression.
*/
ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
if (!BP_IS_RAIDZ(zio->io_bp)) {
/* we are not doing a raw write */
ASSERT3U(zio->io_size, ==, zio->io_lsize);
zio->io_orig_size = zio->io_size = zio->io_lsize = size;
}
}
/*
* ==========================================================================
* Prepare to read and write logical blocks
* ==========================================================================
*/
static int
zio_read_bp_init(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
+ ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
+
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
zio->io_child_type == ZIO_CHILD_LOGICAL &&
!(zio->io_flags & ZIO_FLAG_RAW)) {
uint64_t psize =
BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
psize, psize, zio_decompress);
}
if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
int psize = BPE_GET_PSIZE(bp);
void *data = abd_borrow_buf(zio->io_abd, psize);
decode_embedded_bp_compressed(bp, data);
abd_return_buf_copy(zio->io_abd, data, psize);
} else {
ASSERT(!BP_IS_EMBEDDED(bp));
+ ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
}
if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
zio->io_flags |= ZIO_FLAG_DONT_CACHE;
if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
zio->io_flags |= ZIO_FLAG_DONT_CACHE;
if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
return (ZIO_PIPELINE_CONTINUE);
}
static int
zio_write_bp_init(zio_t *zio)
{
if (!IO_IS_ALLOCATING(zio))
return (ZIO_PIPELINE_CONTINUE);
ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
if (zio->io_bp_override) {
blkptr_t *bp = zio->io_bp;
zio_prop_t *zp = &zio->io_prop;
ASSERT(bp->blk_birth != zio->io_txg);
ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
*bp = *zio->io_bp_override;
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
if (BP_IS_EMBEDDED(bp))
return (ZIO_PIPELINE_CONTINUE);
/*
* If we've been overridden and nopwrite is set then
* set the flag accordingly to indicate that a nopwrite
* has already occurred.
*/
if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
ASSERT(!zp->zp_dedup);
ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
zio->io_flags |= ZIO_FLAG_NOPWRITE;
return (ZIO_PIPELINE_CONTINUE);
}
ASSERT(!zp->zp_nopwrite);
if (BP_IS_HOLE(bp) || !zp->zp_dedup)
return (ZIO_PIPELINE_CONTINUE);
ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
BP_SET_DEDUP(bp, 1);
zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
return (ZIO_PIPELINE_CONTINUE);
}
/*
* We were unable to handle this as an override bp, treat
* it as a regular write I/O.
*/
zio->io_bp_override = NULL;
*bp = zio->io_bp_orig;
zio->io_pipeline = zio->io_orig_pipeline;
}
return (ZIO_PIPELINE_CONTINUE);
}
static int
zio_write_compress(zio_t *zio)
{
spa_t *spa = zio->io_spa;
zio_prop_t *zp = &zio->io_prop;
enum zio_compress compress = zp->zp_compress;
blkptr_t *bp = zio->io_bp;
uint64_t lsize = zio->io_lsize;
uint64_t psize = zio->io_size;
int pass = 1;
EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0);
/*
* If our children haven't all reached the ready stage,
* wait for them and then repeat this pipeline stage.
*/
if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) {
return (ZIO_PIPELINE_STOP);
}
if (!IO_IS_ALLOCATING(zio))
return (ZIO_PIPELINE_CONTINUE);
if (zio->io_children_ready != NULL) {
/*
* Now that all our children are ready, run the callback
* associated with this zio in case it wants to modify the
* data to be written.
*/
ASSERT3U(zp->zp_level, >, 0);
zio->io_children_ready(zio);
}
ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
ASSERT(zio->io_bp_override == NULL);
if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
/*
* We're rewriting an existing block, which means we're
* working on behalf of spa_sync(). For spa_sync() to
* converge, it must eventually be the case that we don't
* have to allocate new blocks. But compression changes
* the blocksize, which forces a reallocate, and makes
* convergence take longer. Therefore, after the first
* few passes, stop compressing to ensure convergence.
*/
pass = spa_sync_pass(spa);
ASSERT(zio->io_txg == spa_syncing_txg(spa));
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
ASSERT(!BP_GET_DEDUP(bp));
if (pass >= zfs_sync_pass_dont_compress)
compress = ZIO_COMPRESS_OFF;
/* Make sure someone doesn't change their mind on overwrites */
ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
spa_max_replication(spa)) == BP_GET_NDVAS(bp));
}
/* If it's a compressed write that is not raw, compress the buffer. */
if (compress != ZIO_COMPRESS_OFF && psize == lsize) {
void *cbuf = zio_buf_alloc(lsize);
psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize);
if (psize == 0 || psize == lsize) {
compress = ZIO_COMPRESS_OFF;
zio_buf_free(cbuf, lsize);
} else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
encode_embedded_bp_compressed(bp,
cbuf, compress, lsize, psize);
BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
BP_SET_TYPE(bp, zio->io_prop.zp_type);
BP_SET_LEVEL(bp, zio->io_prop.zp_level);
zio_buf_free(cbuf, lsize);
bp->blk_birth = zio->io_txg;
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
ASSERT(spa_feature_is_active(spa,
SPA_FEATURE_EMBEDDED_DATA));
return (ZIO_PIPELINE_CONTINUE);
} else {
/*
* Round up compressed size up to the ashift
* of the smallest-ashift device, and zero the tail.
* This ensures that the compressed size of the BP
* (and thus compressratio property) are correct,
* in that we charge for the padding used to fill out
* the last sector.
*/
ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
size_t rounded = (size_t)P2ROUNDUP(psize,
1ULL << spa->spa_min_ashift);
if (rounded >= lsize) {
compress = ZIO_COMPRESS_OFF;
zio_buf_free(cbuf, lsize);
psize = lsize;
} else {
abd_t *cdata = abd_get_from_buf(cbuf, lsize);
abd_take_ownership_of_buf(cdata, B_TRUE);
abd_zero_off(cdata, psize, rounded - psize);
psize = rounded;
zio_push_transform(zio, cdata,
psize, lsize, NULL);
}
}
/*
* We were unable to handle this as an override bp, treat
* it as a regular write I/O.
*/
zio->io_bp_override = NULL;
*bp = zio->io_bp_orig;
zio->io_pipeline = zio->io_orig_pipeline;
} else {
ASSERT3U(psize, !=, 0);
}
/*
* The final pass of spa_sync() must be all rewrites, but the first
* few passes offer a trade-off: allocating blocks defers convergence,
* but newly allocated blocks are sequential, so they can be written
* to disk faster. Therefore, we allow the first few passes of
* spa_sync() to allocate new blocks, but force rewrites after that.
* There should only be a handful of blocks after pass 1 in any case.
*/
if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
BP_GET_PSIZE(bp) == psize &&
pass >= zfs_sync_pass_rewrite) {
ASSERT(psize != 0);
enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
zio->io_flags |= ZIO_FLAG_IO_REWRITE;
} else {
BP_ZERO(bp);
zio->io_pipeline = ZIO_WRITE_PIPELINE;
}
if (psize == 0) {
if (zio->io_bp_orig.blk_birth != 0 &&
spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
BP_SET_LSIZE(bp, lsize);
BP_SET_TYPE(bp, zp->zp_type);
BP_SET_LEVEL(bp, zp->zp_level);
BP_SET_BIRTH(bp, zio->io_txg, 0);
}
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
} else {
ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
BP_SET_LSIZE(bp, lsize);
BP_SET_TYPE(bp, zp->zp_type);
BP_SET_LEVEL(bp, zp->zp_level);
BP_SET_PSIZE(bp, psize);
BP_SET_COMPRESS(bp, compress);
BP_SET_CHECKSUM(bp, zp->zp_checksum);
BP_SET_DEDUP(bp, zp->zp_dedup);
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
if (zp->zp_dedup) {
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
}
if (zp->zp_nopwrite) {
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
}
}
return (ZIO_PIPELINE_CONTINUE);
}
static int
zio_free_bp_init(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
if (BP_GET_DEDUP(bp))
zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
}
+ ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
+
return (ZIO_PIPELINE_CONTINUE);
}
/*
* ==========================================================================
* Execute the I/O pipeline
* ==========================================================================
*/
static void
zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
{
spa_t *spa = zio->io_spa;
zio_type_t t = zio->io_type;
int flags = (cutinline ? TQ_FRONT : 0);
ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT);
/*
* If we're a config writer or a probe, the normal issue and
* interrupt threads may all be blocked waiting for the config lock.
* In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
*/
if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
t = ZIO_TYPE_NULL;
/*
* A similar issue exists for the L2ARC write thread until L2ARC 2.0.
*/
if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
t = ZIO_TYPE_NULL;
/*
* If this is a high priority I/O, then use the high priority taskq if
* available.
*/
if (zio->io_priority == ZIO_PRIORITY_NOW &&
spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
q++;
ASSERT3U(q, <, ZIO_TASKQ_TYPES);
/*
* NB: We are assuming that the zio can only be dispatched
* to a single taskq at a time. It would be a grievous error
* to dispatch the zio to another taskq at the same time.
*/
#if defined(illumos) || !defined(_KERNEL)
ASSERT(zio->io_tqent.tqent_next == NULL);
#else
ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
#endif
spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
flags, &zio->io_tqent);
}
static boolean_t
zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
{
kthread_t *executor = zio->io_executor;
spa_t *spa = zio->io_spa;
for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
uint_t i;
for (i = 0; i < tqs->stqs_count; i++) {
if (taskq_member(tqs->stqs_taskq[i], executor))
return (B_TRUE);
}
}
return (B_FALSE);
}
static int
zio_issue_async(zio_t *zio)
{
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
return (ZIO_PIPELINE_STOP);
}
void
zio_interrupt(zio_t *zio)
{
zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
}
void
zio_delay_interrupt(zio_t *zio)
{
/*
* The timeout_generic() function isn't defined in userspace, so
* rather than trying to implement the function, the zio delay
* functionality has been disabled for userspace builds.
*/
#ifdef _KERNEL
/*
* If io_target_timestamp is zero, then no delay has been registered
* for this IO, thus jump to the end of this function and "skip" the
* delay; issuing it directly to the zio layer.
*/
if (zio->io_target_timestamp != 0) {
hrtime_t now = gethrtime();
if (now >= zio->io_target_timestamp) {
/*
* This IO has already taken longer than the target
* delay to complete, so we don't want to delay it
* any longer; we "miss" the delay and issue it
* directly to the zio layer. This is likely due to
* the target latency being set to a value less than
* the underlying hardware can satisfy (e.g. delay
* set to 1ms, but the disks take 10ms to complete an
* IO request).
*/
DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
hrtime_t, now);
zio_interrupt(zio);
} else {
hrtime_t diff = zio->io_target_timestamp - now;
DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
hrtime_t, now, hrtime_t, diff);
(void) timeout_generic(CALLOUT_NORMAL,
(void (*)(void *))zio_interrupt, zio, diff, 1, 0);
}
return;
}
#endif
DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
zio_interrupt(zio);
}
/*
* Execute the I/O pipeline until one of the following occurs:
*
* (1) the I/O completes
* (2) the pipeline stalls waiting for dependent child I/Os
* (3) the I/O issues, so we're waiting for an I/O completion interrupt
* (4) the I/O is delegated by vdev-level caching or aggregation
* (5) the I/O is deferred due to vdev-level queueing
* (6) the I/O is handed off to another thread.
*
* In all cases, the pipeline stops whenever there's no CPU work; it never
* burns a thread in cv_wait().
*
* There's no locking on io_stage because there's no legitimate way
* for multiple threads to be attempting to process the same I/O.
*/
static zio_pipe_stage_t *zio_pipeline[];
void
zio_execute(zio_t *zio)
{
zio->io_executor = curthread;
ASSERT3U(zio->io_queued_timestamp, >, 0);
while (zio->io_stage < ZIO_STAGE_DONE) {
enum zio_stage pipeline = zio->io_pipeline;
enum zio_stage stage = zio->io_stage;
int rv;
ASSERT(!MUTEX_HELD(&zio->io_lock));
ASSERT(ISP2(stage));
ASSERT(zio->io_stall == NULL);
do {
stage <<= 1;
} while ((stage & pipeline) == 0);
ASSERT(stage <= ZIO_STAGE_DONE);
/*
* If we are in interrupt context and this pipeline stage
* will grab a config lock that is held across I/O,
* or may wait for an I/O that needs an interrupt thread
* to complete, issue async to avoid deadlock.
*
* For VDEV_IO_START, we cut in line so that the io will
* be sent to disk promptly.
*/
if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
zio_requeue_io_start_cut_in_line : B_FALSE;
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
return;
}
zio->io_stage = stage;
zio->io_pipeline_trace |= zio->io_stage;
rv = zio_pipeline[highbit64(stage) - 1](zio);
if (rv == ZIO_PIPELINE_STOP)
return;
ASSERT(rv == ZIO_PIPELINE_CONTINUE);
}
}
/*
* ==========================================================================
* Initiate I/O, either sync or async
* ==========================================================================
*/
int
zio_wait(zio_t *zio)
{
int error;
ASSERT3P(zio->io_stage, ==, ZIO_STAGE_OPEN);
ASSERT3P(zio->io_executor, ==, NULL);
zio->io_waiter = curthread;
ASSERT0(zio->io_queued_timestamp);
zio->io_queued_timestamp = gethrtime();
zio_execute(zio);
mutex_enter(&zio->io_lock);
while (zio->io_executor != NULL)
cv_wait(&zio->io_cv, &zio->io_lock);
mutex_exit(&zio->io_lock);
error = zio->io_error;
zio_destroy(zio);
return (error);
}
void
zio_nowait(zio_t *zio)
{
ASSERT3P(zio->io_executor, ==, NULL);
if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
zio_unique_parent(zio) == NULL) {
/*
* This is a logical async I/O with no parent to wait for it.
* We add it to the spa_async_root_zio "Godfather" I/O which
* will ensure they complete prior to unloading the pool.
*/
spa_t *spa = zio->io_spa;
zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio);
}
ASSERT0(zio->io_queued_timestamp);
zio->io_queued_timestamp = gethrtime();
zio_execute(zio);
}
/*
* ==========================================================================
* Reexecute, cancel, or suspend/resume failed I/O
* ==========================================================================
*/
static void
zio_reexecute(zio_t *pio)
{
zio_t *cio, *cio_next;
ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
ASSERT(pio->io_gang_leader == NULL);
ASSERT(pio->io_gang_tree == NULL);
pio->io_flags = pio->io_orig_flags;
pio->io_stage = pio->io_orig_stage;
pio->io_pipeline = pio->io_orig_pipeline;
pio->io_reexecute = 0;
pio->io_flags |= ZIO_FLAG_REEXECUTED;
pio->io_pipeline_trace = 0;
pio->io_error = 0;
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
pio->io_state[w] = 0;
for (int c = 0; c < ZIO_CHILD_TYPES; c++)
pio->io_child_error[c] = 0;
if (IO_IS_ALLOCATING(pio))
BP_ZERO(pio->io_bp);
/*
* As we reexecute pio's children, new children could be created.
* New children go to the head of pio's io_child_list, however,
* so we will (correctly) not reexecute them. The key is that
* the remainder of pio's io_child_list, from 'cio_next' onward,
* cannot be affected by any side effects of reexecuting 'cio'.
*/
zio_link_t *zl = NULL;
for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
cio_next = zio_walk_children(pio, &zl);
mutex_enter(&pio->io_lock);
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
pio->io_children[cio->io_child_type][w]++;
mutex_exit(&pio->io_lock);
zio_reexecute(cio);
}
/*
* Now that all children have been reexecuted, execute the parent.
* We don't reexecute "The Godfather" I/O here as it's the
* responsibility of the caller to wait on it.
*/
if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) {
pio->io_queued_timestamp = gethrtime();
zio_execute(pio);
}
}
void
zio_suspend(spa_t *spa, zio_t *zio)
{
if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
fm_panic("Pool '%s' has encountered an uncorrectable I/O "
"failure and the failure mode property for this pool "
"is set to panic.", spa_name(spa));
zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
mutex_enter(&spa->spa_suspend_lock);
if (spa->spa_suspend_zio_root == NULL)
spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
ZIO_FLAG_GODFATHER);
spa->spa_suspended = B_TRUE;
if (zio != NULL) {
ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
ASSERT(zio != spa->spa_suspend_zio_root);
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
ASSERT(zio_unique_parent(zio) == NULL);
ASSERT(zio->io_stage == ZIO_STAGE_DONE);
zio_add_child(spa->spa_suspend_zio_root, zio);
}
mutex_exit(&spa->spa_suspend_lock);
}
int
zio_resume(spa_t *spa)
{
zio_t *pio;
/*
* Reexecute all previously suspended i/o.
*/
mutex_enter(&spa->spa_suspend_lock);
spa->spa_suspended = B_FALSE;
cv_broadcast(&spa->spa_suspend_cv);
pio = spa->spa_suspend_zio_root;
spa->spa_suspend_zio_root = NULL;
mutex_exit(&spa->spa_suspend_lock);
if (pio == NULL)
return (0);
zio_reexecute(pio);
return (zio_wait(pio));
}
void
zio_resume_wait(spa_t *spa)
{
mutex_enter(&spa->spa_suspend_lock);
while (spa_suspended(spa))
cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
mutex_exit(&spa->spa_suspend_lock);
}
/*
* ==========================================================================
* Gang blocks.
*
* A gang block is a collection of small blocks that looks to the DMU
* like one large block. When zio_dva_allocate() cannot find a block
* of the requested size, due to either severe fragmentation or the pool
* being nearly full, it calls zio_write_gang_block() to construct the
* block from smaller fragments.
*
* A gang block consists of a gang header (zio_gbh_phys_t) and up to
* three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like
* an indirect block: it's an array of block pointers. It consumes
* only one sector and hence is allocatable regardless of fragmentation.
* The gang header's bps point to its gang members, which hold the data.
*
* Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
* as the verifier to ensure uniqueness of the SHA256 checksum.
* Critically, the gang block bp's blk_cksum is the checksum of the data,
* not the gang header. This ensures that data block signatures (needed for
* deduplication) are independent of how the block is physically stored.
*
* Gang blocks can be nested: a gang member may itself be a gang block.
* Thus every gang block is a tree in which root and all interior nodes are
* gang headers, and the leaves are normal blocks that contain user data.
* The root of the gang tree is called the gang leader.
*
* To perform any operation (read, rewrite, free, claim) on a gang block,
* zio_gang_assemble() first assembles the gang tree (minus data leaves)
* in the io_gang_tree field of the original logical i/o by recursively
* reading the gang leader and all gang headers below it. This yields
* an in-core tree containing the contents of every gang header and the
* bps for every constituent of the gang block.
*
* With the gang tree now assembled, zio_gang_issue() just walks the gang tree
* and invokes a callback on each bp. To free a gang block, zio_gang_issue()
* calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
* zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
* zio_read_gang() is a wrapper around zio_read() that omits reading gang
* headers, since we already have those in io_gang_tree. zio_rewrite_gang()
* performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
* of the gang header plus zio_checksum_compute() of the data to update the
* gang header's blk_cksum as described above.
*
* The two-phase assemble/issue model solves the problem of partial failure --
* what if you'd freed part of a gang block but then couldn't read the
* gang header for another part? Assembling the entire gang tree first
* ensures that all the necessary gang header I/O has succeeded before
* starting the actual work of free, claim, or write. Once the gang tree
* is assembled, free and claim are in-memory operations that cannot fail.
*
* In the event that a gang write fails, zio_dva_unallocate() walks the
* gang tree to immediately free (i.e. insert back into the space map)
* everything we've allocated. This ensures that we don't get ENOSPC
* errors during repeated suspend/resume cycles due to a flaky device.
*
* Gang rewrites only happen during sync-to-convergence. If we can't assemble
* the gang tree, we won't modify the block, so we can safely defer the free
* (knowing that the block is still intact). If we *can* assemble the gang
* tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
* each constituent bp and we can allocate a new block on the next sync pass.
*
* In all cases, the gang tree allows complete recovery from partial failure.
* ==========================================================================
*/
static void
zio_gang_issue_func_done(zio_t *zio)
{
abd_put(zio->io_abd);
}
static zio_t *
zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
uint64_t offset)
{
if (gn != NULL)
return (pio);
return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset),
BP_GET_PSIZE(bp), zio_gang_issue_func_done,
NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
&pio->io_bookmark));
}
static zio_t *
zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
uint64_t offset)
{
zio_t *zio;
if (gn != NULL) {
abd_t *gbh_abd =
abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
&pio->io_bookmark);
/*
* As we rewrite each gang header, the pipeline will compute
* a new gang block header checksum for it; but no one will
* compute a new data checksum, so we do that here. The one
* exception is the gang leader: the pipeline already computed
* its data checksum because that stage precedes gang assembly.
* (Presently, nothing actually uses interior data checksums;
* this is just good hygiene.)
*/
if (gn != pio->io_gang_leader->io_gang_tree) {
abd_t *buf = abd_get_offset(data, offset);
zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
buf, BP_GET_PSIZE(bp));
abd_put(buf);
}
/*
* If we are here to damage data for testing purposes,
* leave the GBH alone so that we can detect the damage.
*/
if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
} else {
zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
abd_get_offset(data, offset), BP_GET_PSIZE(bp),
zio_gang_issue_func_done, NULL, pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
}
return (zio);
}
/* ARGSUSED */
static zio_t *
zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
uint64_t offset)
{
return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp),
ZIO_GANG_CHILD_FLAGS(pio)));
}
/* ARGSUSED */
static zio_t *
zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
uint64_t offset)
{
return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
}
static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
NULL,
zio_read_gang,
zio_rewrite_gang,
zio_free_gang,
zio_claim_gang,
NULL
};
static void zio_gang_tree_assemble_done(zio_t *zio);
static zio_gang_node_t *
zio_gang_node_alloc(zio_gang_node_t **gnpp)
{
zio_gang_node_t *gn;
ASSERT(*gnpp == NULL);
gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
*gnpp = gn;
return (gn);
}
static void
zio_gang_node_free(zio_gang_node_t **gnpp)
{
zio_gang_node_t *gn = *gnpp;
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
ASSERT(gn->gn_child[g] == NULL);
zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
kmem_free(gn, sizeof (*gn));
*gnpp = NULL;
}
static void
zio_gang_tree_free(zio_gang_node_t **gnpp)
{
zio_gang_node_t *gn = *gnpp;
if (gn == NULL)
return;
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
zio_gang_tree_free(&gn->gn_child[g]);
zio_gang_node_free(gnpp);
}
static void
zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
{
zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
ASSERT(gio->io_gang_leader == gio);
ASSERT(BP_IS_GANG(bp));
zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
zio_gang_tree_assemble_done, gn, gio->io_priority,
ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
}
static void
zio_gang_tree_assemble_done(zio_t *zio)
{
zio_t *gio = zio->io_gang_leader;
zio_gang_node_t *gn = zio->io_private;
blkptr_t *bp = zio->io_bp;
ASSERT(gio == zio_unique_parent(zio));
ASSERT(zio->io_child_count == 0);
if (zio->io_error)
return;
/* this ABD was created from a linear buf in zio_gang_tree_assemble */
if (BP_SHOULD_BYTESWAP(bp))
byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
abd_put(zio->io_abd);
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
if (!BP_IS_GANG(gbp))
continue;
zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
}
}
static void
zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
uint64_t offset)
{
zio_t *gio = pio->io_gang_leader;
zio_t *zio;
ASSERT(BP_IS_GANG(bp) == !!gn);
ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
/*
* If you're a gang header, your data is in gn->gn_gbh.
* If you're a gang member, your data is in 'data' and gn == NULL.
*/
zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
if (gn != NULL) {
ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
if (BP_IS_HOLE(gbp))
continue;
zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
offset);
offset += BP_GET_PSIZE(gbp);
}
}
if (gn == gio->io_gang_tree && gio->io_abd != NULL)
ASSERT3U(gio->io_size, ==, offset);
if (zio != pio)
zio_nowait(zio);
}
static int
zio_gang_assemble(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
zio->io_gang_leader = zio;
zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
return (ZIO_PIPELINE_CONTINUE);
}
static int
zio_gang_issue(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) {
return (ZIO_PIPELINE_STOP);
}
ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd,
0);
else
zio_gang_tree_free(&zio->io_gang_tree);
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
return (ZIO_PIPELINE_CONTINUE);
}
static void
zio_write_gang_member_ready(zio_t *zio)
{
zio_t *pio = zio_unique_parent(zio);
zio_t *gio = zio->io_gang_leader;
dva_t *cdva = zio->io_bp->blk_dva;
dva_t *pdva = pio->io_bp->blk_dva;
uint64_t asize;
if (BP_IS_HOLE(zio->io_bp))
return;
ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
mutex_enter(&pio->io_lock);
for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
ASSERT(DVA_GET_GANG(&pdva[d]));
asize = DVA_GET_ASIZE(&pdva[d]);
asize += DVA_GET_ASIZE(&cdva[d]);
DVA_SET_ASIZE(&pdva[d], asize);
}
mutex_exit(&pio->io_lock);
}
static void
zio_write_gang_done(zio_t *zio)
{
abd_put(zio->io_abd);
}
static int
zio_write_gang_block(zio_t *pio)
{
spa_t *spa = pio->io_spa;
metaslab_class_t *mc = spa_normal_class(spa);
blkptr_t *bp = pio->io_bp;
zio_t *gio = pio->io_gang_leader;
zio_t *zio;
zio_gang_node_t *gn, **gnpp;
zio_gbh_phys_t *gbh;
abd_t *gbh_abd;
uint64_t txg = pio->io_txg;
uint64_t resid = pio->io_size;
uint64_t lsize;
int copies = gio->io_prop.zp_copies;
int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
zio_prop_t zp;
int error;
int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
flags |= METASLAB_ASYNC_ALLOC;
VERIFY(refcount_held(&mc->mc_alloc_slots, pio));
/*
* The logical zio has already placed a reservation for
* 'copies' allocation slots but gang blocks may require
* additional copies. These additional copies
* (i.e. gbh_copies - copies) are guaranteed to succeed
* since metaslab_class_throttle_reserve() always allows
* additional reservations for gang blocks.
*/
VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
pio, flags));
}
error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
&pio->io_alloc_list, pio);
if (error) {
if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
/*
* If we failed to allocate the gang block header then
* we remove any additional allocation reservations that
* we placed here. The original reservation will
* be removed when the logical I/O goes to the ready
* stage.
*/
metaslab_class_throttle_unreserve(mc,
gbh_copies - copies, pio);
}
pio->io_error = error;
return (ZIO_PIPELINE_CONTINUE);
}
if (pio == gio) {
gnpp = &gio->io_gang_tree;
} else {
gnpp = pio->io_private;
ASSERT(pio->io_ready == zio_write_gang_member_ready);
}
gn = zio_gang_node_alloc(gnpp);
gbh = gn->gn_gbh;
bzero(gbh, SPA_GANGBLOCKSIZE);
gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
/*
* Create the gang header.
*/
zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
zio_write_gang_done, NULL, pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
/*
* Create and nowait the gang children.
*/
for (int g = 0; resid != 0; resid -= lsize, g++) {
lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
SPA_MINBLOCKSIZE);
ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
zp.zp_checksum = gio->io_prop.zp_checksum;
zp.zp_compress = ZIO_COMPRESS_OFF;
zp.zp_type = DMU_OT_NONE;
zp.zp_level = 0;
zp.zp_copies = gio->io_prop.zp_copies;
zp.zp_dedup = B_FALSE;
zp.zp_dedup_verify = B_FALSE;
zp.zp_nopwrite = B_FALSE;
zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
abd_get_offset(pio->io_abd, pio->io_size - resid), lsize,
lsize, &zp, zio_write_gang_member_ready, NULL, NULL,
zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
/*
* Gang children won't throttle but we should
* account for their work, so reserve an allocation
* slot for them here.
*/
VERIFY(metaslab_class_throttle_reserve(mc,
zp.zp_copies, cio, flags));
}
zio_nowait(cio);
}
/*
* Set pio's pipeline to just wait for zio to finish.
*/
pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
zio_nowait(zio);
return (ZIO_PIPELINE_CONTINUE);
}
/*
* The zio_nop_write stage in the pipeline determines if allocating a
* new bp is necessary. The nopwrite feature can handle writes in
* either syncing or open context (i.e. zil writes) and as a result is
* mutually exclusive with dedup.
*
* By leveraging a cryptographically secure checksum, such as SHA256, we
* can compare the checksums of the new data and the old to determine if
* allocating a new block is required. Note that our requirements for
* cryptographic strength are fairly weak: there can't be any accidental
* hash collisions, but we don't need to be secure against intentional
* (malicious) collisions. To trigger a nopwrite, you have to be able
* to write the file to begin with, and triggering an incorrect (hash
* collision) nopwrite is no worse than simply writing to the file.
* That said, there are no known attacks against the checksum algorithms
* used for nopwrite, assuming that the salt and the checksums
* themselves remain secret.
*/
static int
zio_nop_write(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
blkptr_t *bp_orig = &zio->io_bp_orig;
zio_prop_t *zp = &zio->io_prop;
ASSERT(BP_GET_LEVEL(bp) == 0);
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
ASSERT(zp->zp_nopwrite);
ASSERT(!zp->zp_dedup);
ASSERT(zio->io_bp_override == NULL);
ASSERT(IO_IS_ALLOCATING(zio));
/*
* Check to see if the original bp and the new bp have matching
* characteristics (i.e. same checksum, compression algorithms, etc).
* If they don't then just continue with the pipeline which will
* allocate a new bp.
*/
if (BP_IS_HOLE(bp_orig) ||
!(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
ZCHECKSUM_FLAG_NOPWRITE) ||
BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
zp->zp_copies != BP_GET_NDVAS(bp_orig))
return (ZIO_PIPELINE_CONTINUE);
/*
* If the checksums match then reset the pipeline so that we
* avoid allocating a new bp and issuing any I/O.
*/
if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags &
ZCHECKSUM_FLAG_NOPWRITE);
ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
sizeof (uint64_t)) == 0);
*bp = *bp_orig;
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
zio->io_flags |= ZIO_FLAG_NOPWRITE;
}
return (ZIO_PIPELINE_CONTINUE);
}
/*
* ==========================================================================
* Dedup
* ==========================================================================
*/
static void
zio_ddt_child_read_done(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
ddt_entry_t *dde = zio->io_private;
ddt_phys_t *ddp;
zio_t *pio = zio_unique_parent(zio);
mutex_enter(&pio->io_lock);
ddp = ddt_phys_select(dde, bp);
if (zio->io_error == 0)
ddt_phys_clear(ddp); /* this ddp doesn't need repair */
if (zio->io_error == 0 && dde->dde_repair_abd == NULL)
dde->dde_repair_abd = zio->io_abd;
else
abd_free(zio->io_abd);
mutex_exit(&pio->io_lock);
}
static int
zio_ddt_read_start(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
ASSERT(BP_GET_DEDUP(bp));
ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
if (zio->io_child_error[ZIO_CHILD_DDT]) {
ddt_t *ddt = ddt_select(zio->io_spa, bp);
ddt_entry_t *dde = ddt_repair_start(ddt, bp);
ddt_phys_t *ddp = dde->dde_phys;
ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
blkptr_t blk;
ASSERT(zio->io_vsd == NULL);
zio->io_vsd = dde;
if (ddp_self == NULL)
return (ZIO_PIPELINE_CONTINUE);
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
continue;
ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
&blk);
zio_nowait(zio_read(zio, zio->io_spa, &blk,
abd_alloc_for_io(zio->io_size, B_TRUE),
zio->io_size, zio_ddt_child_read_done, dde,
zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) |
ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark));
}
return (ZIO_PIPELINE_CONTINUE);
}
zio_nowait(zio_read(zio, zio->io_spa, bp,
zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
return (ZIO_PIPELINE_CONTINUE);
}
static int
zio_ddt_read_done(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) {
return (ZIO_PIPELINE_STOP);
}
ASSERT(BP_GET_DEDUP(bp));
ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
if (zio->io_child_error[ZIO_CHILD_DDT]) {
ddt_t *ddt = ddt_select(zio->io_spa, bp);
ddt_entry_t *dde = zio->io_vsd;
if (ddt == NULL) {
ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
return (ZIO_PIPELINE_CONTINUE);
}
if (dde == NULL) {
zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
return (ZIO_PIPELINE_STOP);
}
if (dde->dde_repair_abd != NULL) {
abd_copy(zio->io_abd, dde->dde_repair_abd,
zio->io_size);
zio->io_child_error[ZIO_CHILD_DDT] = 0;
}
ddt_repair_done(ddt, dde);
zio->io_vsd = NULL;
}
ASSERT(zio->io_vsd == NULL);
return (ZIO_PIPELINE_CONTINUE);
}
static boolean_t
zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
{
spa_t *spa = zio->io_spa;
boolean_t do_raw = (zio->io_flags & ZIO_FLAG_RAW);
/* We should never get a raw, override zio */
ASSERT(!(zio->io_bp_override && do_raw));
/*
* Note: we compare the original data, not the transformed data,
* because when zio->io_bp is an override bp, we will not have
* pushed the I/O transforms. That's an important optimization
* because otherwise we'd compress/encrypt all dmu_sync() data twice.
*/
for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
zio_t *lio = dde->dde_lead_zio[p];
if (lio != NULL) {
return (lio->io_orig_size != zio->io_orig_size ||
abd_cmp(zio->io_orig_abd, lio->io_orig_abd,
zio->io_orig_size) != 0);
}
}
for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
ddt_phys_t *ddp = &dde->dde_phys[p];
if (ddp->ddp_phys_birth != 0) {
arc_buf_t *abuf = NULL;
arc_flags_t aflags = ARC_FLAG_WAIT;
int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
blkptr_t blk = *zio->io_bp;
int error;
ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
ddt_exit(ddt);
/*
* Intuitively, it would make more sense to compare
* io_abd than io_orig_abd in the raw case since you
* don't want to look at any transformations that have
* happened to the data. However, for raw I/Os the
* data will actually be the same in io_abd and
* io_orig_abd, so all we have to do is issue this as
* a raw ARC read.
*/
if (do_raw) {
zio_flags |= ZIO_FLAG_RAW;
ASSERT3U(zio->io_size, ==, zio->io_orig_size);
ASSERT0(abd_cmp(zio->io_abd, zio->io_orig_abd,
zio->io_size));
ASSERT3P(zio->io_transform_stack, ==, NULL);
}
error = arc_read(NULL, spa, &blk,
arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
zio_flags, &aflags, &zio->io_bookmark);
if (error == 0) {
if (arc_buf_size(abuf) != zio->io_orig_size ||
abd_cmp_buf(zio->io_orig_abd, abuf->b_data,
zio->io_orig_size) != 0)
error = SET_ERROR(EEXIST);
arc_buf_destroy(abuf, &abuf);
}
ddt_enter(ddt);
return (error != 0);
}
}
return (B_FALSE);
}
static void
zio_ddt_child_write_ready(zio_t *zio)
{
int p = zio->io_prop.zp_copies;
ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
ddt_entry_t *dde = zio->io_private;
ddt_phys_t *ddp = &dde->dde_phys[p];
zio_t *pio;
if (zio->io_error)
return;
ddt_enter(ddt);
ASSERT(dde->dde_lead_zio[p] == zio);
ddt_phys_fill(ddp, zio->io_bp);
zio_link_t *zl = NULL;
while ((pio = zio_walk_parents(zio, &zl)) != NULL)
ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
ddt_exit(ddt);
}
static void
zio_ddt_child_write_done(zio_t *zio)
{
int p = zio->io_prop.zp_copies;
ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
ddt_entry_t *dde = zio->io_private;
ddt_phys_t *ddp = &dde->dde_phys[p];
ddt_enter(ddt);
ASSERT(ddp->ddp_refcnt == 0);
ASSERT(dde->dde_lead_zio[p] == zio);
dde->dde_lead_zio[p] = NULL;
if (zio->io_error == 0) {
zio_link_t *zl = NULL;
while (zio_walk_parents(zio, &zl) != NULL)
ddt_phys_addref(ddp);
} else {
ddt_phys_clear(ddp);
}
ddt_exit(ddt);
}
static void
zio_ddt_ditto_write_done(zio_t *zio)
{
int p = DDT_PHYS_DITTO;
zio_prop_t *zp = &zio->io_prop;
blkptr_t *bp = zio->io_bp;
ddt_t *ddt = ddt_select(zio->io_spa, bp);
ddt_entry_t *dde = zio->io_private;
ddt_phys_t *ddp = &dde->dde_phys[p];
ddt_key_t *ddk = &dde->dde_key;
ddt_enter(ddt);
ASSERT(ddp->ddp_refcnt == 0);
ASSERT(dde->dde_lead_zio[p] == zio);
dde->dde_lead_zio[p] = NULL;
if (zio->io_error == 0) {
ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
if (ddp->ddp_phys_birth != 0)
ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
ddt_phys_fill(ddp, bp);
}
ddt_exit(ddt);
}
static int
zio_ddt_write(zio_t *zio)
{
spa_t *spa = zio->io_spa;
blkptr_t *bp = zio->io_bp;
uint64_t txg = zio->io_txg;
zio_prop_t *zp = &zio->io_prop;
int p = zp->zp_copies;
int ditto_copies;
zio_t *cio = NULL;
zio_t *dio = NULL;
ddt_t *ddt = ddt_select(spa, bp);
ddt_entry_t *dde;
ddt_phys_t *ddp;
ASSERT(BP_GET_DEDUP(bp));
ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
ddt_enter(ddt);
dde = ddt_lookup(ddt, bp, B_TRUE);
ddp = &dde->dde_phys[p];
if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
/*
* If we're using a weak checksum, upgrade to a strong checksum
* and try again. If we're already using a strong checksum,
* we can't resolve it, so just convert to an ordinary write.
* (And automatically e-mail a paper to Nature?)
*/
if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
ZCHECKSUM_FLAG_DEDUP)) {
zp->zp_checksum = spa_dedup_checksum(spa);
zio_pop_transforms(zio);
zio->io_stage = ZIO_STAGE_OPEN;
BP_ZERO(bp);
} else {
zp->zp_dedup = B_FALSE;
BP_SET_DEDUP(bp, B_FALSE);
}
ASSERT(!BP_GET_DEDUP(bp));
zio->io_pipeline = ZIO_WRITE_PIPELINE;
ddt_exit(ddt);
return (ZIO_PIPELINE_CONTINUE);
}
ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
ASSERT(ditto_copies < SPA_DVAS_PER_BP);
if (ditto_copies > ddt_ditto_copies_present(dde) &&
dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
zio_prop_t czp = *zp;
czp.zp_copies = ditto_copies;
/*
* If we arrived here with an override bp, we won't have run
* the transform stack, so we won't have the data we need to
* generate a child i/o. So, toss the override bp and restart.
* This is safe, because using the override bp is just an
* optimization; and it's rare, so the cost doesn't matter.
*/
if (zio->io_bp_override) {
zio_pop_transforms(zio);
zio->io_stage = ZIO_STAGE_OPEN;
zio->io_pipeline = ZIO_WRITE_PIPELINE;
zio->io_bp_override = NULL;
BP_ZERO(bp);
ddt_exit(ddt);
return (ZIO_PIPELINE_CONTINUE);
}
dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL,
NULL, zio_ddt_ditto_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL);
dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
}
if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
if (ddp->ddp_phys_birth != 0)
ddt_bp_fill(ddp, bp, txg);
if (dde->dde_lead_zio[p] != NULL)
zio_add_child(zio, dde->dde_lead_zio[p]);
else
ddt_phys_addref(ddp);
} else if (zio->io_bp_override) {
ASSERT(bp->blk_birth == txg);
ASSERT(BP_EQUAL(bp, zio->io_bp_override));
ddt_phys_fill(ddp, bp);
ddt_phys_addref(ddp);
} else {
cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
zio->io_orig_size, zio->io_orig_size, zp,
zio_ddt_child_write_ready, NULL, NULL,
zio_ddt_child_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
dde->dde_lead_zio[p] = cio;
}
ddt_exit(ddt);
if (cio)
zio_nowait(cio);
if (dio)
zio_nowait(dio);
return (ZIO_PIPELINE_CONTINUE);
}
ddt_entry_t *freedde; /* for debugging */
static int
zio_ddt_free(zio_t *zio)
{
spa_t *spa = zio->io_spa;
blkptr_t *bp = zio->io_bp;
ddt_t *ddt = ddt_select(spa, bp);
ddt_entry_t *dde;
ddt_phys_t *ddp;
ASSERT(BP_GET_DEDUP(bp));
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
ddt_enter(ddt);
freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
ddp = ddt_phys_select(dde, bp);
ddt_phys_decref(ddp);
ddt_exit(ddt);
return (ZIO_PIPELINE_CONTINUE);
}
/*
* ==========================================================================
* Allocate and free blocks
* ==========================================================================
*/
static zio_t *
zio_io_to_allocate(spa_t *spa)
{
zio_t *zio;
ASSERT(MUTEX_HELD(&spa->spa_alloc_lock));
zio = avl_first(&spa->spa_alloc_tree);
if (zio == NULL)
return (NULL);
ASSERT(IO_IS_ALLOCATING(zio));
/*
* Try to place a reservation for this zio. If we're unable to
* reserve then we throttle.
*/
if (!metaslab_class_throttle_reserve(spa_normal_class(spa),
zio->io_prop.zp_copies, zio, 0)) {
return (NULL);
}
avl_remove(&spa->spa_alloc_tree, zio);
ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
return (zio);
}
static int
zio_dva_throttle(zio_t *zio)
{
spa_t *spa = zio->io_spa;
zio_t *nio;
if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
!spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled ||
zio->io_child_type == ZIO_CHILD_GANG ||
zio->io_flags & ZIO_FLAG_NODATA) {
return (ZIO_PIPELINE_CONTINUE);
}
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
ASSERT3U(zio->io_queued_timestamp, >, 0);
ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
mutex_enter(&spa->spa_alloc_lock);
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
avl_add(&spa->spa_alloc_tree, zio);
nio = zio_io_to_allocate(zio->io_spa);
mutex_exit(&spa->spa_alloc_lock);
if (nio == zio)
return (ZIO_PIPELINE_CONTINUE);
if (nio != NULL) {
ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE);
/*
* We are passing control to a new zio so make sure that
* it is processed by a different thread. We do this to
* avoid stack overflows that can occur when parents are
* throttled and children are making progress. We allow
* it to go to the head of the taskq since it's already
* been waiting.
*/
zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE);
}
return (ZIO_PIPELINE_STOP);
}
void
zio_allocate_dispatch(spa_t *spa)
{
zio_t *zio;
mutex_enter(&spa->spa_alloc_lock);
zio = zio_io_to_allocate(spa);
mutex_exit(&spa->spa_alloc_lock);
if (zio == NULL)
return;
ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE);
ASSERT0(zio->io_error);
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
}
static int
zio_dva_allocate(zio_t *zio)
{
spa_t *spa = zio->io_spa;
metaslab_class_t *mc = spa_normal_class(spa);
blkptr_t *bp = zio->io_bp;
int error;
int flags = 0;
if (zio->io_gang_leader == NULL) {
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
zio->io_gang_leader = zio;
}
ASSERT(BP_IS_HOLE(bp));
ASSERT0(BP_GET_NDVAS(bp));
ASSERT3U(zio->io_prop.zp_copies, >, 0);
ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
if (zio->io_flags & ZIO_FLAG_NODATA) {
flags |= METASLAB_DONT_THROTTLE;
}
if (zio->io_flags & ZIO_FLAG_GANG_CHILD) {
flags |= METASLAB_GANG_CHILD;
}
if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) {
flags |= METASLAB_ASYNC_ALLOC;
}
error = metaslab_alloc(spa, mc, zio->io_size, bp,
zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
&zio->io_alloc_list, zio);
if (error != 0) {
spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
"size %llu, error %d", spa_name(spa), zio, zio->io_size,
error);
if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
return (zio_write_gang_block(zio));
zio->io_error = error;
}
return (ZIO_PIPELINE_CONTINUE);
}
static int
zio_dva_free(zio_t *zio)
{
metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
return (ZIO_PIPELINE_CONTINUE);
}
static int
zio_dva_claim(zio_t *zio)
{
int error;
error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
if (error)
zio->io_error = error;
return (ZIO_PIPELINE_CONTINUE);
}
/*
* Undo an allocation. This is used by zio_done() when an I/O fails
* and we want to give back the block we just allocated.
* This handles both normal blocks and gang blocks.
*/
static void
zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
{
ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
ASSERT(zio->io_bp_override == NULL);
if (!BP_IS_HOLE(bp))
metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
if (gn != NULL) {
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
zio_dva_unallocate(zio, gn->gn_child[g],
&gn->gn_gbh->zg_blkptr[g]);
}
}
}
/*
* Try to allocate an intent log block. Return 0 on success, errno on failure.
*/
int
zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
uint64_t size, boolean_t *slog)
{
int error = 1;
zio_alloc_list_t io_alloc_list;
ASSERT(txg > spa_syncing_txg(spa));
metaslab_trace_init(&io_alloc_list);
error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL);
if (error == 0) {
*slog = TRUE;
} else {
error = metaslab_alloc(spa, spa_normal_class(spa), size,
new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID,
&io_alloc_list, NULL);
if (error == 0)
*slog = FALSE;
}
metaslab_trace_fini(&io_alloc_list);
if (error == 0) {
BP_SET_LSIZE(new_bp, size);
BP_SET_PSIZE(new_bp, size);
BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
BP_SET_CHECKSUM(new_bp,
spa_version(spa) >= SPA_VERSION_SLIM_ZIL
? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
BP_SET_LEVEL(new_bp, 0);
BP_SET_DEDUP(new_bp, 0);
BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
} else {
zfs_dbgmsg("%s: zil block allocation failure: "
"size %llu, error %d", spa_name(spa), size, error);
}
return (error);
}
/*
* Free an intent log block.
*/
void
zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
{
ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
ASSERT(!BP_IS_GANG(bp));
zio_free(spa, txg, bp);
}
/*
* ==========================================================================
* Read, write and delete to physical devices
* ==========================================================================
*/
/*
* Issue an I/O to the underlying vdev. Typically the issue pipeline
* stops after this stage and will resume upon I/O completion.
* However, there are instances where the vdev layer may need to
* continue the pipeline when an I/O was not issued. Since the I/O
* that was sent to the vdev layer might be different than the one
* currently active in the pipeline (see vdev_queue_io()), we explicitly
* force the underlying vdev layers to call either zio_execute() or
* zio_interrupt() to ensure that the pipeline continues with the correct I/O.
*/
static int
zio_vdev_io_start(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
uint64_t align;
spa_t *spa = zio->io_spa;
int ret;
ASSERT(zio->io_error == 0);
ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
if (vd == NULL) {
if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
/*
* The mirror_ops handle multiple DVAs in a single BP.
*/
vdev_mirror_ops.vdev_op_io_start(zio);
return (ZIO_PIPELINE_STOP);
}
if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE &&
zio->io_priority == ZIO_PRIORITY_NOW) {
trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg);
return (ZIO_PIPELINE_CONTINUE);
}
ASSERT3P(zio->io_logical, !=, zio);
+ if (zio->io_type == ZIO_TYPE_WRITE && zio->io_vd->vdev_removing) {
+ ASSERT(zio->io_flags &
+ (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
+ ZIO_FLAG_INDUCE_DAMAGE));
+ }
/*
* We keep track of time-sensitive I/Os so that the scan thread
* can quickly react to certain workloads. In particular, we care
* about non-scrubbing, top-level reads and writes with the following
* characteristics:
* - synchronous writes of user data to non-slog devices
* - any reads of user data
* When these conditions are met, adjust the timestamp of spa_last_io
* which allows the scan thread to adjust its workload accordingly.
*/
if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
vd == vd->vdev_top && !vd->vdev_islog &&
zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
zio->io_txg != spa_syncing_txg(spa)) {
uint64_t old = spa->spa_last_io;
uint64_t new = ddi_get_lbolt64();
if (old != new)
(void) atomic_cas_64(&spa->spa_last_io, old, new);
}
align = 1ULL << vd->vdev_top->vdev_ashift;
if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
P2PHASE(zio->io_size, align) != 0) {
/* Transform logical writes to be a full physical block size. */
uint64_t asize = P2ROUNDUP(zio->io_size, align);
abd_t *abuf = NULL;
if (zio->io_type == ZIO_TYPE_READ ||
zio->io_type == ZIO_TYPE_WRITE)
abuf = abd_alloc_sametype(zio->io_abd, asize);
ASSERT(vd == vd->vdev_top);
if (zio->io_type == ZIO_TYPE_WRITE) {
abd_copy(abuf, zio->io_abd, zio->io_size);
abd_zero_off(abuf, zio->io_size, asize - zio->io_size);
}
zio_push_transform(zio, abuf, asize, abuf ? asize : 0,
zio_subblock);
}
/*
* If this is not a physical io, make sure that it is properly aligned
* before proceeding.
*/
if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
ASSERT0(P2PHASE(zio->io_offset, align));
ASSERT0(P2PHASE(zio->io_size, align));
} else {
/*
* For the physical io we allow alignment
* to a logical block size.
*/
uint64_t log_align =
1ULL << vd->vdev_top->vdev_logical_ashift;
ASSERT0(P2PHASE(zio->io_offset, log_align));
ASSERT0(P2PHASE(zio->io_size, log_align));
}
VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa));
/*
* If this is a repair I/O, and there's no self-healing involved --
* that is, we're just resilvering what we expect to resilver --
* then don't do the I/O unless zio's txg is actually in vd's DTL.
* This prevents spurious resilvering with nested replication.
* For example, given a mirror of mirrors, (A+B)+(C+D), if only
* A is out of date, we'll read from C+D, then use the data to
* resilver A+B -- but we don't actually want to resilver B, just A.
* The top-level mirror has no way to know this, so instead we just
* discard unnecessary repairs as we work our way down the vdev tree.
* The same logic applies to any form of nested replication:
* ditto + mirror, RAID-Z + replacing, etc. This covers them all.
*/
if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
!(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
zio->io_txg != 0 && /* not a delegated i/o */
!vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
zio_vdev_io_bypass(zio);
return (ZIO_PIPELINE_CONTINUE);
}
if (vd->vdev_ops->vdev_op_leaf) {
switch (zio->io_type) {
case ZIO_TYPE_READ:
if (vdev_cache_read(zio))
return (ZIO_PIPELINE_CONTINUE);
/* FALLTHROUGH */
case ZIO_TYPE_WRITE:
case ZIO_TYPE_FREE:
if ((zio = vdev_queue_io(zio)) == NULL)
return (ZIO_PIPELINE_STOP);
if (!vdev_accessible(vd, zio)) {
zio->io_error = SET_ERROR(ENXIO);
zio_interrupt(zio);
return (ZIO_PIPELINE_STOP);
}
break;
}
/*
* Note that we ignore repair writes for TRIM because they can
* conflict with normal writes. This isn't an issue because, by
* definition, we only repair blocks that aren't freed.
*/
if (zio->io_type == ZIO_TYPE_WRITE &&
!(zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
!trim_map_write_start(zio))
return (ZIO_PIPELINE_STOP);
}
vd->vdev_ops->vdev_op_io_start(zio);
return (ZIO_PIPELINE_STOP);
}
static int
zio_vdev_io_done(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
boolean_t unexpected_error = B_FALSE;
if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
return (ZIO_PIPELINE_STOP);
}
ASSERT(zio->io_type == ZIO_TYPE_READ ||
zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE);
if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE ||
zio->io_type == ZIO_TYPE_FREE)) {
if (zio->io_type == ZIO_TYPE_WRITE &&
!(zio->io_flags & ZIO_FLAG_IO_REPAIR))
trim_map_write_done(zio);
vdev_queue_io_done(zio);
if (zio->io_type == ZIO_TYPE_WRITE)
vdev_cache_write(zio);
if (zio_injection_enabled && zio->io_error == 0)
zio->io_error = zio_handle_device_injection(vd,
zio, EIO);
if (zio_injection_enabled && zio->io_error == 0)
zio->io_error = zio_handle_label_injection(zio, EIO);
if (zio->io_error) {
if (zio->io_error == ENOTSUP &&
zio->io_type == ZIO_TYPE_FREE) {
/* Not all devices support TRIM. */
} else if (!vdev_accessible(vd, zio)) {
zio->io_error = SET_ERROR(ENXIO);
} else {
unexpected_error = B_TRUE;
}
}
}
ops->vdev_op_io_done(zio);
if (unexpected_error)
VERIFY(vdev_probe(vd, zio) == NULL);
return (ZIO_PIPELINE_CONTINUE);
}
/*
* For non-raidz ZIOs, we can just copy aside the bad data read from the
* disk, and use that to finish the checksum ereport later.
*/
static void
zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
const void *good_buf)
{
/* no processing needed */
zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
}
/*ARGSUSED*/
void
zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
{
void *buf = zio_buf_alloc(zio->io_size);
abd_copy_to_buf(buf, zio->io_abd, zio->io_size);
zcr->zcr_cbinfo = zio->io_size;
zcr->zcr_cbdata = buf;
zcr->zcr_finish = zio_vsd_default_cksum_finish;
zcr->zcr_free = zio_buf_free;
}
static int
zio_vdev_io_assess(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
return (ZIO_PIPELINE_STOP);
}
if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
spa_config_exit(zio->io_spa, SCL_ZIO, zio);
if (zio->io_vsd != NULL) {
zio->io_vsd_ops->vsd_free(zio);
zio->io_vsd = NULL;
}
if (zio_injection_enabled && zio->io_error == 0)
zio->io_error = zio_handle_fault_injection(zio, EIO);
if (zio->io_type == ZIO_TYPE_FREE &&
zio->io_priority != ZIO_PRIORITY_NOW) {
switch (zio->io_error) {
case 0:
ZIO_TRIM_STAT_INCR(bytes, zio->io_size);
ZIO_TRIM_STAT_BUMP(success);
break;
case EOPNOTSUPP:
ZIO_TRIM_STAT_BUMP(unsupported);
break;
default:
ZIO_TRIM_STAT_BUMP(failed);
break;
}
}
/*
* If the I/O failed, determine whether we should attempt to retry it.
*
* On retry, we cut in line in the issue queue, since we don't want
* compression/checksumming/etc. work to prevent our (cheap) IO reissue.
*/
if (zio->io_error && vd == NULL &&
!(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */
zio->io_error = 0;
zio->io_flags |= ZIO_FLAG_IO_RETRY |
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
zio_requeue_io_start_cut_in_line);
return (ZIO_PIPELINE_STOP);
}
/*
* If we got an error on a leaf device, convert it to ENXIO
* if the device is not accessible at all.
*/
if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
!vdev_accessible(vd, zio))
zio->io_error = SET_ERROR(ENXIO);
/*
* If we can't write to an interior vdev (mirror or RAID-Z),
* set vdev_cant_write so that we stop trying to allocate from it.
*/
if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
vd->vdev_cant_write = B_TRUE;
}
/*
* If a cache flush returns ENOTSUP or ENOTTY, we know that no future
* attempts will ever succeed. In this case we set a persistent bit so
* that we don't bother with it in the future.
*/
if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
zio->io_type == ZIO_TYPE_IOCTL &&
zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL)
vd->vdev_nowritecache = B_TRUE;
if (zio->io_error)
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
zio->io_physdone != NULL) {
ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
zio->io_physdone(zio->io_logical);
}
return (ZIO_PIPELINE_CONTINUE);
}
void
zio_vdev_io_reissue(zio_t *zio)
{
ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
ASSERT(zio->io_error == 0);
zio->io_stage >>= 1;
}
void
zio_vdev_io_redone(zio_t *zio)
{
ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
zio->io_stage >>= 1;
}
void
zio_vdev_io_bypass(zio_t *zio)
{
ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
ASSERT(zio->io_error == 0);
zio->io_flags |= ZIO_FLAG_IO_BYPASS;
zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
}
/*
* ==========================================================================
* Generate and verify checksums
* ==========================================================================
*/
static int
zio_checksum_generate(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
enum zio_checksum checksum;
if (bp == NULL) {
/*
* This is zio_write_phys().
* We're either generating a label checksum, or none at all.
*/
checksum = zio->io_prop.zp_checksum;
if (checksum == ZIO_CHECKSUM_OFF)
return (ZIO_PIPELINE_CONTINUE);
ASSERT(checksum == ZIO_CHECKSUM_LABEL);
} else {
if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
ASSERT(!IO_IS_ALLOCATING(zio));
checksum = ZIO_CHECKSUM_GANG_HEADER;
} else {
checksum = BP_GET_CHECKSUM(bp);
}
}
zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size);
return (ZIO_PIPELINE_CONTINUE);
}
static int
zio_checksum_verify(zio_t *zio)
{
zio_bad_cksum_t info;
blkptr_t *bp = zio->io_bp;
int error;
ASSERT(zio->io_vd != NULL);
if (bp == NULL) {
/*
* This is zio_read_phys().
* We're either verifying a label checksum, or nothing at all.
*/
if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
return (ZIO_PIPELINE_CONTINUE);
ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
}
if ((error = zio_checksum_error(zio, &info)) != 0) {
zio->io_error = error;
if (error == ECKSUM &&
!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
zfs_ereport_start_checksum(zio->io_spa,
zio->io_vd, zio, zio->io_offset,
zio->io_size, NULL, &info);
}
}
return (ZIO_PIPELINE_CONTINUE);
}
/*
* Called by RAID-Z to ensure we don't compute the checksum twice.
*/
void
zio_checksum_verified(zio_t *zio)
{
zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
}
/*
* ==========================================================================
* Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
* An error of 0 indicates success. ENXIO indicates whole-device failure,
* which may be transient (e.g. unplugged) or permament. ECKSUM and EIO
* indicate errors that are specific to one I/O, and most likely permanent.
* Any other error is presumed to be worse because we weren't expecting it.
* ==========================================================================
*/
int
zio_worst_error(int e1, int e2)
{
static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
int r1, r2;
for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
if (e1 == zio_error_rank[r1])
break;
for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
if (e2 == zio_error_rank[r2])
break;
return (r1 > r2 ? e1 : e2);
}
/*
* ==========================================================================
* I/O completion
* ==========================================================================
*/
static int
zio_ready(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
zio_t *pio, *pio_next;
zio_link_t *zl = NULL;
if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT,
ZIO_WAIT_READY)) {
return (ZIO_PIPELINE_STOP);
}
if (zio->io_ready) {
ASSERT(IO_IS_ALLOCATING(zio));
ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
(zio->io_flags & ZIO_FLAG_NOPWRITE));
ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
zio->io_ready(zio);
}
if (bp != NULL && bp != &zio->io_bp_copy)
zio->io_bp_copy = *bp;
if (zio->io_error != 0) {
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
ASSERT(IO_IS_ALLOCATING(zio));
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
/*
* We were unable to allocate anything, unreserve and
* issue the next I/O to allocate.
*/
metaslab_class_throttle_unreserve(
spa_normal_class(zio->io_spa),
zio->io_prop.zp_copies, zio);
zio_allocate_dispatch(zio->io_spa);
}
}
mutex_enter(&zio->io_lock);
zio->io_state[ZIO_WAIT_READY] = 1;
pio = zio_walk_parents(zio, &zl);
mutex_exit(&zio->io_lock);
/*
* As we notify zio's parents, new parents could be added.
* New parents go to the head of zio's io_parent_list, however,
* so we will (correctly) not notify them. The remainder of zio's
* io_parent_list, from 'pio_next' onward, cannot change because
* all parents must wait for us to be done before they can be done.
*/
for (; pio != NULL; pio = pio_next) {
pio_next = zio_walk_parents(zio, &zl);
zio_notify_parent(pio, zio, ZIO_WAIT_READY);
}
if (zio->io_flags & ZIO_FLAG_NODATA) {
if (BP_IS_GANG(bp)) {
zio->io_flags &= ~ZIO_FLAG_NODATA;
} else {
ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE);
zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
}
}
if (zio_injection_enabled &&
zio->io_spa->spa_syncing_txg == zio->io_txg)
zio_handle_ignored_writes(zio);
return (ZIO_PIPELINE_CONTINUE);
}
/*
* Update the allocation throttle accounting.
*/
static void
zio_dva_throttle_done(zio_t *zio)
{
zio_t *lio = zio->io_logical;
zio_t *pio = zio_unique_parent(zio);
vdev_t *vd = zio->io_vd;
int flags = METASLAB_ASYNC_ALLOC;
ASSERT3P(zio->io_bp, !=, NULL);
ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE);
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
ASSERT(vd != NULL);
ASSERT3P(vd, ==, vd->vdev_top);
ASSERT(!(zio->io_flags & (ZIO_FLAG_IO_REPAIR | ZIO_FLAG_IO_RETRY)));
ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE));
ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA));
/*
* Parents of gang children can have two flavors -- ones that
* allocated the gang header (will have ZIO_FLAG_IO_REWRITE set)
* and ones that allocated the constituent blocks. The allocation
* throttle needs to know the allocating parent zio so we must find
* it here.
*/
if (pio->io_child_type == ZIO_CHILD_GANG) {
/*
* If our parent is a rewrite gang child then our grandparent
* would have been the one that performed the allocation.
*/
if (pio->io_flags & ZIO_FLAG_IO_REWRITE)
pio = zio_unique_parent(pio);
flags |= METASLAB_GANG_CHILD;
}
ASSERT(IO_IS_ALLOCATING(pio));
ASSERT3P(zio, !=, zio->io_logical);
ASSERT(zio->io_logical != NULL);
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
mutex_enter(&pio->io_lock);
metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags);
mutex_exit(&pio->io_lock);
metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa),
1, pio);
/*
* Call into the pipeline to see if there is more work that
* needs to be done. If there is work to be done it will be
* dispatched to another taskq thread.
*/
zio_allocate_dispatch(zio->io_spa);
}
static int
zio_done(zio_t *zio)
{
spa_t *spa = zio->io_spa;
zio_t *lio = zio->io_logical;
blkptr_t *bp = zio->io_bp;
vdev_t *vd = zio->io_vd;
uint64_t psize = zio->io_size;
zio_t *pio, *pio_next;
metaslab_class_t *mc = spa_normal_class(spa);
zio_link_t *zl = NULL;
/*
* If our children haven't all completed,
* wait for them and then repeat this pipeline stage.
*/
if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) {
return (ZIO_PIPELINE_STOP);
}
/*
* If the allocation throttle is enabled, then update the accounting.
* We only track child I/Os that are part of an allocating async
* write. We must do this since the allocation is performed
* by the logical I/O but the actual write is done by child I/Os.
*/
if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
zio->io_child_type == ZIO_CHILD_VDEV) {
ASSERT(mc->mc_alloc_throttle_enabled);
zio_dva_throttle_done(zio);
}
/*
* If the allocation throttle is enabled, verify that
* we have decremented the refcounts for every I/O that was throttled.
*/
if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(bp != NULL);
metaslab_group_alloc_verify(spa, zio->io_bp, zio);
VERIFY(refcount_not_held(&mc->mc_alloc_slots, zio));
}
for (int c = 0; c < ZIO_CHILD_TYPES; c++)
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
ASSERT(zio->io_children[c][w] == 0);
if (bp != NULL && !BP_IS_EMBEDDED(bp)) {
ASSERT(bp->blk_pad[0] == 0);
ASSERT(bp->blk_pad[1] == 0);
ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
(bp == zio_unique_parent(zio)->io_bp));
if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
zio->io_bp_override == NULL &&
!(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
ASSERT(!BP_SHOULD_BYTESWAP(bp));
ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
ASSERT(BP_COUNT_GANG(bp) == 0 ||
(BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
}
if (zio->io_flags & ZIO_FLAG_NOPWRITE)
VERIFY(BP_EQUAL(bp, &zio->io_bp_orig));
}
/*
* If there were child vdev/gang/ddt errors, they apply to us now.
*/
zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
/*
* If the I/O on the transformed data was successful, generate any
* checksum reports now while we still have the transformed data.
*/
if (zio->io_error == 0) {
while (zio->io_cksum_report != NULL) {
zio_cksum_report_t *zcr = zio->io_cksum_report;
uint64_t align = zcr->zcr_align;
uint64_t asize = P2ROUNDUP(psize, align);
char *abuf = NULL;
abd_t *adata = zio->io_abd;
if (asize != psize) {
adata = abd_alloc_linear(asize, B_TRUE);
abd_copy(adata, zio->io_abd, psize);
abd_zero_off(adata, psize, asize - psize);
}
if (adata != NULL)
abuf = abd_borrow_buf_copy(adata, asize);
zio->io_cksum_report = zcr->zcr_next;
zcr->zcr_next = NULL;
zcr->zcr_finish(zcr, abuf);
zfs_ereport_free_checksum(zcr);
if (adata != NULL)
abd_return_buf(adata, abuf, asize);
if (asize != psize)
abd_free(adata);
}
}
zio_pop_transforms(zio); /* note: may set zio->io_error */
vdev_stat_update(zio, psize);
if (zio->io_error) {
/*
* If this I/O is attached to a particular vdev,
* generate an error message describing the I/O failure
* at the block level. We ignore these errors if the
* device is currently unavailable.
*/
if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
if ((zio->io_error == EIO || !(zio->io_flags &
(ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
zio == lio) {
/*
* For logical I/O requests, tell the SPA to log the
* error and generate a logical data ereport.
*/
spa_log_error(spa, zio);
zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
0, 0);
}
}
if (zio->io_error && zio == lio) {
/*
* Determine whether zio should be reexecuted. This will
* propagate all the way to the root via zio_notify_parent().
*/
ASSERT(vd == NULL && bp != NULL);
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
if (IO_IS_ALLOCATING(zio) &&
!(zio->io_flags & ZIO_FLAG_CANFAIL)) {
if (zio->io_error != ENOSPC)
zio->io_reexecute |= ZIO_REEXECUTE_NOW;
else
zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
}
if ((zio->io_type == ZIO_TYPE_READ ||
zio->io_type == ZIO_TYPE_FREE) &&
!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
zio->io_error == ENXIO &&
spa_load_state(spa) == SPA_LOAD_NONE &&
spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
/*
* Here is a possibly good place to attempt to do
* either combinatorial reconstruction or error correction
* based on checksums. It also might be a good place
* to send out preliminary ereports before we suspend
* processing.
*/
}
/*
* If there were logical child errors, they apply to us now.
* We defer this until now to avoid conflating logical child
* errors with errors that happened to the zio itself when
* updating vdev stats and reporting FMA events above.
*/
zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
if ((zio->io_error || zio->io_reexecute) &&
IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
!(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
zio_dva_unallocate(zio, zio->io_gang_tree, bp);
zio_gang_tree_free(&zio->io_gang_tree);
/*
* Godfather I/Os should never suspend.
*/
if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
(zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
zio->io_reexecute = 0;
if (zio->io_reexecute) {
/*
* This is a logical I/O that wants to reexecute.
*
* Reexecute is top-down. When an i/o fails, if it's not
* the root, it simply notifies its parent and sticks around.
* The parent, seeing that it still has children in zio_done(),
* does the same. This percolates all the way up to the root.
* The root i/o will reexecute or suspend the entire tree.
*
* This approach ensures that zio_reexecute() honors
* all the original i/o dependency relationships, e.g.
* parents not executing until children are ready.
*/
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
zio->io_gang_leader = NULL;
mutex_enter(&zio->io_lock);
zio->io_state[ZIO_WAIT_DONE] = 1;
mutex_exit(&zio->io_lock);
/*
* "The Godfather" I/O monitors its children but is
* not a true parent to them. It will track them through
* the pipeline but severs its ties whenever they get into
* trouble (e.g. suspended). This allows "The Godfather"
* I/O to return status without blocking.
*/
zl = NULL;
for (pio = zio_walk_parents(zio, &zl); pio != NULL;
pio = pio_next) {
zio_link_t *remove_zl = zl;
pio_next = zio_walk_parents(zio, &zl);
if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
(zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
zio_remove_child(pio, zio, remove_zl);
zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
}
}
if ((pio = zio_unique_parent(zio)) != NULL) {
/*
* We're not a root i/o, so there's nothing to do
* but notify our parent. Don't propagate errors
* upward since we haven't permanently failed yet.
*/
ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
/*
* We'd fail again if we reexecuted now, so suspend
* until conditions improve (e.g. device comes online).
*/
zio_suspend(spa, zio);
} else {
/*
* Reexecution is potentially a huge amount of work.
* Hand it off to the otherwise-unused claim taskq.
*/
#if defined(illumos) || !defined(_KERNEL)
ASSERT(zio->io_tqent.tqent_next == NULL);
#else
ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
#endif
spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM,
ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio,
0, &zio->io_tqent);
}
return (ZIO_PIPELINE_STOP);
}
ASSERT(zio->io_child_count == 0);
ASSERT(zio->io_reexecute == 0);
ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
/*
* Report any checksum errors, since the I/O is complete.
*/
while (zio->io_cksum_report != NULL) {
zio_cksum_report_t *zcr = zio->io_cksum_report;
zio->io_cksum_report = zcr->zcr_next;
zcr->zcr_next = NULL;
zcr->zcr_finish(zcr, NULL);
zfs_ereport_free_checksum(zcr);
}
/*
* It is the responsibility of the done callback to ensure that this
* particular zio is no longer discoverable for adoption, and as
* such, cannot acquire any new parents.
*/
if (zio->io_done)
zio->io_done(zio);
mutex_enter(&zio->io_lock);
zio->io_state[ZIO_WAIT_DONE] = 1;
mutex_exit(&zio->io_lock);
zl = NULL;
for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
zio_link_t *remove_zl = zl;
pio_next = zio_walk_parents(zio, &zl);
zio_remove_child(pio, zio, remove_zl);
zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
}
if (zio->io_waiter != NULL) {
mutex_enter(&zio->io_lock);
zio->io_executor = NULL;
cv_broadcast(&zio->io_cv);
mutex_exit(&zio->io_lock);
} else {
zio_destroy(zio);
}
return (ZIO_PIPELINE_STOP);
}
/*
* ==========================================================================
* I/O pipeline definition
* ==========================================================================
*/
static zio_pipe_stage_t *zio_pipeline[] = {
NULL,
zio_read_bp_init,
zio_write_bp_init,
zio_free_bp_init,
zio_issue_async,
zio_write_compress,
zio_checksum_generate,
zio_nop_write,
zio_ddt_read_start,
zio_ddt_read_done,
zio_ddt_write,
zio_ddt_free,
zio_gang_assemble,
zio_gang_issue,
zio_dva_throttle,
zio_dva_allocate,
zio_dva_free,
zio_dva_claim,
zio_ready,
zio_vdev_io_start,
zio_vdev_io_done,
zio_vdev_io_assess,
zio_checksum_verify,
zio_done
};
/*
* Compare two zbookmark_phys_t's to see which we would reach first in a
* pre-order traversal of the object tree.
*
* This is simple in every case aside from the meta-dnode object. For all other
* objects, we traverse them in order (object 1 before object 2, and so on).
* However, all of these objects are traversed while traversing object 0, since
* the data it points to is the list of objects. Thus, we need to convert to a
* canonical representation so we can compare meta-dnode bookmarks to
* non-meta-dnode bookmarks.
*
* We do this by calculating "equivalents" for each field of the zbookmark.
* zbookmarks outside of the meta-dnode use their own object and level, and
* calculate the level 0 equivalent (the first L0 blkid that is contained in the
* blocks this bookmark refers to) by multiplying their blkid by their span
* (the number of L0 blocks contained within one block at their level).
* zbookmarks inside the meta-dnode calculate their object equivalent
* (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
* level + 1<<31 (any value larger than a level could ever be) for their level.
* This causes them to always compare before a bookmark in their object
* equivalent, compare appropriately to bookmarks in other objects, and to
* compare appropriately to other bookmarks in the meta-dnode.
*/
int
zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
{
/*
* These variables represent the "equivalent" values for the zbookmark,
* after converting zbookmarks inside the meta dnode to their
* normal-object equivalents.
*/
uint64_t zb1obj, zb2obj;
uint64_t zb1L0, zb2L0;
uint64_t zb1level, zb2level;
if (zb1->zb_object == zb2->zb_object &&
zb1->zb_level == zb2->zb_level &&
zb1->zb_blkid == zb2->zb_blkid)
return (0);
/*
* BP_SPANB calculates the span in blocks.
*/
zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
zb1L0 = 0;
zb1level = zb1->zb_level + COMPARE_META_LEVEL;
} else {
zb1obj = zb1->zb_object;
zb1level = zb1->zb_level;
}
if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
zb2L0 = 0;
zb2level = zb2->zb_level + COMPARE_META_LEVEL;
} else {
zb2obj = zb2->zb_object;
zb2level = zb2->zb_level;
}
/* Now that we have a canonical representation, do the comparison. */
if (zb1obj != zb2obj)
return (zb1obj < zb2obj ? -1 : 1);
else if (zb1L0 != zb2L0)
return (zb1L0 < zb2L0 ? -1 : 1);
else if (zb1level != zb2level)
return (zb1level > zb2level ? -1 : 1);
/*
* This can (theoretically) happen if the bookmarks have the same object
* and level, but different blkids, if the block sizes are not the same.
* There is presently no way to change the indirect block sizes
*/
return (0);
}
/*
* This function checks the following: given that last_block is the place that
* our traversal stopped last time, does that guarantee that we've visited
* every node under subtree_root? Therefore, we can't just use the raw output
* of zbookmark_compare. We have to pass in a modified version of
* subtree_root; by incrementing the block id, and then checking whether
* last_block is before or equal to that, we can tell whether or not having
* visited last_block implies that all of subtree_root's children have been
* visited.
*/
boolean_t
zbookmark_subtree_completed(const dnode_phys_t *dnp,
const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
{
zbookmark_phys_t mod_zb = *subtree_root;
mod_zb.zb_blkid++;
ASSERT(last_block->zb_level == 0);
/* The objset_phys_t isn't before anything. */
if (dnp == NULL)
return (B_FALSE);
/*
* We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
* data block size in sectors, because that variable is only used if
* the bookmark refers to a block in the meta-dnode. Since we don't
* know without examining it what object it refers to, and there's no
* harm in passing in this value in other cases, we always pass it in.
*
* We pass in 0 for the indirect block size shift because zb2 must be
* level 0. The indirect block size is only used to calculate the span
* of the bookmark, but since the bookmark must be level 0, the span is
* always 1, so the math works out.
*
* If you make changes to how the zbookmark_compare code works, be sure
* to make sure that this code still works afterwards.
*/
return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
last_block) <= 0);
}
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h (revision 332524)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h (revision 332525)
@@ -1,1066 +1,1096 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2017 Joyent, Inc.
* Copyright (c) 2017 Datto Inc.
*/
/* Portions Copyright 2010 Robert Milkowski */
#ifndef _SYS_FS_ZFS_H
#define _SYS_FS_ZFS_H
#include <sys/types.h>
#include <sys/ioccom.h>
#include <sys/time.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
* Types and constants shared between userland and the kernel.
*/
/*
* Each dataset can be one of the following types. These constants can be
* combined into masks that can be passed to various functions.
*/
typedef enum {
ZFS_TYPE_FILESYSTEM = (1 << 0),
ZFS_TYPE_SNAPSHOT = (1 << 1),
ZFS_TYPE_VOLUME = (1 << 2),
ZFS_TYPE_POOL = (1 << 3),
ZFS_TYPE_BOOKMARK = (1 << 4)
} zfs_type_t;
/*
* NB: lzc_dataset_type should be updated whenever a new objset type is added,
* if it represents a real type of a dataset that can be created from userland.
*/
typedef enum dmu_objset_type {
DMU_OST_NONE,
DMU_OST_META,
DMU_OST_ZFS,
DMU_OST_ZVOL,
DMU_OST_OTHER, /* For testing only! */
DMU_OST_ANY, /* Be careful! */
DMU_OST_NUMTYPES
} dmu_objset_type_t;
#define ZFS_TYPE_DATASET \
(ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT)
/*
* All of these include the terminating NUL byte.
*/
#define ZAP_MAXNAMELEN 256
#define ZAP_MAXVALUELEN (1024 * 8)
#define ZAP_OLDMAXVALUELEN 1024
#define ZFS_MAX_DATASET_NAME_LEN 256
/*
* Dataset properties are identified by these constants and must be added to
* the end of this list to ensure that external consumers are not affected
* by the change. If you make any changes to this list, be sure to update
* the property table in usr/src/common/zfs/zfs_prop.c.
*/
typedef enum {
ZPROP_CONT = -2,
ZPROP_INVAL = -1,
ZFS_PROP_TYPE = 0,
ZFS_PROP_CREATION,
ZFS_PROP_USED,
ZFS_PROP_AVAILABLE,
ZFS_PROP_REFERENCED,
ZFS_PROP_COMPRESSRATIO,
ZFS_PROP_MOUNTED,
ZFS_PROP_ORIGIN,
ZFS_PROP_QUOTA,
ZFS_PROP_RESERVATION,
ZFS_PROP_VOLSIZE,
ZFS_PROP_VOLBLOCKSIZE,
ZFS_PROP_RECORDSIZE,
ZFS_PROP_MOUNTPOINT,
ZFS_PROP_SHARENFS,
ZFS_PROP_CHECKSUM,
ZFS_PROP_COMPRESSION,
ZFS_PROP_ATIME,
ZFS_PROP_DEVICES,
ZFS_PROP_EXEC,
ZFS_PROP_SETUID,
ZFS_PROP_READONLY,
ZFS_PROP_ZONED,
ZFS_PROP_SNAPDIR,
ZFS_PROP_ACLMODE,
ZFS_PROP_ACLINHERIT,
ZFS_PROP_CREATETXG, /* not exposed to the user */
ZFS_PROP_NAME, /* not exposed to the user */
ZFS_PROP_CANMOUNT,
ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */
ZFS_PROP_XATTR,
ZFS_PROP_NUMCLONES, /* not exposed to the user */
ZFS_PROP_COPIES,
ZFS_PROP_VERSION,
ZFS_PROP_UTF8ONLY,
ZFS_PROP_NORMALIZE,
ZFS_PROP_CASE,
ZFS_PROP_VSCAN,
ZFS_PROP_NBMAND,
ZFS_PROP_SHARESMB,
ZFS_PROP_REFQUOTA,
ZFS_PROP_REFRESERVATION,
ZFS_PROP_GUID,
ZFS_PROP_PRIMARYCACHE,
ZFS_PROP_SECONDARYCACHE,
ZFS_PROP_USEDSNAP,
ZFS_PROP_USEDDS,
ZFS_PROP_USEDCHILD,
ZFS_PROP_USEDREFRESERV,
ZFS_PROP_USERACCOUNTING, /* not exposed to the user */
ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */
ZFS_PROP_DEFER_DESTROY,
ZFS_PROP_USERREFS,
ZFS_PROP_LOGBIAS,
ZFS_PROP_UNIQUE, /* not exposed to the user */
ZFS_PROP_OBJSETID, /* not exposed to the user */
ZFS_PROP_DEDUP,
ZFS_PROP_MLSLABEL,
ZFS_PROP_SYNC,
ZFS_PROP_REFRATIO,
ZFS_PROP_WRITTEN,
ZFS_PROP_CLONES,
ZFS_PROP_LOGICALUSED,
ZFS_PROP_LOGICALREFERENCED,
ZFS_PROP_INCONSISTENT, /* not exposed to the user */
ZFS_PROP_VOLMODE,
ZFS_PROP_FILESYSTEM_LIMIT,
ZFS_PROP_SNAPSHOT_LIMIT,
ZFS_PROP_FILESYSTEM_COUNT,
ZFS_PROP_SNAPSHOT_COUNT,
ZFS_PROP_REDUNDANT_METADATA,
ZFS_PROP_PREV_SNAP,
ZFS_PROP_RECEIVE_RESUME_TOKEN,
+ ZFS_PROP_REMAPTXG, /* not exposed to the user */
ZFS_NUM_PROPS
} zfs_prop_t;
typedef enum {
ZFS_PROP_USERUSED,
ZFS_PROP_USERQUOTA,
ZFS_PROP_GROUPUSED,
ZFS_PROP_GROUPQUOTA,
ZFS_NUM_USERQUOTA_PROPS
} zfs_userquota_prop_t;
extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS];
/*
* Pool properties are identified by these constants and must be added to the
* end of this list to ensure that external consumers are not affected
* by the change. If you make any changes to this list, be sure to update
* the property table in usr/src/common/zfs/zpool_prop.c.
*/
typedef enum {
ZPOOL_PROP_INVAL = -1,
ZPOOL_PROP_NAME,
ZPOOL_PROP_SIZE,
ZPOOL_PROP_CAPACITY,
ZPOOL_PROP_ALTROOT,
ZPOOL_PROP_HEALTH,
ZPOOL_PROP_GUID,
ZPOOL_PROP_VERSION,
ZPOOL_PROP_BOOTFS,
ZPOOL_PROP_DELEGATION,
ZPOOL_PROP_AUTOREPLACE,
ZPOOL_PROP_CACHEFILE,
ZPOOL_PROP_FAILUREMODE,
ZPOOL_PROP_LISTSNAPS,
ZPOOL_PROP_AUTOEXPAND,
ZPOOL_PROP_DEDUPDITTO,
ZPOOL_PROP_DEDUPRATIO,
ZPOOL_PROP_FREE,
ZPOOL_PROP_ALLOCATED,
ZPOOL_PROP_READONLY,
ZPOOL_PROP_COMMENT,
ZPOOL_PROP_EXPANDSZ,
ZPOOL_PROP_FREEING,
ZPOOL_PROP_FRAGMENTATION,
ZPOOL_PROP_LEAKED,
ZPOOL_PROP_MAXBLOCKSIZE,
ZPOOL_PROP_BOOTSIZE,
ZPOOL_NUM_PROPS
} zpool_prop_t;
/* Small enough to not hog a whole line of printout in zpool(1M). */
#define ZPROP_MAX_COMMENT 32
#define ZPROP_VALUE "value"
#define ZPROP_SOURCE "source"
typedef enum {
ZPROP_SRC_NONE = 0x1,
ZPROP_SRC_DEFAULT = 0x2,
ZPROP_SRC_TEMPORARY = 0x4,
ZPROP_SRC_LOCAL = 0x8,
ZPROP_SRC_INHERITED = 0x10,
ZPROP_SRC_RECEIVED = 0x20
} zprop_source_t;
#define ZPROP_SRC_ALL 0x3f
#define ZPROP_SOURCE_VAL_RECVD "$recvd"
#define ZPROP_N_MORE_ERRORS "N_MORE_ERRORS"
/*
* Dataset flag implemented as a special entry in the props zap object
* indicating that the dataset has received properties on or after
* SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties
* just as it did in earlier versions, and thereafter, local properties are
* preserved.
*/
#define ZPROP_HAS_RECVD "$hasrecvd"
typedef enum {
ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */
ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */
} zprop_errflags_t;
typedef int (*zprop_func)(int, void *);
/*
* Properties to be set on the root file system of a new pool
* are stuffed into their own nvlist, which is then included in
* the properties nvlist with the pool properties.
*/
#define ZPOOL_ROOTFS_PROPS "root-props-nvl"
/*
* Length of 'written@' and 'written#'
*/
#define ZFS_WRITTEN_PROP_PREFIX_LEN 8
/*
* Dataset property functions shared between libzfs and kernel.
*/
const char *zfs_prop_default_string(zfs_prop_t);
uint64_t zfs_prop_default_numeric(zfs_prop_t);
boolean_t zfs_prop_readonly(zfs_prop_t);
boolean_t zfs_prop_visible(zfs_prop_t prop);
boolean_t zfs_prop_inheritable(zfs_prop_t);
boolean_t zfs_prop_setonce(zfs_prop_t);
const char *zfs_prop_to_name(zfs_prop_t);
zfs_prop_t zfs_name_to_prop(const char *);
boolean_t zfs_prop_user(const char *);
boolean_t zfs_prop_userquota(const char *);
int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **);
int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *);
uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed);
boolean_t zfs_prop_valid_for_type(int, zfs_type_t);
/*
* Pool property functions shared between libzfs and kernel.
*/
zpool_prop_t zpool_name_to_prop(const char *);
const char *zpool_prop_to_name(zpool_prop_t);
const char *zpool_prop_default_string(zpool_prop_t);
uint64_t zpool_prop_default_numeric(zpool_prop_t);
boolean_t zpool_prop_readonly(zpool_prop_t);
boolean_t zpool_prop_feature(const char *);
boolean_t zpool_prop_unsupported(const char *name);
int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **);
int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *);
uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed);
/*
* Definitions for the Delegation.
*/
typedef enum {
ZFS_DELEG_WHO_UNKNOWN = 0,
ZFS_DELEG_USER = 'u',
ZFS_DELEG_USER_SETS = 'U',
ZFS_DELEG_GROUP = 'g',
ZFS_DELEG_GROUP_SETS = 'G',
ZFS_DELEG_EVERYONE = 'e',
ZFS_DELEG_EVERYONE_SETS = 'E',
ZFS_DELEG_CREATE = 'c',
ZFS_DELEG_CREATE_SETS = 'C',
ZFS_DELEG_NAMED_SET = 's',
ZFS_DELEG_NAMED_SET_SETS = 'S'
} zfs_deleg_who_type_t;
typedef enum {
ZFS_DELEG_NONE = 0,
ZFS_DELEG_PERM_LOCAL = 1,
ZFS_DELEG_PERM_DESCENDENT = 2,
ZFS_DELEG_PERM_LOCALDESCENDENT = 3,
ZFS_DELEG_PERM_CREATE = 4
} zfs_deleg_inherit_t;
#define ZFS_DELEG_PERM_UID "uid"
#define ZFS_DELEG_PERM_GID "gid"
#define ZFS_DELEG_PERM_GROUPS "groups"
#define ZFS_MLSLABEL_DEFAULT "none"
#define ZFS_SMB_ACL_SRC "src"
#define ZFS_SMB_ACL_TARGET "target"
typedef enum {
ZFS_CANMOUNT_OFF = 0,
ZFS_CANMOUNT_ON = 1,
ZFS_CANMOUNT_NOAUTO = 2
} zfs_canmount_type_t;
typedef enum {
ZFS_LOGBIAS_LATENCY = 0,
ZFS_LOGBIAS_THROUGHPUT = 1
} zfs_logbias_op_t;
typedef enum zfs_share_op {
ZFS_SHARE_NFS = 0,
ZFS_UNSHARE_NFS = 1,
ZFS_SHARE_SMB = 2,
ZFS_UNSHARE_SMB = 3
} zfs_share_op_t;
typedef enum zfs_smb_acl_op {
ZFS_SMB_ACL_ADD,
ZFS_SMB_ACL_REMOVE,
ZFS_SMB_ACL_RENAME,
ZFS_SMB_ACL_PURGE
} zfs_smb_acl_op_t;
typedef enum zfs_cache_type {
ZFS_CACHE_NONE = 0,
ZFS_CACHE_METADATA = 1,
ZFS_CACHE_ALL = 2
} zfs_cache_type_t;
typedef enum {
ZFS_SYNC_STANDARD = 0,
ZFS_SYNC_ALWAYS = 1,
ZFS_SYNC_DISABLED = 2
} zfs_sync_type_t;
typedef enum {
ZFS_VOLMODE_DEFAULT = 0,
ZFS_VOLMODE_GEOM = 1,
ZFS_VOLMODE_DEV = 2,
ZFS_VOLMODE_NONE = 3
} zfs_volmode_t;
typedef enum {
ZFS_REDUNDANT_METADATA_ALL,
ZFS_REDUNDANT_METADATA_MOST
} zfs_redundant_metadata_type_t;
/*
* On-disk version number.
*/
#define SPA_VERSION_1 1ULL
#define SPA_VERSION_2 2ULL
#define SPA_VERSION_3 3ULL
#define SPA_VERSION_4 4ULL
#define SPA_VERSION_5 5ULL
#define SPA_VERSION_6 6ULL
#define SPA_VERSION_7 7ULL
#define SPA_VERSION_8 8ULL
#define SPA_VERSION_9 9ULL
#define SPA_VERSION_10 10ULL
#define SPA_VERSION_11 11ULL
#define SPA_VERSION_12 12ULL
#define SPA_VERSION_13 13ULL
#define SPA_VERSION_14 14ULL
#define SPA_VERSION_15 15ULL
#define SPA_VERSION_16 16ULL
#define SPA_VERSION_17 17ULL
#define SPA_VERSION_18 18ULL
#define SPA_VERSION_19 19ULL
#define SPA_VERSION_20 20ULL
#define SPA_VERSION_21 21ULL
#define SPA_VERSION_22 22ULL
#define SPA_VERSION_23 23ULL
#define SPA_VERSION_24 24ULL
#define SPA_VERSION_25 25ULL
#define SPA_VERSION_26 26ULL
#define SPA_VERSION_27 27ULL
#define SPA_VERSION_28 28ULL
#define SPA_VERSION_5000 5000ULL
/*
* When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
* format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
* and do the appropriate changes. Also bump the version number in
* usr/src/grub/capability.
*/
#define SPA_VERSION SPA_VERSION_5000
#define SPA_VERSION_STRING "5000"
/*
* Symbolic names for the changes that caused a SPA_VERSION switch.
* Used in the code when checking for presence or absence of a feature.
* Feel free to define multiple symbolic names for each version if there
* were multiple changes to on-disk structures during that version.
*
* NOTE: When checking the current SPA_VERSION in your code, be sure
* to use spa_version() since it reports the version of the
* last synced uberblock. Checking the in-flight version can
* be dangerous in some cases.
*/
#define SPA_VERSION_INITIAL SPA_VERSION_1
#define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2
#define SPA_VERSION_SPARES SPA_VERSION_3
#define SPA_VERSION_RAIDZ2 SPA_VERSION_3
#define SPA_VERSION_BPOBJ_ACCOUNT SPA_VERSION_3
#define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3
#define SPA_VERSION_DNODE_BYTES SPA_VERSION_3
#define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4
#define SPA_VERSION_GZIP_COMPRESSION SPA_VERSION_5
#define SPA_VERSION_BOOTFS SPA_VERSION_6
#define SPA_VERSION_SLOGS SPA_VERSION_7
#define SPA_VERSION_DELEGATED_PERMS SPA_VERSION_8
#define SPA_VERSION_FUID SPA_VERSION_9
#define SPA_VERSION_REFRESERVATION SPA_VERSION_9
#define SPA_VERSION_REFQUOTA SPA_VERSION_9
#define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9
#define SPA_VERSION_L2CACHE SPA_VERSION_10
#define SPA_VERSION_NEXT_CLONES SPA_VERSION_11
#define SPA_VERSION_ORIGIN SPA_VERSION_11
#define SPA_VERSION_DSL_SCRUB SPA_VERSION_11
#define SPA_VERSION_SNAP_PROPS SPA_VERSION_12
#define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13
#define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14
#define SPA_VERSION_USERSPACE SPA_VERSION_15
#define SPA_VERSION_STMF_PROP SPA_VERSION_16
#define SPA_VERSION_RAIDZ3 SPA_VERSION_17
#define SPA_VERSION_USERREFS SPA_VERSION_18
#define SPA_VERSION_HOLES SPA_VERSION_19
#define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20
#define SPA_VERSION_DEDUP SPA_VERSION_21
#define SPA_VERSION_RECVD_PROPS SPA_VERSION_22
#define SPA_VERSION_SLIM_ZIL SPA_VERSION_23
#define SPA_VERSION_SA SPA_VERSION_24
#define SPA_VERSION_SCAN SPA_VERSION_25
#define SPA_VERSION_DIR_CLONES SPA_VERSION_26
#define SPA_VERSION_DEADLISTS SPA_VERSION_26
#define SPA_VERSION_FAST_SNAP SPA_VERSION_27
#define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28
#define SPA_VERSION_BEFORE_FEATURES SPA_VERSION_28
#define SPA_VERSION_FEATURES SPA_VERSION_5000
#define SPA_VERSION_IS_SUPPORTED(v) \
(((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \
((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION))
/*
* ZPL version - rev'd whenever an incompatible on-disk format change
* occurs. This is independent of SPA/DMU/ZAP versioning. You must
* also update the version_table[] and help message in zfs_prop.c.
*
* When changing, be sure to teach GRUB how to read the new format!
* See usr/src/grub/grub-0.97/stage2/{zfs-include/,fsys_zfs*}
*/
#define ZPL_VERSION_1 1ULL
#define ZPL_VERSION_2 2ULL
#define ZPL_VERSION_3 3ULL
#define ZPL_VERSION_4 4ULL
#define ZPL_VERSION_5 5ULL
#define ZPL_VERSION ZPL_VERSION_5
#define ZPL_VERSION_STRING "5"
#define ZPL_VERSION_INITIAL ZPL_VERSION_1
#define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2
#define ZPL_VERSION_FUID ZPL_VERSION_3
#define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3
#define ZPL_VERSION_SYSATTR ZPL_VERSION_3
#define ZPL_VERSION_USERSPACE ZPL_VERSION_4
#define ZPL_VERSION_SA ZPL_VERSION_5
/* Rewind request information */
#define ZPOOL_NO_REWIND 1 /* No policy - default behavior */
#define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */
#define ZPOOL_TRY_REWIND 4 /* Search for best txg, but do not rewind */
#define ZPOOL_DO_REWIND 8 /* Rewind to best txg w/in deferred frees */
#define ZPOOL_EXTREME_REWIND 16 /* Allow extreme measures to find best txg */
#define ZPOOL_REWIND_MASK 28 /* All the possible rewind bits */
#define ZPOOL_REWIND_POLICIES 31 /* All the possible policy bits */
typedef struct zpool_rewind_policy {
uint32_t zrp_request; /* rewind behavior requested */
uint64_t zrp_maxmeta; /* max acceptable meta-data errors */
uint64_t zrp_maxdata; /* max acceptable data errors */
uint64_t zrp_txg; /* specific txg to load */
} zpool_rewind_policy_t;
/*
* The following are configuration names used in the nvlist describing a pool's
- * configuration.
+ * configuration. New on-disk names should be prefixed with "<reverse-DNS>:"
+ * (e.g. "org.open-zfs:") to avoid conflicting names being developed
+ * independently.
*/
#define ZPOOL_CONFIG_VERSION "version"
#define ZPOOL_CONFIG_POOL_NAME "name"
#define ZPOOL_CONFIG_POOL_STATE "state"
#define ZPOOL_CONFIG_POOL_TXG "txg"
#define ZPOOL_CONFIG_POOL_GUID "pool_guid"
#define ZPOOL_CONFIG_CREATE_TXG "create_txg"
#define ZPOOL_CONFIG_TOP_GUID "top_guid"
#define ZPOOL_CONFIG_VDEV_TREE "vdev_tree"
#define ZPOOL_CONFIG_TYPE "type"
#define ZPOOL_CONFIG_CHILDREN "children"
#define ZPOOL_CONFIG_ID "id"
#define ZPOOL_CONFIG_GUID "guid"
+#define ZPOOL_CONFIG_INDIRECT_OBJECT "com.delphix:indirect_object"
+#define ZPOOL_CONFIG_INDIRECT_BIRTHS "com.delphix:indirect_births"
+#define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev"
#define ZPOOL_CONFIG_PATH "path"
#define ZPOOL_CONFIG_DEVID "devid"
#define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array"
#define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift"
#define ZPOOL_CONFIG_ASHIFT "ashift"
#define ZPOOL_CONFIG_ASIZE "asize"
#define ZPOOL_CONFIG_DTL "DTL"
#define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */
+#define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */
#define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */
+#define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */
#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk"
#define ZPOOL_CONFIG_ERRCOUNT "error_count"
#define ZPOOL_CONFIG_NOT_PRESENT "not_present"
#define ZPOOL_CONFIG_SPARES "spares"
#define ZPOOL_CONFIG_IS_SPARE "is_spare"
#define ZPOOL_CONFIG_NPARITY "nparity"
#define ZPOOL_CONFIG_HOSTID "hostid"
#define ZPOOL_CONFIG_HOSTNAME "hostname"
#define ZPOOL_CONFIG_LOADED_TIME "initial_load_time"
#define ZPOOL_CONFIG_UNSPARE "unspare"
#define ZPOOL_CONFIG_PHYS_PATH "phys_path"
#define ZPOOL_CONFIG_IS_LOG "is_log"
#define ZPOOL_CONFIG_L2CACHE "l2cache"
#define ZPOOL_CONFIG_HOLE_ARRAY "hole_array"
#define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children"
#define ZPOOL_CONFIG_IS_HOLE "is_hole"
#define ZPOOL_CONFIG_DDT_HISTOGRAM "ddt_histogram"
#define ZPOOL_CONFIG_DDT_OBJ_STATS "ddt_object_stats"
#define ZPOOL_CONFIG_DDT_STATS "ddt_stats"
#define ZPOOL_CONFIG_SPLIT "splitcfg"
#define ZPOOL_CONFIG_ORIG_GUID "orig_guid"
#define ZPOOL_CONFIG_SPLIT_GUID "split_guid"
#define ZPOOL_CONFIG_SPLIT_LIST "guid_list"
#define ZPOOL_CONFIG_REMOVING "removing"
#define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg"
#define ZPOOL_CONFIG_COMMENT "comment"
#define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */
#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */
#define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */
#define ZPOOL_CONFIG_MISSING_DEVICES "missing_vdevs" /* not stored on disk */
#define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */
#define ZPOOL_CONFIG_REWIND_INFO "rewind_info" /* not stored on disk */
#define ZPOOL_CONFIG_UNSUP_FEAT "unsup_feat" /* not stored on disk */
#define ZPOOL_CONFIG_ENABLED_FEAT "enabled_feat" /* not stored on disk */
#define ZPOOL_CONFIG_CAN_RDONLY "can_rdonly" /* not stored on disk */
#define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read"
#define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */
#define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top"
#define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf"
#define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps"
/*
* The persistent vdev state is stored as separate values rather than a single
* 'vdev_state' entry. This is because a device can be in multiple states, such
* as offline and degraded.
*/
#define ZPOOL_CONFIG_OFFLINE "offline"
#define ZPOOL_CONFIG_FAULTED "faulted"
#define ZPOOL_CONFIG_DEGRADED "degraded"
#define ZPOOL_CONFIG_REMOVED "removed"
#define ZPOOL_CONFIG_FRU "fru"
#define ZPOOL_CONFIG_AUX_STATE "aux_state"
/* Rewind policy parameters */
#define ZPOOL_REWIND_POLICY "rewind-policy"
#define ZPOOL_REWIND_REQUEST "rewind-request"
#define ZPOOL_REWIND_REQUEST_TXG "rewind-request-txg"
#define ZPOOL_REWIND_META_THRESH "rewind-meta-thresh"
#define ZPOOL_REWIND_DATA_THRESH "rewind-data-thresh"
/* Rewind data discovered */
#define ZPOOL_CONFIG_LOAD_TIME "rewind_txg_ts"
#define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors"
#define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind"
#define VDEV_TYPE_ROOT "root"
#define VDEV_TYPE_MIRROR "mirror"
#define VDEV_TYPE_REPLACING "replacing"
#define VDEV_TYPE_RAIDZ "raidz"
#define VDEV_TYPE_DISK "disk"
#define VDEV_TYPE_FILE "file"
#define VDEV_TYPE_MISSING "missing"
#define VDEV_TYPE_HOLE "hole"
#define VDEV_TYPE_SPARE "spare"
#define VDEV_TYPE_LOG "log"
#define VDEV_TYPE_L2CACHE "l2cache"
+#define VDEV_TYPE_INDIRECT "indirect"
+/* VDEV_TOP_ZAP_* are used in top-level vdev ZAP objects. */
+#define VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM \
+ "com.delphix:indirect_obsolete_sm"
+#define VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE \
+ "com.delphix:obsolete_counts_are_precise"
+
/*
* This is needed in userland to report the minimum necessary device size.
*
* Note that the zfs test suite uses 64MB vdevs.
*/
#define SPA_MINDEVSIZE (64ULL << 20)
/*
* Set if the fragmentation has not yet been calculated. This can happen
* because the space maps have not been upgraded or the histogram feature
* is not enabled.
*/
#define ZFS_FRAG_INVALID UINT64_MAX
/*
* The location of the pool configuration repository, shared between kernel and
* userland.
*/
#define ZPOOL_CACHE "/boot/zfs/zpool.cache"
/*
* vdev states are ordered from least to most healthy.
* A vdev that's CANT_OPEN or below is considered unusable.
*/
typedef enum vdev_state {
VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */
VDEV_STATE_CLOSED, /* Not currently open */
VDEV_STATE_OFFLINE, /* Not allowed to open */
VDEV_STATE_REMOVED, /* Explicitly removed from system */
VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */
VDEV_STATE_FAULTED, /* External request to fault device */
VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */
VDEV_STATE_HEALTHY /* Presumed good */
} vdev_state_t;
#define VDEV_STATE_ONLINE VDEV_STATE_HEALTHY
/*
* vdev aux states. When a vdev is in the CANT_OPEN state, the aux field
* of the vdev stats structure uses these constants to distinguish why.
*/
typedef enum vdev_aux {
VDEV_AUX_NONE, /* no error */
VDEV_AUX_OPEN_FAILED, /* ldi_open_*() or vn_open() failed */
VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */
VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */
VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */
VDEV_AUX_TOO_SMALL, /* vdev size is too small */
VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */
VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */
VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */
VDEV_AUX_UNSUP_FEAT, /* unsupported features */
VDEV_AUX_SPARED, /* hot spare used in another pool */
VDEV_AUX_ERR_EXCEEDED, /* too many errors */
VDEV_AUX_IO_FAILURE, /* experienced I/O failure */
VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */
VDEV_AUX_EXTERNAL, /* external diagnosis */
VDEV_AUX_SPLIT_POOL, /* vdev was split off into another pool */
VDEV_AUX_ASHIFT_TOO_BIG /* vdev's min block size is too large */
} vdev_aux_t;
/*
* pool state. The following states are written to disk as part of the normal
* SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE. The remaining
* states are software abstractions used at various levels to communicate
* pool state.
*/
typedef enum pool_state {
POOL_STATE_ACTIVE = 0, /* In active use */
POOL_STATE_EXPORTED, /* Explicitly exported */
POOL_STATE_DESTROYED, /* Explicitly destroyed */
POOL_STATE_SPARE, /* Reserved for hot spare use */
POOL_STATE_L2CACHE, /* Level 2 ARC device */
POOL_STATE_UNINITIALIZED, /* Internal spa_t state */
POOL_STATE_UNAVAIL, /* Internal libzfs state */
POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */
} pool_state_t;
/*
* Scan Functions.
*/
typedef enum pool_scan_func {
POOL_SCAN_NONE,
POOL_SCAN_SCRUB,
POOL_SCAN_RESILVER,
POOL_SCAN_FUNCS
} pool_scan_func_t;
/*
* Used to control scrub pause and resume.
*/
typedef enum pool_scrub_cmd {
POOL_SCRUB_NORMAL = 0,
POOL_SCRUB_PAUSE,
POOL_SCRUB_FLAGS_END
} pool_scrub_cmd_t;
/*
* ZIO types. Needed to interpret vdev statistics below.
*/
typedef enum zio_type {
ZIO_TYPE_NULL = 0,
ZIO_TYPE_READ,
ZIO_TYPE_WRITE,
ZIO_TYPE_FREE,
ZIO_TYPE_CLAIM,
ZIO_TYPE_IOCTL,
ZIO_TYPES
} zio_type_t;
/*
* Pool statistics. Note: all fields should be 64-bit because this
* is passed between kernel and userland as an nvlist uint64 array.
*/
typedef struct pool_scan_stat {
/* values stored on disk */
uint64_t pss_func; /* pool_scan_func_t */
uint64_t pss_state; /* dsl_scan_state_t */
uint64_t pss_start_time; /* scan start time */
uint64_t pss_end_time; /* scan end time */
uint64_t pss_to_examine; /* total bytes to scan */
uint64_t pss_examined; /* total examined bytes */
uint64_t pss_to_process; /* total bytes to process */
uint64_t pss_processed; /* total processed bytes */
uint64_t pss_errors; /* scan errors */
/* values not stored on disk */
uint64_t pss_pass_exam; /* examined bytes per scan pass */
uint64_t pss_pass_start; /* start time of a scan pass */
uint64_t pss_pass_scrub_pause; /* pause time of a scurb pass */
/* cumulative time scrub spent paused, needed for rate calculation */
uint64_t pss_pass_scrub_spent_paused;
} pool_scan_stat_t;
+typedef struct pool_removal_stat {
+ uint64_t prs_state; /* dsl_scan_state_t */
+ uint64_t prs_removing_vdev;
+ uint64_t prs_start_time;
+ uint64_t prs_end_time;
+ uint64_t prs_to_copy; /* bytes that need to be copied */
+ uint64_t prs_copied; /* bytes copied so far */
+ /*
+ * bytes of memory used for indirect mappings.
+ * This includes all removed vdevs.
+ */
+ uint64_t prs_mapping_memory;
+} pool_removal_stat_t;
+
typedef enum dsl_scan_state {
DSS_NONE,
DSS_SCANNING,
DSS_FINISHED,
DSS_CANCELED,
DSS_NUM_STATES
} dsl_scan_state_t;
/*
* Vdev statistics. Note: all fields should be 64-bit because this
* is passed between kernel and userland as an nvlist uint64 array.
*/
typedef struct vdev_stat {
hrtime_t vs_timestamp; /* time since vdev load */
uint64_t vs_state; /* vdev state */
uint64_t vs_aux; /* see vdev_aux_t */
uint64_t vs_alloc; /* space allocated */
uint64_t vs_space; /* total capacity */
uint64_t vs_dspace; /* deflated capacity */
uint64_t vs_rsize; /* replaceable dev size */
uint64_t vs_esize; /* expandable dev size */
uint64_t vs_ops[ZIO_TYPES]; /* operation count */
uint64_t vs_bytes[ZIO_TYPES]; /* bytes read/written */
uint64_t vs_read_errors; /* read errors */
uint64_t vs_write_errors; /* write errors */
uint64_t vs_checksum_errors; /* checksum errors */
uint64_t vs_self_healed; /* self-healed bytes */
uint64_t vs_scan_removing; /* removing? */
uint64_t vs_scan_processed; /* scan processed bytes */
uint64_t vs_configured_ashift; /* TLV vdev_ashift */
uint64_t vs_logical_ashift; /* vdev_logical_ashift */
uint64_t vs_physical_ashift; /* vdev_physical_ashift */
uint64_t vs_fragmentation; /* device fragmentation */
} vdev_stat_t;
#define VDEV_STAT_VALID(field, uint64_t_field_count) \
((uint64_t_field_count * sizeof(uint64_t)) >= \
(offsetof(vdev_stat_t, field) + sizeof(((vdev_stat_t *)NULL)->field)))
/*
* DDT statistics. Note: all fields should be 64-bit because this
* is passed between kernel and userland as an nvlist uint64 array.
*/
typedef struct ddt_object {
uint64_t ddo_count; /* number of elments in ddt */
uint64_t ddo_dspace; /* size of ddt on disk */
uint64_t ddo_mspace; /* size of ddt in-core */
} ddt_object_t;
typedef struct ddt_stat {
uint64_t dds_blocks; /* blocks */
uint64_t dds_lsize; /* logical size */
uint64_t dds_psize; /* physical size */
uint64_t dds_dsize; /* deflated allocated size */
uint64_t dds_ref_blocks; /* referenced blocks */
uint64_t dds_ref_lsize; /* referenced lsize * refcnt */
uint64_t dds_ref_psize; /* referenced psize * refcnt */
uint64_t dds_ref_dsize; /* referenced dsize * refcnt */
} ddt_stat_t;
typedef struct ddt_histogram {
ddt_stat_t ddh_stat[64]; /* power-of-two histogram buckets */
} ddt_histogram_t;
#define ZVOL_DRIVER "zvol"
#define ZFS_DRIVER "zfs"
#define ZFS_DEV_NAME "zfs"
#define ZFS_DEV "/dev/" ZFS_DEV_NAME
#define ZFS_DISK_ROOT "/dev/dsk"
#define ZFS_DISK_ROOTD ZFS_DISK_ROOT "/"
#define ZFS_RDISK_ROOT "/dev/rdsk"
#define ZFS_RDISK_ROOTD ZFS_RDISK_ROOT "/"
/* general zvol path */
#define ZVOL_DIR "/dev/zvol"
/* expansion */
#define ZVOL_PSEUDO_DEV "/devices/pseudo/zfs@0:"
/* for dump and swap */
#define ZVOL_FULL_DEV_DIR ZVOL_DIR "/dsk/"
#define ZVOL_FULL_RDEV_DIR ZVOL_DIR "/rdsk/"
#define ZVOL_PROP_NAME "name"
#define ZVOL_DEFAULT_BLOCKSIZE 8192
/*
* /dev/zfs ioctl numbers.
*/
typedef enum zfs_ioc {
ZFS_IOC_FIRST = 0,
ZFS_IOC_POOL_CREATE = ZFS_IOC_FIRST,
ZFS_IOC_POOL_DESTROY,
ZFS_IOC_POOL_IMPORT,
ZFS_IOC_POOL_EXPORT,
ZFS_IOC_POOL_CONFIGS,
ZFS_IOC_POOL_STATS,
ZFS_IOC_POOL_TRYIMPORT,
ZFS_IOC_POOL_SCAN,
ZFS_IOC_POOL_FREEZE,
ZFS_IOC_POOL_UPGRADE,
ZFS_IOC_POOL_GET_HISTORY,
ZFS_IOC_VDEV_ADD,
ZFS_IOC_VDEV_REMOVE,
ZFS_IOC_VDEV_SET_STATE,
ZFS_IOC_VDEV_ATTACH,
ZFS_IOC_VDEV_DETACH,
ZFS_IOC_VDEV_SETPATH,
ZFS_IOC_VDEV_SETFRU,
ZFS_IOC_OBJSET_STATS,
ZFS_IOC_OBJSET_ZPLPROPS,
ZFS_IOC_DATASET_LIST_NEXT,
ZFS_IOC_SNAPSHOT_LIST_NEXT,
ZFS_IOC_SET_PROP,
ZFS_IOC_CREATE,
ZFS_IOC_DESTROY,
ZFS_IOC_ROLLBACK,
ZFS_IOC_RENAME,
ZFS_IOC_RECV,
ZFS_IOC_SEND,
ZFS_IOC_INJECT_FAULT,
ZFS_IOC_CLEAR_FAULT,
ZFS_IOC_INJECT_LIST_NEXT,
ZFS_IOC_ERROR_LOG,
ZFS_IOC_CLEAR,
ZFS_IOC_PROMOTE,
ZFS_IOC_DESTROY_SNAPS,
ZFS_IOC_SNAPSHOT,
ZFS_IOC_DSOBJ_TO_DSNAME,
ZFS_IOC_OBJ_TO_PATH,
ZFS_IOC_POOL_SET_PROPS,
ZFS_IOC_POOL_GET_PROPS,
ZFS_IOC_SET_FSACL,
ZFS_IOC_GET_FSACL,
ZFS_IOC_SHARE,
ZFS_IOC_INHERIT_PROP,
ZFS_IOC_SMB_ACL,
ZFS_IOC_USERSPACE_ONE,
ZFS_IOC_USERSPACE_MANY,
ZFS_IOC_USERSPACE_UPGRADE,
ZFS_IOC_HOLD,
ZFS_IOC_RELEASE,
ZFS_IOC_GET_HOLDS,
ZFS_IOC_OBJSET_RECVD_PROPS,
ZFS_IOC_VDEV_SPLIT,
ZFS_IOC_NEXT_OBJ,
ZFS_IOC_DIFF,
ZFS_IOC_TMP_SNAPSHOT,
ZFS_IOC_OBJ_TO_STATS,
ZFS_IOC_JAIL,
ZFS_IOC_UNJAIL,
ZFS_IOC_POOL_REGUID,
ZFS_IOC_SPACE_WRITTEN,
ZFS_IOC_SPACE_SNAPS,
ZFS_IOC_SEND_PROGRESS,
ZFS_IOC_POOL_REOPEN,
ZFS_IOC_LOG_HISTORY,
ZFS_IOC_SEND_NEW,
ZFS_IOC_SEND_SPACE,
ZFS_IOC_CLONE,
ZFS_IOC_BOOKMARK,
ZFS_IOC_GET_BOOKMARKS,
ZFS_IOC_DESTROY_BOOKMARKS,
#ifdef __FreeBSD__
ZFS_IOC_NEXTBOOT,
#endif
ZFS_IOC_CHANNEL_PROGRAM,
+ ZFS_IOC_REMAP,
ZFS_IOC_LAST
} zfs_ioc_t;
/*
* Internal SPA load state. Used by FMA diagnosis engine.
*/
typedef enum {
SPA_LOAD_NONE, /* no load in progress */
SPA_LOAD_OPEN, /* normal open */
SPA_LOAD_IMPORT, /* import in progress */
SPA_LOAD_TRYIMPORT, /* tryimport in progress */
SPA_LOAD_RECOVER, /* recovery requested */
SPA_LOAD_ERROR, /* load failed */
SPA_LOAD_CREATE /* creation in progress */
} spa_load_state_t;
/*
* Bookmark name values.
*/
#define ZPOOL_ERR_LIST "error list"
#define ZPOOL_ERR_DATASET "dataset"
#define ZPOOL_ERR_OBJECT "object"
#define HIS_MAX_RECORD_LEN (MAXPATHLEN + MAXPATHLEN + 1)
/*
* The following are names used in the nvlist describing
* the pool's history log.
*/
#define ZPOOL_HIST_RECORD "history record"
#define ZPOOL_HIST_TIME "history time"
#define ZPOOL_HIST_CMD "history command"
#define ZPOOL_HIST_WHO "history who"
#define ZPOOL_HIST_ZONE "history zone"
#define ZPOOL_HIST_HOST "history hostname"
#define ZPOOL_HIST_TXG "history txg"
#define ZPOOL_HIST_INT_EVENT "history internal event"
#define ZPOOL_HIST_INT_STR "history internal str"
#define ZPOOL_HIST_INT_NAME "internal_name"
#define ZPOOL_HIST_IOCTL "ioctl"
#define ZPOOL_HIST_INPUT_NVL "in_nvl"
#define ZPOOL_HIST_OUTPUT_NVL "out_nvl"
#define ZPOOL_HIST_DSNAME "dsname"
#define ZPOOL_HIST_DSID "dsid"
#define ZPOOL_HIST_ERRNO "errno"
/*
* Flags for ZFS_IOC_VDEV_SET_STATE
*/
#define ZFS_ONLINE_CHECKREMOVE 0x1
#define ZFS_ONLINE_UNSPARE 0x2
#define ZFS_ONLINE_FORCEFAULT 0x4
#define ZFS_ONLINE_EXPAND 0x8
#define ZFS_OFFLINE_TEMPORARY 0x1
/*
* Flags for ZFS_IOC_POOL_IMPORT
*/
#define ZFS_IMPORT_NORMAL 0x0
#define ZFS_IMPORT_VERBATIM 0x1
#define ZFS_IMPORT_ANY_HOST 0x2
#define ZFS_IMPORT_MISSING_LOG 0x4
#define ZFS_IMPORT_ONLY 0x8
/*
* Channel program argument/return nvlist keys and defaults.
*/
#define ZCP_ARG_PROGRAM "program"
#define ZCP_ARG_ARGLIST "arg"
#define ZCP_ARG_SYNC "sync"
#define ZCP_ARG_INSTRLIMIT "instrlimit"
#define ZCP_ARG_MEMLIMIT "memlimit"
#define ZCP_ARG_CLIARGV "argv"
#define ZCP_RET_ERROR "error"
#define ZCP_RET_RETURN "return"
#define ZCP_DEFAULT_INSTRLIMIT (10 * 1000 * 1000)
#define ZCP_MAX_INSTRLIMIT (10 * ZCP_DEFAULT_INSTRLIMIT)
#define ZCP_DEFAULT_MEMLIMIT (10 * 1024 * 1024)
#define ZCP_MAX_MEMLIMIT (10 * ZCP_DEFAULT_MEMLIMIT)
/*
* Sysevent payload members. ZFS will generate the following sysevents with the
* given payloads:
*
* ESC_ZFS_RESILVER_START
* ESC_ZFS_RESILVER_END
* ESC_ZFS_POOL_DESTROY
* ESC_ZFS_POOL_REGUID
*
* ZFS_EV_POOL_NAME DATA_TYPE_STRING
* ZFS_EV_POOL_GUID DATA_TYPE_UINT64
*
* ESC_ZFS_VDEV_REMOVE
* ESC_ZFS_VDEV_CLEAR
* ESC_ZFS_VDEV_CHECK
*
* ZFS_EV_POOL_NAME DATA_TYPE_STRING
* ZFS_EV_POOL_GUID DATA_TYPE_UINT64
* ZFS_EV_VDEV_PATH DATA_TYPE_STRING (optional)
* ZFS_EV_VDEV_GUID DATA_TYPE_UINT64
*
* ESC_ZFS_HISTORY_EVENT
*
* ZFS_EV_POOL_NAME DATA_TYPE_STRING
* ZFS_EV_POOL_GUID DATA_TYPE_UINT64
* ZFS_EV_HIST_TIME DATA_TYPE_UINT64 (optional)
* ZFS_EV_HIST_CMD DATA_TYPE_STRING (optional)
* ZFS_EV_HIST_WHO DATA_TYPE_UINT64 (optional)
* ZFS_EV_HIST_ZONE DATA_TYPE_STRING (optional)
* ZFS_EV_HIST_HOST DATA_TYPE_STRING (optional)
* ZFS_EV_HIST_TXG DATA_TYPE_UINT64 (optional)
* ZFS_EV_HIST_INT_EVENT DATA_TYPE_UINT64 (optional)
* ZFS_EV_HIST_INT_STR DATA_TYPE_STRING (optional)
* ZFS_EV_HIST_INT_NAME DATA_TYPE_STRING (optional)
* ZFS_EV_HIST_IOCTL DATA_TYPE_STRING (optional)
* ZFS_EV_HIST_DSNAME DATA_TYPE_STRING (optional)
* ZFS_EV_HIST_DSID DATA_TYPE_UINT64 (optional)
*
* The ZFS_EV_HIST_* members will correspond to the ZPOOL_HIST_* members in the
* history log nvlist. The keynames will be free of any spaces or other
* characters that could be potentially unexpected to consumers of the
* sysevents.
*/
#define ZFS_EV_POOL_NAME "pool_name"
#define ZFS_EV_POOL_GUID "pool_guid"
#define ZFS_EV_VDEV_PATH "vdev_path"
#define ZFS_EV_VDEV_GUID "vdev_guid"
#define ZFS_EV_HIST_TIME "history_time"
#define ZFS_EV_HIST_CMD "history_command"
#define ZFS_EV_HIST_WHO "history_who"
#define ZFS_EV_HIST_ZONE "history_zone"
#define ZFS_EV_HIST_HOST "history_hostname"
#define ZFS_EV_HIST_TXG "history_txg"
#define ZFS_EV_HIST_INT_EVENT "history_internal_event"
#define ZFS_EV_HIST_INT_STR "history_internal_str"
#define ZFS_EV_HIST_INT_NAME "history_internal_name"
#define ZFS_EV_HIST_IOCTL "history_ioctl"
#define ZFS_EV_HIST_DSNAME "history_dsname"
#define ZFS_EV_HIST_DSID "history_dsid"
#ifdef __cplusplus
}
#endif
#endif /* _SYS_FS_ZFS_H */
Index: stable/11/sys/conf/files
===================================================================
--- stable/11/sys/conf/files (revision 332524)
+++ stable/11/sys/conf/files (revision 332525)
@@ -1,4743 +1,4747 @@
# $FreeBSD$
#
# The long compile-with and dependency lines are required because of
# limitations in config: backslash-newline doesn't work in strings, and
# dependency lines other than the first are silently ignored.
#
acpi_quirks.h optional acpi \
dependency "$S/tools/acpi_quirks2h.awk $S/dev/acpica/acpi_quirks" \
compile-with "${AWK} -f $S/tools/acpi_quirks2h.awk $S/dev/acpica/acpi_quirks" \
no-obj no-implicit-rule before-depend \
clean "acpi_quirks.h"
bhnd_nvram_map.h optional bhnd \
dependency "$S/dev/bhnd/tools/nvram_map_gen.sh $S/dev/bhnd/tools/nvram_map_gen.awk $S/dev/bhnd/nvram/nvram_map" \
compile-with "sh $S/dev/bhnd/tools/nvram_map_gen.sh $S/dev/bhnd/nvram/nvram_map -h" \
no-obj no-implicit-rule before-depend \
clean "bhnd_nvram_map.h"
bhnd_nvram_map_data.h optional bhnd \
dependency "$S/dev/bhnd/tools/nvram_map_gen.sh $S/dev/bhnd/tools/nvram_map_gen.awk $S/dev/bhnd/nvram/nvram_map" \
compile-with "sh $S/dev/bhnd/tools/nvram_map_gen.sh $S/dev/bhnd/nvram/nvram_map -d" \
no-obj no-implicit-rule before-depend \
clean "bhnd_nvram_map_data.h"
#
# The 'fdt_dtb_file' target covers an actual DTB file name, which is derived
# from the specified source (DTS) file: <platform>.dts -> <platform>.dtb
#
fdt_dtb_file optional fdt fdt_dtb_static \
compile-with "sh -c 'MACHINE=${MACHINE} $S/tools/fdt/make_dtb.sh $S ${FDT_DTS_FILE} ${.CURDIR}'" \
no-obj no-implicit-rule before-depend \
clean "${FDT_DTS_FILE:R}.dtb"
fdt_static_dtb.h optional fdt fdt_dtb_static \
compile-with "sh -c 'MACHINE=${MACHINE} $S/tools/fdt/make_dtbh.sh ${FDT_DTS_FILE} ${.CURDIR}'" \
dependency "fdt_dtb_file" \
no-obj no-implicit-rule before-depend \
clean "fdt_static_dtb.h"
feeder_eq_gen.h optional sound \
dependency "$S/tools/sound/feeder_eq_mkfilter.awk" \
compile-with "${AWK} -f $S/tools/sound/feeder_eq_mkfilter.awk -- ${FEEDER_EQ_PRESETS} > feeder_eq_gen.h" \
no-obj no-implicit-rule before-depend \
clean "feeder_eq_gen.h"
feeder_rate_gen.h optional sound \
dependency "$S/tools/sound/feeder_rate_mkfilter.awk" \
compile-with "${AWK} -f $S/tools/sound/feeder_rate_mkfilter.awk -- ${FEEDER_RATE_PRESETS} > feeder_rate_gen.h" \
no-obj no-implicit-rule before-depend \
clean "feeder_rate_gen.h"
snd_fxdiv_gen.h optional sound \
dependency "$S/tools/sound/snd_fxdiv_gen.awk" \
compile-with "${AWK} -f $S/tools/sound/snd_fxdiv_gen.awk -- > snd_fxdiv_gen.h" \
no-obj no-implicit-rule before-depend \
clean "snd_fxdiv_gen.h"
miidevs.h optional miibus | mii \
dependency "$S/tools/miidevs2h.awk $S/dev/mii/miidevs" \
compile-with "${AWK} -f $S/tools/miidevs2h.awk $S/dev/mii/miidevs" \
no-obj no-implicit-rule before-depend \
clean "miidevs.h"
pccarddevs.h standard \
dependency "$S/tools/pccarddevs2h.awk $S/dev/pccard/pccarddevs" \
compile-with "${AWK} -f $S/tools/pccarddevs2h.awk $S/dev/pccard/pccarddevs" \
no-obj no-implicit-rule before-depend \
clean "pccarddevs.h"
kbdmuxmap.h optional kbdmux_dflt_keymap \
compile-with "kbdcontrol -P ${S:S/sys$/share/}/vt/keymaps -P ${S:S/sys$/share/}/syscons/keymaps -L ${KBDMUX_DFLT_KEYMAP} | sed -e 's/^static keymap_t.* = /static keymap_t key_map = /' -e 's/^static accentmap_t.* = /static accentmap_t accent_map = /' > kbdmuxmap.h" \
no-obj no-implicit-rule before-depend \
clean "kbdmuxmap.h"
teken_state.h optional sc | vt \
dependency "$S/teken/gensequences $S/teken/sequences" \
compile-with "${AWK} -f $S/teken/gensequences $S/teken/sequences > teken_state.h" \
no-obj no-implicit-rule before-depend \
clean "teken_state.h"
usbdevs.h optional usb \
dependency "$S/tools/usbdevs2h.awk $S/dev/usb/usbdevs" \
compile-with "${AWK} -f $S/tools/usbdevs2h.awk $S/dev/usb/usbdevs -h" \
no-obj no-implicit-rule before-depend \
clean "usbdevs.h"
usbdevs_data.h optional usb \
dependency "$S/tools/usbdevs2h.awk $S/dev/usb/usbdevs" \
compile-with "${AWK} -f $S/tools/usbdevs2h.awk $S/dev/usb/usbdevs -d" \
no-obj no-implicit-rule before-depend \
clean "usbdevs_data.h"
cam/cam.c optional scbus
cam/cam_compat.c optional scbus
cam/cam_iosched.c optional scbus
cam/cam_periph.c optional scbus
cam/cam_queue.c optional scbus
cam/cam_sim.c optional scbus
cam/cam_xpt.c optional scbus
cam/ata/ata_all.c optional scbus
cam/ata/ata_xpt.c optional scbus
cam/ata/ata_pmp.c optional scbus
cam/nvme/nvme_all.c optional scbus nvme
cam/nvme/nvme_da.c optional scbus nvme da
cam/nvme/nvme_xpt.c optional scbus nvme
cam/scsi/scsi_xpt.c optional scbus
cam/scsi/scsi_all.c optional scbus
cam/scsi/scsi_cd.c optional cd
cam/scsi/scsi_ch.c optional ch
cam/ata/ata_da.c optional ada | da
cam/ctl/ctl.c optional ctl
cam/ctl/ctl_backend.c optional ctl
cam/ctl/ctl_backend_block.c optional ctl
cam/ctl/ctl_backend_ramdisk.c optional ctl
cam/ctl/ctl_cmd_table.c optional ctl
cam/ctl/ctl_frontend.c optional ctl
cam/ctl/ctl_frontend_cam_sim.c optional ctl
cam/ctl/ctl_frontend_ioctl.c optional ctl
cam/ctl/ctl_frontend_iscsi.c optional ctl cfiscsi
cam/ctl/ctl_ha.c optional ctl
cam/ctl/ctl_scsi_all.c optional ctl
cam/ctl/ctl_tpc.c optional ctl
cam/ctl/ctl_tpc_local.c optional ctl
cam/ctl/ctl_error.c optional ctl
cam/ctl/ctl_util.c optional ctl
cam/ctl/scsi_ctl.c optional ctl
cam/scsi/scsi_da.c optional da
cam/scsi/scsi_low.c optional ct | ncv | nsp | stg
cam/scsi/scsi_pass.c optional pass
cam/scsi/scsi_pt.c optional pt
cam/scsi/scsi_sa.c optional sa
cam/scsi/scsi_enc.c optional ses
cam/scsi/scsi_enc_ses.c optional ses
cam/scsi/scsi_enc_safte.c optional ses
cam/scsi/scsi_sg.c optional sg
cam/scsi/scsi_targ_bh.c optional targbh
cam/scsi/scsi_target.c optional targ
cam/scsi/smp_all.c optional scbus
# shared between zfs and dtrace
cddl/compat/opensolaris/kern/opensolaris.c optional zfs | dtrace compile-with "${CDDL_C}"
cddl/compat/opensolaris/kern/opensolaris_cmn_err.c optional zfs | dtrace compile-with "${CDDL_C}"
cddl/compat/opensolaris/kern/opensolaris_kmem.c optional zfs | dtrace compile-with "${CDDL_C}"
cddl/compat/opensolaris/kern/opensolaris_misc.c optional zfs | dtrace compile-with "${CDDL_C}"
cddl/compat/opensolaris/kern/opensolaris_proc.c optional zfs | dtrace compile-with "${CDDL_C}"
cddl/compat/opensolaris/kern/opensolaris_sunddi.c optional zfs | dtrace compile-with "${CDDL_C}"
cddl/compat/opensolaris/kern/opensolaris_taskq.c optional zfs | dtrace compile-with "${CDDL_C}"
# zfs specific
cddl/compat/opensolaris/kern/opensolaris_acl.c optional zfs compile-with "${ZFS_C}"
cddl/compat/opensolaris/kern/opensolaris_dtrace.c optional zfs compile-with "${ZFS_C}"
cddl/compat/opensolaris/kern/opensolaris_kobj.c optional zfs compile-with "${ZFS_C}"
cddl/compat/opensolaris/kern/opensolaris_kstat.c optional zfs compile-with "${ZFS_C}"
cddl/compat/opensolaris/kern/opensolaris_lookup.c optional zfs compile-with "${ZFS_C}"
cddl/compat/opensolaris/kern/opensolaris_policy.c optional zfs compile-with "${ZFS_C}"
cddl/compat/opensolaris/kern/opensolaris_string.c optional zfs compile-with "${ZFS_C}"
cddl/compat/opensolaris/kern/opensolaris_sysevent.c optional zfs compile-with "${ZFS_C}"
cddl/compat/opensolaris/kern/opensolaris_uio.c optional zfs compile-with "${ZFS_C}"
cddl/compat/opensolaris/kern/opensolaris_vfs.c optional zfs compile-with "${ZFS_C}"
cddl/compat/opensolaris/kern/opensolaris_vm.c optional zfs compile-with "${ZFS_C}"
cddl/compat/opensolaris/kern/opensolaris_zone.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/common/acl/acl_common.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/common/avl/avl.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/common/nvpair/opensolaris_fnvpair.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair_alloc_fixed.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/common/unicode/u8_textprep.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/common/zfs/zfeature_common.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/common/zfs/zfs_comutil.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/common/zfs/zfs_deleg.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/common/zfs/zfs_fletcher.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/common/zfs/zfs_prop.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/common/zfs/zpool_prop.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/common/zfs/zprop_common.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/vnode.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c optional zfs compile-with "${ZFS_C}" \
warning "kernel contains CDDL licensed ZFS filesystem"
cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lz4.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c optional zfs compile-with "${ZFS_C}"
+cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c optional zfs compile-with "${ZFS_C}"
+cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c optional zfs compile-with "${ZFS_C}"
+cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c optional zfs compile-with "${ZFS_C}"
+cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_global.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_iter.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_synctask.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/os/callb.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/os/fm.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/os/list.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/zmod/adler32.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/zmod/deflate.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/zmod/inffast.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/zmod/inflate.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/zmod/inftrees.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/zmod/opensolaris_crc32.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/zmod/trees.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/zmod/zmod.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/zmod/zmod_subr.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/zmod/zutil.c optional zfs compile-with "${ZFS_C}"
# zfs lua support
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbaselib.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbitlib.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcompat.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcorolib.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldump.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstrlib.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltablib.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.c optional zfs compile-with "${ZFS_C}"
# dtrace specific
cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c optional dtrace compile-with "${DTRACE_C}" \
warning "kernel contains CDDL licensed DTRACE"
cddl/dev/dtmalloc/dtmalloc.c optional dtmalloc | dtraceall compile-with "${CDDL_C}"
cddl/dev/profile/profile.c optional dtrace_profile | dtraceall compile-with "${CDDL_C}"
cddl/dev/sdt/sdt.c optional dtrace_sdt | dtraceall compile-with "${CDDL_C}"
cddl/dev/fbt/fbt.c optional dtrace_fbt | dtraceall compile-with "${FBT_C}"
cddl/dev/systrace/systrace.c optional dtrace_systrace | dtraceall compile-with "${CDDL_C}"
cddl/dev/prototype.c optional dtrace_prototype | dtraceall compile-with "${CDDL_C}"
fs/nfsclient/nfs_clkdtrace.c optional dtnfscl nfscl | dtraceall nfscl compile-with "${CDDL_C}"
compat/cloudabi/cloudabi_clock.c optional compat_cloudabi32 | compat_cloudabi64
compat/cloudabi/cloudabi_errno.c optional compat_cloudabi32 | compat_cloudabi64
compat/cloudabi/cloudabi_fd.c optional compat_cloudabi32 | compat_cloudabi64
compat/cloudabi/cloudabi_file.c optional compat_cloudabi32 | compat_cloudabi64
compat/cloudabi/cloudabi_futex.c optional compat_cloudabi32 | compat_cloudabi64
compat/cloudabi/cloudabi_mem.c optional compat_cloudabi32 | compat_cloudabi64
compat/cloudabi/cloudabi_proc.c optional compat_cloudabi32 | compat_cloudabi64
compat/cloudabi/cloudabi_random.c optional compat_cloudabi32 | compat_cloudabi64
compat/cloudabi/cloudabi_sock.c optional compat_cloudabi32 | compat_cloudabi64
compat/cloudabi/cloudabi_thread.c optional compat_cloudabi32 | compat_cloudabi64
compat/cloudabi/cloudabi_vdso.c optional compat_cloudabi32 | compat_cloudabi64
compat/cloudabi32/cloudabi32_fd.c optional compat_cloudabi32
compat/cloudabi32/cloudabi32_module.c optional compat_cloudabi32
compat/cloudabi32/cloudabi32_poll.c optional compat_cloudabi32
compat/cloudabi32/cloudabi32_sock.c optional compat_cloudabi32
compat/cloudabi32/cloudabi32_syscalls.c optional compat_cloudabi32
compat/cloudabi32/cloudabi32_sysent.c optional compat_cloudabi32
compat/cloudabi32/cloudabi32_thread.c optional compat_cloudabi32
compat/cloudabi64/cloudabi64_fd.c optional compat_cloudabi64
compat/cloudabi64/cloudabi64_module.c optional compat_cloudabi64
compat/cloudabi64/cloudabi64_poll.c optional compat_cloudabi64
compat/cloudabi64/cloudabi64_sock.c optional compat_cloudabi64
compat/cloudabi64/cloudabi64_syscalls.c optional compat_cloudabi64
compat/cloudabi64/cloudabi64_sysent.c optional compat_cloudabi64
compat/cloudabi64/cloudabi64_thread.c optional compat_cloudabi64
compat/freebsd32/freebsd32_capability.c optional compat_freebsd32
compat/freebsd32/freebsd32_ioctl.c optional compat_freebsd32
compat/freebsd32/freebsd32_misc.c optional compat_freebsd32
compat/freebsd32/freebsd32_syscalls.c optional compat_freebsd32
compat/freebsd32/freebsd32_sysent.c optional compat_freebsd32
contrib/ck/src/ck_array.c standard compile-with "${NORMAL_C} -I$S/contrib/ck/include"
contrib/ck/src/ck_barrier_centralized.c standard compile-with "${NORMAL_C} -I$S/contrib/ck/include"
contrib/ck/src/ck_barrier_combining.c standard compile-with "${NORMAL_C} -I$S/contrib/ck/include"
contrib/ck/src/ck_barrier_dissemination.c standard compile-with "${NORMAL_C} -I$S/contrib/ck/include"
contrib/ck/src/ck_barrier_mcs.c standard compile-with "${NORMAL_C} -I$S/contrib/ck/include"
contrib/ck/src/ck_barrier_tournament.c standard compile-with "${NORMAL_C} -I$S/contrib/ck/include"
contrib/ck/src/ck_epoch.c standard compile-with "${NORMAL_C} -I$S/contrib/ck/include"
contrib/ck/src/ck_hp.c standard compile-with "${NORMAL_C} -I$S/contrib/ck/include"
contrib/ck/src/ck_hs.c standard compile-with "${NORMAL_C} -I$S/contrib/ck/include"
contrib/ck/src/ck_ht.c standard compile-with "${NORMAL_C} -I$S/contrib/ck/include"
contrib/ck/src/ck_rhs.c standard compile-with "${NORMAL_C} -I$S/contrib/ck/include"
contrib/dev/acpica/common/ahids.c optional acpi acpi_debug
contrib/dev/acpica/common/ahuuids.c optional acpi acpi_debug
contrib/dev/acpica/components/debugger/dbcmds.c optional acpi acpi_debug
contrib/dev/acpica/components/debugger/dbconvert.c optional acpi acpi_debug
contrib/dev/acpica/components/debugger/dbdisply.c optional acpi acpi_debug
contrib/dev/acpica/components/debugger/dbexec.c optional acpi acpi_debug
contrib/dev/acpica/components/debugger/dbhistry.c optional acpi acpi_debug
contrib/dev/acpica/components/debugger/dbinput.c optional acpi acpi_debug
contrib/dev/acpica/components/debugger/dbmethod.c optional acpi acpi_debug
contrib/dev/acpica/components/debugger/dbnames.c optional acpi acpi_debug
contrib/dev/acpica/components/debugger/dbobject.c optional acpi acpi_debug
contrib/dev/acpica/components/debugger/dbstats.c optional acpi acpi_debug
contrib/dev/acpica/components/debugger/dbtest.c optional acpi acpi_debug
contrib/dev/acpica/components/debugger/dbutils.c optional acpi acpi_debug
contrib/dev/acpica/components/debugger/dbxface.c optional acpi acpi_debug
contrib/dev/acpica/components/disassembler/dmbuffer.c optional acpi acpi_debug
contrib/dev/acpica/components/disassembler/dmcstyle.c optional acpi acpi_debug
contrib/dev/acpica/components/disassembler/dmdeferred.c optional acpi acpi_debug
contrib/dev/acpica/components/disassembler/dmnames.c optional acpi acpi_debug
contrib/dev/acpica/components/disassembler/dmopcode.c optional acpi acpi_debug
contrib/dev/acpica/components/disassembler/dmresrc.c optional acpi acpi_debug
contrib/dev/acpica/components/disassembler/dmresrcl.c optional acpi acpi_debug
contrib/dev/acpica/components/disassembler/dmresrcl2.c optional acpi acpi_debug
contrib/dev/acpica/components/disassembler/dmresrcs.c optional acpi acpi_debug
contrib/dev/acpica/components/disassembler/dmutils.c optional acpi acpi_debug
contrib/dev/acpica/components/disassembler/dmwalk.c optional acpi acpi_debug
contrib/dev/acpica/components/dispatcher/dsargs.c optional acpi
contrib/dev/acpica/components/dispatcher/dscontrol.c optional acpi
contrib/dev/acpica/components/dispatcher/dsdebug.c optional acpi
contrib/dev/acpica/components/dispatcher/dsfield.c optional acpi
contrib/dev/acpica/components/dispatcher/dsinit.c optional acpi
contrib/dev/acpica/components/dispatcher/dsmethod.c optional acpi
contrib/dev/acpica/components/dispatcher/dsmthdat.c optional acpi
contrib/dev/acpica/components/dispatcher/dsobject.c optional acpi
contrib/dev/acpica/components/dispatcher/dsopcode.c optional acpi
contrib/dev/acpica/components/dispatcher/dspkginit.c optional acpi
contrib/dev/acpica/components/dispatcher/dsutils.c optional acpi
contrib/dev/acpica/components/dispatcher/dswexec.c optional acpi
contrib/dev/acpica/components/dispatcher/dswload.c optional acpi
contrib/dev/acpica/components/dispatcher/dswload2.c optional acpi
contrib/dev/acpica/components/dispatcher/dswscope.c optional acpi
contrib/dev/acpica/components/dispatcher/dswstate.c optional acpi
contrib/dev/acpica/components/events/evevent.c optional acpi
contrib/dev/acpica/components/events/evglock.c optional acpi
contrib/dev/acpica/components/events/evgpe.c optional acpi
contrib/dev/acpica/components/events/evgpeblk.c optional acpi
contrib/dev/acpica/components/events/evgpeinit.c optional acpi
contrib/dev/acpica/components/events/evgpeutil.c optional acpi
contrib/dev/acpica/components/events/evhandler.c optional acpi
contrib/dev/acpica/components/events/evmisc.c optional acpi
contrib/dev/acpica/components/events/evregion.c optional acpi
contrib/dev/acpica/components/events/evrgnini.c optional acpi
contrib/dev/acpica/components/events/evsci.c optional acpi
contrib/dev/acpica/components/events/evxface.c optional acpi
contrib/dev/acpica/components/events/evxfevnt.c optional acpi
contrib/dev/acpica/components/events/evxfgpe.c optional acpi
contrib/dev/acpica/components/events/evxfregn.c optional acpi
contrib/dev/acpica/components/executer/exconcat.c optional acpi
contrib/dev/acpica/components/executer/exconfig.c optional acpi
contrib/dev/acpica/components/executer/exconvrt.c optional acpi
contrib/dev/acpica/components/executer/excreate.c optional acpi
contrib/dev/acpica/components/executer/exdebug.c optional acpi
contrib/dev/acpica/components/executer/exdump.c optional acpi
contrib/dev/acpica/components/executer/exfield.c optional acpi
contrib/dev/acpica/components/executer/exfldio.c optional acpi
contrib/dev/acpica/components/executer/exmisc.c optional acpi
contrib/dev/acpica/components/executer/exmutex.c optional acpi
contrib/dev/acpica/components/executer/exnames.c optional acpi
contrib/dev/acpica/components/executer/exoparg1.c optional acpi
contrib/dev/acpica/components/executer/exoparg2.c optional acpi
contrib/dev/acpica/components/executer/exoparg3.c optional acpi
contrib/dev/acpica/components/executer/exoparg6.c optional acpi
contrib/dev/acpica/components/executer/exprep.c optional acpi
contrib/dev/acpica/components/executer/exregion.c optional acpi
contrib/dev/acpica/components/executer/exresnte.c optional acpi
contrib/dev/acpica/components/executer/exresolv.c optional acpi
contrib/dev/acpica/components/executer/exresop.c optional acpi
contrib/dev/acpica/components/executer/exstore.c optional acpi
contrib/dev/acpica/components/executer/exstoren.c optional acpi
contrib/dev/acpica/components/executer/exstorob.c optional acpi
contrib/dev/acpica/components/executer/exsystem.c optional acpi
contrib/dev/acpica/components/executer/extrace.c optional acpi
contrib/dev/acpica/components/executer/exutils.c optional acpi
contrib/dev/acpica/components/hardware/hwacpi.c optional acpi
contrib/dev/acpica/components/hardware/hwesleep.c optional acpi
contrib/dev/acpica/components/hardware/hwgpe.c optional acpi
contrib/dev/acpica/components/hardware/hwpci.c optional acpi
contrib/dev/acpica/components/hardware/hwregs.c optional acpi
contrib/dev/acpica/components/hardware/hwsleep.c optional acpi
contrib/dev/acpica/components/hardware/hwtimer.c optional acpi
contrib/dev/acpica/components/hardware/hwvalid.c optional acpi
contrib/dev/acpica/components/hardware/hwxface.c optional acpi
contrib/dev/acpica/components/hardware/hwxfsleep.c optional acpi
contrib/dev/acpica/components/namespace/nsaccess.c optional acpi
contrib/dev/acpica/components/namespace/nsalloc.c optional acpi
contrib/dev/acpica/components/namespace/nsarguments.c optional acpi
contrib/dev/acpica/components/namespace/nsconvert.c optional acpi
contrib/dev/acpica/components/namespace/nsdump.c optional acpi
contrib/dev/acpica/components/namespace/nseval.c optional acpi
contrib/dev/acpica/components/namespace/nsinit.c optional acpi
contrib/dev/acpica/components/namespace/nsload.c optional acpi
contrib/dev/acpica/components/namespace/nsnames.c optional acpi
contrib/dev/acpica/components/namespace/nsobject.c optional acpi
contrib/dev/acpica/components/namespace/nsparse.c optional acpi
contrib/dev/acpica/components/namespace/nspredef.c optional acpi
contrib/dev/acpica/components/namespace/nsprepkg.c optional acpi
contrib/dev/acpica/components/namespace/nsrepair.c optional acpi
contrib/dev/acpica/components/namespace/nsrepair2.c optional acpi
contrib/dev/acpica/components/namespace/nssearch.c optional acpi
contrib/dev/acpica/components/namespace/nsutils.c optional acpi
contrib/dev/acpica/components/namespace/nswalk.c optional acpi
contrib/dev/acpica/components/namespace/nsxfeval.c optional acpi
contrib/dev/acpica/components/namespace/nsxfname.c optional acpi
contrib/dev/acpica/components/namespace/nsxfobj.c optional acpi
contrib/dev/acpica/components/parser/psargs.c optional acpi
contrib/dev/acpica/components/parser/psloop.c optional acpi
contrib/dev/acpica/components/parser/psobject.c optional acpi
contrib/dev/acpica/components/parser/psopcode.c optional acpi
contrib/dev/acpica/components/parser/psopinfo.c optional acpi
contrib/dev/acpica/components/parser/psparse.c optional acpi
contrib/dev/acpica/components/parser/psscope.c optional acpi
contrib/dev/acpica/components/parser/pstree.c optional acpi
contrib/dev/acpica/components/parser/psutils.c optional acpi
contrib/dev/acpica/components/parser/pswalk.c optional acpi
contrib/dev/acpica/components/parser/psxface.c optional acpi
contrib/dev/acpica/components/resources/rsaddr.c optional acpi
contrib/dev/acpica/components/resources/rscalc.c optional acpi
contrib/dev/acpica/components/resources/rscreate.c optional acpi
contrib/dev/acpica/components/resources/rsdump.c optional acpi acpi_debug
contrib/dev/acpica/components/resources/rsdumpinfo.c optional acpi
contrib/dev/acpica/components/resources/rsinfo.c optional acpi
contrib/dev/acpica/components/resources/rsio.c optional acpi
contrib/dev/acpica/components/resources/rsirq.c optional acpi
contrib/dev/acpica/components/resources/rslist.c optional acpi
contrib/dev/acpica/components/resources/rsmemory.c optional acpi
contrib/dev/acpica/components/resources/rsmisc.c optional acpi
contrib/dev/acpica/components/resources/rsserial.c optional acpi
contrib/dev/acpica/components/resources/rsutils.c optional acpi
contrib/dev/acpica/components/resources/rsxface.c optional acpi
contrib/dev/acpica/components/tables/tbdata.c optional acpi
contrib/dev/acpica/components/tables/tbfadt.c optional acpi
contrib/dev/acpica/components/tables/tbfind.c optional acpi
contrib/dev/acpica/components/tables/tbinstal.c optional acpi
contrib/dev/acpica/components/tables/tbprint.c optional acpi
contrib/dev/acpica/components/tables/tbutils.c optional acpi
contrib/dev/acpica/components/tables/tbxface.c optional acpi
contrib/dev/acpica/components/tables/tbxfload.c optional acpi
contrib/dev/acpica/components/tables/tbxfroot.c optional acpi
contrib/dev/acpica/components/utilities/utaddress.c optional acpi
contrib/dev/acpica/components/utilities/utalloc.c optional acpi
contrib/dev/acpica/components/utilities/utascii.c optional acpi
contrib/dev/acpica/components/utilities/utbuffer.c optional acpi
contrib/dev/acpica/components/utilities/utcache.c optional acpi
contrib/dev/acpica/components/utilities/utcopy.c optional acpi
contrib/dev/acpica/components/utilities/utdebug.c optional acpi
contrib/dev/acpica/components/utilities/utdecode.c optional acpi
contrib/dev/acpica/components/utilities/utdelete.c optional acpi
contrib/dev/acpica/components/utilities/uterror.c optional acpi
contrib/dev/acpica/components/utilities/uteval.c optional acpi
contrib/dev/acpica/components/utilities/utexcep.c optional acpi
contrib/dev/acpica/components/utilities/utglobal.c optional acpi
contrib/dev/acpica/components/utilities/uthex.c optional acpi
contrib/dev/acpica/components/utilities/utids.c optional acpi
contrib/dev/acpica/components/utilities/utinit.c optional acpi
contrib/dev/acpica/components/utilities/utlock.c optional acpi
contrib/dev/acpica/components/utilities/utmath.c optional acpi
contrib/dev/acpica/components/utilities/utmisc.c optional acpi
contrib/dev/acpica/components/utilities/utmutex.c optional acpi
contrib/dev/acpica/components/utilities/utnonansi.c optional acpi
contrib/dev/acpica/components/utilities/utobject.c optional acpi
contrib/dev/acpica/components/utilities/utosi.c optional acpi
contrib/dev/acpica/components/utilities/utownerid.c optional acpi
contrib/dev/acpica/components/utilities/utpredef.c optional acpi
contrib/dev/acpica/components/utilities/utresdecode.c optional acpi acpi_debug
contrib/dev/acpica/components/utilities/utresrc.c optional acpi
contrib/dev/acpica/components/utilities/utstate.c optional acpi
contrib/dev/acpica/components/utilities/utstring.c optional acpi
contrib/dev/acpica/components/utilities/utstrsuppt.c optional acpi
contrib/dev/acpica/components/utilities/utstrtoul64.c optional acpi
contrib/dev/acpica/components/utilities/utuuid.c optional acpi acpi_debug
contrib/dev/acpica/components/utilities/utxface.c optional acpi
contrib/dev/acpica/components/utilities/utxferror.c optional acpi
contrib/dev/acpica/components/utilities/utxfinit.c optional acpi
contrib/dev/acpica/os_specific/service_layers/osgendbg.c optional acpi acpi_debug
contrib/ipfilter/netinet/fil.c optional ipfilter inet \
compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN} -Wno-unused -I$S/contrib/ipfilter"
contrib/ipfilter/netinet/ip_auth.c optional ipfilter inet \
compile-with "${NORMAL_C} -Wno-unused -I$S/contrib/ipfilter"
contrib/ipfilter/netinet/ip_fil_freebsd.c optional ipfilter inet \
compile-with "${NORMAL_C} -Wno-unused -I$S/contrib/ipfilter"
contrib/ipfilter/netinet/ip_frag.c optional ipfilter inet \
compile-with "${NORMAL_C} -Wno-unused -I$S/contrib/ipfilter"
contrib/ipfilter/netinet/ip_log.c optional ipfilter inet \
compile-with "${NORMAL_C} -I$S/contrib/ipfilter"
contrib/ipfilter/netinet/ip_nat.c optional ipfilter inet \
compile-with "${NORMAL_C} -Wno-unused -I$S/contrib/ipfilter"
contrib/ipfilter/netinet/ip_proxy.c optional ipfilter inet \
compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN} -Wno-unused -I$S/contrib/ipfilter"
contrib/ipfilter/netinet/ip_state.c optional ipfilter inet \
compile-with "${NORMAL_C} -Wno-unused -I$S/contrib/ipfilter"
contrib/ipfilter/netinet/ip_lookup.c optional ipfilter inet \
compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN} -Wno-unused -Wno-error -I$S/contrib/ipfilter"
contrib/ipfilter/netinet/ip_pool.c optional ipfilter inet \
compile-with "${NORMAL_C} -Wno-unused -I$S/contrib/ipfilter"
contrib/ipfilter/netinet/ip_htable.c optional ipfilter inet \
compile-with "${NORMAL_C} -Wno-unused -I$S/contrib/ipfilter"
contrib/ipfilter/netinet/ip_sync.c optional ipfilter inet \
compile-with "${NORMAL_C} -Wno-unused -I$S/contrib/ipfilter"
contrib/ipfilter/netinet/mlfk_ipl.c optional ipfilter inet \
compile-with "${NORMAL_C} -I$S/contrib/ipfilter"
contrib/ipfilter/netinet/ip_nat6.c optional ipfilter inet \
compile-with "${NORMAL_C} -Wno-unused -I$S/contrib/ipfilter"
contrib/ipfilter/netinet/ip_rules.c optional ipfilter inet \
compile-with "${NORMAL_C} -I$S/contrib/ipfilter"
contrib/ipfilter/netinet/ip_scan.c optional ipfilter inet \
compile-with "${NORMAL_C} -Wno-unused -I$S/contrib/ipfilter"
contrib/ipfilter/netinet/ip_dstlist.c optional ipfilter inet \
compile-with "${NORMAL_C} -Wno-unused -I$S/contrib/ipfilter"
contrib/ipfilter/netinet/radix_ipf.c optional ipfilter inet \
compile-with "${NORMAL_C} -I$S/contrib/ipfilter"
contrib/libfdt/fdt.c optional fdt
contrib/libfdt/fdt_ro.c optional fdt
contrib/libfdt/fdt_rw.c optional fdt
contrib/libfdt/fdt_strerror.c optional fdt
contrib/libfdt/fdt_sw.c optional fdt
contrib/libfdt/fdt_wip.c optional fdt
contrib/libnv/dnvlist.c standard
contrib/libnv/nvlist.c standard
contrib/libnv/nvpair.c standard
contrib/ngatm/netnatm/api/cc_conn.c optional ngatm_ccatm \
compile-with "${NORMAL_C_NOWERROR} -I$S/contrib/ngatm"
contrib/ngatm/netnatm/api/cc_data.c optional ngatm_ccatm \
compile-with "${NORMAL_C} -I$S/contrib/ngatm"
contrib/ngatm/netnatm/api/cc_dump.c optional ngatm_ccatm \
compile-with "${NORMAL_C} -I$S/contrib/ngatm"
contrib/ngatm/netnatm/api/cc_port.c optional ngatm_ccatm \
compile-with "${NORMAL_C} -I$S/contrib/ngatm"
contrib/ngatm/netnatm/api/cc_sig.c optional ngatm_ccatm \
compile-with "${NORMAL_C} -I$S/contrib/ngatm"
contrib/ngatm/netnatm/api/cc_user.c optional ngatm_ccatm \
compile-with "${NORMAL_C} -I$S/contrib/ngatm"
contrib/ngatm/netnatm/api/unisap.c optional ngatm_ccatm \
compile-with "${NORMAL_C} -I$S/contrib/ngatm"
contrib/ngatm/netnatm/misc/straddr.c optional ngatm_atmbase \
compile-with "${NORMAL_C} -I$S/contrib/ngatm"
contrib/ngatm/netnatm/misc/unimsg_common.c optional ngatm_atmbase \
compile-with "${NORMAL_C} -I$S/contrib/ngatm"
contrib/ngatm/netnatm/msg/traffic.c optional ngatm_atmbase \
compile-with "${NORMAL_C} -I$S/contrib/ngatm"
contrib/ngatm/netnatm/msg/uni_ie.c optional ngatm_atmbase \
compile-with "${NORMAL_C} -I$S/contrib/ngatm"
contrib/ngatm/netnatm/msg/uni_msg.c optional ngatm_atmbase \
compile-with "${NORMAL_C} -I$S/contrib/ngatm"
contrib/ngatm/netnatm/saal/saal_sscfu.c optional ngatm_sscfu \
compile-with "${NORMAL_C} -I$S/contrib/ngatm"
contrib/ngatm/netnatm/saal/saal_sscop.c optional ngatm_sscop \
compile-with "${NORMAL_C} -I$S/contrib/ngatm"
contrib/ngatm/netnatm/sig/sig_call.c optional ngatm_uni \
compile-with "${NORMAL_C} -I$S/contrib/ngatm"
contrib/ngatm/netnatm/sig/sig_coord.c optional ngatm_uni \
compile-with "${NORMAL_C} -I$S/contrib/ngatm"
contrib/ngatm/netnatm/sig/sig_party.c optional ngatm_uni \
compile-with "${NORMAL_C} -I$S/contrib/ngatm"
contrib/ngatm/netnatm/sig/sig_print.c optional ngatm_uni \
compile-with "${NORMAL_C} -I$S/contrib/ngatm"
contrib/ngatm/netnatm/sig/sig_reset.c optional ngatm_uni \
compile-with "${NORMAL_C} -I$S/contrib/ngatm"
contrib/ngatm/netnatm/sig/sig_uni.c optional ngatm_uni \
compile-with "${NORMAL_C} -I$S/contrib/ngatm"
contrib/ngatm/netnatm/sig/sig_unimsgcpy.c optional ngatm_uni \
compile-with "${NORMAL_C} -I$S/contrib/ngatm"
contrib/ngatm/netnatm/sig/sig_verify.c optional ngatm_uni \
compile-with "${NORMAL_C} -I$S/contrib/ngatm"
crypto/blowfish/bf_ecb.c optional ipsec | ipsec_support
crypto/blowfish/bf_skey.c optional crypto | ipsec | ipsec_support
crypto/camellia/camellia.c optional crypto | ipsec | ipsec_support
crypto/camellia/camellia-api.c optional crypto | ipsec | ipsec_support
crypto/des/des_ecb.c optional crypto | ipsec | ipsec_support | netsmb
crypto/des/des_setkey.c optional crypto | ipsec | ipsec_support | netsmb
crypto/rc4/rc4.c optional netgraph_mppc_encryption | kgssapi
crypto/rijndael/rijndael-alg-fst.c optional crypto | geom_bde | \
ipsec | ipsec_support | random !random_loadable | wlan_ccmp
crypto/rijndael/rijndael-api-fst.c optional geom_bde | random !random_loadable
crypto/rijndael/rijndael-api.c optional crypto | ipsec | ipsec_support | \
wlan_ccmp
crypto/sha1.c optional carp | crypto | ipsec | \
ipsec_support | netgraph_mppc_encryption | sctp
crypto/sha2/sha256c.c optional crypto | geom_bde | ipsec | \
ipsec_support | random !random_loadable | sctp | zfs
crypto/sha2/sha512c.c optional crypto | geom_bde | ipsec | \
ipsec_support | zfs
crypto/skein/skein.c optional crypto | zfs
crypto/skein/skein_block.c optional crypto | zfs
crypto/siphash/siphash.c optional inet | inet6
crypto/siphash/siphash_test.c optional inet | inet6
ddb/db_access.c optional ddb
ddb/db_break.c optional ddb
ddb/db_capture.c optional ddb
ddb/db_command.c optional ddb
ddb/db_examine.c optional ddb
ddb/db_expr.c optional ddb
ddb/db_input.c optional ddb
ddb/db_lex.c optional ddb
ddb/db_main.c optional ddb
ddb/db_output.c optional ddb
ddb/db_print.c optional ddb
ddb/db_ps.c optional ddb
ddb/db_run.c optional ddb
ddb/db_script.c optional ddb
ddb/db_sym.c optional ddb
ddb/db_thread.c optional ddb
ddb/db_textdump.c optional ddb
ddb/db_variables.c optional ddb
ddb/db_watch.c optional ddb
ddb/db_write_cmd.c optional ddb
dev/aac/aac.c optional aac
dev/aac/aac_cam.c optional aacp aac
dev/aac/aac_debug.c optional aac
dev/aac/aac_disk.c optional aac
dev/aac/aac_linux.c optional aac compat_linux
dev/aac/aac_pci.c optional aac pci
dev/aacraid/aacraid.c optional aacraid
dev/aacraid/aacraid_cam.c optional aacraid scbus
dev/aacraid/aacraid_debug.c optional aacraid
dev/aacraid/aacraid_linux.c optional aacraid compat_linux
dev/aacraid/aacraid_pci.c optional aacraid pci
dev/acpi_support/acpi_wmi.c optional acpi_wmi acpi
dev/acpi_support/acpi_asus.c optional acpi_asus acpi
dev/acpi_support/acpi_asus_wmi.c optional acpi_asus_wmi acpi
dev/acpi_support/acpi_fujitsu.c optional acpi_fujitsu acpi
dev/acpi_support/acpi_hp.c optional acpi_hp acpi
dev/acpi_support/acpi_ibm.c optional acpi_ibm acpi
dev/acpi_support/acpi_panasonic.c optional acpi_panasonic acpi
dev/acpi_support/acpi_sony.c optional acpi_sony acpi
dev/acpi_support/acpi_toshiba.c optional acpi_toshiba acpi
dev/acpi_support/atk0110.c optional aibs acpi
dev/acpica/Osd/OsdDebug.c optional acpi
dev/acpica/Osd/OsdHardware.c optional acpi
dev/acpica/Osd/OsdInterrupt.c optional acpi
dev/acpica/Osd/OsdMemory.c optional acpi
dev/acpica/Osd/OsdSchedule.c optional acpi
dev/acpica/Osd/OsdStream.c optional acpi
dev/acpica/Osd/OsdSynch.c optional acpi
dev/acpica/Osd/OsdTable.c optional acpi
dev/acpica/acpi.c optional acpi
dev/acpica/acpi_acad.c optional acpi
dev/acpica/acpi_battery.c optional acpi
dev/acpica/acpi_button.c optional acpi
dev/acpica/acpi_cmbat.c optional acpi
dev/acpica/acpi_cpu.c optional acpi
dev/acpica/acpi_ec.c optional acpi
dev/acpica/acpi_isab.c optional acpi isa
dev/acpica/acpi_lid.c optional acpi
dev/acpica/acpi_package.c optional acpi
dev/acpica/acpi_pci.c optional acpi pci
dev/acpica/acpi_pci_link.c optional acpi pci
dev/acpica/acpi_pcib.c optional acpi pci
dev/acpica/acpi_pcib_acpi.c optional acpi pci
dev/acpica/acpi_pcib_pci.c optional acpi pci
dev/acpica/acpi_perf.c optional acpi
dev/acpica/acpi_powerres.c optional acpi
dev/acpica/acpi_quirk.c optional acpi
dev/acpica/acpi_resource.c optional acpi
dev/acpica/acpi_container.c optional acpi
dev/acpica/acpi_smbat.c optional acpi
dev/acpica/acpi_thermal.c optional acpi
dev/acpica/acpi_throttle.c optional acpi
dev/acpica/acpi_timer.c optional acpi
dev/acpica/acpi_video.c optional acpi_video acpi
dev/acpica/acpi_dock.c optional acpi_dock acpi
dev/adlink/adlink.c optional adlink
dev/advansys/adv_eisa.c optional adv eisa
dev/advansys/adv_pci.c optional adv pci
dev/advansys/advansys.c optional adv
dev/advansys/advlib.c optional adv
dev/advansys/advmcode.c optional adv
dev/advansys/adw_pci.c optional adw pci
dev/advansys/adwcam.c optional adw
dev/advansys/adwlib.c optional adw
dev/advansys/adwmcode.c optional adw
dev/ae/if_ae.c optional ae pci
dev/age/if_age.c optional age pci
dev/agp/agp.c optional agp pci
dev/agp/agp_if.m optional agp pci
dev/aha/aha.c optional aha
dev/aha/aha_isa.c optional aha isa
dev/aha/aha_mca.c optional aha mca
dev/ahb/ahb.c optional ahb eisa
dev/ahci/ahci.c optional ahci
dev/ahci/ahciem.c optional ahci
dev/ahci/ahci_pci.c optional ahci pci
dev/aic/aic.c optional aic
dev/aic/aic_pccard.c optional aic pccard
dev/aic7xxx/ahc_eisa.c optional ahc eisa
dev/aic7xxx/ahc_isa.c optional ahc isa
dev/aic7xxx/ahc_pci.c optional ahc pci \
compile-with "${NORMAL_C} ${NO_WCONSTANT_CONVERSION}"
dev/aic7xxx/ahd_pci.c optional ahd pci \
compile-with "${NORMAL_C} ${NO_WCONSTANT_CONVERSION}"
dev/aic7xxx/aic7770.c optional ahc
dev/aic7xxx/aic79xx.c optional ahd pci
dev/aic7xxx/aic79xx_osm.c optional ahd pci
dev/aic7xxx/aic79xx_pci.c optional ahd pci
dev/aic7xxx/aic79xx_reg_print.c optional ahd pci ahd_reg_pretty_print
dev/aic7xxx/aic7xxx.c optional ahc
dev/aic7xxx/aic7xxx_93cx6.c optional ahc
dev/aic7xxx/aic7xxx_osm.c optional ahc
dev/aic7xxx/aic7xxx_pci.c optional ahc pci
dev/aic7xxx/aic7xxx_reg_print.c optional ahc ahc_reg_pretty_print
dev/alc/if_alc.c optional alc pci
dev/ale/if_ale.c optional ale pci
dev/alpm/alpm.c optional alpm pci
dev/altera/avgen/altera_avgen.c optional altera_avgen
dev/altera/avgen/altera_avgen_fdt.c optional altera_avgen fdt
dev/altera/avgen/altera_avgen_nexus.c optional altera_avgen
dev/altera/sdcard/altera_sdcard.c optional altera_sdcard
dev/altera/sdcard/altera_sdcard_disk.c optional altera_sdcard
dev/altera/sdcard/altera_sdcard_io.c optional altera_sdcard
dev/altera/sdcard/altera_sdcard_fdt.c optional altera_sdcard fdt
dev/altera/sdcard/altera_sdcard_nexus.c optional altera_sdcard
dev/altera/pio/pio.c optional altera_pio
dev/altera/pio/pio_if.m optional altera_pio
dev/amdpm/amdpm.c optional amdpm pci | nfpm pci
dev/amdsmb/amdsmb.c optional amdsmb pci
dev/amr/amr.c optional amr
dev/amr/amr_cam.c optional amrp amr
dev/amr/amr_disk.c optional amr
dev/amr/amr_linux.c optional amr compat_linux
dev/amr/amr_pci.c optional amr pci
dev/an/if_an.c optional an
dev/an/if_an_isa.c optional an isa
dev/an/if_an_pccard.c optional an pccard
dev/an/if_an_pci.c optional an pci
#
dev/ata/ata_if.m optional ata | atacore
dev/ata/ata-all.c optional ata | atacore
dev/ata/ata-dma.c optional ata | atacore
dev/ata/ata-lowlevel.c optional ata | atacore
dev/ata/ata-sata.c optional ata | atacore
dev/ata/ata-card.c optional ata pccard | atapccard
dev/ata/ata-cbus.c optional ata pc98 | atapc98
dev/ata/ata-isa.c optional ata isa | ataisa
dev/ata/ata-pci.c optional ata pci | atapci
dev/ata/chipsets/ata-acard.c optional ata pci | ataacard
dev/ata/chipsets/ata-acerlabs.c optional ata pci | ataacerlabs
dev/ata/chipsets/ata-amd.c optional ata pci | ataamd
dev/ata/chipsets/ata-ati.c optional ata pci | ataati
dev/ata/chipsets/ata-cenatek.c optional ata pci | atacenatek
dev/ata/chipsets/ata-cypress.c optional ata pci | atacypress
dev/ata/chipsets/ata-cyrix.c optional ata pci | atacyrix
dev/ata/chipsets/ata-highpoint.c optional ata pci | atahighpoint
dev/ata/chipsets/ata-intel.c optional ata pci | ataintel
dev/ata/chipsets/ata-ite.c optional ata pci | ataite
dev/ata/chipsets/ata-jmicron.c optional ata pci | atajmicron
dev/ata/chipsets/ata-marvell.c optional ata pci | atamarvell
dev/ata/chipsets/ata-micron.c optional ata pci | atamicron
dev/ata/chipsets/ata-national.c optional ata pci | atanational
dev/ata/chipsets/ata-netcell.c optional ata pci | atanetcell
dev/ata/chipsets/ata-nvidia.c optional ata pci | atanvidia
dev/ata/chipsets/ata-promise.c optional ata pci | atapromise
dev/ata/chipsets/ata-serverworks.c optional ata pci | ataserverworks
dev/ata/chipsets/ata-siliconimage.c optional ata pci | atasiliconimage | ataati
dev/ata/chipsets/ata-sis.c optional ata pci | atasis
dev/ata/chipsets/ata-via.c optional ata pci | atavia
#
dev/ath/if_ath_pci.c optional ath_pci pci \
compile-with "${NORMAL_C} -I$S/dev/ath"
#
dev/ath/if_ath_ahb.c optional ath_ahb \
compile-with "${NORMAL_C} -I$S/dev/ath"
#
dev/ath/if_ath.c optional ath \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/if_ath_alq.c optional ath \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/if_ath_beacon.c optional ath \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/if_ath_btcoex.c optional ath \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/if_ath_btcoex_mci.c optional ath \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/if_ath_debug.c optional ath \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/if_ath_descdma.c optional ath \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/if_ath_keycache.c optional ath \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/if_ath_ioctl.c optional ath \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/if_ath_led.c optional ath \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/if_ath_lna_div.c optional ath \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/if_ath_tx.c optional ath \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/if_ath_tx_edma.c optional ath \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/if_ath_tx_ht.c optional ath \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/if_ath_tdma.c optional ath \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/if_ath_sysctl.c optional ath \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/if_ath_rx.c optional ath \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/if_ath_rx_edma.c optional ath \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/if_ath_spectral.c optional ath \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/ah_osdep.c optional ath \
compile-with "${NORMAL_C} -I$S/dev/ath"
#
dev/ath/ath_hal/ah.c optional ath \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/ath_hal/ah_eeprom_v1.c optional ath_hal | ath_ar5210 \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/ath_hal/ah_eeprom_v3.c optional ath_hal | ath_ar5211 | ath_ar5212 \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/ath_hal/ah_eeprom_v14.c \
optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/ath_hal/ah_eeprom_v4k.c \
optional ath_hal | ath_ar9285 \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/ath_hal/ah_eeprom_9287.c \
optional ath_hal | ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/ath_hal/ah_regdomain.c optional ath \
compile-with "${NORMAL_C} ${NO_WSHIFT_COUNT_NEGATIVE} ${NO_WSHIFT_COUNT_OVERFLOW} -I$S/dev/ath"
# ar5210
dev/ath/ath_hal/ar5210/ar5210_attach.c optional ath_hal | ath_ar5210 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5210/ar5210_beacon.c optional ath_hal | ath_ar5210 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5210/ar5210_interrupts.c optional ath_hal | ath_ar5210 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5210/ar5210_keycache.c optional ath_hal | ath_ar5210 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5210/ar5210_misc.c optional ath_hal | ath_ar5210 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5210/ar5210_phy.c optional ath_hal | ath_ar5210 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5210/ar5210_power.c optional ath_hal | ath_ar5210 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5210/ar5210_recv.c optional ath_hal | ath_ar5210 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5210/ar5210_reset.c optional ath_hal | ath_ar5210 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5210/ar5210_xmit.c optional ath_hal | ath_ar5210 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
# ar5211
dev/ath/ath_hal/ar5211/ar5211_attach.c optional ath_hal | ath_ar5211 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5211/ar5211_beacon.c optional ath_hal | ath_ar5211 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5211/ar5211_interrupts.c optional ath_hal | ath_ar5211 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5211/ar5211_keycache.c optional ath_hal | ath_ar5211 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5211/ar5211_misc.c optional ath_hal | ath_ar5211 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5211/ar5211_phy.c optional ath_hal | ath_ar5211 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5211/ar5211_power.c optional ath_hal | ath_ar5211 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5211/ar5211_recv.c optional ath_hal | ath_ar5211 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5211/ar5211_reset.c optional ath_hal | ath_ar5211 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5211/ar5211_xmit.c optional ath_hal | ath_ar5211 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
# ar5212
dev/ath/ath_hal/ar5212/ar5212_ani.c \
optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \
ath_ar9285 ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5212/ar5212_attach.c \
optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \
ath_ar9285 ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5212/ar5212_beacon.c \
optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \
ath_ar9285 ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5212/ar5212_eeprom.c \
optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \
ath_ar9285 ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5212/ar5212_gpio.c \
optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \
ath_ar9285 ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5212/ar5212_interrupts.c \
optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \
ath_ar9285 ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5212/ar5212_keycache.c \
optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \
ath_ar9285 ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5212/ar5212_misc.c \
optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \
ath_ar9285 ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5212/ar5212_phy.c \
optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \
ath_ar9285 ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5212/ar5212_power.c \
optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \
ath_ar9285 ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5212/ar5212_recv.c \
optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \
ath_ar9285 ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5212/ar5212_reset.c \
optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \
ath_ar9285 ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5212/ar5212_rfgain.c \
optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \
ath_ar9285 ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5212/ar5212_xmit.c \
optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \
ath_ar9285 ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
# ar5416 (depends on ar5212)
dev/ath/ath_hal/ar5416/ar5416_ani.c \
optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \
ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5416/ar5416_attach.c \
optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \
ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5416/ar5416_beacon.c \
optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \
ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5416/ar5416_btcoex.c \
optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \
ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5416/ar5416_cal.c \
optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \
ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5416/ar5416_cal_iq.c \
optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \
ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5416/ar5416_cal_adcgain.c \
optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \
ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5416/ar5416_cal_adcdc.c \
optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \
ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5416/ar5416_eeprom.c \
optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \
ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5416/ar5416_gpio.c \
optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \
ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5416/ar5416_interrupts.c \
optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \
ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5416/ar5416_keycache.c \
optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \
ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5416/ar5416_misc.c \
optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \
ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5416/ar5416_phy.c \
optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \
ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5416/ar5416_power.c \
optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \
ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5416/ar5416_radar.c \
optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \
ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5416/ar5416_recv.c \
optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \
ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5416/ar5416_reset.c \
optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \
ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5416/ar5416_spectral.c \
optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \
ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5416/ar5416_xmit.c \
optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \
ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
# ar9130 (depends upon ar5416) - also requires AH_SUPPORT_AR9130
#
# Since this is an embedded MAC SoC, there's no need to compile it into the
# default HAL.
dev/ath/ath_hal/ar9001/ar9130_attach.c optional ath_ar9130 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar9001/ar9130_phy.c optional ath_ar9130 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar9001/ar9130_eeprom.c optional ath_ar9130 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
# ar9160 (depends on ar5416)
dev/ath/ath_hal/ar9001/ar9160_attach.c optional ath_hal | ath_ar9160 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
# ar9280 (depends on ar5416)
dev/ath/ath_hal/ar9002/ar9280_attach.c optional ath_hal | ath_ar9280 | \
ath_ar9285 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar9002/ar9280_olc.c optional ath_hal | ath_ar9280 | \
ath_ar9285 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
# ar9285 (depends on ar5416 and ar9280)
dev/ath/ath_hal/ar9002/ar9285_attach.c optional ath_hal | ath_ar9285 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar9002/ar9285_btcoex.c optional ath_hal | ath_ar9285 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar9002/ar9285_reset.c optional ath_hal | ath_ar9285 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar9002/ar9285_cal.c optional ath_hal | ath_ar9285 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar9002/ar9285_phy.c optional ath_hal | ath_ar9285 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar9002/ar9285_diversity.c optional ath_hal | ath_ar9285 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
# ar9287 (depends on ar5416)
dev/ath/ath_hal/ar9002/ar9287_attach.c optional ath_hal | ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar9002/ar9287_reset.c optional ath_hal | ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar9002/ar9287_cal.c optional ath_hal | ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar9002/ar9287_olc.c optional ath_hal | ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
# ar9300
contrib/dev/ath/ath_hal/ar9300/ar9300_ani.c optional ath_hal | ath_ar9300 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
contrib/dev/ath/ath_hal/ar9300/ar9300_attach.c optional ath_hal | ath_ar9300 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
contrib/dev/ath/ath_hal/ar9300/ar9300_beacon.c optional ath_hal | ath_ar9300 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
contrib/dev/ath/ath_hal/ar9300/ar9300_eeprom.c optional ath_hal | ath_ar9300 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal ${NO_WCONSTANT_CONVERSION}"
contrib/dev/ath/ath_hal/ar9300/ar9300_freebsd.c optional ath_hal | ath_ar9300 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
contrib/dev/ath/ath_hal/ar9300/ar9300_gpio.c optional ath_hal | ath_ar9300 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
contrib/dev/ath/ath_hal/ar9300/ar9300_interrupts.c optional ath_hal | ath_ar9300 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
contrib/dev/ath/ath_hal/ar9300/ar9300_keycache.c optional ath_hal | ath_ar9300 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
contrib/dev/ath/ath_hal/ar9300/ar9300_mci.c optional ath_hal | ath_ar9300 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
contrib/dev/ath/ath_hal/ar9300/ar9300_misc.c optional ath_hal | ath_ar9300 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
contrib/dev/ath/ath_hal/ar9300/ar9300_paprd.c optional ath_hal | ath_ar9300 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
contrib/dev/ath/ath_hal/ar9300/ar9300_phy.c optional ath_hal | ath_ar9300 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
contrib/dev/ath/ath_hal/ar9300/ar9300_power.c optional ath_hal | ath_ar9300 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
contrib/dev/ath/ath_hal/ar9300/ar9300_radar.c optional ath_hal | ath_ar9300 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
contrib/dev/ath/ath_hal/ar9300/ar9300_radio.c optional ath_hal | ath_ar9300 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
contrib/dev/ath/ath_hal/ar9300/ar9300_recv.c optional ath_hal | ath_ar9300 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
contrib/dev/ath/ath_hal/ar9300/ar9300_recv_ds.c optional ath_hal | ath_ar9300 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
contrib/dev/ath/ath_hal/ar9300/ar9300_reset.c optional ath_hal | ath_ar9300 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal ${NO_WSOMETIMES_UNINITIALIZED} -Wno-unused-function"
contrib/dev/ath/ath_hal/ar9300/ar9300_stub.c optional ath_hal | ath_ar9300 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
contrib/dev/ath/ath_hal/ar9300/ar9300_stub_funcs.c optional ath_hal | ath_ar9300 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
contrib/dev/ath/ath_hal/ar9300/ar9300_spectral.c optional ath_hal | ath_ar9300 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
contrib/dev/ath/ath_hal/ar9300/ar9300_timer.c optional ath_hal | ath_ar9300 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
contrib/dev/ath/ath_hal/ar9300/ar9300_xmit.c optional ath_hal | ath_ar9300 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
contrib/dev/ath/ath_hal/ar9300/ar9300_xmit_ds.c optional ath_hal | ath_ar9300 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
# rf backends
dev/ath/ath_hal/ar5212/ar2316.c optional ath_rf2316 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5212/ar2317.c optional ath_rf2317 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5212/ar2413.c optional ath_hal | ath_rf2413 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5212/ar2425.c optional ath_hal | ath_rf2425 | ath_rf2417 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5212/ar5111.c optional ath_hal | ath_rf5111 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5212/ar5112.c optional ath_hal | ath_rf5112 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5212/ar5413.c optional ath_hal | ath_rf5413 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar5416/ar2133.c optional ath_hal | ath_ar5416 | \
ath_ar9130 | ath_ar9160 | ath_ar9280 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar9002/ar9280.c optional ath_hal | ath_ar9280 | ath_ar9285 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar9002/ar9285.c optional ath_hal | ath_ar9285 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
dev/ath/ath_hal/ar9002/ar9287.c optional ath_hal | ath_ar9287 \
compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
# ath rate control algorithms
dev/ath/ath_rate/amrr/amrr.c optional ath_rate_amrr \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/ath_rate/onoe/onoe.c optional ath_rate_onoe \
compile-with "${NORMAL_C} -I$S/dev/ath"
dev/ath/ath_rate/sample/sample.c optional ath_rate_sample \
compile-with "${NORMAL_C} -I$S/dev/ath"
# ath DFS modules
dev/ath/ath_dfs/null/dfs_null.c optional ath \
compile-with "${NORMAL_C} -I$S/dev/ath"
#
dev/bce/if_bce.c optional bce
dev/bfe/if_bfe.c optional bfe
dev/bge/if_bge.c optional bge
dev/bhnd/bhnd.c optional bhnd
dev/bhnd/bhnd_nexus.c optional bhnd siba_nexus | \
bhnd bcma_nexus
dev/bhnd/bhnd_subr.c optional bhnd
dev/bhnd/bhnd_bus_if.m optional bhnd
dev/bhnd/bhndb/bhnd_bhndb.c optional bhndb bhnd
dev/bhnd/bhndb/bhndb.c optional bhndb bhnd
dev/bhnd/bhndb/bhndb_bus_if.m optional bhndb bhnd
dev/bhnd/bhndb/bhndb_hwdata.c optional bhndb bhnd
dev/bhnd/bhndb/bhndb_if.m optional bhndb bhnd
dev/bhnd/bhndb/bhndb_pci.c optional bhndb bhnd pci
dev/bhnd/bhndb/bhndb_pci_hwdata.c optional bhndb bhnd pci
dev/bhnd/bhndb/bhndb_pci_sprom.c optional bhndb bhnd pci
dev/bhnd/bhndb/bhndb_subr.c optional bhndb bhnd
dev/bhnd/bcma/bcma.c optional bcma bhnd
dev/bhnd/bcma/bcma_bhndb.c optional bcma bhnd bhndb
dev/bhnd/bcma/bcma_erom.c optional bcma bhnd
dev/bhnd/bcma/bcma_nexus.c optional bcma_nexus bcma bhnd
dev/bhnd/bcma/bcma_subr.c optional bcma bhnd
dev/bhnd/cores/chipc/chipc.c optional bhnd
dev/bhnd/cores/chipc/chipc_cfi.c optional bhnd cfi
dev/bhnd/cores/chipc/chipc_slicer.c optional bhnd cfi | bhnd spibus
dev/bhnd/cores/chipc/chipc_spi.c optional bhnd spibus
dev/bhnd/cores/chipc/chipc_subr.c optional bhnd
dev/bhnd/cores/chipc/bhnd_chipc_if.m optional bhnd
dev/bhnd/cores/chipc/bhnd_sprom_chipc.c optional bhnd
dev/bhnd/cores/pci/bhnd_pci.c optional bhnd pci
dev/bhnd/cores/pci/bhnd_pci_hostb.c optional bhndb bhnd pci
dev/bhnd/cores/pci/bhnd_pcib.c optional bhnd_pcib bhnd pci
dev/bhnd/cores/pcie2/bhnd_pcie2.c optional bhnd pci
dev/bhnd/cores/pcie2/bhnd_pcie2_hostb.c optional bhndb bhnd pci
dev/bhnd/cores/pcie2/bhnd_pcie2b.c optional bhnd_pcie2b bhnd pci
dev/bhnd/nvram/bhnd_nvram_if.m optional bhnd
dev/bhnd/nvram/bhnd_sprom.c optional bhnd
dev/bhnd/nvram/bhnd_sprom_subr.c optional bhnd
dev/bhnd/nvram/nvram_subr.c optional bhnd
dev/bhnd/siba/siba.c optional siba bhnd
dev/bhnd/siba/siba_bhndb.c optional siba bhnd bhndb
dev/bhnd/siba/siba_nexus.c optional siba_nexus siba bhnd
dev/bhnd/siba/siba_subr.c optional siba bhnd
#
dev/bktr/bktr_audio.c optional bktr pci
dev/bktr/bktr_card.c optional bktr pci
dev/bktr/bktr_core.c optional bktr pci
dev/bktr/bktr_i2c.c optional bktr pci smbus
dev/bktr/bktr_os.c optional bktr pci
dev/bktr/bktr_tuner.c optional bktr pci
dev/bktr/msp34xx.c optional bktr pci
dev/bnxt/bnxt_hwrm.c optional bnxt iflib pci
dev/bnxt/bnxt_sysctl.c optional bnxt iflib pci
dev/bnxt/bnxt_txrx.c optional bnxt iflib pci
dev/bnxt/if_bnxt.c optional bnxt iflib pci
dev/buslogic/bt.c optional bt
dev/buslogic/bt_eisa.c optional bt eisa
dev/buslogic/bt_isa.c optional bt isa
dev/buslogic/bt_mca.c optional bt mca
dev/buslogic/bt_pci.c optional bt pci
dev/bwi/bwimac.c optional bwi
dev/bwi/bwiphy.c optional bwi
dev/bwi/bwirf.c optional bwi
dev/bwi/if_bwi.c optional bwi
dev/bwi/if_bwi_pci.c optional bwi pci
# XXX Work around clang warnings, until maintainer approves fix.
dev/bwn/if_bwn.c optional bwn siba_bwn \
compile-with "${NORMAL_C} ${NO_WSOMETIMES_UNINITIALIZED}"
dev/bwn/if_bwn_pci.c optional bwn pci bhnd
dev/bwn/if_bwn_phy_common.c optional bwn siba_bwn
dev/bwn/if_bwn_phy_g.c optional bwn siba_bwn \
compile-with "${NORMAL_C} ${NO_WSOMETIMES_UNINITIALIZED} ${NO_WCONSTANT_CONVERSION}"
dev/bwn/if_bwn_phy_lp.c optional bwn siba_bwn \
compile-with "${NORMAL_C} ${NO_WSOMETIMES_UNINITIALIZED}"
dev/bwn/if_bwn_phy_n.c optional bwn siba_bwn
dev/bwn/if_bwn_util.c optional bwn siba_bwn
dev/bwn/bwn_mac.c optional bwn bhnd
dev/cardbus/cardbus.c optional cardbus
dev/cardbus/cardbus_cis.c optional cardbus
dev/cardbus/cardbus_device.c optional cardbus
dev/cas/if_cas.c optional cas
dev/cfi/cfi_bus_fdt.c optional cfi fdt
dev/cfi/cfi_bus_nexus.c optional cfi
dev/cfi/cfi_core.c optional cfi
dev/cfi/cfi_dev.c optional cfi
dev/cfi/cfi_disk.c optional cfid
dev/chromebook_platform/chromebook_platform.c optional chromebook_platform
dev/ciss/ciss.c optional ciss
dev/cm/smc90cx6.c optional cm
dev/cmx/cmx.c optional cmx
dev/cmx/cmx_pccard.c optional cmx pccard
dev/cpufreq/ichss.c optional cpufreq
dev/cs/if_cs.c optional cs
dev/cs/if_cs_isa.c optional cs isa
dev/cs/if_cs_pccard.c optional cs pccard
dev/cxgb/cxgb_main.c optional cxgb pci \
compile-with "${NORMAL_C} -I$S/dev/cxgb"
dev/cxgb/cxgb_sge.c optional cxgb pci \
compile-with "${NORMAL_C} -I$S/dev/cxgb"
dev/cxgb/common/cxgb_mc5.c optional cxgb pci \
compile-with "${NORMAL_C} -I$S/dev/cxgb"
dev/cxgb/common/cxgb_vsc7323.c optional cxgb pci \
compile-with "${NORMAL_C} -I$S/dev/cxgb"
dev/cxgb/common/cxgb_vsc8211.c optional cxgb pci \
compile-with "${NORMAL_C} -I$S/dev/cxgb"
dev/cxgb/common/cxgb_ael1002.c optional cxgb pci \
compile-with "${NORMAL_C} -I$S/dev/cxgb"
dev/cxgb/common/cxgb_aq100x.c optional cxgb pci \
compile-with "${NORMAL_C} -I$S/dev/cxgb"
dev/cxgb/common/cxgb_mv88e1xxx.c optional cxgb pci \
compile-with "${NORMAL_C} -I$S/dev/cxgb"
dev/cxgb/common/cxgb_xgmac.c optional cxgb pci \
compile-with "${NORMAL_C} -I$S/dev/cxgb"
dev/cxgb/common/cxgb_t3_hw.c optional cxgb pci \
compile-with "${NORMAL_C} -I$S/dev/cxgb"
dev/cxgb/common/cxgb_tn1010.c optional cxgb pci \
compile-with "${NORMAL_C} -I$S/dev/cxgb"
dev/cxgb/sys/uipc_mvec.c optional cxgb pci \
compile-with "${NORMAL_C} -I$S/dev/cxgb"
dev/cxgb/cxgb_t3fw.c optional cxgb cxgb_t3fw \
compile-with "${NORMAL_C} -I$S/dev/cxgb"
dev/cxgbe/t4_if.m optional cxgbe pci
dev/cxgbe/t4_iov.c optional cxgbe pci \
compile-with "${NORMAL_C} -I$S/dev/cxgbe"
dev/cxgbe/t4_mp_ring.c optional cxgbe pci \
compile-with "${NORMAL_C} -I$S/dev/cxgbe"
dev/cxgbe/t4_main.c optional cxgbe pci \
compile-with "${NORMAL_C} -I$S/dev/cxgbe"
dev/cxgbe/t4_netmap.c optional cxgbe pci \
compile-with "${NORMAL_C} -I$S/dev/cxgbe"
dev/cxgbe/t4_sched.c optional cxgbe pci \
compile-with "${NORMAL_C} -I$S/dev/cxgbe"
dev/cxgbe/t4_sge.c optional cxgbe pci \
compile-with "${NORMAL_C} -I$S/dev/cxgbe"
dev/cxgbe/t4_l2t.c optional cxgbe pci \
compile-with "${NORMAL_C} -I$S/dev/cxgbe"
dev/cxgbe/t4_tracer.c optional cxgbe pci \
compile-with "${NORMAL_C} -I$S/dev/cxgbe"
dev/cxgbe/t4_vf.c optional cxgbev pci \
compile-with "${NORMAL_C} -I$S/dev/cxgbe"
dev/cxgbe/common/t4_hw.c optional cxgbe pci \
compile-with "${NORMAL_C} -I$S/dev/cxgbe"
dev/cxgbe/common/t4vf_hw.c optional cxgbev pci \
compile-with "${NORMAL_C} -I$S/dev/cxgbe"
dev/cxgbe/cudbg/cudbg_common.c optional cxgbe \
compile-with "${NORMAL_C} -I$S/dev/cxgbe"
dev/cxgbe/cudbg/cudbg_flash_utils.c optional cxgbe \
compile-with "${NORMAL_C} -I$S/dev/cxgbe"
dev/cxgbe/cudbg/cudbg_lib.c optional cxgbe \
compile-with "${NORMAL_C} -I$S/dev/cxgbe"
dev/cxgbe/cudbg/cudbg_wtp.c optional cxgbe \
compile-with "${NORMAL_C} -I$S/dev/cxgbe"
dev/cxgbe/cudbg/fastlz.c optional cxgbe \
compile-with "${NORMAL_C} -I$S/dev/cxgbe"
dev/cxgbe/cudbg/fastlz_api.c optional cxgbe \
compile-with "${NORMAL_C} -I$S/dev/cxgbe"
t4fw_cfg.c optional cxgbe \
compile-with "${AWK} -f $S/tools/fw_stub.awk t4fw_cfg.fw:t4fw_cfg t4fw_cfg_uwire.fw:t4fw_cfg_uwire t4fw.fw:t4fw -mt4fw_cfg -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "t4fw_cfg.c"
t4fw_cfg.fwo optional cxgbe \
dependency "t4fw_cfg.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "t4fw_cfg.fwo"
t4fw_cfg.fw optional cxgbe \
dependency "$S/dev/cxgbe/firmware/t4fw_cfg.txt" \
compile-with "${CP} ${.ALLSRC} ${.TARGET}" \
no-obj no-implicit-rule \
clean "t4fw_cfg.fw"
t4fw_cfg_uwire.fwo optional cxgbe \
dependency "t4fw_cfg_uwire.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "t4fw_cfg_uwire.fwo"
t4fw_cfg_uwire.fw optional cxgbe \
dependency "$S/dev/cxgbe/firmware/t4fw_cfg_uwire.txt" \
compile-with "${CP} ${.ALLSRC} ${.TARGET}" \
no-obj no-implicit-rule \
clean "t4fw_cfg_uwire.fw"
t4fw.fwo optional cxgbe \
dependency "t4fw.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "t4fw.fwo"
t4fw.fw optional cxgbe \
dependency "$S/dev/cxgbe/firmware/t4fw-1.16.63.0.bin.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "t4fw.fw"
t5fw_cfg.c optional cxgbe \
compile-with "${AWK} -f $S/tools/fw_stub.awk t5fw_cfg.fw:t5fw_cfg t5fw_cfg_uwire.fw:t5fw_cfg_uwire t5fw.fw:t5fw -mt5fw_cfg -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "t5fw_cfg.c"
t5fw_cfg.fwo optional cxgbe \
dependency "t5fw_cfg.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "t5fw_cfg.fwo"
t5fw_cfg.fw optional cxgbe \
dependency "$S/dev/cxgbe/firmware/t5fw_cfg.txt" \
compile-with "${CP} ${.ALLSRC} ${.TARGET}" \
no-obj no-implicit-rule \
clean "t5fw_cfg.fw"
t5fw_cfg_uwire.fwo optional cxgbe \
dependency "t5fw_cfg_uwire.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "t5fw_cfg_uwire.fwo"
t5fw_cfg_uwire.fw optional cxgbe \
dependency "$S/dev/cxgbe/firmware/t5fw_cfg_uwire.txt" \
compile-with "${CP} ${.ALLSRC} ${.TARGET}" \
no-obj no-implicit-rule \
clean "t5fw_cfg_uwire.fw"
t5fw.fwo optional cxgbe \
dependency "t5fw.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "t5fw.fwo"
t5fw.fw optional cxgbe \
dependency "$S/dev/cxgbe/firmware/t5fw-1.16.63.0.bin.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "t5fw.fw"
t6fw_cfg.c optional cxgbe \
compile-with "${AWK} -f $S/tools/fw_stub.awk t6fw_cfg.fw:t6fw_cfg t6fw_cfg_uwire.fw:t6fw_cfg_uwire t6fw.fw:t6fw -mt6fw_cfg -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "t6fw_cfg.c"
t6fw_cfg.fwo optional cxgbe \
dependency "t6fw_cfg.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "t6fw_cfg.fwo"
t6fw_cfg.fw optional cxgbe \
dependency "$S/dev/cxgbe/firmware/t6fw_cfg.txt" \
compile-with "${CP} ${.ALLSRC} ${.TARGET}" \
no-obj no-implicit-rule \
clean "t6fw_cfg.fw"
t6fw_cfg_uwire.fwo optional cxgbe \
dependency "t6fw_cfg_uwire.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "t6fw_cfg_uwire.fwo"
t6fw_cfg_uwire.fw optional cxgbe \
dependency "$S/dev/cxgbe/firmware/t6fw_cfg_uwire.txt" \
compile-with "${CP} ${.ALLSRC} ${.TARGET}" \
no-obj no-implicit-rule \
clean "t6fw_cfg_uwire.fw"
t6fw.fwo optional cxgbe \
dependency "t6fw.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "t6fw.fwo"
t6fw.fw optional cxgbe \
dependency "$S/dev/cxgbe/firmware/t6fw-1.16.63.0.bin.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "t6fw.fw"
dev/cy/cy.c optional cy
dev/cy/cy_isa.c optional cy isa
dev/cy/cy_pci.c optional cy pci
dev/cyapa/cyapa.c optional cyapa iicbus
dev/dc/if_dc.c optional dc pci
dev/dc/dcphy.c optional dc pci
dev/dc/pnphy.c optional dc pci
dev/dcons/dcons.c optional dcons
dev/dcons/dcons_crom.c optional dcons_crom
dev/dcons/dcons_os.c optional dcons
dev/de/if_de.c optional de pci
dev/digi/CX.c optional digi_CX
dev/digi/CX_PCI.c optional digi_CX_PCI
dev/digi/EPCX.c optional digi_EPCX
dev/digi/EPCX_PCI.c optional digi_EPCX_PCI
dev/digi/Xe.c optional digi_Xe
dev/digi/Xem.c optional digi_Xem
dev/digi/Xr.c optional digi_Xr
dev/digi/digi.c optional digi
dev/digi/digi_isa.c optional digi isa
dev/digi/digi_pci.c optional digi pci
dev/dpt/dpt_eisa.c optional dpt eisa
dev/dpt/dpt_pci.c optional dpt pci
dev/dpt/dpt_scsi.c optional dpt
dev/drm/ati_pcigart.c optional drm
dev/drm/drm_agpsupport.c optional drm
dev/drm/drm_auth.c optional drm
dev/drm/drm_bufs.c optional drm
dev/drm/drm_context.c optional drm
dev/drm/drm_dma.c optional drm
dev/drm/drm_drawable.c optional drm
dev/drm/drm_drv.c optional drm
dev/drm/drm_fops.c optional drm
dev/drm/drm_hashtab.c optional drm
dev/drm/drm_ioctl.c optional drm
dev/drm/drm_irq.c optional drm
dev/drm/drm_lock.c optional drm
dev/drm/drm_memory.c optional drm
dev/drm/drm_mm.c optional drm
dev/drm/drm_pci.c optional drm
dev/drm/drm_scatter.c optional drm
dev/drm/drm_sman.c optional drm
dev/drm/drm_sysctl.c optional drm
dev/drm/drm_vm.c optional drm
dev/drm/i915_dma.c optional i915drm
dev/drm/i915_drv.c optional i915drm
dev/drm/i915_irq.c optional i915drm
dev/drm/i915_mem.c optional i915drm
dev/drm/i915_suspend.c optional i915drm
dev/drm/mach64_dma.c optional mach64drm
dev/drm/mach64_drv.c optional mach64drm
dev/drm/mach64_irq.c optional mach64drm
dev/drm/mach64_state.c optional mach64drm
dev/drm/mga_dma.c optional mgadrm
dev/drm/mga_drv.c optional mgadrm
dev/drm/mga_irq.c optional mgadrm
dev/drm/mga_state.c optional mgadrm
dev/drm/mga_warp.c optional mgadrm
dev/drm/r128_cce.c optional r128drm \
compile-with "${NORMAL_C} ${NO_WCONSTANT_CONVERSION}"
dev/drm/r128_drv.c optional r128drm
dev/drm/r128_irq.c optional r128drm
dev/drm/r128_state.c optional r128drm
dev/drm/r300_cmdbuf.c optional radeondrm
dev/drm/r600_blit.c optional radeondrm
dev/drm/r600_cp.c optional radeondrm \
compile-with "${NORMAL_C} ${NO_WCONSTANT_CONVERSION}"
dev/drm/radeon_cp.c optional radeondrm \
compile-with "${NORMAL_C} ${NO_WCONSTANT_CONVERSION}"
dev/drm/radeon_cs.c optional radeondrm
dev/drm/radeon_drv.c optional radeondrm
dev/drm/radeon_irq.c optional radeondrm
dev/drm/radeon_mem.c optional radeondrm
dev/drm/radeon_state.c optional radeondrm
dev/drm/savage_bci.c optional savagedrm
dev/drm/savage_drv.c optional savagedrm
dev/drm/savage_state.c optional savagedrm
dev/drm/sis_drv.c optional sisdrm
dev/drm/sis_ds.c optional sisdrm
dev/drm/sis_mm.c optional sisdrm
dev/drm/tdfx_drv.c optional tdfxdrm
dev/drm/via_dma.c optional viadrm
dev/drm/via_dmablit.c optional viadrm
dev/drm/via_drv.c optional viadrm
dev/drm/via_irq.c optional viadrm
dev/drm/via_map.c optional viadrm
dev/drm/via_mm.c optional viadrm
dev/drm/via_verifier.c optional viadrm
dev/drm/via_video.c optional viadrm
dev/drm2/drm_agpsupport.c optional drm2
dev/drm2/drm_auth.c optional drm2
dev/drm2/drm_bufs.c optional drm2
dev/drm2/drm_buffer.c optional drm2
dev/drm2/drm_context.c optional drm2
dev/drm2/drm_crtc.c optional drm2
dev/drm2/drm_crtc_helper.c optional drm2
dev/drm2/drm_dma.c optional drm2
dev/drm2/drm_dp_helper.c optional drm2
dev/drm2/drm_dp_iic_helper.c optional drm2
dev/drm2/drm_drv.c optional drm2
dev/drm2/drm_edid.c optional drm2
dev/drm2/drm_fb_helper.c optional drm2
dev/drm2/drm_fops.c optional drm2
dev/drm2/drm_gem.c optional drm2
dev/drm2/drm_gem_names.c optional drm2
dev/drm2/drm_global.c optional drm2
dev/drm2/drm_hashtab.c optional drm2
dev/drm2/drm_ioctl.c optional drm2
dev/drm2/drm_irq.c optional drm2
dev/drm2/drm_linux_list_sort.c optional drm2
dev/drm2/drm_lock.c optional drm2
dev/drm2/drm_memory.c optional drm2
dev/drm2/drm_mm.c optional drm2
dev/drm2/drm_modes.c optional drm2
dev/drm2/drm_pci.c optional drm2
dev/drm2/drm_platform.c optional drm2
dev/drm2/drm_scatter.c optional drm2
dev/drm2/drm_stub.c optional drm2
dev/drm2/drm_sysctl.c optional drm2
dev/drm2/drm_vm.c optional drm2
dev/drm2/drm_os_freebsd.c optional drm2
dev/drm2/ttm/ttm_agp_backend.c optional drm2
dev/drm2/ttm/ttm_lock.c optional drm2
dev/drm2/ttm/ttm_object.c optional drm2
dev/drm2/ttm/ttm_tt.c optional drm2
dev/drm2/ttm/ttm_bo_util.c optional drm2
dev/drm2/ttm/ttm_bo.c optional drm2
dev/drm2/ttm/ttm_bo_manager.c optional drm2
dev/drm2/ttm/ttm_execbuf_util.c optional drm2
dev/drm2/ttm/ttm_memory.c optional drm2
dev/drm2/ttm/ttm_page_alloc.c optional drm2
dev/drm2/ttm/ttm_bo_vm.c optional drm2
dev/drm2/ati_pcigart.c optional drm2 agp pci
dev/ed/if_ed.c optional ed
dev/ed/if_ed_novell.c optional ed
dev/ed/if_ed_rtl80x9.c optional ed
dev/ed/if_ed_pccard.c optional ed pccard
dev/ed/if_ed_pci.c optional ed pci
dev/efidev/efidev.c optional efirt
dev/efidev/efirt.c optional efirt
dev/efidev/efirtc.c optional efirt
dev/eisa/eisa_if.m standard
dev/eisa/eisaconf.c optional eisa
dev/e1000/if_em.c optional em \
compile-with "${NORMAL_C} -I$S/dev/e1000"
dev/e1000/if_lem.c optional em \
compile-with "${NORMAL_C} -I$S/dev/e1000"
dev/e1000/if_igb.c optional igb \
compile-with "${NORMAL_C} -I$S/dev/e1000"
dev/e1000/e1000_80003es2lan.c optional em | igb \
compile-with "${NORMAL_C} -I$S/dev/e1000"
dev/e1000/e1000_82540.c optional em | igb \
compile-with "${NORMAL_C} -I$S/dev/e1000"
dev/e1000/e1000_82541.c optional em | igb \
compile-with "${NORMAL_C} -I$S/dev/e1000"
dev/e1000/e1000_82542.c optional em | igb \
compile-with "${NORMAL_C} -I$S/dev/e1000"
dev/e1000/e1000_82543.c optional em | igb \
compile-with "${NORMAL_C} -I$S/dev/e1000"
dev/e1000/e1000_82571.c optional em | igb \
compile-with "${NORMAL_C} -I$S/dev/e1000"
dev/e1000/e1000_82575.c optional em | igb \
compile-with "${NORMAL_C} -I$S/dev/e1000"
dev/e1000/e1000_ich8lan.c optional em | igb \
compile-with "${NORMAL_C} -I$S/dev/e1000"
dev/e1000/e1000_i210.c optional em | igb \
compile-with "${NORMAL_C} -I$S/dev/e1000"
dev/e1000/e1000_api.c optional em | igb \
compile-with "${NORMAL_C} -I$S/dev/e1000"
dev/e1000/e1000_mac.c optional em | igb \
compile-with "${NORMAL_C} -I$S/dev/e1000"
dev/e1000/e1000_manage.c optional em | igb \
compile-with "${NORMAL_C} -I$S/dev/e1000"
dev/e1000/e1000_nvm.c optional em | igb \
compile-with "${NORMAL_C} -I$S/dev/e1000"
dev/e1000/e1000_phy.c optional em | igb \
compile-with "${NORMAL_C} -I$S/dev/e1000"
dev/e1000/e1000_vf.c optional em | igb \
compile-with "${NORMAL_C} -I$S/dev/e1000"
dev/e1000/e1000_mbx.c optional em | igb \
compile-with "${NORMAL_C} -I$S/dev/e1000"
dev/e1000/e1000_osdep.c optional em | igb \
compile-with "${NORMAL_C} -I$S/dev/e1000"
dev/et/if_et.c optional et
dev/ena/ena.c optional ena \
compile-with "${NORMAL_C} -I$S/contrib"
dev/ena/ena_sysctl.c optional ena \
compile-with "${NORMAL_C} -I$S/contrib"
contrib/ena-com/ena_com.c optional ena
contrib/ena-com/ena_eth_com.c optional ena
dev/en/if_en_pci.c optional en pci
dev/en/midway.c optional en
dev/ep/if_ep.c optional ep
dev/ep/if_ep_eisa.c optional ep eisa
dev/ep/if_ep_isa.c optional ep isa
dev/ep/if_ep_mca.c optional ep mca
dev/ep/if_ep_pccard.c optional ep pccard
dev/esp/esp_pci.c optional esp pci
dev/esp/ncr53c9x.c optional esp
dev/etherswitch/arswitch/arswitch.c optional arswitch
dev/etherswitch/arswitch/arswitch_reg.c optional arswitch
dev/etherswitch/arswitch/arswitch_phy.c optional arswitch
dev/etherswitch/arswitch/arswitch_8216.c optional arswitch
dev/etherswitch/arswitch/arswitch_8226.c optional arswitch
dev/etherswitch/arswitch/arswitch_8316.c optional arswitch
dev/etherswitch/arswitch/arswitch_8327.c optional arswitch
dev/etherswitch/arswitch/arswitch_7240.c optional arswitch
dev/etherswitch/arswitch/arswitch_9340.c optional arswitch
dev/etherswitch/arswitch/arswitch_vlans.c optional arswitch
dev/etherswitch/etherswitch.c optional etherswitch
dev/etherswitch/etherswitch_if.m optional etherswitch
dev/etherswitch/ip17x/ip17x.c optional ip17x
dev/etherswitch/ip17x/ip175c.c optional ip17x
dev/etherswitch/ip17x/ip175d.c optional ip17x
dev/etherswitch/ip17x/ip17x_phy.c optional ip17x
dev/etherswitch/ip17x/ip17x_vlans.c optional ip17x
dev/etherswitch/miiproxy.c optional miiproxy
dev/etherswitch/rtl8366/rtl8366rb.c optional rtl8366rb
dev/etherswitch/ukswitch/ukswitch.c optional ukswitch
dev/evdev/cdev.c optional evdev
dev/evdev/evdev.c optional evdev
dev/evdev/evdev_mt.c optional evdev
dev/evdev/evdev_utils.c optional evdev
dev/evdev/uinput.c optional evdev uinput
dev/ex/if_ex.c optional ex
dev/ex/if_ex_isa.c optional ex isa
dev/ex/if_ex_pccard.c optional ex pccard
dev/exca/exca.c optional cbb
dev/extres/clk/clk.c optional ext_resources clk
dev/extres/clk/clkdev_if.m optional ext_resources clk
dev/extres/clk/clknode_if.m optional ext_resources clk
dev/extres/clk/clk_bus.c optional ext_resources clk fdt
dev/extres/clk/clk_div.c optional ext_resources clk
dev/extres/clk/clk_fixed.c optional ext_resources clk
dev/extres/clk/clk_gate.c optional ext_resources clk
dev/extres/clk/clk_mux.c optional ext_resources clk
dev/extres/phy/phy.c optional ext_resources phy
dev/extres/phy/phydev_if.m optional ext_resources phy
dev/extres/phy/phynode_if.m optional ext_resources phy
dev/extres/hwreset/hwreset.c optional ext_resources hwreset
dev/extres/hwreset/hwreset_if.m optional ext_resources hwreset
dev/extres/regulator/regdev_if.m optional ext_resources regulator
dev/extres/regulator/regnode_if.m optional ext_resources regulator
dev/extres/regulator/regulator.c optional ext_resources regulator
dev/extres/regulator/regulator_bus.c optional ext_resources regulator fdt
dev/extres/regulator/regulator_fixed.c optional ext_resources regulator
dev/fatm/if_fatm.c optional fatm pci
dev/fb/fbd.c optional fbd | vt
dev/fb/fb_if.m standard
dev/fb/splash.c optional sc splash
dev/fdt/fdt_clock.c optional fdt fdt_clock
dev/fdt/fdt_clock_if.m optional fdt fdt_clock
dev/fdt/fdt_common.c optional fdt
dev/fdt/fdt_pinctrl.c optional fdt fdt_pinctrl
dev/fdt/fdt_pinctrl_if.m optional fdt fdt_pinctrl
dev/fdt/fdt_slicer.c optional fdt cfi | fdt nand | fdt mx25l
dev/fdt/fdt_static_dtb.S optional fdt fdt_dtb_static \
dependency "fdt_dtb_file"
dev/fdt/simplebus.c optional fdt
dev/fe/if_fe.c optional fe
dev/fe/if_fe_pccard.c optional fe pccard
dev/filemon/filemon.c optional filemon
dev/firewire/firewire.c optional firewire
dev/firewire/fwcrom.c optional firewire
dev/firewire/fwdev.c optional firewire
dev/firewire/fwdma.c optional firewire
dev/firewire/fwmem.c optional firewire
dev/firewire/fwohci.c optional firewire
dev/firewire/fwohci_pci.c optional firewire pci
dev/firewire/if_fwe.c optional fwe
dev/firewire/if_fwip.c optional fwip
dev/firewire/sbp.c optional sbp
dev/firewire/sbp_targ.c optional sbp_targ
dev/flash/at45d.c optional at45d
dev/flash/mx25l.c optional mx25l
dev/fxp/if_fxp.c optional fxp
dev/fxp/inphy.c optional fxp
dev/gem/if_gem.c optional gem
dev/gem/if_gem_pci.c optional gem pci
dev/gem/if_gem_sbus.c optional gem sbus
dev/gpio/gpiobacklight.c optional gpiobacklight fdt
dev/gpio/gpiokeys.c optional gpiokeys fdt
dev/gpio/gpiokeys_codes.c optional gpiokeys fdt
dev/gpio/gpiobus.c optional gpio \
dependency "gpiobus_if.h"
dev/gpio/gpioc.c optional gpio \
dependency "gpio_if.h"
dev/gpio/gpioiic.c optional gpioiic
dev/gpio/gpioled.c optional gpioled !fdt
dev/gpio/gpioled_fdt.c optional gpioled fdt
dev/gpio/gpiospi.c optional gpiospi
dev/gpio/gpio_if.m optional gpio
dev/gpio/gpiobus_if.m optional gpio
dev/gpio/gpiopps.c optional gpiopps
dev/gpio/ofw_gpiobus.c optional fdt gpio
dev/hatm/if_hatm.c optional hatm pci
dev/hatm/if_hatm_intr.c optional hatm pci
dev/hatm/if_hatm_ioctl.c optional hatm pci
dev/hatm/if_hatm_rx.c optional hatm pci
dev/hatm/if_hatm_tx.c optional hatm pci
dev/hifn/hifn7751.c optional hifn
dev/hme/if_hme.c optional hme
dev/hme/if_hme_pci.c optional hme pci
dev/hme/if_hme_sbus.c optional hme sbus
dev/hptiop/hptiop.c optional hptiop scbus
dev/hwpmc/hwpmc_logging.c optional hwpmc
dev/hwpmc/hwpmc_mod.c optional hwpmc
dev/hwpmc/hwpmc_soft.c optional hwpmc
dev/ichiic/ig4_acpi.c optional ig4 acpi iicbus
dev/ichiic/ig4_iic.c optional ig4 iicbus
dev/ichiic/ig4_pci.c optional ig4 pci iicbus
dev/ichsmb/ichsmb.c optional ichsmb
dev/ichsmb/ichsmb_pci.c optional ichsmb pci
dev/ida/ida.c optional ida
dev/ida/ida_disk.c optional ida
dev/ida/ida_eisa.c optional ida eisa
dev/ida/ida_pci.c optional ida pci
dev/ie/if_ie.c optional ie isa nowerror
dev/ie/if_ie_isa.c optional ie isa
dev/iicbus/ad7418.c optional ad7418
dev/iicbus/ds1307.c optional ds1307
dev/iicbus/ds13rtc.c optional ds13rtc | ds133x | ds1374
dev/iicbus/ds1672.c optional ds1672
dev/iicbus/ds3231.c optional ds3231
dev/iicbus/rtc8583.c optional rtc8583
dev/iicbus/icee.c optional icee
dev/iicbus/if_ic.c optional ic
dev/iicbus/iic.c optional iic
dev/iicbus/iic_recover_bus.c optional iicbus
dev/iicbus/iicbb.c optional iicbb
dev/iicbus/iicbb_if.m optional iicbb
dev/iicbus/iicbus.c optional iicbus
dev/iicbus/iicbus_if.m optional iicbus
dev/iicbus/iiconf.c optional iicbus
dev/iicbus/iicsmb.c optional iicsmb \
dependency "iicbus_if.h"
dev/iicbus/iicoc.c optional iicoc
dev/iicbus/isl12xx.c optional isl12xx
dev/iicbus/lm75.c optional lm75
dev/iicbus/nxprtc.c optional nxprtc | pcf8563
dev/iicbus/ofw_iicbus.c optional fdt iicbus
dev/iicbus/s35390a.c optional s35390a
dev/iir/iir.c optional iir
dev/iir/iir_ctrl.c optional iir
dev/iir/iir_pci.c optional iir pci
dev/intpm/intpm.c optional intpm pci
# XXX Work around clang warning, until maintainer approves fix.
dev/ips/ips.c optional ips \
compile-with "${NORMAL_C} ${NO_WSOMETIMES_UNINITIALIZED}"
dev/ips/ips_commands.c optional ips
dev/ips/ips_disk.c optional ips
dev/ips/ips_ioctl.c optional ips
dev/ips/ips_pci.c optional ips pci
dev/ipw/if_ipw.c optional ipw
ipwbssfw.c optional ipwbssfw | ipwfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk ipw_bss.fw:ipw_bss:130 -lintel_ipw -mipw_bss -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "ipwbssfw.c"
ipw_bss.fwo optional ipwbssfw | ipwfw \
dependency "ipw_bss.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "ipw_bss.fwo"
ipw_bss.fw optional ipwbssfw | ipwfw \
dependency "$S/contrib/dev/ipw/ipw2100-1.3.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "ipw_bss.fw"
ipwibssfw.c optional ipwibssfw | ipwfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk ipw_ibss.fw:ipw_ibss:130 -lintel_ipw -mipw_ibss -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "ipwibssfw.c"
ipw_ibss.fwo optional ipwibssfw | ipwfw \
dependency "ipw_ibss.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "ipw_ibss.fwo"
ipw_ibss.fw optional ipwibssfw | ipwfw \
dependency "$S/contrib/dev/ipw/ipw2100-1.3-i.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "ipw_ibss.fw"
ipwmonitorfw.c optional ipwmonitorfw | ipwfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk ipw_monitor.fw:ipw_monitor:130 -lintel_ipw -mipw_monitor -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "ipwmonitorfw.c"
ipw_monitor.fwo optional ipwmonitorfw | ipwfw \
dependency "ipw_monitor.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "ipw_monitor.fwo"
ipw_monitor.fw optional ipwmonitorfw | ipwfw \
dependency "$S/contrib/dev/ipw/ipw2100-1.3-p.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "ipw_monitor.fw"
dev/iscsi/icl.c optional iscsi
dev/iscsi/icl_conn_if.m optional cfiscsi | iscsi
dev/iscsi/icl_soft.c optional iscsi
dev/iscsi/icl_soft_proxy.c optional iscsi
dev/iscsi/iscsi.c optional iscsi scbus
dev/iscsi_initiator/iscsi.c optional iscsi_initiator scbus
dev/iscsi_initiator/iscsi_subr.c optional iscsi_initiator scbus
dev/iscsi_initiator/isc_cam.c optional iscsi_initiator scbus
dev/iscsi_initiator/isc_soc.c optional iscsi_initiator scbus
dev/iscsi_initiator/isc_sm.c optional iscsi_initiator scbus
dev/iscsi_initiator/isc_subr.c optional iscsi_initiator scbus
dev/ismt/ismt.c optional ismt
dev/isl/isl.c optional isl iicbus
dev/isp/isp.c optional isp
dev/isp/isp_freebsd.c optional isp
dev/isp/isp_library.c optional isp
dev/isp/isp_pci.c optional isp pci
dev/isp/isp_sbus.c optional isp sbus
dev/isp/isp_target.c optional isp
dev/ispfw/ispfw.c optional ispfw
dev/iwi/if_iwi.c optional iwi
iwibssfw.c optional iwibssfw | iwifw \
compile-with "${AWK} -f $S/tools/fw_stub.awk iwi_bss.fw:iwi_bss:300 -lintel_iwi -miwi_bss -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "iwibssfw.c"
iwi_bss.fwo optional iwibssfw | iwifw \
dependency "iwi_bss.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "iwi_bss.fwo"
iwi_bss.fw optional iwibssfw | iwifw \
dependency "$S/contrib/dev/iwi/ipw2200-bss.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "iwi_bss.fw"
iwiibssfw.c optional iwiibssfw | iwifw \
compile-with "${AWK} -f $S/tools/fw_stub.awk iwi_ibss.fw:iwi_ibss:300 -lintel_iwi -miwi_ibss -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "iwiibssfw.c"
iwi_ibss.fwo optional iwiibssfw | iwifw \
dependency "iwi_ibss.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "iwi_ibss.fwo"
iwi_ibss.fw optional iwiibssfw | iwifw \
dependency "$S/contrib/dev/iwi/ipw2200-ibss.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "iwi_ibss.fw"
iwimonitorfw.c optional iwimonitorfw | iwifw \
compile-with "${AWK} -f $S/tools/fw_stub.awk iwi_monitor.fw:iwi_monitor:300 -lintel_iwi -miwi_monitor -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "iwimonitorfw.c"
iwi_monitor.fwo optional iwimonitorfw | iwifw \
dependency "iwi_monitor.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "iwi_monitor.fwo"
iwi_monitor.fw optional iwimonitorfw | iwifw \
dependency "$S/contrib/dev/iwi/ipw2200-sniffer.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "iwi_monitor.fw"
dev/iwm/if_iwm.c optional iwm
dev/iwm/if_iwm_7000.c optional iwm
dev/iwm/if_iwm_8000.c optional iwm
dev/iwm/if_iwm_binding.c optional iwm
dev/iwm/if_iwm_fw.c optional iwm
dev/iwm/if_iwm_led.c optional iwm
dev/iwm/if_iwm_mac_ctxt.c optional iwm
dev/iwm/if_iwm_notif_wait.c optional iwm
dev/iwm/if_iwm_pcie_trans.c optional iwm
dev/iwm/if_iwm_phy_ctxt.c optional iwm
dev/iwm/if_iwm_phy_db.c optional iwm
dev/iwm/if_iwm_power.c optional iwm
dev/iwm/if_iwm_scan.c optional iwm
dev/iwm/if_iwm_sta.c optional iwm
dev/iwm/if_iwm_time_event.c optional iwm
dev/iwm/if_iwm_util.c optional iwm
iwm3160fw.c optional iwm3160fw | iwmfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk iwm3160.fw:iwm3160fw -miwm3160fw -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "iwm3160fw.c"
iwm3160fw.fwo optional iwm3160fw | iwmfw \
dependency "iwm3160.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "iwm3160fw.fwo"
iwm3160.fw optional iwm3160fw | iwmfw \
dependency "$S/contrib/dev/iwm/iwm-3160-17.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "iwm3160.fw"
iwm7260fw.c optional iwm7260fw | iwmfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk iwm7260.fw:iwm7260fw -miwm7260fw -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "iwm7260fw.c"
iwm7260fw.fwo optional iwm7260fw | iwmfw \
dependency "iwm7260.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "iwm7260fw.fwo"
iwm7260.fw optional iwm7260fw | iwmfw \
dependency "$S/contrib/dev/iwm/iwm-7260-17.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "iwm7260.fw"
iwm7265fw.c optional iwm7265fw | iwmfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk iwm7265.fw:iwm7265fw -miwm7265fw -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "iwm7265fw.c"
iwm7265fw.fwo optional iwm7265fw | iwmfw \
dependency "iwm7265.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "iwm7265fw.fwo"
iwm7265.fw optional iwm7265fw | iwmfw \
dependency "$S/contrib/dev/iwm/iwm-7265-17.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "iwm7265.fw"
iwm7265Dfw.c optional iwm7265Dfw | iwmfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk iwm7265D.fw:iwm7265Dfw -miwm7265Dfw -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "iwm7265Dfw.c"
iwm7265Dfw.fwo optional iwm7265Dfw | iwmfw \
dependency "iwm7265D.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "iwm7265Dfw.fwo"
iwm7265D.fw optional iwm7265Dfw | iwmfw \
dependency "$S/contrib/dev/iwm/iwm-7265D-17.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "iwm7265D.fw"
iwm8000Cfw.c optional iwm8000Cfw | iwmfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk iwm8000C.fw:iwm8000Cfw -miwm8000Cfw -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "iwm8000Cfw.c"
iwm8000Cfw.fwo optional iwm8000Cfw | iwmfw \
dependency "iwm8000C.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "iwm8000Cfw.fwo"
iwm8000C.fw optional iwm8000Cfw | iwmfw \
dependency "$S/contrib/dev/iwm/iwm-8000C-17.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "iwm8000C.fw"
iwm8265.fw optional iwm8265fw | iwmfw \
dependency "$S/contrib/dev/iwm/iwm-8265-22.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "iwm8265.fw"
iwm8265fw.c optional iwm8265fw | iwmfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk iwm8265.fw:iwm8265fw -miwm8265fw -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "iwm8265fw.c"
iwm8265fw.fwo optional iwm8265fw | iwmfw \
dependency "iwm8265.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "iwm8265fw.fwo"
dev/iwn/if_iwn.c optional iwn
iwn1000fw.c optional iwn1000fw | iwnfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk iwn1000.fw:iwn1000fw -miwn1000fw -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "iwn1000fw.c"
iwn1000fw.fwo optional iwn1000fw | iwnfw \
dependency "iwn1000.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "iwn1000fw.fwo"
iwn1000.fw optional iwn1000fw | iwnfw \
dependency "$S/contrib/dev/iwn/iwlwifi-1000-39.31.5.1.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "iwn1000.fw"
iwn100fw.c optional iwn100fw | iwnfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk iwn100.fw:iwn100fw -miwn100fw -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "iwn100fw.c"
iwn100fw.fwo optional iwn100fw | iwnfw \
dependency "iwn100.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "iwn100fw.fwo"
iwn100.fw optional iwn100fw | iwnfw \
dependency "$S/contrib/dev/iwn/iwlwifi-100-39.31.5.1.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "iwn100.fw"
iwn105fw.c optional iwn105fw | iwnfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk iwn105.fw:iwn105fw -miwn105fw -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "iwn105fw.c"
iwn105fw.fwo optional iwn105fw | iwnfw \
dependency "iwn105.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "iwn105fw.fwo"
iwn105.fw optional iwn105fw | iwnfw \
dependency "$S/contrib/dev/iwn/iwlwifi-105-6-18.168.6.1.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "iwn105.fw"
iwn135fw.c optional iwn135fw | iwnfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk iwn135.fw:iwn135fw -miwn135fw -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "iwn135fw.c"
iwn135fw.fwo optional iwn135fw | iwnfw \
dependency "iwn135.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "iwn135fw.fwo"
iwn135.fw optional iwn135fw | iwnfw \
dependency "$S/contrib/dev/iwn/iwlwifi-135-6-18.168.6.1.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "iwn135.fw"
iwn2000fw.c optional iwn2000fw | iwnfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk iwn2000.fw:iwn2000fw -miwn2000fw -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "iwn2000fw.c"
iwn2000fw.fwo optional iwn2000fw | iwnfw \
dependency "iwn2000.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "iwn2000fw.fwo"
iwn2000.fw optional iwn2000fw | iwnfw \
dependency "$S/contrib/dev/iwn/iwlwifi-2000-18.168.6.1.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "iwn2000.fw"
iwn2030fw.c optional iwn2030fw | iwnfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk iwn2030.fw:iwn2030fw -miwn2030fw -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "iwn2030fw.c"
iwn2030fw.fwo optional iwn2030fw | iwnfw \
dependency "iwn2030.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "iwn2030fw.fwo"
iwn2030.fw optional iwn2030fw | iwnfw \
dependency "$S/contrib/dev/iwn/iwnwifi-2030-18.168.6.1.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "iwn2030.fw"
iwn4965fw.c optional iwn4965fw | iwnfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk iwn4965.fw:iwn4965fw -miwn4965fw -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "iwn4965fw.c"
iwn4965fw.fwo optional iwn4965fw | iwnfw \
dependency "iwn4965.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "iwn4965fw.fwo"
iwn4965.fw optional iwn4965fw | iwnfw \
dependency "$S/contrib/dev/iwn/iwlwifi-4965-228.61.2.24.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "iwn4965.fw"
iwn5000fw.c optional iwn5000fw | iwnfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk iwn5000.fw:iwn5000fw -miwn5000fw -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "iwn5000fw.c"
iwn5000fw.fwo optional iwn5000fw | iwnfw \
dependency "iwn5000.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "iwn5000fw.fwo"
iwn5000.fw optional iwn5000fw | iwnfw \
dependency "$S/contrib/dev/iwn/iwlwifi-5000-8.83.5.1.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "iwn5000.fw"
iwn5150fw.c optional iwn5150fw | iwnfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk iwn5150.fw:iwn5150fw -miwn5150fw -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "iwn5150fw.c"
iwn5150fw.fwo optional iwn5150fw | iwnfw \
dependency "iwn5150.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "iwn5150fw.fwo"
iwn5150.fw optional iwn5150fw | iwnfw \
dependency "$S/contrib/dev/iwn/iwlwifi-5150-8.24.2.2.fw.uu"\
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "iwn5150.fw"
iwn6000fw.c optional iwn6000fw | iwnfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk iwn6000.fw:iwn6000fw -miwn6000fw -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "iwn6000fw.c"
iwn6000fw.fwo optional iwn6000fw | iwnfw \
dependency "iwn6000.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "iwn6000fw.fwo"
iwn6000.fw optional iwn6000fw | iwnfw \
dependency "$S/contrib/dev/iwn/iwlwifi-6000-9.221.4.1.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "iwn6000.fw"
iwn6000g2afw.c optional iwn6000g2afw | iwnfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk iwn6000g2a.fw:iwn6000g2afw -miwn6000g2afw -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "iwn6000g2afw.c"
iwn6000g2afw.fwo optional iwn6000g2afw | iwnfw \
dependency "iwn6000g2a.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "iwn6000g2afw.fwo"
iwn6000g2a.fw optional iwn6000g2afw | iwnfw \
dependency "$S/contrib/dev/iwn/iwlwifi-6000g2a-18.168.6.1.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "iwn6000g2a.fw"
iwn6000g2bfw.c optional iwn6000g2bfw | iwnfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk iwn6000g2b.fw:iwn6000g2bfw -miwn6000g2bfw -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "iwn6000g2bfw.c"
iwn6000g2bfw.fwo optional iwn6000g2bfw | iwnfw \
dependency "iwn6000g2b.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "iwn6000g2bfw.fwo"
iwn6000g2b.fw optional iwn6000g2bfw | iwnfw \
dependency "$S/contrib/dev/iwn/iwlwifi-6000g2b-18.168.6.1.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "iwn6000g2b.fw"
iwn6050fw.c optional iwn6050fw | iwnfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk iwn6050.fw:iwn6050fw -miwn6050fw -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "iwn6050fw.c"
iwn6050fw.fwo optional iwn6050fw | iwnfw \
dependency "iwn6050.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "iwn6050fw.fwo"
iwn6050.fw optional iwn6050fw | iwnfw \
dependency "$S/contrib/dev/iwn/iwlwifi-6050-41.28.5.1.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "iwn6050.fw"
dev/ixgb/if_ixgb.c optional ixgb
dev/ixgb/ixgb_ee.c optional ixgb
dev/ixgb/ixgb_hw.c optional ixgb
dev/ixgbe/if_ix.c optional ix inet \
compile-with "${NORMAL_C} -I$S/dev/ixgbe -DSMP"
dev/ixgbe/if_ixv.c optional ixv inet \
compile-with "${NORMAL_C} -I$S/dev/ixgbe -DSMP"
dev/ixgbe/if_bypass.c optional ix inet \
compile-with "${NORMAL_C} -I$S/dev/ixgbe"
dev/ixgbe/ixgbe_netmap.c optional ix inet \
compile-with "${NORMAL_C} -I$S/dev/ixgbe"
dev/ixgbe/if_fdir.c optional ix inet | ixv inet \
compile-with "${NORMAL_C} -I$S/dev/ixgbe"
dev/ixgbe/if_sriov.c optional ix inet | ixv inet \
compile-with "${NORMAL_C} -I$S/dev/ixgbe"
dev/ixgbe/ix_txrx.c optional ix inet | ixv inet \
compile-with "${NORMAL_C} -I$S/dev/ixgbe"
dev/ixgbe/ixgbe_osdep.c optional ix inet | ixv inet \
compile-with "${NORMAL_C} -I$S/dev/ixgbe"
dev/ixgbe/ixgbe_phy.c optional ix inet | ixv inet \
compile-with "${NORMAL_C} -I$S/dev/ixgbe"
dev/ixgbe/ixgbe_api.c optional ix inet | ixv inet \
compile-with "${NORMAL_C} -I$S/dev/ixgbe"
dev/ixgbe/ixgbe_common.c optional ix inet | ixv inet \
compile-with "${NORMAL_C} -I$S/dev/ixgbe"
dev/ixgbe/ixgbe_mbx.c optional ix inet | ixv inet \
compile-with "${NORMAL_C} -I$S/dev/ixgbe"
dev/ixgbe/ixgbe_vf.c optional ix inet | ixv inet \
compile-with "${NORMAL_C} -I$S/dev/ixgbe"
dev/ixgbe/ixgbe_82598.c optional ix inet | ixv inet \
compile-with "${NORMAL_C} -I$S/dev/ixgbe"
dev/ixgbe/ixgbe_82599.c optional ix inet | ixv inet \
compile-with "${NORMAL_C} -I$S/dev/ixgbe"
dev/ixgbe/ixgbe_x540.c optional ix inet | ixv inet \
compile-with "${NORMAL_C} -I$S/dev/ixgbe"
dev/ixgbe/ixgbe_x550.c optional ix inet | ixv inet \
compile-with "${NORMAL_C} -I$S/dev/ixgbe"
dev/ixgbe/ixgbe_dcb.c optional ix inet | ixv inet \
compile-with "${NORMAL_C} -I$S/dev/ixgbe"
dev/ixgbe/ixgbe_dcb_82598.c optional ix inet | ixv inet \
compile-with "${NORMAL_C} -I$S/dev/ixgbe"
dev/ixgbe/ixgbe_dcb_82599.c optional ix inet | ixv inet \
compile-with "${NORMAL_C} -I$S/dev/ixgbe"
dev/jedec_dimm/jedec_dimm.c optional jedec_dimm smbus
dev/jedec_ts/jedec_ts.c optional jedec_ts smbus
dev/jme/if_jme.c optional jme pci
dev/joy/joy.c optional joy
dev/joy/joy_isa.c optional joy isa
dev/kbd/kbd.c optional atkbd | pckbd | sc | ukbd | vt
dev/kbdmux/kbdmux.c optional kbdmux
dev/ksyms/ksyms.c optional ksyms
dev/le/am7990.c optional le
dev/le/am79900.c optional le
dev/le/if_le_pci.c optional le pci
dev/le/lance.c optional le
dev/led/led.c standard
dev/lge/if_lge.c optional lge
dev/liquidio/base/cn23xx_pf_device.c optional lio \
compile-with "${NORMAL_C} \
-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
dev/liquidio/base/lio_console.c optional lio \
compile-with "${NORMAL_C} \
-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
dev/liquidio/base/lio_ctrl.c optional lio \
compile-with "${NORMAL_C} \
-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
dev/liquidio/base/lio_device.c optional lio \
compile-with "${NORMAL_C} \
-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
dev/liquidio/base/lio_droq.c optional lio \
compile-with "${NORMAL_C} \
-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
dev/liquidio/base/lio_mem_ops.c optional lio \
compile-with "${NORMAL_C} \
-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
dev/liquidio/base/lio_request_manager.c optional lio \
compile-with "${NORMAL_C} \
-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
dev/liquidio/base/lio_response_manager.c optional lio \
compile-with "${NORMAL_C} \
-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
dev/liquidio/lio_core.c optional lio \
compile-with "${NORMAL_C} \
-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
dev/liquidio/lio_ioctl.c optional lio \
compile-with "${NORMAL_C} \
-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
dev/liquidio/lio_main.c optional lio \
compile-with "${NORMAL_C} \
-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
dev/liquidio/lio_rss.c optional lio \
compile-with "${NORMAL_C} \
-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
dev/liquidio/lio_rxtx.c optional lio \
compile-with "${NORMAL_C} \
-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
dev/liquidio/lio_sysctl.c optional lio \
compile-with "${NORMAL_C} \
-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
lio.c optional lio \
compile-with "${AWK} -f $S/tools/fw_stub.awk lio_23xx_nic.bin.fw:lio_23xx_nic.bin -mlio_23xx_nic.bin -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "lio.c"
lio_23xx_nic.bin.fw.fwo optional lio \
dependency "lio_23xx_nic.bin.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "lio_23xx_nic.bin.fw.fwo"
lio_23xx_nic.bin.fw optional lio \
dependency "$S/contrib/dev/liquidio/lio_23xx_nic.bin.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "lio_23xx_nic.bin.fw"
dev/lmc/if_lmc.c optional lmc
dev/malo/if_malo.c optional malo
dev/malo/if_malohal.c optional malo
dev/malo/if_malo_pci.c optional malo pci
dev/mc146818/mc146818.c optional mc146818
dev/mca/mca_bus.c optional mca
dev/mcd/mcd.c optional mcd isa nowerror
dev/mcd/mcd_isa.c optional mcd isa nowerror
dev/md/md.c optional md
dev/mdio/mdio_if.m optional miiproxy | mdio
dev/mdio/mdio.c optional miiproxy | mdio
dev/mem/memdev.c optional mem
dev/mem/memutil.c optional mem
dev/mfi/mfi.c optional mfi
dev/mfi/mfi_debug.c optional mfi
dev/mfi/mfi_pci.c optional mfi pci
dev/mfi/mfi_disk.c optional mfi
dev/mfi/mfi_syspd.c optional mfi
dev/mfi/mfi_tbolt.c optional mfi
dev/mfi/mfi_linux.c optional mfi compat_linux
dev/mfi/mfi_cam.c optional mfip scbus
dev/mii/acphy.c optional miibus | acphy
dev/mii/amphy.c optional miibus | amphy
dev/mii/atphy.c optional miibus | atphy
dev/mii/axphy.c optional miibus | axphy
dev/mii/bmtphy.c optional miibus | bmtphy
dev/mii/brgphy.c optional miibus | brgphy
dev/mii/ciphy.c optional miibus | ciphy
dev/mii/e1000phy.c optional miibus | e1000phy
dev/mii/gentbi.c optional miibus | gentbi
dev/mii/icsphy.c optional miibus | icsphy
dev/mii/ip1000phy.c optional miibus | ip1000phy
dev/mii/jmphy.c optional miibus | jmphy
dev/mii/lxtphy.c optional miibus | lxtphy
dev/mii/micphy.c optional miibus fdt | micphy fdt
dev/mii/mii.c optional miibus | mii
dev/mii/mii_bitbang.c optional miibus | mii_bitbang
dev/mii/mii_physubr.c optional miibus | mii
dev/mii/mii_fdt.c optional miibus fdt | mii fdt
dev/mii/miibus_if.m optional miibus | mii
dev/mii/mlphy.c optional miibus | mlphy
dev/mii/nsgphy.c optional miibus | nsgphy
dev/mii/nsphy.c optional miibus | nsphy
dev/mii/nsphyter.c optional miibus | nsphyter
dev/mii/pnaphy.c optional miibus | pnaphy
dev/mii/qsphy.c optional miibus | qsphy
dev/mii/rdcphy.c optional miibus | rdcphy
dev/mii/rgephy.c optional miibus | rgephy
dev/mii/rlphy.c optional miibus | rlphy
dev/mii/rlswitch.c optional rlswitch
dev/mii/smcphy.c optional miibus | smcphy
dev/mii/smscphy.c optional miibus | smscphy
dev/mii/tdkphy.c optional miibus | tdkphy
dev/mii/tlphy.c optional miibus | tlphy
dev/mii/truephy.c optional miibus | truephy
dev/mii/ukphy.c optional miibus | mii
dev/mii/ukphy_subr.c optional miibus | mii
dev/mii/vscphy.c optional miibus | vscphy
dev/mii/xmphy.c optional miibus | xmphy
dev/mk48txx/mk48txx.c optional mk48txx
dev/mlx/mlx.c optional mlx
dev/mlx/mlx_disk.c optional mlx
dev/mlx/mlx_pci.c optional mlx pci
dev/mly/mly.c optional mly
dev/mmc/mmc_subr.c optional mmc | mmcsd
dev/mmc/mmc.c optional mmc
dev/mmc/mmcbr_if.m standard
dev/mmc/mmcbus_if.m standard
dev/mmc/mmcsd.c optional mmcsd
dev/mn/if_mn.c optional mn pci
dev/mpr/mpr.c optional mpr
dev/mpr/mpr_config.c optional mpr
# XXX Work around clang warning, until maintainer approves fix.
dev/mpr/mpr_mapping.c optional mpr \
compile-with "${NORMAL_C} ${NO_WSOMETIMES_UNINITIALIZED}"
dev/mpr/mpr_pci.c optional mpr pci
dev/mpr/mpr_sas.c optional mpr \
compile-with "${NORMAL_C} ${NO_WUNNEEDED_INTERNAL_DECL}"
dev/mpr/mpr_sas_lsi.c optional mpr
dev/mpr/mpr_table.c optional mpr
dev/mpr/mpr_user.c optional mpr
dev/mps/mps.c optional mps
dev/mps/mps_config.c optional mps
# XXX Work around clang warning, until maintainer approves fix.
dev/mps/mps_mapping.c optional mps \
compile-with "${NORMAL_C} ${NO_WSOMETIMES_UNINITIALIZED}"
dev/mps/mps_pci.c optional mps pci
dev/mps/mps_sas.c optional mps \
compile-with "${NORMAL_C} ${NO_WUNNEEDED_INTERNAL_DECL}"
dev/mps/mps_sas_lsi.c optional mps
dev/mps/mps_table.c optional mps
dev/mps/mps_user.c optional mps
dev/mpt/mpt.c optional mpt
dev/mpt/mpt_cam.c optional mpt
dev/mpt/mpt_debug.c optional mpt
dev/mpt/mpt_pci.c optional mpt pci
dev/mpt/mpt_raid.c optional mpt
dev/mpt/mpt_user.c optional mpt
dev/mrsas/mrsas.c optional mrsas
dev/mrsas/mrsas_cam.c optional mrsas
dev/mrsas/mrsas_ioctl.c optional mrsas
dev/mrsas/mrsas_fp.c optional mrsas
dev/msk/if_msk.c optional msk
dev/mvs/mvs.c optional mvs
dev/mvs/mvs_if.m optional mvs
dev/mvs/mvs_pci.c optional mvs pci
dev/mwl/if_mwl.c optional mwl
dev/mwl/if_mwl_pci.c optional mwl pci
dev/mwl/mwlhal.c optional mwl
mwlfw.c optional mwlfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk mw88W8363.fw:mw88W8363fw mwlboot.fw:mwlboot -mmwl -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "mwlfw.c"
mw88W8363.fwo optional mwlfw \
dependency "mw88W8363.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "mw88W8363.fwo"
mw88W8363.fw optional mwlfw \
dependency "$S/contrib/dev/mwl/mw88W8363.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "mw88W8363.fw"
mwlboot.fwo optional mwlfw \
dependency "mwlboot.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "mwlboot.fwo"
mwlboot.fw optional mwlfw \
dependency "$S/contrib/dev/mwl/mwlboot.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "mwlboot.fw"
dev/mxge/if_mxge.c optional mxge pci
dev/mxge/mxge_eth_z8e.c optional mxge pci
dev/mxge/mxge_ethp_z8e.c optional mxge pci
dev/mxge/mxge_rss_eth_z8e.c optional mxge pci
dev/mxge/mxge_rss_ethp_z8e.c optional mxge pci
dev/my/if_my.c optional my
dev/nand/nand.c optional nand
dev/nand/nand_bbt.c optional nand
dev/nand/nand_cdev.c optional nand
dev/nand/nand_generic.c optional nand
dev/nand/nand_geom.c optional nand
dev/nand/nand_id.c optional nand
dev/nand/nandbus.c optional nand
dev/nand/nandbus_if.m optional nand
dev/nand/nand_if.m optional nand
dev/nand/nandsim.c optional nandsim nand
dev/nand/nandsim_chip.c optional nandsim nand
dev/nand/nandsim_ctrl.c optional nandsim nand
dev/nand/nandsim_log.c optional nandsim nand
dev/nand/nandsim_swap.c optional nandsim nand
dev/nand/nfc_if.m optional nand
dev/ncr/ncr.c optional ncr pci
dev/ncv/ncr53c500.c optional ncv
dev/ncv/ncr53c500_pccard.c optional ncv pccard
dev/netmap/netmap.c optional netmap
dev/netmap/netmap_freebsd.c optional netmap
dev/netmap/netmap_generic.c optional netmap
dev/netmap/netmap_mbq.c optional netmap
dev/netmap/netmap_mem2.c optional netmap
dev/netmap/netmap_monitor.c optional netmap
dev/netmap/netmap_offloadings.c optional netmap
dev/netmap/netmap_pipe.c optional netmap
dev/netmap/netmap_vale.c optional netmap
# compile-with "${NORMAL_C} -Wconversion -Wextra"
dev/nfsmb/nfsmb.c optional nfsmb pci
dev/nge/if_nge.c optional nge
dev/nxge/if_nxge.c optional nxge \
compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}"
dev/nxge/xgehal/xgehal-device.c optional nxge \
compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}"
dev/nxge/xgehal/xgehal-mm.c optional nxge
dev/nxge/xgehal/xge-queue.c optional nxge
dev/nxge/xgehal/xgehal-driver.c optional nxge \
compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}"
dev/nxge/xgehal/xgehal-ring.c optional nxge \
compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}"
dev/nxge/xgehal/xgehal-channel.c optional nxge \
compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}"
dev/nxge/xgehal/xgehal-fifo.c optional nxge \
compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}"
dev/nxge/xgehal/xgehal-stats.c optional nxge \
compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}"
dev/nxge/xgehal/xgehal-config.c optional nxge
dev/nxge/xgehal/xgehal-mgmt.c optional nxge \
compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}"
dev/nmdm/nmdm.c optional nmdm
dev/nsp/nsp.c optional nsp
dev/nsp/nsp_pccard.c optional nsp pccard
dev/null/null.c standard
dev/oce/oce_hw.c optional oce pci
dev/oce/oce_if.c optional oce pci
dev/oce/oce_mbox.c optional oce pci
dev/oce/oce_queue.c optional oce pci
dev/oce/oce_sysctl.c optional oce pci
dev/oce/oce_util.c optional oce pci
dev/ocs_fc/ocs_pci.c optional ocs_fc pci
dev/ocs_fc/ocs_ioctl.c optional ocs_fc pci
dev/ocs_fc/ocs_os.c optional ocs_fc pci
dev/ocs_fc/ocs_utils.c optional ocs_fc pci
dev/ocs_fc/ocs_hw.c optional ocs_fc pci
dev/ocs_fc/ocs_hw_queues.c optional ocs_fc pci
dev/ocs_fc/sli4.c optional ocs_fc pci
dev/ocs_fc/ocs_sm.c optional ocs_fc pci
dev/ocs_fc/ocs_device.c optional ocs_fc pci
dev/ocs_fc/ocs_xport.c optional ocs_fc pci
dev/ocs_fc/ocs_domain.c optional ocs_fc pci
dev/ocs_fc/ocs_sport.c optional ocs_fc pci
dev/ocs_fc/ocs_els.c optional ocs_fc pci
dev/ocs_fc/ocs_fabric.c optional ocs_fc pci
dev/ocs_fc/ocs_io.c optional ocs_fc pci
dev/ocs_fc/ocs_node.c optional ocs_fc pci
dev/ocs_fc/ocs_scsi.c optional ocs_fc pci
dev/ocs_fc/ocs_unsol.c optional ocs_fc pci
dev/ocs_fc/ocs_ddump.c optional ocs_fc pci
dev/ocs_fc/ocs_mgmt.c optional ocs_fc pci
dev/ocs_fc/ocs_cam.c optional ocs_fc pci
dev/ofw/ofw_bus_if.m optional fdt
dev/ofw/ofw_bus_subr.c optional fdt
dev/ofw/ofw_fdt.c optional fdt
dev/ofw/ofw_if.m optional fdt
dev/ofw/ofw_subr.c optional fdt
dev/ofw/ofwbus.c optional fdt
dev/ofw/openfirm.c optional fdt
dev/ofw/openfirmio.c optional fdt
dev/ow/ow.c optional ow \
dependency "owll_if.h" \
dependency "own_if.h"
dev/ow/owll_if.m optional ow
dev/ow/own_if.m optional ow
dev/ow/ow_temp.c optional ow_temp
dev/ow/owc_gpiobus.c optional owc gpio
dev/patm/if_patm.c optional patm pci
dev/patm/if_patm_attach.c optional patm pci
dev/patm/if_patm_intr.c optional patm pci
dev/patm/if_patm_ioctl.c optional patm pci
dev/patm/if_patm_rtables.c optional patm pci
dev/patm/if_patm_rx.c optional patm pci
dev/patm/if_patm_tx.c optional patm pci
dev/pbio/pbio.c optional pbio isa
dev/pccard/card_if.m standard
dev/pccard/pccard.c optional pccard
dev/pccard/pccard_cis.c optional pccard
dev/pccard/pccard_cis_quirks.c optional pccard
dev/pccard/pccard_device.c optional pccard
dev/pccard/power_if.m standard
dev/pccbb/pccbb.c optional cbb
dev/pccbb/pccbb_isa.c optional cbb isa
dev/pccbb/pccbb_pci.c optional cbb pci
dev/pcf/pcf.c optional pcf
dev/pci/eisa_pci.c optional pci eisa
dev/pci/fixup_pci.c optional pci
dev/pci/hostb_pci.c optional pci
dev/pci/ignore_pci.c optional pci
dev/pci/isa_pci.c optional pci isa
dev/pci/pci.c optional pci
dev/pci/pci_if.m standard
dev/pci/pci_iov.c optional pci pci_iov
dev/pci/pci_iov_if.m standard
dev/pci/pci_iov_schema.c optional pci pci_iov
dev/pci/pci_pci.c optional pci
dev/pci/pci_subr.c optional pci
dev/pci/pci_user.c optional pci
dev/pci/pcib_if.m standard
dev/pci/pcib_support.c standard
dev/pci/vga_pci.c optional pci
dev/pcn/if_pcn.c optional pcn pci
dev/pdq/if_fea.c optional fea eisa
dev/pdq/if_fpa.c optional fpa pci
dev/pdq/pdq.c optional nowerror fea eisa | fpa pci
dev/pdq/pdq_ifsubr.c optional nowerror fea eisa | fpa pci
dev/pms/freebsd/driver/ini/src/agtiapi.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/sallsdk/spc/sadisc.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/sallsdk/spc/mpi.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/sallsdk/spc/saframe.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/sallsdk/spc/sahw.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/sallsdk/spc/sainit.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/sallsdk/spc/saint.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/sallsdk/spc/sampicmd.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/sallsdk/spc/sampirsp.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/sallsdk/spc/saphy.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/sallsdk/spc/saport.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/sallsdk/spc/sasata.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/sallsdk/spc/sasmp.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/sallsdk/spc/sassp.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/sallsdk/spc/satimer.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/sallsdk/spc/sautil.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/sallsdk/spc/saioctlcmd.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/sallsdk/spc/mpidebug.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/discovery/dm/dminit.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/discovery/dm/dmsmp.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/discovery/dm/dmdisc.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/discovery/dm/dmport.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/discovery/dm/dmtimer.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/discovery/dm/dmmisc.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/sat/src/sminit.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/sat/src/smmisc.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/sat/src/smsat.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/sat/src/smsatcb.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/sat/src/smsathw.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/sat/src/smtimer.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/tisa/sassata/common/tdinit.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/tisa/sassata/common/tdmisc.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/tisa/sassata/common/tdesgl.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/tisa/sassata/common/tdport.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/tisa/sassata/common/tdint.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/tisa/sassata/common/tdioctl.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/tisa/sassata/common/tdhw.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/tisa/sassata/common/ossacmnapi.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/tisa/sassata/common/tddmcmnapi.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/tisa/sassata/common/tdsmcmnapi.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/tisa/sassata/common/tdtimers.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/tisa/sassata/sas/ini/itdio.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/tisa/sassata/sas/ini/itdcb.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/tisa/sassata/sas/ini/itdinit.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/tisa/sassata/sas/ini/itddisc.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/tisa/sassata/sata/host/sat.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/tisa/sassata/sata/host/ossasat.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/pms/RefTisa/tisa/sassata/sata/host/sathw.c optional pmspcv \
compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
dev/ppbus/if_plip.c optional plip
dev/ppbus/immio.c optional vpo
dev/ppbus/lpbb.c optional lpbb
dev/ppbus/lpt.c optional lpt
dev/ppbus/pcfclock.c optional pcfclock
dev/ppbus/ppb_1284.c optional ppbus
dev/ppbus/ppb_base.c optional ppbus
dev/ppbus/ppb_msq.c optional ppbus
dev/ppbus/ppbconf.c optional ppbus
dev/ppbus/ppbus_if.m optional ppbus
dev/ppbus/ppi.c optional ppi
dev/ppbus/pps.c optional pps
dev/ppbus/vpo.c optional vpo
dev/ppbus/vpoio.c optional vpo
dev/ppc/ppc.c optional ppc
dev/ppc/ppc_acpi.c optional ppc acpi
dev/ppc/ppc_isa.c optional ppc isa
dev/ppc/ppc_pci.c optional ppc pci
dev/ppc/ppc_puc.c optional ppc puc
dev/proto/proto_bus_isa.c optional proto acpi | proto isa
dev/proto/proto_bus_pci.c optional proto pci
dev/proto/proto_busdma.c optional proto
dev/proto/proto_core.c optional proto
dev/pst/pst-iop.c optional pst
dev/pst/pst-pci.c optional pst pci
dev/pst/pst-raid.c optional pst
dev/pty/pty.c optional pty
dev/puc/puc.c optional puc
dev/puc/puc_cfg.c optional puc
dev/puc/puc_pccard.c optional puc pccard
dev/puc/puc_pci.c optional puc pci
dev/puc/pucdata.c optional puc pci
dev/quicc/quicc_core.c optional quicc
dev/ral/rt2560.c optional ral
dev/ral/rt2661.c optional ral
dev/ral/rt2860.c optional ral
dev/ral/if_ral_pci.c optional ral pci
rt2561fw.c optional rt2561fw | ralfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk rt2561.fw:rt2561fw -mrt2561 -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "rt2561fw.c"
rt2561fw.fwo optional rt2561fw | ralfw \
dependency "rt2561.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "rt2561fw.fwo"
rt2561.fw optional rt2561fw | ralfw \
dependency "$S/contrib/dev/ral/rt2561.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "rt2561.fw"
rt2561sfw.c optional rt2561sfw | ralfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk rt2561s.fw:rt2561sfw -mrt2561s -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "rt2561sfw.c"
rt2561sfw.fwo optional rt2561sfw | ralfw \
dependency "rt2561s.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "rt2561sfw.fwo"
rt2561s.fw optional rt2561sfw | ralfw \
dependency "$S/contrib/dev/ral/rt2561s.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "rt2561s.fw"
rt2661fw.c optional rt2661fw | ralfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk rt2661.fw:rt2661fw -mrt2661 -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "rt2661fw.c"
rt2661fw.fwo optional rt2661fw | ralfw \
dependency "rt2661.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "rt2661fw.fwo"
rt2661.fw optional rt2661fw | ralfw \
dependency "$S/contrib/dev/ral/rt2661.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "rt2661.fw"
rt2860fw.c optional rt2860fw | ralfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk rt2860.fw:rt2860fw -mrt2860 -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "rt2860fw.c"
rt2860fw.fwo optional rt2860fw | ralfw \
dependency "rt2860.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "rt2860fw.fwo"
rt2860.fw optional rt2860fw | ralfw \
dependency "$S/contrib/dev/ral/rt2860.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "rt2860.fw"
dev/random/random_infra.c optional random
dev/random/random_harvestq.c optional random
dev/random/randomdev.c optional random random_yarrow | \
random !random_yarrow !random_loadable
dev/random/yarrow.c optional random random_yarrow
dev/random/fortuna.c optional random !random_yarrow !random_loadable
dev/random/hash.c optional random random_yarrow | \
random !random_yarrow !random_loadable
dev/rc/rc.c optional rc
dev/rccgpio/rccgpio.c optional rccgpio gpio
dev/re/if_re.c optional re
dev/rl/if_rl.c optional rl pci
dev/rndtest/rndtest.c optional rndtest
dev/rp/rp.c optional rp
dev/rp/rp_isa.c optional rp isa
dev/rp/rp_pci.c optional rp pci
dev/rtwn/if_rtwn.c optional rtwn
rtwn-rtl8192cfwU.c optional rtwn-rtl8192cfwU | rtwnfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk rtwn-rtl8192cfwU.fw:rtwn-rtl8192cfwU:111 -mrtwn-rtl8192cfwU -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "rtwn-rtl8192cfwU.c"
rtwn-rtl8192cfwU.fwo optional rtwn-rtl8192cfwU | rtwnfw \
dependency "rtwn-rtl8192cfwU.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "rtwn-rtl8192cfwU.fwo"
rtwn-rtl8192cfwU.fw optional rtwn-rtl8192cfwU | rtwnfw \
dependency "$S/contrib/dev/rtwn/rtwn-rtl8192cfwU.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "rtwn-rtl8192cfwU.fw"
rtwn-rtl8192cfwU_B.c optional rtwn-rtl8192cfwU_B | rtwnfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk rtwn-rtl8192cfwU_B.fw:rtwn-rtl8192cfwU_B:111 -mrtwn-rtl8192cfwU_B -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "rtwn-rtl8192cfwU_B.c"
rtwn-rtl8192cfwU_B.fwo optional rtwn-rtl8192cfwU_B | rtwnfw \
dependency "rtwn-rtl8192cfwU_B.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "rtwn-rtl8192cfwU_B.fwo"
rtwn-rtl8192cfwU_B.fw optional rtwn-rtl8192cfwU_B | rtwnfw \
dependency "$S/contrib/dev/rtwn/rtwn-rtl8192cfwU_B.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "rtwn-rtl8192cfwU_B.fw"
dev/safe/safe.c optional safe
dev/scc/scc_if.m optional scc
dev/scc/scc_bfe_ebus.c optional scc ebus
dev/scc/scc_bfe_quicc.c optional scc quicc
dev/scc/scc_bfe_sbus.c optional scc fhc | scc sbus
dev/scc/scc_core.c optional scc
dev/scc/scc_dev_quicc.c optional scc quicc
dev/scc/scc_dev_sab82532.c optional scc
dev/scc/scc_dev_z8530.c optional scc
dev/scd/scd.c optional scd isa
dev/scd/scd_isa.c optional scd isa
dev/sdhci/sdhci.c optional sdhci
dev/sdhci/sdhci_fdt_gpio.c optional sdhci fdt gpio
dev/sdhci/sdhci_if.m optional sdhci
dev/sdhci/sdhci_acpi.c optional sdhci acpi
dev/sdhci/sdhci_pci.c optional sdhci pci
dev/sf/if_sf.c optional sf pci
dev/sge/if_sge.c optional sge pci
dev/si/si.c optional si \
compile-with "${NORMAL_C} ${NO_WCONSTANT_CONVERSION}"
dev/si/si2_z280.c optional si
dev/si/si3_t225.c optional si
dev/si/si_eisa.c optional si eisa
dev/si/si_isa.c optional si isa
dev/si/si_pci.c optional si pci
dev/siba/siba_bwn.c optional siba_bwn pci
dev/siba/siba_core.c optional siba_bwn pci
dev/siis/siis.c optional siis pci
dev/sis/if_sis.c optional sis pci
dev/sk/if_sk.c optional sk pci
dev/smbus/smb.c optional smb
dev/smbus/smbconf.c optional smbus
dev/smbus/smbus.c optional smbus
dev/smbus/smbus_if.m optional smbus
dev/smc/if_smc.c optional smc
dev/smc/if_smc_fdt.c optional smc fdt
dev/sn/if_sn.c optional sn
dev/sn/if_sn_isa.c optional sn isa
dev/sn/if_sn_pccard.c optional sn pccard
dev/snp/snp.c optional snp
dev/sound/clone.c optional sound
dev/sound/unit.c optional sound
dev/sound/isa/ad1816.c optional snd_ad1816 isa
dev/sound/isa/ess.c optional snd_ess isa
dev/sound/isa/gusc.c optional snd_gusc isa
dev/sound/isa/mss.c optional snd_mss isa
dev/sound/isa/sb16.c optional snd_sb16 isa
dev/sound/isa/sb8.c optional snd_sb8 isa
dev/sound/isa/sbc.c optional snd_sbc isa
dev/sound/isa/sndbuf_dma.c optional sound isa
dev/sound/pci/als4000.c optional snd_als4000 pci
dev/sound/pci/atiixp.c optional snd_atiixp pci
dev/sound/pci/cmi.c optional snd_cmi pci
dev/sound/pci/cs4281.c optional snd_cs4281 pci
dev/sound/pci/csa.c optional snd_csa pci
dev/sound/pci/csapcm.c optional snd_csa pci
dev/sound/pci/ds1.c optional snd_ds1 pci
dev/sound/pci/emu10k1.c optional snd_emu10k1 pci
dev/sound/pci/emu10kx.c optional snd_emu10kx pci
dev/sound/pci/emu10kx-pcm.c optional snd_emu10kx pci
dev/sound/pci/emu10kx-midi.c optional snd_emu10kx pci
dev/sound/pci/envy24.c optional snd_envy24 pci
dev/sound/pci/envy24ht.c optional snd_envy24ht pci
dev/sound/pci/es137x.c optional snd_es137x pci
dev/sound/pci/fm801.c optional snd_fm801 pci
dev/sound/pci/ich.c optional snd_ich pci
dev/sound/pci/maestro.c optional snd_maestro pci
dev/sound/pci/maestro3.c optional snd_maestro3 pci
dev/sound/pci/neomagic.c optional snd_neomagic pci
dev/sound/pci/solo.c optional snd_solo pci
dev/sound/pci/spicds.c optional snd_spicds pci
dev/sound/pci/t4dwave.c optional snd_t4dwave pci
dev/sound/pci/via8233.c optional snd_via8233 pci
dev/sound/pci/via82c686.c optional snd_via82c686 pci
dev/sound/pci/vibes.c optional snd_vibes pci
dev/sound/pci/hda/hdaa.c optional snd_hda pci
dev/sound/pci/hda/hdaa_patches.c optional snd_hda pci
dev/sound/pci/hda/hdac.c optional snd_hda pci
dev/sound/pci/hda/hdac_if.m optional snd_hda pci
dev/sound/pci/hda/hdacc.c optional snd_hda pci
dev/sound/pci/hdspe.c optional snd_hdspe pci
dev/sound/pci/hdspe-pcm.c optional snd_hdspe pci
dev/sound/pcm/ac97.c optional sound
dev/sound/pcm/ac97_if.m optional sound
dev/sound/pcm/ac97_patch.c optional sound
dev/sound/pcm/buffer.c optional sound \
dependency "snd_fxdiv_gen.h"
dev/sound/pcm/channel.c optional sound
dev/sound/pcm/channel_if.m optional sound
dev/sound/pcm/dsp.c optional sound
dev/sound/pcm/feeder.c optional sound
dev/sound/pcm/feeder_chain.c optional sound
dev/sound/pcm/feeder_eq.c optional sound \
dependency "feeder_eq_gen.h" \
dependency "snd_fxdiv_gen.h"
dev/sound/pcm/feeder_if.m optional sound
dev/sound/pcm/feeder_format.c optional sound \
dependency "snd_fxdiv_gen.h"
dev/sound/pcm/feeder_matrix.c optional sound \
dependency "snd_fxdiv_gen.h"
dev/sound/pcm/feeder_mixer.c optional sound \
dependency "snd_fxdiv_gen.h"
dev/sound/pcm/feeder_rate.c optional sound \
dependency "feeder_rate_gen.h" \
dependency "snd_fxdiv_gen.h"
dev/sound/pcm/feeder_volume.c optional sound \
dependency "snd_fxdiv_gen.h"
dev/sound/pcm/mixer.c optional sound
dev/sound/pcm/mixer_if.m optional sound
dev/sound/pcm/sndstat.c optional sound
dev/sound/pcm/sound.c optional sound
dev/sound/pcm/vchan.c optional sound
dev/sound/usb/uaudio.c optional snd_uaudio usb
dev/sound/usb/uaudio_pcm.c optional snd_uaudio usb
dev/sound/midi/midi.c optional sound
dev/sound/midi/mpu401.c optional sound
dev/sound/midi/mpu_if.m optional sound
dev/sound/midi/mpufoi_if.m optional sound
dev/sound/midi/sequencer.c optional sound
dev/sound/midi/synth_if.m optional sound
dev/spibus/ofw_spibus.c optional fdt spibus
dev/spibus/spibus.c optional spibus \
dependency "spibus_if.h"
dev/spibus/spigen.c optional spigen
dev/spibus/spibus_if.m optional spibus
dev/ste/if_ste.c optional ste pci
dev/stg/tmc18c30.c optional stg
dev/stg/tmc18c30_isa.c optional stg isa
dev/stg/tmc18c30_pccard.c optional stg pccard
dev/stg/tmc18c30_pci.c optional stg pci
dev/stg/tmc18c30_subr.c optional stg
dev/stge/if_stge.c optional stge
dev/streams/streams.c optional streams
dev/sym/sym_hipd.c optional sym \
dependency "$S/dev/sym/sym_{conf,defs}.h"
dev/syscons/blank/blank_saver.c optional blank_saver
dev/syscons/daemon/daemon_saver.c optional daemon_saver
dev/syscons/dragon/dragon_saver.c optional dragon_saver
dev/syscons/fade/fade_saver.c optional fade_saver
dev/syscons/fire/fire_saver.c optional fire_saver
dev/syscons/green/green_saver.c optional green_saver
dev/syscons/logo/logo.c optional logo_saver
dev/syscons/logo/logo_saver.c optional logo_saver
dev/syscons/rain/rain_saver.c optional rain_saver
dev/syscons/schistory.c optional sc
dev/syscons/scmouse.c optional sc
dev/syscons/scterm.c optional sc
dev/syscons/scvidctl.c optional sc
dev/syscons/snake/snake_saver.c optional snake_saver
dev/syscons/star/star_saver.c optional star_saver
dev/syscons/syscons.c optional sc
dev/syscons/sysmouse.c optional sc
dev/syscons/warp/warp_saver.c optional warp_saver
dev/tdfx/tdfx_linux.c optional tdfx_linux tdfx compat_linux
dev/tdfx/tdfx_pci.c optional tdfx pci
dev/ti/if_ti.c optional ti pci
dev/tl/if_tl.c optional tl pci
dev/trm/trm.c optional trm
dev/twa/tw_cl_init.c optional twa \
compile-with "${NORMAL_C} -I$S/dev/twa"
dev/twa/tw_cl_intr.c optional twa \
compile-with "${NORMAL_C} -I$S/dev/twa"
dev/twa/tw_cl_io.c optional twa \
compile-with "${NORMAL_C} -I$S/dev/twa"
dev/twa/tw_cl_misc.c optional twa \
compile-with "${NORMAL_C} -I$S/dev/twa"
dev/twa/tw_osl_cam.c optional twa \
compile-with "${NORMAL_C} -I$S/dev/twa"
dev/twa/tw_osl_freebsd.c optional twa \
compile-with "${NORMAL_C} -I$S/dev/twa"
dev/twe/twe.c optional twe
dev/twe/twe_freebsd.c optional twe
dev/tws/tws.c optional tws
dev/tws/tws_cam.c optional tws
dev/tws/tws_hdm.c optional tws
dev/tws/tws_services.c optional tws
dev/tws/tws_user.c optional tws
dev/tx/if_tx.c optional tx
dev/txp/if_txp.c optional txp
dev/uart/uart_bus_acpi.c optional uart acpi
dev/uart/uart_bus_ebus.c optional uart ebus
dev/uart/uart_bus_fdt.c optional uart fdt
dev/uart/uart_bus_isa.c optional uart isa
dev/uart/uart_bus_pccard.c optional uart pccard
dev/uart/uart_bus_pci.c optional uart pci
dev/uart/uart_bus_puc.c optional uart puc
dev/uart/uart_bus_scc.c optional uart scc
dev/uart/uart_core.c optional uart
dev/uart/uart_dbg.c optional uart gdb
dev/uart/uart_dev_ns8250.c optional uart uart_ns8250 | uart uart_snps
dev/uart/uart_dev_pl011.c optional uart pl011
dev/uart/uart_dev_quicc.c optional uart quicc
dev/uart/uart_dev_sab82532.c optional uart uart_sab82532
dev/uart/uart_dev_sab82532.c optional uart scc
dev/uart/uart_dev_snps.c optional uart uart_snps
dev/uart/uart_dev_z8530.c optional uart uart_z8530
dev/uart/uart_dev_z8530.c optional uart scc
dev/uart/uart_if.m optional uart
dev/uart/uart_subr.c optional uart
dev/uart/uart_tty.c optional uart
dev/ubsec/ubsec.c optional ubsec
#
# USB controller drivers
#
dev/usb/controller/at91dci.c optional at91dci
dev/usb/controller/at91dci_atmelarm.c optional at91dci at91rm9200
dev/usb/controller/musb_otg.c optional musb
dev/usb/controller/musb_otg_atmelarm.c optional musb at91rm9200
dev/usb/controller/dwc_otg.c optional dwcotg
dev/usb/controller/dwc_otg_fdt.c optional dwcotg fdt
dev/usb/controller/ehci.c optional ehci
dev/usb/controller/ehci_pci.c optional ehci pci
dev/usb/controller/ohci.c optional ohci
dev/usb/controller/ohci_pci.c optional ohci pci
dev/usb/controller/uhci.c optional uhci
dev/usb/controller/uhci_pci.c optional uhci pci
dev/usb/controller/xhci.c optional xhci
dev/usb/controller/xhci_pci.c optional xhci pci
dev/usb/controller/saf1761_otg.c optional saf1761otg
dev/usb/controller/saf1761_otg_fdt.c optional saf1761otg fdt
dev/usb/controller/uss820dci.c optional uss820dci
dev/usb/controller/uss820dci_atmelarm.c optional uss820dci at91rm9200
dev/usb/controller/usb_controller.c optional usb
#
# USB storage drivers
#
dev/usb/storage/cfumass.c optional cfumass ctl
dev/usb/storage/umass.c optional umass
dev/usb/storage/urio.c optional urio
dev/usb/storage/ustorage_fs.c optional usfs
#
# USB core
#
dev/usb/usb_busdma.c optional usb
dev/usb/usb_core.c optional usb
dev/usb/usb_debug.c optional usb
dev/usb/usb_dev.c optional usb
dev/usb/usb_device.c optional usb
dev/usb/usb_dynamic.c optional usb
dev/usb/usb_error.c optional usb
dev/usb/usb_generic.c optional usb
dev/usb/usb_handle_request.c optional usb
dev/usb/usb_hid.c optional usb
dev/usb/usb_hub.c optional usb
dev/usb/usb_if.m optional usb
dev/usb/usb_lookup.c optional usb
dev/usb/usb_mbuf.c optional usb
dev/usb/usb_msctest.c optional usb
dev/usb/usb_parse.c optional usb
dev/usb/usb_pf.c optional usb
dev/usb/usb_process.c optional usb
dev/usb/usb_request.c optional usb
dev/usb/usb_transfer.c optional usb
dev/usb/usb_util.c optional usb
#
# USB network drivers
#
dev/usb/net/if_aue.c optional aue
dev/usb/net/if_axe.c optional axe
dev/usb/net/if_axge.c optional axge
dev/usb/net/if_cdce.c optional cdce
dev/usb/net/if_cue.c optional cue
dev/usb/net/if_ipheth.c optional ipheth
dev/usb/net/if_kue.c optional kue
dev/usb/net/if_mos.c optional mos
dev/usb/net/if_rue.c optional rue
dev/usb/net/if_smsc.c optional smsc
dev/usb/net/if_udav.c optional udav
dev/usb/net/if_ure.c optional ure
dev/usb/net/if_usie.c optional usie
dev/usb/net/if_urndis.c optional urndis
dev/usb/net/ruephy.c optional rue
dev/usb/net/usb_ethernet.c optional uether | aue | axe | axge | cdce | \
cue | ipheth | kue | mos | rue | \
smsc | udav | ure | urndis
dev/usb/net/uhso.c optional uhso
#
# USB WLAN drivers
#
dev/usb/wlan/if_rsu.c optional rsu
rsu-rtl8712fw.c optional rsu-rtl8712fw | rsufw \
compile-with "${AWK} -f $S/tools/fw_stub.awk rsu-rtl8712fw.fw:rsu-rtl8712fw:120 -mrsu-rtl8712fw -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "rsu-rtl8712fw.c"
rsu-rtl8712fw.fwo optional rsu-rtl8712fw | rsufw \
dependency "rsu-rtl8712fw.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "rsu-rtl8712fw.fwo"
rsu-rtl8712fw.fw optional rsu-rtl8712.fw | rsufw \
dependency "$S/contrib/dev/rsu/rsu-rtl8712fw.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "rsu-rtl8712fw.fw"
dev/usb/wlan/if_rum.c optional rum
dev/usb/wlan/if_run.c optional run
runfw.c optional runfw \
compile-with "${AWK} -f $S/tools/fw_stub.awk run.fw:runfw -mrunfw -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "runfw.c"
runfw.fwo optional runfw \
dependency "run.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "runfw.fwo"
run.fw optional runfw \
dependency "$S/contrib/dev/run/rt2870.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "run.fw"
dev/usb/wlan/if_uath.c optional uath
dev/usb/wlan/if_upgt.c optional upgt
dev/usb/wlan/if_ural.c optional ural
dev/usb/wlan/if_urtw.c optional urtw
dev/usb/wlan/if_zyd.c optional zyd
#
# USB serial and parallel port drivers
#
dev/usb/serial/u3g.c optional u3g
dev/usb/serial/uark.c optional uark
dev/usb/serial/ubsa.c optional ubsa
dev/usb/serial/ubser.c optional ubser
dev/usb/serial/uchcom.c optional uchcom
dev/usb/serial/ucycom.c optional ucycom
dev/usb/serial/ufoma.c optional ufoma
dev/usb/serial/uftdi.c optional uftdi
dev/usb/serial/ugensa.c optional ugensa
dev/usb/serial/uipaq.c optional uipaq
dev/usb/serial/ulpt.c optional ulpt
dev/usb/serial/umcs.c optional umcs
dev/usb/serial/umct.c optional umct
dev/usb/serial/umodem.c optional umodem
dev/usb/serial/umoscom.c optional umoscom
dev/usb/serial/uplcom.c optional uplcom
dev/usb/serial/uslcom.c optional uslcom
dev/usb/serial/uvisor.c optional uvisor
dev/usb/serial/uvscom.c optional uvscom
dev/usb/serial/usb_serial.c optional ucom | u3g | uark | ubsa | ubser | \
uchcom | ucycom | ufoma | uftdi | \
ugensa | uipaq | umcs | umct | \
umodem | umoscom | uplcom | usie | \
uslcom | uvisor | uvscom
#
# USB misc drivers
#
dev/usb/misc/ufm.c optional ufm
dev/usb/misc/udbp.c optional udbp
dev/usb/misc/ugold.c optional ugold
dev/usb/misc/uled.c optional uled
#
# USB input drivers
#
dev/usb/input/atp.c optional atp
dev/usb/input/uep.c optional uep
dev/usb/input/uhid.c optional uhid
dev/usb/input/ukbd.c optional ukbd
dev/usb/input/ums.c optional ums
dev/usb/input/wmt.c optional wmt
dev/usb/input/wsp.c optional wsp
#
# USB quirks
#
dev/usb/quirk/usb_quirk.c optional usb
#
# USB templates
#
dev/usb/template/usb_template.c optional usb_template
dev/usb/template/usb_template_audio.c optional usb_template
dev/usb/template/usb_template_cdce.c optional usb_template
dev/usb/template/usb_template_kbd.c optional usb_template
dev/usb/template/usb_template_modem.c optional usb_template
dev/usb/template/usb_template_mouse.c optional usb_template
dev/usb/template/usb_template_msc.c optional usb_template
dev/usb/template/usb_template_mtp.c optional usb_template
dev/usb/template/usb_template_phone.c optional usb_template
dev/usb/template/usb_template_serialnet.c optional usb_template
dev/usb/template/usb_template_midi.c optional usb_template
#
# USB video drivers
#
dev/usb/video/udl.c optional udl
#
# USB END
#
dev/videomode/videomode.c optional videomode
dev/videomode/edid.c optional videomode
dev/videomode/pickmode.c optional videomode
dev/videomode/vesagtf.c optional videomode
dev/utopia/idtphy.c optional utopia
dev/utopia/suni.c optional utopia
dev/utopia/utopia.c optional utopia
dev/vge/if_vge.c optional vge
dev/viapm/viapm.c optional viapm pci
dev/virtio/virtio.c optional virtio
dev/virtio/virtqueue.c optional virtio
dev/virtio/virtio_bus_if.m optional virtio
dev/virtio/virtio_if.m optional virtio
dev/virtio/pci/virtio_pci.c optional virtio_pci
dev/virtio/mmio/virtio_mmio.c optional virtio_mmio
dev/virtio/mmio/virtio_mmio_if.m optional virtio_mmio
dev/virtio/network/if_vtnet.c optional vtnet
dev/virtio/block/virtio_blk.c optional virtio_blk
dev/virtio/balloon/virtio_balloon.c optional virtio_balloon
dev/virtio/scsi/virtio_scsi.c optional virtio_scsi
dev/virtio/random/virtio_random.c optional virtio_random
dev/virtio/console/virtio_console.c optional virtio_console
dev/vkbd/vkbd.c optional vkbd
dev/vr/if_vr.c optional vr pci
dev/vt/colors/vt_termcolors.c optional vt
dev/vt/font/vt_font_default.c optional vt
dev/vt/font/vt_mouse_cursor.c optional vt
dev/vt/hw/efifb/efifb.c optional vt_efifb
dev/vt/hw/fb/vt_fb.c optional vt
dev/vt/hw/vga/vt_vga.c optional vt vt_vga
dev/vt/logo/logo_freebsd.c optional vt splash
dev/vt/logo/logo_beastie.c optional vt splash
dev/vt/vt_buf.c optional vt
dev/vt/vt_consolectl.c optional vt
dev/vt/vt_core.c optional vt
dev/vt/vt_cpulogos.c optional vt splash
dev/vt/vt_font.c optional vt
dev/vt/vt_sysmouse.c optional vt
dev/vte/if_vte.c optional vte pci
dev/vx/if_vx.c optional vx
dev/vx/if_vx_eisa.c optional vx eisa
dev/vx/if_vx_pci.c optional vx pci
dev/vxge/vxge.c optional vxge
dev/vxge/vxgehal/vxgehal-ifmsg.c optional vxge
dev/vxge/vxgehal/vxgehal-mrpcim.c optional vxge
dev/vxge/vxgehal/vxge-queue.c optional vxge
dev/vxge/vxgehal/vxgehal-ring.c optional vxge
dev/vxge/vxgehal/vxgehal-swapper.c optional vxge
dev/vxge/vxgehal/vxgehal-mgmt.c optional vxge
dev/vxge/vxgehal/vxgehal-srpcim.c optional vxge
dev/vxge/vxgehal/vxgehal-config.c optional vxge
dev/vxge/vxgehal/vxgehal-blockpool.c optional vxge
dev/vxge/vxgehal/vxgehal-doorbells.c optional vxge
dev/vxge/vxgehal/vxgehal-mgmtaux.c optional vxge
dev/vxge/vxgehal/vxgehal-device.c optional vxge
dev/vxge/vxgehal/vxgehal-mm.c optional vxge
dev/vxge/vxgehal/vxgehal-driver.c optional vxge
dev/vxge/vxgehal/vxgehal-virtualpath.c optional vxge
dev/vxge/vxgehal/vxgehal-channel.c optional vxge
dev/vxge/vxgehal/vxgehal-fifo.c optional vxge
dev/watchdog/watchdog.c standard
dev/wb/if_wb.c optional wb pci
dev/wds/wd7000.c optional wds isa
dev/wi/if_wi.c optional wi
dev/wi/if_wi_pccard.c optional wi pccard
dev/wi/if_wi_pci.c optional wi pci
dev/wl/if_wl.c optional wl isa
dev/wpi/if_wpi.c optional wpi pci
wpifw.c optional wpifw \
compile-with "${AWK} -f $S/tools/fw_stub.awk wpi.fw:wpifw:153229 -mwpi -c${.TARGET}" \
no-implicit-rule before-depend local \
clean "wpifw.c"
wpifw.fwo optional wpifw \
dependency "wpi.fw" \
compile-with "${NORMAL_FWO}" \
no-implicit-rule \
clean "wpifw.fwo"
wpi.fw optional wpifw \
dependency "$S/contrib/dev/wpi/iwlwifi-3945-15.32.2.9.fw.uu" \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "wpi.fw"
dev/xe/if_xe.c optional xe
dev/xe/if_xe_pccard.c optional xe pccard
dev/xen/balloon/balloon.c optional xenhvm
dev/xen/blkfront/blkfront.c optional xenhvm
dev/xen/blkback/blkback.c optional xenhvm
dev/xen/console/xen_console.c optional xenhvm
dev/xen/control/control.c optional xenhvm
dev/xen/grant_table/grant_table.c optional xenhvm
dev/xen/netback/netback.c optional xenhvm
dev/xen/netfront/netfront.c optional xenhvm
dev/xen/xenpci/xenpci.c optional xenpci
dev/xen/timer/timer.c optional xenhvm
dev/xen/pvcpu/pvcpu.c optional xenhvm
dev/xen/xenstore/xenstore.c optional xenhvm
dev/xen/xenstore/xenstore_dev.c optional xenhvm
dev/xen/xenstore/xenstored_dev.c optional xenhvm
dev/xen/evtchn/evtchn_dev.c optional xenhvm
dev/xen/privcmd/privcmd.c optional xenhvm
dev/xen/debug/debug.c optional xenhvm
dev/xl/if_xl.c optional xl pci
dev/xl/xlphy.c optional xl pci
fs/autofs/autofs.c optional autofs
fs/autofs/autofs_vfsops.c optional autofs
fs/autofs/autofs_vnops.c optional autofs
fs/deadfs/dead_vnops.c standard
fs/devfs/devfs_devs.c standard
fs/devfs/devfs_dir.c standard
fs/devfs/devfs_rule.c standard
fs/devfs/devfs_vfsops.c standard
fs/devfs/devfs_vnops.c standard
fs/fdescfs/fdesc_vfsops.c optional fdescfs
fs/fdescfs/fdesc_vnops.c optional fdescfs
fs/fifofs/fifo_vnops.c standard
fs/cuse/cuse.c optional cuse
fs/fuse/fuse_device.c optional fuse
fs/fuse/fuse_file.c optional fuse
fs/fuse/fuse_internal.c optional fuse
fs/fuse/fuse_io.c optional fuse
fs/fuse/fuse_ipc.c optional fuse
fs/fuse/fuse_main.c optional fuse
fs/fuse/fuse_node.c optional fuse
fs/fuse/fuse_vfsops.c optional fuse
fs/fuse/fuse_vnops.c optional fuse
fs/msdosfs/msdosfs_conv.c optional msdosfs
fs/msdosfs/msdosfs_denode.c optional msdosfs
fs/msdosfs/msdosfs_fat.c optional msdosfs
fs/msdosfs/msdosfs_fileno.c optional msdosfs
fs/msdosfs/msdosfs_iconv.c optional msdosfs_iconv
fs/msdosfs/msdosfs_lookup.c optional msdosfs
fs/msdosfs/msdosfs_vfsops.c optional msdosfs
fs/msdosfs/msdosfs_vnops.c optional msdosfs
fs/nandfs/bmap.c optional nandfs
fs/nandfs/nandfs_alloc.c optional nandfs
fs/nandfs/nandfs_bmap.c optional nandfs
fs/nandfs/nandfs_buffer.c optional nandfs
fs/nandfs/nandfs_cleaner.c optional nandfs
fs/nandfs/nandfs_cpfile.c optional nandfs
fs/nandfs/nandfs_dat.c optional nandfs
fs/nandfs/nandfs_dir.c optional nandfs
fs/nandfs/nandfs_ifile.c optional nandfs
fs/nandfs/nandfs_segment.c optional nandfs
fs/nandfs/nandfs_subr.c optional nandfs
fs/nandfs/nandfs_sufile.c optional nandfs
fs/nandfs/nandfs_vfsops.c optional nandfs
fs/nandfs/nandfs_vnops.c optional nandfs
fs/nfs/nfs_commonkrpc.c optional nfscl | nfsd
fs/nfs/nfs_commonsubs.c optional nfscl | nfsd
fs/nfs/nfs_commonport.c optional nfscl | nfsd
fs/nfs/nfs_commonacl.c optional nfscl | nfsd
fs/nfsclient/nfs_clcomsubs.c optional nfscl
fs/nfsclient/nfs_clsubs.c optional nfscl
fs/nfsclient/nfs_clstate.c optional nfscl
fs/nfsclient/nfs_clkrpc.c optional nfscl
fs/nfsclient/nfs_clrpcops.c optional nfscl
fs/nfsclient/nfs_clvnops.c optional nfscl
fs/nfsclient/nfs_clnode.c optional nfscl
fs/nfsclient/nfs_clvfsops.c optional nfscl
fs/nfsclient/nfs_clport.c optional nfscl
fs/nfsclient/nfs_clbio.c optional nfscl
fs/nfsclient/nfs_clnfsiod.c optional nfscl
fs/nfsserver/nfs_fha_new.c optional nfsd inet
fs/nfsserver/nfs_nfsdsocket.c optional nfsd inet
fs/nfsserver/nfs_nfsdsubs.c optional nfsd inet
fs/nfsserver/nfs_nfsdstate.c optional nfsd inet
fs/nfsserver/nfs_nfsdkrpc.c optional nfsd inet
fs/nfsserver/nfs_nfsdserv.c optional nfsd inet
fs/nfsserver/nfs_nfsdport.c optional nfsd inet
fs/nfsserver/nfs_nfsdcache.c optional nfsd inet
fs/nullfs/null_subr.c optional nullfs
fs/nullfs/null_vfsops.c optional nullfs
fs/nullfs/null_vnops.c optional nullfs
fs/procfs/procfs.c optional procfs
fs/procfs/procfs_ctl.c optional procfs
fs/procfs/procfs_dbregs.c optional procfs
fs/procfs/procfs_fpregs.c optional procfs
fs/procfs/procfs_ioctl.c optional procfs
fs/procfs/procfs_map.c optional procfs
fs/procfs/procfs_mem.c optional procfs
fs/procfs/procfs_note.c optional procfs
fs/procfs/procfs_osrel.c optional procfs
fs/procfs/procfs_regs.c optional procfs
fs/procfs/procfs_rlimit.c optional procfs
fs/procfs/procfs_status.c optional procfs
fs/procfs/procfs_type.c optional procfs
fs/pseudofs/pseudofs.c optional pseudofs
fs/pseudofs/pseudofs_fileno.c optional pseudofs
fs/pseudofs/pseudofs_vncache.c optional pseudofs
fs/pseudofs/pseudofs_vnops.c optional pseudofs
fs/smbfs/smbfs_io.c optional smbfs
fs/smbfs/smbfs_node.c optional smbfs
fs/smbfs/smbfs_smb.c optional smbfs
fs/smbfs/smbfs_subr.c optional smbfs
fs/smbfs/smbfs_vfsops.c optional smbfs
fs/smbfs/smbfs_vnops.c optional smbfs
fs/udf/osta.c optional udf
fs/udf/udf_iconv.c optional udf_iconv
fs/udf/udf_vfsops.c optional udf
fs/udf/udf_vnops.c optional udf
fs/unionfs/union_subr.c optional unionfs
fs/unionfs/union_vfsops.c optional unionfs
fs/unionfs/union_vnops.c optional unionfs
fs/tmpfs/tmpfs_vnops.c optional tmpfs
fs/tmpfs/tmpfs_fifoops.c optional tmpfs
fs/tmpfs/tmpfs_vfsops.c optional tmpfs
fs/tmpfs/tmpfs_subr.c optional tmpfs
gdb/gdb_cons.c optional gdb
gdb/gdb_main.c optional gdb
gdb/gdb_packet.c optional gdb
geom/bde/g_bde.c optional geom_bde
geom/bde/g_bde_crypt.c optional geom_bde
geom/bde/g_bde_lock.c optional geom_bde
geom/bde/g_bde_work.c optional geom_bde
geom/cache/g_cache.c optional geom_cache
geom/concat/g_concat.c optional geom_concat
geom/eli/g_eli.c optional geom_eli
geom/eli/g_eli_crypto.c optional geom_eli
geom/eli/g_eli_ctl.c optional geom_eli
geom/eli/g_eli_hmac.c optional geom_eli
geom/eli/g_eli_integrity.c optional geom_eli
geom/eli/g_eli_key.c optional geom_eli
geom/eli/g_eli_key_cache.c optional geom_eli
geom/eli/g_eli_privacy.c optional geom_eli
geom/eli/pkcs5v2.c optional geom_eli
geom/gate/g_gate.c optional geom_gate
geom/geom_aes.c optional geom_aes
geom/geom_bsd.c optional geom_bsd
geom/geom_bsd_enc.c optional geom_bsd | geom_part_bsd
geom/geom_ccd.c optional ccd | geom_ccd
geom/geom_ctl.c standard
geom/geom_dev.c standard
geom/geom_disk.c standard
geom/geom_dump.c standard
geom/geom_event.c standard
geom/geom_fox.c optional geom_fox
geom/geom_flashmap.c optional fdt cfi | fdt nand | fdt mx25l | mmcsd
geom/geom_io.c standard
geom/geom_kern.c standard
geom/geom_map.c optional geom_map
geom/geom_mbr.c optional geom_mbr
geom/geom_mbr_enc.c optional geom_mbr
geom/geom_pc98.c optional geom_pc98
geom/geom_pc98_enc.c optional geom_pc98
geom/geom_redboot.c optional geom_redboot
geom/geom_slice.c standard
geom/geom_subr.c standard
geom/geom_sunlabel.c optional geom_sunlabel
geom/geom_sunlabel_enc.c optional geom_sunlabel
geom/geom_vfs.c standard
geom/geom_vol_ffs.c optional geom_vol
geom/journal/g_journal.c optional geom_journal
geom/journal/g_journal_ufs.c optional geom_journal
geom/label/g_label.c optional geom_label | geom_label_gpt
geom/label/g_label_ext2fs.c optional geom_label
geom/label/g_label_iso9660.c optional geom_label
geom/label/g_label_msdosfs.c optional geom_label
geom/label/g_label_ntfs.c optional geom_label
geom/label/g_label_reiserfs.c optional geom_label
geom/label/g_label_ufs.c optional geom_label
geom/label/g_label_gpt.c optional geom_label | geom_label_gpt
geom/label/g_label_disk_ident.c optional geom_label
geom/linux_lvm/g_linux_lvm.c optional geom_linux_lvm
geom/mirror/g_mirror.c optional geom_mirror
geom/mirror/g_mirror_ctl.c optional geom_mirror
geom/mountver/g_mountver.c optional geom_mountver
geom/multipath/g_multipath.c optional geom_multipath
geom/nop/g_nop.c optional geom_nop
geom/part/g_part.c standard
geom/part/g_part_if.m standard
geom/part/g_part_apm.c optional geom_part_apm
geom/part/g_part_bsd.c optional geom_part_bsd
geom/part/g_part_bsd64.c optional geom_part_bsd64
geom/part/g_part_ebr.c optional geom_part_ebr
geom/part/g_part_gpt.c optional geom_part_gpt
geom/part/g_part_ldm.c optional geom_part_ldm
geom/part/g_part_mbr.c optional geom_part_mbr
geom/part/g_part_pc98.c optional geom_part_pc98
geom/part/g_part_vtoc8.c optional geom_part_vtoc8
geom/raid/g_raid.c optional geom_raid
geom/raid/g_raid_ctl.c optional geom_raid
geom/raid/g_raid_md_if.m optional geom_raid
geom/raid/g_raid_tr_if.m optional geom_raid
geom/raid/md_ddf.c optional geom_raid
geom/raid/md_intel.c optional geom_raid
geom/raid/md_jmicron.c optional geom_raid
geom/raid/md_nvidia.c optional geom_raid
geom/raid/md_promise.c optional geom_raid
geom/raid/md_sii.c optional geom_raid
geom/raid/tr_concat.c optional geom_raid
geom/raid/tr_raid0.c optional geom_raid
geom/raid/tr_raid1.c optional geom_raid
geom/raid/tr_raid1e.c optional geom_raid
geom/raid/tr_raid5.c optional geom_raid
geom/raid3/g_raid3.c optional geom_raid3
geom/raid3/g_raid3_ctl.c optional geom_raid3
geom/shsec/g_shsec.c optional geom_shsec
geom/stripe/g_stripe.c optional geom_stripe
contrib/xz-embedded/freebsd/xz_malloc.c \
optional xz_embedded | geom_uzip \
compile-with "${NORMAL_C} -I$S/contrib/xz-embedded/freebsd/ -I$S/contrib/xz-embedded/linux/lib/xz/ -I$S/contrib/xz-embedded/linux/include/linux/"
contrib/xz-embedded/linux/lib/xz/xz_crc32.c \
optional xz_embedded | geom_uzip \
compile-with "${NORMAL_C} -I$S/contrib/xz-embedded/freebsd/ -I$S/contrib/xz-embedded/linux/lib/xz/ -I$S/contrib/xz-embedded/linux/include/linux/"
contrib/xz-embedded/linux/lib/xz/xz_dec_bcj.c \
optional xz_embedded | geom_uzip \
compile-with "${NORMAL_C} -I$S/contrib/xz-embedded/freebsd/ -I$S/contrib/xz-embedded/linux/lib/xz/ -I$S/contrib/xz-embedded/linux/include/linux/"
contrib/xz-embedded/linux/lib/xz/xz_dec_lzma2.c \
optional xz_embedded | geom_uzip \
compile-with "${NORMAL_C} -I$S/contrib/xz-embedded/freebsd/ -I$S/contrib/xz-embedded/linux/lib/xz/ -I$S/contrib/xz-embedded/linux/include/linux/"
contrib/xz-embedded/linux/lib/xz/xz_dec_stream.c \
optional xz_embedded | geom_uzip \
compile-with "${NORMAL_C} -I$S/contrib/xz-embedded/freebsd/ -I$S/contrib/xz-embedded/linux/lib/xz/ -I$S/contrib/xz-embedded/linux/include/linux/"
geom/uzip/g_uzip.c optional geom_uzip
geom/uzip/g_uzip_lzma.c optional geom_uzip
geom/uzip/g_uzip_wrkthr.c optional geom_uzip
geom/uzip/g_uzip_zlib.c optional geom_uzip
geom/vinum/geom_vinum.c optional geom_vinum
geom/vinum/geom_vinum_create.c optional geom_vinum
geom/vinum/geom_vinum_drive.c optional geom_vinum
geom/vinum/geom_vinum_plex.c optional geom_vinum
geom/vinum/geom_vinum_volume.c optional geom_vinum
geom/vinum/geom_vinum_subr.c optional geom_vinum
geom/vinum/geom_vinum_raid5.c optional geom_vinum
geom/vinum/geom_vinum_share.c optional geom_vinum
geom/vinum/geom_vinum_list.c optional geom_vinum
geom/vinum/geom_vinum_rm.c optional geom_vinum
geom/vinum/geom_vinum_init.c optional geom_vinum
geom/vinum/geom_vinum_state.c optional geom_vinum
geom/vinum/geom_vinum_rename.c optional geom_vinum
geom/vinum/geom_vinum_move.c optional geom_vinum
geom/vinum/geom_vinum_events.c optional geom_vinum
geom/virstor/binstream.c optional geom_virstor
geom/virstor/g_virstor.c optional geom_virstor
geom/virstor/g_virstor_md.c optional geom_virstor
geom/zero/g_zero.c optional geom_zero
fs/ext2fs/ext2_acl.c optional ext2fs
fs/ext2fs/ext2_alloc.c optional ext2fs
fs/ext2fs/ext2_balloc.c optional ext2fs
fs/ext2fs/ext2_bmap.c optional ext2fs
fs/ext2fs/ext2_csum.c optional ext2fs
fs/ext2fs/ext2_extattr.c optional ext2fs
fs/ext2fs/ext2_extents.c optional ext2fs
fs/ext2fs/ext2_inode.c optional ext2fs
fs/ext2fs/ext2_inode_cnv.c optional ext2fs
fs/ext2fs/ext2_hash.c optional ext2fs
fs/ext2fs/ext2_htree.c optional ext2fs
fs/ext2fs/ext2_lookup.c optional ext2fs
fs/ext2fs/ext2_subr.c optional ext2fs
fs/ext2fs/ext2_vfsops.c optional ext2fs
fs/ext2fs/ext2_vnops.c optional ext2fs
#
isa/isa_if.m standard
isa/isa_common.c optional isa
isa/isahint.c optional isa
isa/pnp.c optional isa isapnp
isa/pnpparse.c optional isa isapnp
fs/cd9660/cd9660_bmap.c optional cd9660
fs/cd9660/cd9660_lookup.c optional cd9660
fs/cd9660/cd9660_node.c optional cd9660
fs/cd9660/cd9660_rrip.c optional cd9660
fs/cd9660/cd9660_util.c optional cd9660
fs/cd9660/cd9660_vfsops.c optional cd9660
fs/cd9660/cd9660_vnops.c optional cd9660
fs/cd9660/cd9660_iconv.c optional cd9660_iconv
kern/bus_if.m standard
kern/clock_if.m standard
kern/cpufreq_if.m standard
kern/device_if.m standard
kern/imgact_binmisc.c optional imagact_binmisc
kern/imgact_elf.c standard
kern/imgact_elf32.c optional compat_freebsd32
kern/imgact_shell.c standard
kern/inflate.c optional gzip
kern/init_main.c standard
kern/init_sysent.c standard
kern/ksched.c optional _kposix_priority_scheduling
kern/kern_acct.c standard
kern/kern_alq.c optional alq
kern/kern_clock.c standard
kern/kern_condvar.c standard
kern/kern_conf.c standard
kern/kern_cons.c standard
kern/kern_cpu.c standard
kern/kern_cpuset.c standard
kern/kern_context.c standard
kern/kern_descrip.c standard
kern/kern_dtrace.c optional kdtrace_hooks
kern/kern_dump.c standard
kern/kern_environment.c standard
kern/kern_et.c standard
kern/kern_event.c standard
kern/kern_exec.c standard
kern/kern_exit.c standard
kern/kern_fail.c standard
kern/kern_ffclock.c standard
kern/kern_fork.c standard
kern/kern_gzio.c optional gzio
kern/kern_hhook.c standard
kern/kern_idle.c standard
kern/kern_intr.c standard
kern/kern_jail.c standard
kern/kern_khelp.c standard
kern/kern_kthread.c standard
kern/kern_ktr.c optional ktr
kern/kern_ktrace.c standard
kern/kern_linker.c standard
kern/kern_lock.c standard
kern/kern_lockf.c standard
kern/kern_lockstat.c optional kdtrace_hooks
kern/kern_loginclass.c standard
kern/kern_malloc.c standard
kern/kern_mbuf.c standard
kern/kern_mib.c standard
kern/kern_module.c standard
kern/kern_mtxpool.c standard
kern/kern_mutex.c standard
kern/kern_ntptime.c standard
kern/kern_numa.c standard
kern/kern_osd.c standard
kern/kern_physio.c standard
kern/kern_pmc.c standard
kern/kern_poll.c optional device_polling
kern/kern_priv.c standard
kern/kern_proc.c standard
kern/kern_procctl.c standard
kern/kern_prot.c standard
kern/kern_racct.c standard
kern/kern_rangelock.c standard
kern/kern_rctl.c standard
kern/kern_resource.c standard
kern/kern_rmlock.c standard
kern/kern_rwlock.c standard
kern/kern_sdt.c optional kdtrace_hooks
kern/kern_sema.c standard
kern/kern_sendfile.c standard
kern/kern_sharedpage.c standard
kern/kern_shutdown.c standard
kern/kern_sig.c standard
kern/kern_switch.c standard
kern/kern_sx.c standard
kern/kern_synch.c standard
kern/kern_syscalls.c standard
kern/kern_sysctl.c standard
kern/kern_tc.c standard
kern/kern_thr.c standard
kern/kern_thread.c standard
kern/kern_time.c standard
kern/kern_timeout.c standard
kern/kern_umtx.c standard
kern/kern_uuid.c standard
kern/kern_xxx.c standard
kern/link_elf.c standard
kern/linker_if.m standard
kern/md4c.c optional netsmb
kern/md5c.c standard
kern/p1003_1b.c standard
kern/posix4_mib.c standard
kern/sched_4bsd.c optional sched_4bsd
kern/sched_ule.c optional sched_ule
kern/serdev_if.m standard
kern/stack_protector.c standard \
compile-with "${NORMAL_C:N-fstack-protector*}"
kern/subr_acl_nfs4.c optional ufs_acl | zfs
kern/subr_acl_posix1e.c optional ufs_acl
kern/subr_autoconf.c standard
kern/subr_blist.c standard
kern/subr_bus.c standard
kern/subr_bus_dma.c standard
kern/subr_bufring.c standard
kern/subr_capability.c standard
kern/subr_clock.c standard
kern/subr_counter.c standard
kern/subr_devstat.c standard
kern/subr_disk.c standard
kern/subr_eventhandler.c standard
kern/subr_fattime.c standard
kern/subr_firmware.c optional firmware
kern/subr_gtaskqueue.c standard
kern/subr_hash.c standard
kern/subr_hints.c standard
kern/subr_kdb.c standard
kern/subr_kobj.c standard
kern/subr_lock.c standard
kern/subr_log.c standard
kern/subr_mbpool.c optional libmbpool
kern/subr_mchain.c optional libmchain
kern/subr_module.c standard
kern/subr_msgbuf.c standard
kern/subr_param.c standard
kern/subr_pcpu.c standard
kern/subr_pctrie.c standard
kern/subr_power.c standard
kern/subr_prf.c standard
kern/subr_prof.c standard
kern/subr_rman.c standard
kern/subr_rtc.c standard
kern/subr_sbuf.c standard
kern/subr_scanf.c standard
kern/subr_sglist.c standard
kern/subr_sleepqueue.c standard
kern/subr_smp.c standard
kern/subr_stack.c optional ddb | stack | ktr
kern/subr_taskqueue.c standard
kern/subr_terminal.c optional vt
kern/subr_trap.c standard
kern/subr_turnstile.c standard
kern/subr_uio.c standard
kern/subr_unit.c standard
kern/subr_vmem.c standard
kern/subr_witness.c optional witness
kern/sys_capability.c standard
kern/sys_generic.c standard
kern/sys_pipe.c standard
kern/sys_procdesc.c standard
kern/sys_process.c standard
kern/sys_socket.c standard
kern/syscalls.c standard
kern/sysv_ipc.c standard
kern/sysv_msg.c optional sysvmsg
kern/sysv_sem.c optional sysvsem
kern/sysv_shm.c optional sysvshm
kern/tty.c standard
kern/tty_compat.c optional compat_43tty
kern/tty_info.c standard
kern/tty_inq.c standard
kern/tty_outq.c standard
kern/tty_pts.c standard
kern/tty_tty.c standard
kern/tty_ttydisc.c standard
kern/uipc_accf.c standard
kern/uipc_debug.c optional ddb
kern/uipc_domain.c standard
kern/uipc_mbuf.c standard
kern/uipc_mbuf2.c standard
kern/uipc_mbufhash.c standard
kern/uipc_mqueue.c optional p1003_1b_mqueue
kern/uipc_sem.c optional p1003_1b_semaphores
kern/uipc_shm.c standard
kern/uipc_sockbuf.c standard
kern/uipc_socket.c standard
kern/uipc_syscalls.c standard
kern/uipc_usrreq.c standard
kern/vfs_acl.c standard
kern/vfs_aio.c standard
kern/vfs_bio.c standard
kern/vfs_cache.c standard
kern/vfs_cluster.c standard
kern/vfs_default.c standard
kern/vfs_export.c standard
kern/vfs_extattr.c standard
kern/vfs_hash.c standard
kern/vfs_init.c standard
kern/vfs_lookup.c standard
kern/vfs_mount.c standard
kern/vfs_mountroot.c standard
kern/vfs_subr.c standard
kern/vfs_syscalls.c standard
kern/vfs_vnops.c standard
#
# Kernel GSS-API
#
gssd.h optional kgssapi \
dependency "$S/kgssapi/gssd.x" \
compile-with "RPCGEN_CPP='${CPP}' rpcgen -hM $S/kgssapi/gssd.x | grep -v pthread.h > gssd.h" \
no-obj no-implicit-rule before-depend local \
clean "gssd.h"
gssd_xdr.c optional kgssapi \
dependency "$S/kgssapi/gssd.x gssd.h" \
compile-with "RPCGEN_CPP='${CPP}' rpcgen -c $S/kgssapi/gssd.x -o gssd_xdr.c" \
no-implicit-rule before-depend local \
clean "gssd_xdr.c"
gssd_clnt.c optional kgssapi \
dependency "$S/kgssapi/gssd.x gssd.h" \
compile-with "RPCGEN_CPP='${CPP}' rpcgen -lM $S/kgssapi/gssd.x | grep -v string.h > gssd_clnt.c" \
no-implicit-rule before-depend local \
clean "gssd_clnt.c"
kgssapi/gss_accept_sec_context.c optional kgssapi
kgssapi/gss_add_oid_set_member.c optional kgssapi
kgssapi/gss_acquire_cred.c optional kgssapi
kgssapi/gss_canonicalize_name.c optional kgssapi
kgssapi/gss_create_empty_oid_set.c optional kgssapi
kgssapi/gss_delete_sec_context.c optional kgssapi
kgssapi/gss_display_status.c optional kgssapi
kgssapi/gss_export_name.c optional kgssapi
kgssapi/gss_get_mic.c optional kgssapi
kgssapi/gss_init_sec_context.c optional kgssapi
kgssapi/gss_impl.c optional kgssapi
kgssapi/gss_import_name.c optional kgssapi
kgssapi/gss_names.c optional kgssapi
kgssapi/gss_pname_to_uid.c optional kgssapi
kgssapi/gss_release_buffer.c optional kgssapi
kgssapi/gss_release_cred.c optional kgssapi
kgssapi/gss_release_name.c optional kgssapi
kgssapi/gss_release_oid_set.c optional kgssapi
kgssapi/gss_set_cred_option.c optional kgssapi
kgssapi/gss_test_oid_set_member.c optional kgssapi
kgssapi/gss_unwrap.c optional kgssapi
kgssapi/gss_verify_mic.c optional kgssapi
kgssapi/gss_wrap.c optional kgssapi
kgssapi/gss_wrap_size_limit.c optional kgssapi
kgssapi/gssd_prot.c optional kgssapi
kgssapi/krb5/krb5_mech.c optional kgssapi
kgssapi/krb5/kcrypto.c optional kgssapi
kgssapi/krb5/kcrypto_aes.c optional kgssapi
kgssapi/krb5/kcrypto_arcfour.c optional kgssapi
kgssapi/krb5/kcrypto_des.c optional kgssapi
kgssapi/krb5/kcrypto_des3.c optional kgssapi
kgssapi/kgss_if.m optional kgssapi
kgssapi/gsstest.c optional kgssapi_debug
# These files in libkern/ are those needed by all architectures. Some
# of the files in libkern/ are only needed on some architectures, e.g.,
# libkern/divdi3.c is needed by i386 but not alpha. Also, some of these
# routines may be optimized for a particular platform. In either case,
# the file should be moved to conf/files.<arch> from here.
#
libkern/arc4random.c standard
libkern/asprintf.c standard
libkern/bcd.c standard
libkern/bsearch.c standard
libkern/crc32.c standard
libkern/explicit_bzero.c standard
libkern/fnmatch.c standard
libkern/iconv.c optional libiconv
libkern/iconv_converter_if.m optional libiconv
libkern/iconv_ucs.c optional libiconv
libkern/iconv_xlat.c optional libiconv
libkern/iconv_xlat16.c optional libiconv
libkern/inet_aton.c standard
libkern/inet_ntoa.c standard
libkern/inet_ntop.c standard
libkern/inet_pton.c standard
libkern/jenkins_hash.c standard
libkern/murmur3_32.c standard
libkern/mcount.c optional profiling-routine
libkern/memcchr.c standard
libkern/memchr.c standard
libkern/memcmp.c standard
libkern/memmem.c optional gdb
libkern/qsort.c standard
libkern/qsort_r.c standard
libkern/random.c standard
libkern/scanc.c standard
libkern/strcasecmp.c standard
libkern/strcat.c standard
libkern/strchr.c standard
libkern/strcmp.c standard
libkern/strcpy.c standard
libkern/strcspn.c standard
libkern/strdup.c standard
libkern/strndup.c standard
libkern/strlcat.c standard
libkern/strlcpy.c standard
libkern/strlen.c standard
libkern/strncat.c standard
libkern/strncmp.c standard
libkern/strncpy.c standard
libkern/strnlen.c standard
libkern/strrchr.c standard
libkern/strsep.c standard
libkern/strspn.c standard
libkern/strstr.c standard
libkern/strtol.c standard
libkern/strtoq.c standard
libkern/strtoul.c standard
libkern/strtouq.c standard
libkern/strvalid.c standard
libkern/timingsafe_bcmp.c standard
libkern/zlib.c optional crypto | geom_uzip | ipsec | \
ipsec_support | mxge | netgraph_deflate | ddb_ctf | gzio
net/altq/altq_cbq.c optional altq
net/altq/altq_cdnr.c optional altq
net/altq/altq_codel.c optional altq
net/altq/altq_hfsc.c optional altq
net/altq/altq_fairq.c optional altq
net/altq/altq_priq.c optional altq
net/altq/altq_red.c optional altq
net/altq/altq_rio.c optional altq
net/altq/altq_rmclass.c optional altq
net/altq/altq_subr.c optional altq
net/bpf.c standard
net/bpf_buffer.c optional bpf
net/bpf_jitter.c optional bpf_jitter
net/bpf_filter.c optional bpf | netgraph_bpf
net/bpf_zerocopy.c optional bpf
net/bridgestp.c optional bridge | if_bridge
net/flowtable.c optional flowtable inet | flowtable inet6
net/ieee8023ad_lacp.c optional lagg
net/if.c standard
net/if_arcsubr.c optional arcnet
net/if_atmsubr.c optional atm
net/if_bridge.c optional bridge inet | if_bridge inet
net/if_clone.c standard
net/if_dead.c standard
net/if_debug.c optional ddb
net/if_disc.c optional disc
net/if_edsc.c optional edsc
net/if_enc.c optional enc inet | enc inet6
net/if_epair.c optional epair
net/if_ethersubr.c optional ether
net/if_fddisubr.c optional fddi
net/if_fwsubr.c optional fwip
net/if_gif.c optional gif inet | gif inet6 | \
netgraph_gif inet | netgraph_gif inet6
net/if_gre.c optional gre inet | gre inet6
net/if_ipsec.c optional inet ipsec | inet6 ipsec
net/if_iso88025subr.c optional token
net/if_lagg.c optional lagg
net/if_loop.c optional loop
net/if_llatbl.c standard
net/if_me.c optional me inet
net/if_media.c standard
net/if_mib.c standard
net/if_spppfr.c optional sppp | netgraph_sppp
net/if_spppsubr.c optional sppp | netgraph_sppp
net/if_stf.c optional stf inet inet6
net/if_tun.c optional tun
net/if_tap.c optional tap
net/if_vlan.c optional vlan
net/if_vxlan.c optional vxlan inet | vxlan inet6
net/ifdi_if.m optional ether pci
net/iflib.c optional ether pci
net/mp_ring.c optional ether
net/mppcc.c optional netgraph_mppc_compression
net/mppcd.c optional netgraph_mppc_compression
net/netisr.c standard
net/pfil.c optional ether | inet
net/radix.c standard
net/radix_mpath.c standard
net/raw_cb.c standard
net/raw_usrreq.c standard
net/route.c standard
net/rss_config.c optional inet rss | inet6 rss
net/rtsock.c standard
net/slcompress.c optional netgraph_vjc | sppp | \
netgraph_sppp
net/toeplitz.c optional inet rss | inet6 rss
net/vnet.c optional vimage
net80211/ieee80211.c optional wlan
net80211/ieee80211_acl.c optional wlan wlan_acl
net80211/ieee80211_action.c optional wlan
net80211/ieee80211_ageq.c optional wlan
net80211/ieee80211_adhoc.c optional wlan \
compile-with "${NORMAL_C} -Wno-unused-function"
net80211/ieee80211_ageq.c optional wlan
net80211/ieee80211_amrr.c optional wlan | wlan_amrr
net80211/ieee80211_crypto.c optional wlan \
compile-with "${NORMAL_C} -Wno-unused-function"
net80211/ieee80211_crypto_ccmp.c optional wlan wlan_ccmp
net80211/ieee80211_crypto_none.c optional wlan
net80211/ieee80211_crypto_tkip.c optional wlan wlan_tkip
net80211/ieee80211_crypto_wep.c optional wlan wlan_wep
net80211/ieee80211_ddb.c optional wlan ddb
net80211/ieee80211_dfs.c optional wlan
net80211/ieee80211_freebsd.c optional wlan
net80211/ieee80211_hostap.c optional wlan \
compile-with "${NORMAL_C} -Wno-unused-function"
net80211/ieee80211_ht.c optional wlan
net80211/ieee80211_hwmp.c optional wlan ieee80211_support_mesh
net80211/ieee80211_input.c optional wlan
net80211/ieee80211_ioctl.c optional wlan
net80211/ieee80211_mesh.c optional wlan ieee80211_support_mesh \
compile-with "${NORMAL_C} -Wno-unused-function"
net80211/ieee80211_monitor.c optional wlan
net80211/ieee80211_node.c optional wlan
net80211/ieee80211_output.c optional wlan
net80211/ieee80211_phy.c optional wlan
net80211/ieee80211_power.c optional wlan
net80211/ieee80211_proto.c optional wlan
net80211/ieee80211_radiotap.c optional wlan
net80211/ieee80211_ratectl.c optional wlan
net80211/ieee80211_ratectl_none.c optional wlan
net80211/ieee80211_regdomain.c optional wlan
net80211/ieee80211_rssadapt.c optional wlan wlan_rssadapt
net80211/ieee80211_scan.c optional wlan
net80211/ieee80211_scan_sta.c optional wlan
net80211/ieee80211_sta.c optional wlan \
compile-with "${NORMAL_C} -Wno-unused-function"
net80211/ieee80211_superg.c optional wlan ieee80211_support_superg
net80211/ieee80211_scan_sw.c optional wlan
net80211/ieee80211_tdma.c optional wlan ieee80211_support_tdma
net80211/ieee80211_wds.c optional wlan
net80211/ieee80211_xauth.c optional wlan wlan_xauth
net80211/ieee80211_alq.c optional wlan ieee80211_alq
netgraph/atm/ccatm/ng_ccatm.c optional ngatm_ccatm \
compile-with "${NORMAL_C} -I$S/contrib/ngatm"
netgraph/atm/ng_atm.c optional ngatm_atm
netgraph/atm/ngatmbase.c optional ngatm_atmbase \
compile-with "${NORMAL_C} -I$S/contrib/ngatm"
netgraph/atm/sscfu/ng_sscfu.c optional ngatm_sscfu \
compile-with "${NORMAL_C} -I$S/contrib/ngatm"
netgraph/atm/sscop/ng_sscop.c optional ngatm_sscop \
compile-with "${NORMAL_C} -I$S/contrib/ngatm"
netgraph/atm/uni/ng_uni.c optional ngatm_uni \
compile-with "${NORMAL_C} -I$S/contrib/ngatm"
netgraph/bluetooth/common/ng_bluetooth.c optional netgraph_bluetooth
netgraph/bluetooth/drivers/bt3c/ng_bt3c_pccard.c optional netgraph_bluetooth_bt3c
netgraph/bluetooth/drivers/h4/ng_h4.c optional netgraph_bluetooth_h4
netgraph/bluetooth/drivers/ubt/ng_ubt.c optional netgraph_bluetooth_ubt usb
netgraph/bluetooth/drivers/ubtbcmfw/ubtbcmfw.c optional netgraph_bluetooth_ubtbcmfw usb
netgraph/bluetooth/hci/ng_hci_cmds.c optional netgraph_bluetooth_hci
netgraph/bluetooth/hci/ng_hci_evnt.c optional netgraph_bluetooth_hci
netgraph/bluetooth/hci/ng_hci_main.c optional netgraph_bluetooth_hci
netgraph/bluetooth/hci/ng_hci_misc.c optional netgraph_bluetooth_hci
netgraph/bluetooth/hci/ng_hci_ulpi.c optional netgraph_bluetooth_hci
netgraph/bluetooth/l2cap/ng_l2cap_cmds.c optional netgraph_bluetooth_l2cap
netgraph/bluetooth/l2cap/ng_l2cap_evnt.c optional netgraph_bluetooth_l2cap
netgraph/bluetooth/l2cap/ng_l2cap_llpi.c optional netgraph_bluetooth_l2cap
netgraph/bluetooth/l2cap/ng_l2cap_main.c optional netgraph_bluetooth_l2cap
netgraph/bluetooth/l2cap/ng_l2cap_misc.c optional netgraph_bluetooth_l2cap
netgraph/bluetooth/l2cap/ng_l2cap_ulpi.c optional netgraph_bluetooth_l2cap
netgraph/bluetooth/socket/ng_btsocket.c optional netgraph_bluetooth_socket
netgraph/bluetooth/socket/ng_btsocket_hci_raw.c optional netgraph_bluetooth_socket
netgraph/bluetooth/socket/ng_btsocket_l2cap.c optional netgraph_bluetooth_socket
netgraph/bluetooth/socket/ng_btsocket_l2cap_raw.c optional netgraph_bluetooth_socket
netgraph/bluetooth/socket/ng_btsocket_rfcomm.c optional netgraph_bluetooth_socket
netgraph/bluetooth/socket/ng_btsocket_sco.c optional netgraph_bluetooth_socket
netgraph/netflow/netflow.c optional netgraph_netflow
netgraph/netflow/netflow_v9.c optional netgraph_netflow
netgraph/netflow/ng_netflow.c optional netgraph_netflow
netgraph/ng_UI.c optional netgraph_UI
netgraph/ng_async.c optional netgraph_async
netgraph/ng_atmllc.c optional netgraph_atmllc
netgraph/ng_base.c optional netgraph
netgraph/ng_bpf.c optional netgraph_bpf
netgraph/ng_bridge.c optional netgraph_bridge
netgraph/ng_car.c optional netgraph_car
netgraph/ng_cisco.c optional netgraph_cisco
netgraph/ng_deflate.c optional netgraph_deflate
netgraph/ng_device.c optional netgraph_device
netgraph/ng_echo.c optional netgraph_echo
netgraph/ng_eiface.c optional netgraph_eiface
netgraph/ng_ether.c optional netgraph_ether
netgraph/ng_ether_echo.c optional netgraph_ether_echo
netgraph/ng_frame_relay.c optional netgraph_frame_relay
netgraph/ng_gif.c optional netgraph_gif inet6 | netgraph_gif inet
netgraph/ng_gif_demux.c optional netgraph_gif_demux
netgraph/ng_hole.c optional netgraph_hole
netgraph/ng_iface.c optional netgraph_iface
netgraph/ng_ip_input.c optional netgraph_ip_input
netgraph/ng_ipfw.c optional netgraph_ipfw inet ipfirewall
netgraph/ng_ksocket.c optional netgraph_ksocket
netgraph/ng_l2tp.c optional netgraph_l2tp
netgraph/ng_lmi.c optional netgraph_lmi
netgraph/ng_mppc.c optional netgraph_mppc_compression | \
netgraph_mppc_encryption
netgraph/ng_nat.c optional netgraph_nat inet libalias
netgraph/ng_one2many.c optional netgraph_one2many
netgraph/ng_parse.c optional netgraph
netgraph/ng_patch.c optional netgraph_patch
netgraph/ng_pipe.c optional netgraph_pipe
netgraph/ng_ppp.c optional netgraph_ppp
netgraph/ng_pppoe.c optional netgraph_pppoe
netgraph/ng_pptpgre.c optional netgraph_pptpgre
netgraph/ng_pred1.c optional netgraph_pred1
netgraph/ng_rfc1490.c optional netgraph_rfc1490
netgraph/ng_socket.c optional netgraph_socket
netgraph/ng_split.c optional netgraph_split
netgraph/ng_sppp.c optional netgraph_sppp
netgraph/ng_tag.c optional netgraph_tag
netgraph/ng_tcpmss.c optional netgraph_tcpmss
netgraph/ng_tee.c optional netgraph_tee
netgraph/ng_tty.c optional netgraph_tty
netgraph/ng_vjc.c optional netgraph_vjc
netgraph/ng_vlan.c optional netgraph_vlan
netinet/accf_data.c optional accept_filter_data inet
netinet/accf_dns.c optional accept_filter_dns inet
netinet/accf_http.c optional accept_filter_http inet
netinet/if_atm.c optional atm
netinet/if_ether.c optional inet ether
netinet/igmp.c optional inet
netinet/in.c optional inet
netinet/in_debug.c optional inet ddb
netinet/in_kdtrace.c optional inet | inet6
netinet/ip_carp.c optional inet carp | inet6 carp
netinet/in_fib.c optional inet
netinet/in_gif.c optional gif inet | netgraph_gif inet
netinet/ip_gre.c optional gre inet
netinet/ip_id.c optional inet
netinet/in_jail.c optional inet
netinet/in_mcast.c optional inet
netinet/in_pcb.c optional inet | inet6
netinet/in_pcbgroup.c optional inet pcbgroup | inet6 pcbgroup
netinet/in_proto.c optional inet | inet6
netinet/in_rmx.c optional inet
netinet/in_rss.c optional inet rss
netinet/ip_divert.c optional inet ipdivert ipfirewall
netinet/ip_ecn.c optional inet | inet6
netinet/ip_encap.c optional inet | inet6
netinet/ip_fastfwd.c optional inet
netinet/ip_icmp.c optional inet | inet6
netinet/ip_input.c optional inet
netinet/ip_mroute.c optional mrouting inet
netinet/ip_options.c optional inet
netinet/ip_output.c optional inet
netinet/ip_reass.c optional inet
netinet/raw_ip.c optional inet | inet6
netinet/cc/cc.c optional inet | inet6
netinet/cc/cc_newreno.c optional inet | inet6
netinet/sctp_asconf.c optional inet sctp | inet6 sctp
netinet/sctp_auth.c optional inet sctp | inet6 sctp
netinet/sctp_bsd_addr.c optional inet sctp | inet6 sctp
netinet/sctp_cc_functions.c optional inet sctp | inet6 sctp
netinet/sctp_crc32.c optional inet | inet6
netinet/sctp_indata.c optional inet sctp | inet6 sctp
netinet/sctp_input.c optional inet sctp | inet6 sctp
netinet/sctp_output.c optional inet sctp | inet6 sctp
netinet/sctp_pcb.c optional inet sctp | inet6 sctp
netinet/sctp_peeloff.c optional inet sctp | inet6 sctp
netinet/sctp_ss_functions.c optional inet sctp | inet6 sctp
netinet/sctp_syscalls.c optional inet sctp | inet6 sctp
netinet/sctp_sysctl.c optional inet sctp | inet6 sctp
netinet/sctp_timer.c optional inet sctp | inet6 sctp
netinet/sctp_usrreq.c optional inet sctp | inet6 sctp
netinet/sctputil.c optional inet sctp | inet6 sctp
netinet/siftr.c optional inet siftr alq | inet6 siftr alq
netinet/tcp_debug.c optional tcpdebug
netinet/tcp_fastopen.c optional inet tcp_rfc7413 | inet6 tcp_rfc7413
netinet/tcp_hostcache.c optional inet | inet6
netinet/tcp_input.c optional inet | inet6
netinet/tcp_lro.c optional inet | inet6
netinet/tcp_output.c optional inet | inet6
netinet/tcp_offload.c optional tcp_offload inet | tcp_offload inet6
netinet/tcp_pcap.c optional inet tcppcap | inet6 tcppcap
netinet/tcp_reass.c optional inet | inet6
netinet/tcp_sack.c optional inet | inet6
netinet/tcp_subr.c optional inet | inet6
netinet/tcp_syncache.c optional inet | inet6
netinet/tcp_timer.c optional inet | inet6
netinet/tcp_timewait.c optional inet | inet6
netinet/tcp_usrreq.c optional inet | inet6
netinet/udp_usrreq.c optional inet | inet6
netinet/libalias/alias.c optional libalias inet | netgraph_nat inet
netinet/libalias/alias_db.c optional libalias inet | netgraph_nat inet
netinet/libalias/alias_mod.c optional libalias | netgraph_nat
netinet/libalias/alias_proxy.c optional libalias inet | netgraph_nat inet
netinet/libalias/alias_util.c optional libalias inet | netgraph_nat inet
netinet/libalias/alias_sctp.c optional libalias inet | netgraph_nat inet
netinet6/dest6.c optional inet6
netinet6/frag6.c optional inet6
netinet6/icmp6.c optional inet6
netinet6/in6.c optional inet6
netinet6/in6_cksum.c optional inet6
netinet6/in6_fib.c optional inet6
netinet6/in6_gif.c optional gif inet6 | netgraph_gif inet6
netinet6/in6_ifattach.c optional inet6
netinet6/in6_jail.c optional inet6
netinet6/in6_mcast.c optional inet6
netinet6/in6_pcb.c optional inet6
netinet6/in6_pcbgroup.c optional inet6 pcbgroup
netinet6/in6_proto.c optional inet6
netinet6/in6_rmx.c optional inet6
netinet6/in6_rss.c optional inet6 rss
netinet6/in6_src.c optional inet6
netinet6/ip6_fastfwd.c optional inet6
netinet6/ip6_forward.c optional inet6
netinet6/ip6_gre.c optional gre inet6
netinet6/ip6_id.c optional inet6
netinet6/ip6_input.c optional inet6
netinet6/ip6_mroute.c optional mrouting inet6
netinet6/ip6_output.c optional inet6
netinet6/mld6.c optional inet6
netinet6/nd6.c optional inet6
netinet6/nd6_nbr.c optional inet6
netinet6/nd6_rtr.c optional inet6
netinet6/raw_ip6.c optional inet6
netinet6/route6.c optional inet6
netinet6/scope6.c optional inet6
netinet6/sctp6_usrreq.c optional inet6 sctp
netinet6/udp6_usrreq.c optional inet6
netipsec/ipsec.c optional ipsec inet | ipsec inet6
netipsec/ipsec_input.c optional ipsec inet | ipsec inet6
netipsec/ipsec_mbuf.c optional ipsec inet | ipsec inet6
netipsec/ipsec_mod.c optional ipsec inet | ipsec inet6
netipsec/ipsec_output.c optional ipsec inet | ipsec inet6
netipsec/ipsec_pcb.c optional ipsec inet | ipsec inet6 | \
ipsec_support inet | ipsec_support inet6
netipsec/key.c optional ipsec inet | ipsec inet6 | \
ipsec_support inet | ipsec_support inet6
netipsec/key_debug.c optional ipsec inet | ipsec inet6 | \
ipsec_support inet | ipsec_support inet6
netipsec/keysock.c optional ipsec inet | ipsec inet6 | \
ipsec_support inet | ipsec_support inet6
netipsec/subr_ipsec.c optional ipsec inet | ipsec inet6 | \
ipsec_support inet | ipsec_support inet6
netipsec/udpencap.c optional ipsec inet
netipsec/xform_ah.c optional ipsec inet | ipsec inet6
netipsec/xform_esp.c optional ipsec inet | ipsec inet6
netipsec/xform_ipcomp.c optional ipsec inet | ipsec inet6
netipsec/xform_tcp.c optional ipsec inet tcp_signature | \
ipsec inet6 tcp_signature | ipsec_support inet tcp_signature | \
ipsec_support inet6 tcp_signature
netnatm/natm.c optional natm
netnatm/natm_pcb.c optional natm
netnatm/natm_proto.c optional natm
netpfil/ipfw/dn_aqm_codel.c optional inet dummynet
netpfil/ipfw/dn_aqm_pie.c optional inet dummynet
netpfil/ipfw/dn_heap.c optional inet dummynet
netpfil/ipfw/dn_sched_fifo.c optional inet dummynet
netpfil/ipfw/dn_sched_fq_codel.c optional inet dummynet
netpfil/ipfw/dn_sched_fq_pie.c optional inet dummynet
netpfil/ipfw/dn_sched_prio.c optional inet dummynet
netpfil/ipfw/dn_sched_qfq.c optional inet dummynet
netpfil/ipfw/dn_sched_rr.c optional inet dummynet
netpfil/ipfw/dn_sched_wf2q.c optional inet dummynet
netpfil/ipfw/ip_dummynet.c optional inet dummynet
netpfil/ipfw/ip_dn_io.c optional inet dummynet
netpfil/ipfw/ip_dn_glue.c optional inet dummynet
netpfil/ipfw/ip_fw2.c optional inet ipfirewall
netpfil/ipfw/ip_fw_bpf.c optional inet ipfirewall
netpfil/ipfw/ip_fw_dynamic.c optional inet ipfirewall \
compile-with "${NORMAL_C} -I$S/contrib/ck/include"
netpfil/ipfw/ip_fw_eaction.c optional inet ipfirewall
netpfil/ipfw/ip_fw_log.c optional inet ipfirewall
netpfil/ipfw/ip_fw_pfil.c optional inet ipfirewall
netpfil/ipfw/ip_fw_sockopt.c optional inet ipfirewall
netpfil/ipfw/ip_fw_table.c optional inet ipfirewall
netpfil/ipfw/ip_fw_table_algo.c optional inet ipfirewall
netpfil/ipfw/ip_fw_table_value.c optional inet ipfirewall
netpfil/ipfw/ip_fw_iface.c optional inet ipfirewall
netpfil/ipfw/ip_fw_nat.c optional inet ipfirewall_nat
netpfil/ipfw/nat64/ip_fw_nat64.c optional inet inet6 ipfirewall \
ipfirewall_nat64
netpfil/ipfw/nat64/nat64lsn.c optional inet inet6 ipfirewall \
ipfirewall_nat64
netpfil/ipfw/nat64/nat64lsn_control.c optional inet inet6 ipfirewall \
ipfirewall_nat64
netpfil/ipfw/nat64/nat64stl.c optional inet inet6 ipfirewall \
ipfirewall_nat64
netpfil/ipfw/nat64/nat64stl_control.c optional inet inet6 ipfirewall \
ipfirewall_nat64
netpfil/ipfw/nat64/nat64_translate.c optional inet inet6 ipfirewall \
ipfirewall_nat64
netpfil/ipfw/nptv6/ip_fw_nptv6.c optional inet inet6 ipfirewall \
ipfirewall_nptv6
netpfil/ipfw/nptv6/nptv6.c optional inet inet6 ipfirewall \
ipfirewall_nptv6
netpfil/ipfw/pmod/ip_fw_pmod.c optional inet ipfirewall_pmod
netpfil/ipfw/pmod/tcpmod.c optional inet ipfirewall_pmod
netpfil/pf/if_pflog.c optional pflog pf inet
netpfil/pf/if_pfsync.c optional pfsync pf inet
netpfil/pf/pf.c optional pf inet
netpfil/pf/pf_if.c optional pf inet
netpfil/pf/pf_ioctl.c optional pf inet
netpfil/pf/pf_lb.c optional pf inet
netpfil/pf/pf_norm.c optional pf inet
netpfil/pf/pf_osfp.c optional pf inet
netpfil/pf/pf_ruleset.c optional pf inet
netpfil/pf/pf_table.c optional pf inet
netpfil/pf/in4_cksum.c optional pf inet
netsmb/smb_conn.c optional netsmb
netsmb/smb_crypt.c optional netsmb
netsmb/smb_dev.c optional netsmb
netsmb/smb_iod.c optional netsmb
netsmb/smb_rq.c optional netsmb
netsmb/smb_smb.c optional netsmb
netsmb/smb_subr.c optional netsmb
netsmb/smb_trantcp.c optional netsmb
netsmb/smb_usr.c optional netsmb
nfs/bootp_subr.c optional bootp nfscl
nfs/krpc_subr.c optional bootp nfscl
nfs/nfs_diskless.c optional nfscl nfs_root
nfs/nfs_fha.c optional nfsd
nfs/nfs_lock.c optional nfscl | nfslockd | nfsd
nfs/nfs_nfssvc.c optional nfscl | nfsd
nlm/nlm_advlock.c optional nfslockd | nfsd
nlm/nlm_prot_clnt.c optional nfslockd | nfsd
nlm/nlm_prot_impl.c optional nfslockd | nfsd
nlm/nlm_prot_server.c optional nfslockd | nfsd
nlm/nlm_prot_svc.c optional nfslockd | nfsd
nlm/nlm_prot_xdr.c optional nfslockd | nfsd
nlm/sm_inter_xdr.c optional nfslockd | nfsd
# Linux Kernel Programming Interface
compat/linuxkpi/common/src/linux_kmod.c optional compat_linuxkpi \
compile-with "${LINUXKPI_C}"
compat/linuxkpi/common/src/linux_compat.c optional compat_linuxkpi \
compile-with "${LINUXKPI_C}"
compat/linuxkpi/common/src/linux_current.c optional compat_linuxkpi \
compile-with "${LINUXKPI_C}"
compat/linuxkpi/common/src/linux_hrtimer.c optional compat_linuxkpi \
compile-with "${LINUXKPI_C}"
compat/linuxkpi/common/src/linux_kthread.c optional compat_linuxkpi \
compile-with "${LINUXKPI_C}"
compat/linuxkpi/common/src/linux_lock.c optional compat_linuxkpi \
compile-with "${LINUXKPI_C}"
compat/linuxkpi/common/src/linux_page.c optional compat_linuxkpi \
compile-with "${LINUXKPI_C}"
compat/linuxkpi/common/src/linux_pci.c optional compat_linuxkpi pci \
compile-with "${LINUXKPI_C}"
compat/linuxkpi/common/src/linux_tasklet.c optional compat_linuxkpi \
compile-with "${LINUXKPI_C}"
compat/linuxkpi/common/src/linux_idr.c optional compat_linuxkpi \
compile-with "${LINUXKPI_C}"
compat/linuxkpi/common/src/linux_radix.c optional compat_linuxkpi \
compile-with "${LINUXKPI_C}"
compat/linuxkpi/common/src/linux_rcu.c optional compat_linuxkpi \
compile-with "${LINUXKPI_C} -I$S/contrib/ck/include"
compat/linuxkpi/common/src/linux_schedule.c optional compat_linuxkpi \
compile-with "${LINUXKPI_C}"
compat/linuxkpi/common/src/linux_slab.c optional compat_linuxkpi \
compile-with "${LINUXKPI_C}"
compat/linuxkpi/common/src/linux_usb.c optional compat_linuxkpi usb \
compile-with "${LINUXKPI_C}"
compat/linuxkpi/common/src/linux_work.c optional compat_linuxkpi \
compile-with "${LINUXKPI_C}"
# OpenFabrics Enterprise Distribution (Infiniband)
ofed/drivers/infiniband/core/ib_addr.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_agent.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_cache.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_cm.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_cma.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_cq.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_device.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_fmr_pool.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_iwcm.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_iwpm_msg.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_iwpm_util.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_mad.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_mad_rmpp.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_multicast.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_packer.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_roce_gid_mgmt.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_sa_query.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_smi.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_sysfs.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_ucm.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_ucma.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_ud_header.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_umem.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_user_mad.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_uverbs_cmd.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_uverbs_main.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_uverbs_marshall.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/core/ib_verbs.c optional ofed \
compile-with "${OFED_C}"
ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c optional ipoib \
compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/"
#ofed/drivers/infiniband/ulp/ipoib/ipoib_fs.c optional ipoib \
# compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/"
ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c optional ipoib \
compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/"
ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c optional ipoib \
compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/"
ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c optional ipoib \
compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/"
ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c optional ipoib \
compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/"
#ofed/drivers/infiniband/ulp/ipoib/ipoib_vlan.c optional ipoib \
# compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/"
ofed/drivers/infiniband/ulp/sdp/sdp_bcopy.c optional sdp inet \
compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/sdp/"
ofed/drivers/infiniband/ulp/sdp/sdp_main.c optional sdp inet \
compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/sdp/"
ofed/drivers/infiniband/ulp/sdp/sdp_rx.c optional sdp inet \
compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/sdp/"
ofed/drivers/infiniband/ulp/sdp/sdp_cma.c optional sdp inet \
compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/sdp/"
ofed/drivers/infiniband/ulp/sdp/sdp_tx.c optional sdp inet \
compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/sdp/"
dev/mthca/mthca_allocator.c optional mthca pci ofed \
compile-with "${OFED_C}"
dev/mthca/mthca_av.c optional mthca pci ofed \
compile-with "${OFED_C}"
dev/mthca/mthca_catas.c optional mthca pci ofed \
compile-with "${OFED_C}"
dev/mthca/mthca_cmd.c optional mthca pci ofed \
compile-with "${OFED_C}"
dev/mthca/mthca_cq.c optional mthca pci ofed \
compile-with "${OFED_C}"
dev/mthca/mthca_eq.c optional mthca pci ofed \
compile-with "${OFED_C}"
dev/mthca/mthca_mad.c optional mthca pci ofed \
compile-with "${OFED_C}"
dev/mthca/mthca_main.c optional mthca pci ofed \
compile-with "${OFED_C}"
dev/mthca/mthca_mcg.c optional mthca pci ofed \
compile-with "${OFED_C}"
dev/mthca/mthca_memfree.c optional mthca pci ofed \
compile-with "${OFED_C}"
dev/mthca/mthca_mr.c optional mthca pci ofed \
compile-with "${OFED_C}"
dev/mthca/mthca_pd.c optional mthca pci ofed \
compile-with "${OFED_C}"
dev/mthca/mthca_profile.c optional mthca pci ofed \
compile-with "${OFED_C}"
dev/mthca/mthca_provider.c optional mthca pci ofed \
compile-with "${OFED_C}"
dev/mthca/mthca_qp.c optional mthca pci ofed \
compile-with "${OFED_C}"
dev/mthca/mthca_reset.c optional mthca pci ofed \
compile-with "${OFED_C}"
dev/mthca/mthca_srq.c optional mthca pci ofed \
compile-with "${OFED_C}"
dev/mthca/mthca_uar.c optional mthca pci ofed \
compile-with "${OFED_C}"
dev/mlx4/mlx4_ib/mlx4_ib_alias_GUID.c optional mlx4ib pci ofed \
compile-with "${OFED_C}"
dev/mlx4/mlx4_ib/mlx4_ib_mcg.c optional mlx4ib pci ofed \
compile-with "${OFED_C}"
dev/mlx4/mlx4_ib/mlx4_ib_sysfs.c optional mlx4ib pci ofed \
compile-with "${OFED_C}"
dev/mlx4/mlx4_ib/mlx4_ib_cm.c optional mlx4ib pci ofed \
compile-with "${OFED_C}"
dev/mlx4/mlx4_ib/mlx4_ib_ah.c optional mlx4ib pci ofed \
compile-with "${OFED_C}"
dev/mlx4/mlx4_ib/mlx4_ib_cq.c optional mlx4ib pci ofed \
compile-with "${OFED_C}"
dev/mlx4/mlx4_ib/mlx4_ib_doorbell.c optional mlx4ib pci ofed \
compile-with "${OFED_C}"
dev/mlx4/mlx4_ib/mlx4_ib_mad.c optional mlx4ib pci ofed \
compile-with "${OFED_C}"
dev/mlx4/mlx4_ib/mlx4_ib_main.c optional mlx4ib pci ofed \
compile-with "${OFED_C}"
dev/mlx4/mlx4_ib/mlx4_ib_mr.c optional mlx4ib pci ofed \
compile-with "${OFED_C}"
dev/mlx4/mlx4_ib/mlx4_ib_qp.c optional mlx4ib pci ofed \
compile-with "${OFED_C}"
dev/mlx4/mlx4_ib/mlx4_ib_srq.c optional mlx4ib pci ofed \
compile-with "${OFED_C}"
dev/mlx4/mlx4_ib/mlx4_ib_wc.c optional mlx4ib pci ofed \
compile-with "${OFED_C}"
dev/mlx4/mlx4_core/mlx4_alloc.c optional mlx4 pci \
compile-with "${OFED_C}"
dev/mlx4/mlx4_core/mlx4_catas.c optional mlx4 pci \
compile-with "${OFED_C}"
dev/mlx4/mlx4_core/mlx4_cmd.c optional mlx4 pci \
compile-with "${OFED_C}"
dev/mlx4/mlx4_core/mlx4_cq.c optional mlx4 pci \
compile-with "${OFED_C}"
dev/mlx4/mlx4_core/mlx4_eq.c optional mlx4 pci \
compile-with "${OFED_C}"
dev/mlx4/mlx4_core/mlx4_fw.c optional mlx4 pci \
compile-with "${OFED_C}"
dev/mlx4/mlx4_core/mlx4_fw_qos.c optional mlx4 pci \
compile-with "${OFED_C}"
dev/mlx4/mlx4_core/mlx4_icm.c optional mlx4 pci \
compile-with "${OFED_C}"
dev/mlx4/mlx4_core/mlx4_intf.c optional mlx4 pci \
compile-with "${OFED_C}"
dev/mlx4/mlx4_core/mlx4_main.c optional mlx4 pci \
compile-with "${OFED_C}"
dev/mlx4/mlx4_core/mlx4_mcg.c optional mlx4 pci \
compile-with "${OFED_C}"
dev/mlx4/mlx4_core/mlx4_mr.c optional mlx4 pci \
compile-with "${OFED_C}"
dev/mlx4/mlx4_core/mlx4_pd.c optional mlx4 pci \
compile-with "${OFED_C}"
dev/mlx4/mlx4_core/mlx4_port.c optional mlx4 pci \
compile-with "${OFED_C}"
dev/mlx4/mlx4_core/mlx4_profile.c optional mlx4 pci \
compile-with "${OFED_C}"
dev/mlx4/mlx4_core/mlx4_qp.c optional mlx4 pci \
compile-with "${OFED_C}"
dev/mlx4/mlx4_core/mlx4_reset.c optional mlx4 pci \
compile-with "${OFED_C}"
dev/mlx4/mlx4_core/mlx4_sense.c optional mlx4 pci \
compile-with "${OFED_C}"
dev/mlx4/mlx4_core/mlx4_srq.c optional mlx4 pci \
compile-with "${OFED_C}"
dev/mlx4/mlx4_core/mlx4_resource_tracker.c optional mlx4 pci \
compile-with "${OFED_C}"
dev/mlx4/mlx4_en/mlx4_en_cq.c optional mlx4en pci inet inet6 \
compile-with "${OFED_C}"
dev/mlx4/mlx4_en/mlx4_en_main.c optional mlx4en pci inet inet6 \
compile-with "${OFED_C}"
dev/mlx4/mlx4_en/mlx4_en_netdev.c optional mlx4en pci inet inet6 \
compile-with "${OFED_C}"
dev/mlx4/mlx4_en/mlx4_en_port.c optional mlx4en pci inet inet6 \
compile-with "${OFED_C}"
dev/mlx4/mlx4_en/mlx4_en_resources.c optional mlx4en pci inet inet6 \
compile-with "${OFED_C}"
dev/mlx4/mlx4_en/mlx4_en_rx.c optional mlx4en pci inet inet6 \
compile-with "${OFED_C}"
dev/mlx4/mlx4_en/mlx4_en_tx.c optional mlx4en pci inet inet6 \
compile-with "${OFED_C}"
dev/mlx5/mlx5_ib/mlx5_ib_ah.c optional mlx5ib pci ofed \
compile-with "${OFED_C}"
dev/mlx5/mlx5_ib/mlx5_ib_cong.c optional mlx5ib pci ofed \
compile-with "${OFED_C}"
dev/mlx5/mlx5_ib/mlx5_ib_cq.c optional mlx5ib pci ofed \
compile-with "${OFED_C}"
dev/mlx5/mlx5_ib/mlx5_ib_doorbell.c optional mlx5ib pci ofed \
compile-with "${OFED_C}"
dev/mlx5/mlx5_ib/mlx5_ib_gsi.c optional mlx5ib pci ofed \
compile-with "${OFED_C}"
dev/mlx5/mlx5_ib/mlx5_ib_mad.c optional mlx5ib pci ofed \
compile-with "${OFED_C}"
dev/mlx5/mlx5_ib/mlx5_ib_main.c optional mlx5ib pci ofed \
compile-with "${OFED_C}"
dev/mlx5/mlx5_ib/mlx5_ib_mem.c optional mlx5ib pci ofed \
compile-with "${OFED_C}"
dev/mlx5/mlx5_ib/mlx5_ib_mr.c optional mlx5ib pci ofed \
compile-with "${OFED_C}"
dev/mlx5/mlx5_ib/mlx5_ib_qp.c optional mlx5ib pci ofed \
compile-with "${OFED_C}"
dev/mlx5/mlx5_ib/mlx5_ib_srq.c optional mlx5ib pci ofed \
compile-with "${OFED_C}"
dev/mlx5/mlx5_ib/mlx5_ib_virt.c optional mlx5ib pci ofed \
compile-with "${OFED_C}"
dev/mlx5/mlx5_core/mlx5_alloc.c optional mlx5 pci \
compile-with "${OFED_C}"
dev/mlx5/mlx5_core/mlx5_cmd.c optional mlx5 pci \
compile-with "${OFED_C}"
dev/mlx5/mlx5_core/mlx5_cq.c optional mlx5 pci \
compile-with "${OFED_C}"
dev/mlx5/mlx5_core/mlx5_crspace.c optional mlx5 pci \
compile-with "${OFED_C}"
dev/mlx5/mlx5_core/mlx5_diagnostics.c optional mlx5 pci \
compile-with "${OFED_C}"
dev/mlx5/mlx5_core/mlx5_eq.c optional mlx5 pci \
compile-with "${OFED_C}"
dev/mlx5/mlx5_core/mlx5_fs_cmd.c optional mlx5 pci \
compile-with "${OFED_C}"
dev/mlx5/mlx5_core/mlx5_fs_tree.c optional mlx5 pci \
compile-with "${OFED_C}"
dev/mlx5/mlx5_core/mlx5_fw.c optional mlx5 pci \
compile-with "${OFED_C}"
dev/mlx5/mlx5_core/mlx5_fwdump.c optional mlx5 pci \
compile-with "${OFED_C}"
dev/mlx5/mlx5_core/mlx5_fwdump_regmaps.c optional mlx5 pci \
compile-with "${OFED_C}"
dev/mlx5/mlx5_core/mlx5_health.c optional mlx5 pci \
compile-with "${OFED_C}"
dev/mlx5/mlx5_core/mlx5_mad.c optional mlx5 pci \
compile-with "${OFED_C}"
dev/mlx5/mlx5_core/mlx5_main.c optional mlx5 pci \
compile-with "${OFED_C}"
dev/mlx5/mlx5_core/mlx5_mcg.c optional mlx5 pci \
compile-with "${OFED_C}"
dev/mlx5/mlx5_core/mlx5_mr.c optional mlx5 pci \
compile-with "${OFED_C}"
dev/mlx5/mlx5_core/mlx5_pagealloc.c optional mlx5 pci \
compile-with "${OFED_C}"
dev/mlx5/mlx5_core/mlx5_pd.c optional mlx5 pci \
compile-with "${OFED_C}"
dev/mlx5/mlx5_core/mlx5_port.c optional mlx5 pci \
compile-with "${OFED_C}"
dev/mlx5/mlx5_core/mlx5_qp.c optional mlx5 pci \
compile-with "${OFED_C}"
dev/mlx5/mlx5_core/mlx5_srq.c optional mlx5 pci \
compile-with "${OFED_C}"
dev/mlx5/mlx5_core/mlx5_transobj.c optional mlx5 pci \
compile-with "${OFED_C}"
dev/mlx5/mlx5_core/mlx5_uar.c optional mlx5 pci \
compile-with "${OFED_C}"
dev/mlx5/mlx5_core/mlx5_vport.c optional mlx5 pci \
compile-with "${OFED_C}"
dev/mlx5/mlx5_core/mlx5_vsc.c optional mlx5 pci \
compile-with "${OFED_C}"
dev/mlx5/mlx5_core/mlx5_wq.c optional mlx5 pci \
compile-with "${OFED_C}"
dev/mlx5/mlx5_en/mlx5_en_ethtool.c optional mlx5en pci inet inet6 \
compile-with "${OFED_C}"
dev/mlx5/mlx5_en/mlx5_en_main.c optional mlx5en pci inet inet6 \
compile-with "${OFED_C}"
dev/mlx5/mlx5_en/mlx5_en_tx.c optional mlx5en pci inet inet6 \
compile-with "${OFED_C}"
dev/mlx5/mlx5_en/mlx5_en_flow_table.c optional mlx5en pci inet inet6 \
compile-with "${OFED_C}"
dev/mlx5/mlx5_en/mlx5_en_rx.c optional mlx5en pci inet inet6 \
compile-with "${OFED_C}"
dev/mlx5/mlx5_en/mlx5_en_txrx.c optional mlx5en pci inet inet6 \
compile-with "${OFED_C}"
# crypto support
opencrypto/cast.c optional crypto | ipsec | ipsec_support
opencrypto/criov.c optional crypto | ipsec | ipsec_support
opencrypto/crypto.c optional crypto | ipsec | ipsec_support
opencrypto/cryptodev.c optional cryptodev
opencrypto/cryptodev_if.m optional crypto | ipsec | ipsec_support
opencrypto/cryptosoft.c optional crypto | ipsec | ipsec_support
opencrypto/cryptodeflate.c optional crypto | ipsec | ipsec_support
opencrypto/gmac.c optional crypto | ipsec | ipsec_support
opencrypto/gfmult.c optional crypto | ipsec | ipsec_support
opencrypto/rmd160.c optional crypto | ipsec | ipsec_support
opencrypto/skipjack.c optional crypto | ipsec | ipsec_support
opencrypto/xform.c optional crypto | ipsec | ipsec_support
rpc/auth_none.c optional krpc | nfslockd | nfscl | nfsd
rpc/auth_unix.c optional krpc | nfslockd | nfscl | nfsd
rpc/authunix_prot.c optional krpc | nfslockd | nfscl | nfsd
rpc/clnt_bck.c optional krpc | nfslockd | nfscl | nfsd
rpc/clnt_dg.c optional krpc | nfslockd | nfscl | nfsd
rpc/clnt_rc.c optional krpc | nfslockd | nfscl | nfsd
rpc/clnt_vc.c optional krpc | nfslockd | nfscl | nfsd
rpc/getnetconfig.c optional krpc | nfslockd | nfscl | nfsd
rpc/replay.c optional krpc | nfslockd | nfscl | nfsd
rpc/rpc_callmsg.c optional krpc | nfslockd | nfscl | nfsd
rpc/rpc_generic.c optional krpc | nfslockd | nfscl | nfsd
rpc/rpc_prot.c optional krpc | nfslockd | nfscl | nfsd
rpc/rpcb_clnt.c optional krpc | nfslockd | nfscl | nfsd
rpc/rpcb_prot.c optional krpc | nfslockd | nfscl | nfsd
rpc/svc.c optional krpc | nfslockd | nfscl | nfsd
rpc/svc_auth.c optional krpc | nfslockd | nfscl | nfsd
rpc/svc_auth_unix.c optional krpc | nfslockd | nfscl | nfsd
rpc/svc_dg.c optional krpc | nfslockd | nfscl | nfsd
rpc/svc_generic.c optional krpc | nfslockd | nfscl | nfsd
rpc/svc_vc.c optional krpc | nfslockd | nfscl | nfsd
rpc/rpcsec_gss/rpcsec_gss.c optional krpc kgssapi | nfslockd kgssapi | nfscl kgssapi | nfsd kgssapi
rpc/rpcsec_gss/rpcsec_gss_conf.c optional krpc kgssapi | nfslockd kgssapi | nfscl kgssapi | nfsd kgssapi
rpc/rpcsec_gss/rpcsec_gss_misc.c optional krpc kgssapi | nfslockd kgssapi | nfscl kgssapi | nfsd kgssapi
rpc/rpcsec_gss/rpcsec_gss_prot.c optional krpc kgssapi | nfslockd kgssapi | nfscl kgssapi | nfsd kgssapi
rpc/rpcsec_gss/svc_rpcsec_gss.c optional krpc kgssapi | nfslockd kgssapi | nfscl kgssapi | nfsd kgssapi
security/audit/audit.c optional audit
security/audit/audit_arg.c optional audit
security/audit/audit_bsm.c optional audit
security/audit/audit_bsm_klib.c optional audit
security/audit/audit_pipe.c optional audit
security/audit/audit_syscalls.c standard
security/audit/audit_trigger.c optional audit
security/audit/audit_worker.c optional audit
security/audit/bsm_domain.c optional audit
security/audit/bsm_errno.c optional audit
security/audit/bsm_fcntl.c optional audit
security/audit/bsm_socket_type.c optional audit
security/audit/bsm_token.c optional audit
security/mac/mac_audit.c optional mac audit
security/mac/mac_cred.c optional mac
security/mac/mac_framework.c optional mac
security/mac/mac_inet.c optional mac inet | mac inet6
security/mac/mac_inet6.c optional mac inet6
security/mac/mac_label.c optional mac
security/mac/mac_net.c optional mac
security/mac/mac_pipe.c optional mac
security/mac/mac_posix_sem.c optional mac
security/mac/mac_posix_shm.c optional mac
security/mac/mac_priv.c optional mac
security/mac/mac_process.c optional mac
security/mac/mac_socket.c optional mac
security/mac/mac_syscalls.c standard
security/mac/mac_system.c optional mac
security/mac/mac_sysv_msg.c optional mac
security/mac/mac_sysv_sem.c optional mac
security/mac/mac_sysv_shm.c optional mac
security/mac/mac_vfs.c optional mac
security/mac_biba/mac_biba.c optional mac_biba
security/mac_bsdextended/mac_bsdextended.c optional mac_bsdextended
security/mac_bsdextended/ugidfw_system.c optional mac_bsdextended
security/mac_bsdextended/ugidfw_vnode.c optional mac_bsdextended
security/mac_ifoff/mac_ifoff.c optional mac_ifoff
security/mac_lomac/mac_lomac.c optional mac_lomac
security/mac_mls/mac_mls.c optional mac_mls
security/mac_none/mac_none.c optional mac_none
security/mac_partition/mac_partition.c optional mac_partition
security/mac_portacl/mac_portacl.c optional mac_portacl
security/mac_seeotheruids/mac_seeotheruids.c optional mac_seeotheruids
security/mac_stub/mac_stub.c optional mac_stub
security/mac_test/mac_test.c optional mac_test
teken/teken.c optional sc | vt
ufs/ffs/ffs_alloc.c optional ffs
ufs/ffs/ffs_balloc.c optional ffs
ufs/ffs/ffs_inode.c optional ffs
ufs/ffs/ffs_snapshot.c optional ffs
ufs/ffs/ffs_softdep.c optional ffs
ufs/ffs/ffs_subr.c optional ffs
ufs/ffs/ffs_tables.c optional ffs
ufs/ffs/ffs_vfsops.c optional ffs
ufs/ffs/ffs_vnops.c optional ffs
ufs/ffs/ffs_rawread.c optional ffs directio
ufs/ffs/ffs_suspend.c optional ffs
ufs/ufs/ufs_acl.c optional ffs
ufs/ufs/ufs_bmap.c optional ffs
ufs/ufs/ufs_dirhash.c optional ffs
ufs/ufs/ufs_extattr.c optional ffs
ufs/ufs/ufs_gjournal.c optional ffs UFS_GJOURNAL
ufs/ufs/ufs_inode.c optional ffs
ufs/ufs/ufs_lookup.c optional ffs
ufs/ufs/ufs_quota.c optional ffs
ufs/ufs/ufs_vfsops.c optional ffs
ufs/ufs/ufs_vnops.c optional ffs
vm/default_pager.c standard
vm/device_pager.c standard
vm/phys_pager.c standard
vm/redzone.c optional DEBUG_REDZONE
vm/sg_pager.c standard
vm/swap_pager.c standard
vm/uma_core.c standard
vm/uma_dbg.c standard
vm/memguard.c optional DEBUG_MEMGUARD
vm/vm_domain.c standard
vm/vm_fault.c standard
vm/vm_glue.c standard
vm/vm_init.c standard
vm/vm_kern.c standard
vm/vm_map.c standard
vm/vm_meter.c standard
vm/vm_mmap.c standard
vm/vm_object.c standard
vm/vm_page.c standard
vm/vm_pageout.c standard
vm/vm_pager.c standard
vm/vm_phys.c standard
vm/vm_radix.c standard
vm/vm_reserv.c standard
vm/vm_swapout.c optional !NO_SWAPPING
vm/vm_swapout_dummy.c optional NO_SWAPPING
vm/vm_unix.c standard
vm/vm_zeroidle.c standard
vm/vnode_pager.c standard
xen/features.c optional xenhvm
xen/xenbus/xenbus_if.m optional xenhvm
xen/xenbus/xenbus.c optional xenhvm
xen/xenbus/xenbusb_if.m optional xenhvm
xen/xenbus/xenbusb.c optional xenhvm
xen/xenbus/xenbusb_front.c optional xenhvm
xen/xenbus/xenbusb_back.c optional xenhvm
xen/xenmem/xenmem_if.m optional xenhvm
xdr/xdr.c optional krpc | nfslockd | nfscl | nfsd
xdr/xdr_array.c optional krpc | nfslockd | nfscl | nfsd
xdr/xdr_mbuf.c optional krpc | nfslockd | nfscl | nfsd
xdr/xdr_mem.c optional krpc | nfslockd | nfscl | nfsd
xdr/xdr_reference.c optional krpc | nfslockd | nfscl | nfsd
xdr/xdr_sizeof.c optional krpc | nfslockd | nfscl | nfsd
Index: stable/11
===================================================================
--- stable/11 (revision 332524)
+++ stable/11 (revision 332525)
Property changes on: stable/11
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /head:r329732

File Metadata

Mime Type
application/octet-stream
Expires
Mon, May 6, 10:42 AM (2 d)
Storage Engine
chunks
Storage Format
Chunks
Storage Handle
2LpEwBq4K3Gd
Default Alt Text
(4 MB)

Event Timeline