Index: usr.sbin/makefs/Makefile =================================================================== --- usr.sbin/makefs/Makefile +++ usr.sbin/makefs/Makefile @@ -13,7 +13,8 @@ makefs.c \ msdos.c \ mtree.c \ - walk.c + walk.c \ + zfs.c MAN= makefs.8 NO_WCAST_ALIGN= @@ -22,6 +23,7 @@ .include "${SRCDIR}/cd9660/Makefile.inc" .include "${SRCDIR}/ffs/Makefile.inc" .include "${SRCDIR}/msdos/Makefile.inc" +.include "${SRCDIR}/zfs/Makefile.inc" CFLAGS+=-DHAVE_STRUCT_STAT_ST_FLAGS=1 @@ -36,6 +38,9 @@ CFLAGS+= -I${SRCTOP}/lib/libnetbsd LIBADD= netbsd util sbuf +CFLAGS.zfs.c+= -I${SRCDIR}/zfs \ + -I${SRCTOP}/sys/cddl/boot/zfs \ + HAS_TESTS= SUBDIR.${MK_TESTS}+= tests Index: usr.sbin/makefs/makefs.h =================================================================== --- usr.sbin/makefs/makefs.h +++ usr.sbin/makefs/makefs.h @@ -78,12 +78,14 @@ FI_SIZED = 1<<0, /* inode sized */ FI_ALLOCATED = 1<<1, /* fsinode->ino allocated */ FI_WRITTEN = 1<<2, /* inode written */ + FI_ROOT = 1<<3, /* root of a ZFS dataset */ }; typedef struct { uint32_t ino; /* inode number used on target fs */ uint32_t nlink; /* number of links to this entry */ enum fi_flags flags; /* flags used by fs specific code */ + void *param; /* for use by individual fs impls */ struct stat st; /* stat entry */ } fsinode; @@ -186,6 +188,7 @@ DECLARE_FUN(cd9660); DECLARE_FUN(ffs); DECLARE_FUN(msdos); +DECLARE_FUN(zfs); extern u_int debug; extern int dupsok; Index: usr.sbin/makefs/makefs.8 =================================================================== --- usr.sbin/makefs/makefs.8 +++ usr.sbin/makefs/makefs.8 @@ -35,7 +35,7 @@ .\" .\" $FreeBSD$ .\" -.Dd September 17, 2020 +.Dd May 18, 2022 .Dt MAKEFS 8 .Os .Sh NAME @@ -266,6 +266,8 @@ ISO 9660 file system. .It Sy msdos FAT12, FAT16, or FAT32 file system. +.It Sy zfs +ZFS pool containing one or more file systems. .El .It Fl x Exclude file system nodes not explicitly listed in the specfile. @@ -494,10 +496,87 @@ .It Cm volume_label Volume Label. .El +.Ss zfs-specific options +The image created by +.Nm +contains a ZFS pool with a single vdev of type +.Ql disk . +The root dataset is always created implicitly and contains the entire input +directory tree unless additional datasets are specified using the options +described below. +.Pp +The arguments consist of a keyword, an equal sign +.Pq Ql = , +and a value. +The following keywords are supported: +.Pp +.Bl -tag -width omit-trailing-period -offset indent -compact +.It ashift +The base-2 logarithm of the minimum block size. +Typical values are 9 (512B blocks) and 12 (4KB blocks). +The default value is 12. +.It bootfs +The name of the bootable dataset for the pool. +Specifying this option causes the +.Ql bootfs +property to be set in the created pool. +.It poolname +The name of the ZFS pool. +This option must be specified. +.It rootpath +An implicit path prefix added to dataset mountpoints. +By default it is +.Pa / . +For creating bootable pools, the +.Va rootpath +should be set to +.Pa / . +At least one dataset must have a mountpoint equal to +.Va rootpath . +.It fs +Create an additional dataset. +This option may be specified multiple times. +The argument value must be of the form +.Ar [:[:[:...]]] , +where +.Ar dataset +is the name of the dataset and must belong to the pool's namespace. +For example, with a pool name of +.Ql test +all dataset names must be prefixed by +.Ql test/ . +A dataset must exist at each level of the pool's namespace. +For example, to create +.Ql test/foo/bar , +.Ql test/foo +must be created as well. +.Pp +The dataset mountpoints determine how the datasets are populated with +files from the staged directory tree. +Conceptually, all datasets are mounted before any are populated with files. +The root of the staged directory tree is mapped to +.Va rootpath . +.Pp +Dataset properties, as described in +.Xr zfsprops 8 , +may be specified following the dataset name. +The following properties may be set for a dataset: +.Pp +.Bl -tag -compact -offset indent +.It atime +.It canmount +.It exec +.It mountpoint +.It setuid +.El +.El .Sh SEE ALSO .Xr mtree 5 , .Xr mtree 8 , -.Xr newfs 8 +.Xr newfs 8 , +.Xr zfsconcepts 8 , +.Xr zfsprops 8 , +.Xr zpoolprops 8 .Sh HISTORY The .Nm Index: usr.sbin/makefs/makefs.c =================================================================== --- usr.sbin/makefs/makefs.c +++ usr.sbin/makefs/makefs.c @@ -77,6 +77,7 @@ ENTRY(cd9660), ENTRY(ffs), ENTRY(msdos), + ENTRY(zfs), { .type = NULL }, }; @@ -266,7 +267,7 @@ break; case 'Z': - /* Superscedes 'p' for compatibility with NetBSD makefs(8) */ + /* Supersedes 'p' for compatibility with NetBSD makefs(8) */ fsoptions.sparse = 1; break; Index: usr.sbin/makefs/tests/Makefile =================================================================== --- usr.sbin/makefs/tests/Makefile +++ usr.sbin/makefs/tests/Makefile @@ -2,6 +2,7 @@ ATF_TESTS_SH+= makefs_cd9660_tests ATF_TESTS_SH+= makefs_ffs_tests +ATF_TESTS_SH+= makefs_zfs_tests BINDIR= ${TESTSDIR} @@ -12,7 +13,7 @@ TEST_METADATA.makefs_cd9660_tests+= required_files="/sbin/mount_cd9660" .for t in ${ATF_TESTS_SH} -TEST_METADATA.$t+= required_user="root" +#TEST_METADATA.$t+= required_user="root" .endfor .include Index: usr.sbin/makefs/tests/makefs_zfs_tests.sh =================================================================== --- /dev/null +++ usr.sbin/makefs/tests/makefs_zfs_tests.sh @@ -0,0 +1,521 @@ +#- +# SPDX-License-Identifier: BSD-2-Clause-FreeBSD +# +# Copyright (c) 2022 The FreeBSD Foundation +# +# This software was developed by Mark Johnston under sponsorship from +# the FreeBSD Foundation. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# + +MAKEFS="makefs -t zfs" +ZFS_POOL_NAME="makefstest$(jot -r 1 100000)" +TEST_ZFS_POOL_NAME="$TMPDIR/poolname" + +. "$(dirname "$0")/makefs_tests_common.sh" + +common_cleanup() +{ + local pool md + + # Try to force a TXG, this can help catch bugs by triggering a panic. + sync + + pool=$(cat $TEST_ZFS_POOL_NAME) + if zpool list "$pool" >/dev/null; then + zpool destroy "$pool" + fi + + md=$(cat $TEST_MD_DEVICE_FILE) + if [ -c /dev/"$md" ]; then + mdconfig -d -u "$md" + fi +} + +import_image() +{ + atf_check -e empty -o save:$TEST_MD_DEVICE_FILE -s exit:0 \ + mdconfig -a -f $TEST_IMAGE + atf_check -e empty -o empty -s exit:0 \ + zpool import -R $TEST_MOUNT_DIR $ZFS_POOL_NAME + echo "$ZFS_POOL_NAME" > $TEST_ZFS_POOL_NAME +} + +# +# Test with some default layout defined by the common code. +# +atf_test_case basic cleanup +basic_body() +{ + create_test_inputs + + atf_check -o empty -e empty -s exit:0 \ + $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents +} +basic_cleanup() +{ + common_cleanup +} + +atf_test_case dataset_removal cleanup +dataset_removal_body() +{ + create_test_dirs + + cd $TEST_INPUTS_DIR + mkdir dir + cd - + + atf_check -o empty -e empty -s exit:0 \ + $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + -o fs=${ZFS_POOL_NAME}/dir \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents + + atf_check -o empty -e empty -s exit:0 zfs destroy ${ZFS_POOL_NAME}/dir +} +dataset_removal_cleanup() +{ + common_cleanup +} + +# +# Make sure that we can create and remove an empty directory. +# +atf_test_case empty_dir cleanup +empty_dir_body() +{ + create_test_dirs + + cd $TEST_INPUTS_DIR + mkdir dir + cd - + + atf_check -o empty -e empty -s exit:0 \ + $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents + + atf_check -s exit:0 rmdir ${TEST_MOUNT_DIR}/dir +} +empty_dir_cleanup() +{ + common_cleanup +} + +atf_test_case empty_fs cleanup +empty_fs_body() +{ + create_test_dirs + + atf_check -o empty -e empty -s exit:0 \ + $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents +} +empty_fs_cleanup() +{ + common_cleanup +} + +atf_test_case file_sizes cleanup +file_sizes_body() +{ + local i + + create_test_dirs + cd $TEST_INPUTS_DIR + + i=1 + while [ $i -lt $((1 << 20)) ]; do + truncate -s $i ${i}.1 + truncate -s $(($i - 1)) ${i}.2 + truncate -s $(($i + 1)) ${i}.3 + i=$(($i << 1)) + done + + cd - + + # XXXMJ this creates sparse files, make sure makefs doesn't + # preserve the sparseness. + # XXXMJ need to test with larger files (at least 128MB for L2 indirs) + # XXXMJ try with different ashifts + atf_check -o empty -e empty -s exit:0 \ + $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents +} +file_sizes_cleanup() +{ + common_cleanup +} + +atf_test_case hard_links cleanup +hard_links_body() +{ + local f + + create_test_dirs + cd $TEST_INPUTS_DIR + + mkdir dir + echo "hello" > 1 + ln 1 2 + ln 1 dir/1 + + echo "goodbye" > dir/a + ln dir/a dir/b + ln dir/a a + + cd - + + atf_check -o empty -e empty -s exit:0 \ + $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents + + stat -f '%i' ${TEST_MOUNT_DIR}/1 > ./ino + stat -f '%l' ${TEST_MOUNT_DIR}/1 > ./nlink + for f in 1 2 dir/1; do + atf_check -o file:./nlink -e empty -s exit:0 \ + stat -f '%l' ${TEST_MOUNT_DIR}/${f} + atf_check -o file:./ino -e empty -s exit:0 \ + stat -f '%i' ${TEST_MOUNT_DIR}/${f} + atf_check -o empty -e empty -s exit:0 \ + cmp -s ${TEST_INPUTS_DIR}/1 ${TEST_MOUNT_DIR}/${f} + done + + stat -f '%i' ${TEST_MOUNT_DIR}/dir/a > ./ino + stat -f '%l' ${TEST_MOUNT_DIR}/dir/a > ./nlink + for f in dir/a dir/b a; do + atf_check -o file:./nlink -e empty -s exit:0 \ + stat -f '%l' ${TEST_MOUNT_DIR}/${f} + atf_check -o file:./ino -e empty -s exit:0 \ + stat -f '%i' ${TEST_MOUNT_DIR}/${f} + atf_check -o empty -e empty -s exit:0 \ + cmp -s ${TEST_INPUTS_DIR}/dir/a ${TEST_MOUNT_DIR}/${f} + done +} +hard_links_cleanup() +{ + common_cleanup +} + +# Allocate enough dnodes from an object set that the meta dnode needs to use +# indirect blocks. +atf_test_case indirect_dnode_array cleanup +indirect_dnode_array_body() +{ + local i + + create_test_dirs + cd $TEST_INPUTS_DIR + # 512 bytes per dnode, 3*128KB of direct blocks => limit of 768 files. + # XXXMJ actual threshold is much lower + for i in $(seq 1 1000); do + touch $i + done + cd - + + atf_check -o empty -e empty -s exit:0 \ + $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents +} +indirect_dnode_array_cleanup() +{ + common_cleanup +} + +# +# Create some files with long names, so as to test fat ZAP handling. +# +atf_test_case long_file_name cleanup +long_file_name_body() +{ + local dir i + + create_test_dirs + cd $TEST_INPUTS_DIR + + # micro ZAP keys can be at most 50 bytes. + for i in $(seq 1 60); do + touch $(jot -s '' $i 1 1) + done + dir=$(jot -s '' 61 1 1) + mkdir $dir + for i in $(seq 1 60); do + touch ${dir}/$(jot -s '' $i 1 1) + done + + cd - + + atf_check -o empty -e empty -s exit:0 \ + $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents + + # Add a directory entry in the hope that OpenZFS might catch a bug + # in makefs' fat ZAP encoding. + touch ${TEST_MOUNT_DIR}/foo +} +long_file_name_cleanup() +{ + common_cleanup +} + +# +# Exercise handling of multiple datasets. +# +atf_test_case multi_dataset_1 cleanup +multi_dataset_1_body() +{ + create_test_dirs + cd $TEST_INPUTS_DIR + + mkdir dir1 + echo a > dir1/a + mkdir dir2 + echo b > dir2/b + + cd - + + atf_check -o empty -e empty -s exit:0 \ + $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + -o fs=${ZFS_POOL_NAME}/dir1 -o fs=${ZFS_POOL_NAME}/dir2 \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents + + # Make sure that we have three datasets with the expected mount points. + atf_check -o inline:${ZFS_POOL_NAME}\\n -e empty -s exit:0 \ + zfs list -H -o name ${ZFS_POOL_NAME} + atf_check -o inline:${TEST_MOUNT_DIR}\\n -e empty -s exit:0 \ + zfs list -H -o mountpoint ${ZFS_POOL_NAME} + + atf_check -o inline:${ZFS_POOL_NAME}/dir1\\n -e empty -s exit:0 \ + zfs list -H -o name ${ZFS_POOL_NAME}/dir1 + atf_check -o inline:${TEST_MOUNT_DIR}/dir1\\n -e empty -s exit:0 \ + zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir1 + + atf_check -o inline:${ZFS_POOL_NAME}/dir2\\n -e empty -s exit:0 \ + zfs list -H -o name ${ZFS_POOL_NAME}/dir2 + atf_check -o inline:${TEST_MOUNT_DIR}/dir2\\n -e empty -s exit:0 \ + zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir2 +} +multi_dataset_1_cleanup() +{ + common_cleanup +} + +atf_test_case multi_dataset_2 cleanup +multi_dataset_2_body() +{ + create_test_dirs + cd $TEST_INPUTS_DIR + + mkdir dir1 + echo a > dir1/a + mkdir dir2 + echo b > dir2/b + + cd - + + atf_check -o empty -e empty -s exit:0 \ + $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + -o fs=${ZFS_POOL_NAME}/dir1:mountpoint=/ \ + -o fs=${ZFS_POOL_NAME}:mountpoint=/dir1 \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents +} +multi_dataset_2_cleanup() +{ + common_cleanup +} + +# +# Rudimentary test to verify that two ZFS images created using the same +# parameters and input hierarchy are byte-identical. In particular, makefs(1) +# does not preserve file access times. +# +atf_test_case reproducible cleanup +reproducible_body() +{ + create_test_inputs + + atf_check -o empty -e empty -s exit:0 \ + $MAKEFS -s 512m -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + ${TEST_IMAGE}.1 $TEST_INPUTS_DIR + + atf_check -o empty -e empty -s exit:0 \ + $MAKEFS -s 512m -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + ${TEST_IMAGE}.2 $TEST_INPUTS_DIR + + # XXX-MJ cmp(1) is really slow + atf_check -o empty -e empty -s exit:0 \ + cmp ${TEST_IMAGE}.1 ${TEST_IMAGE}.2 +} +reproducible_cleanup() +{ +} + +atf_test_case snapshot cleanup +snapshot_body() +{ + create_test_dirs + cd $TEST_INPUTS_DIR + + mkdir dir + echo "hello" > dir/hello + echo "goodbye" > goodbye + + cd - + + atf_check -o empty -e empty -s exit:0 \ + $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + atf_check -o empty -e empty -s exit:0 zfs snapshot ${ZFS_POOL_NAME}@1 +} +snapshot_cleanup() +{ + common_cleanup +} + +atf_test_case soft_links cleanup +soft_links_body() +{ + create_test_dirs + cd $TEST_INPUTS_DIR + + mkdir dir + ln -s a a + ln -s dir/../a a + ln -s dir/b b + echo 'c' > dir + ln -s dir/c c + # XXX-MJ overflows bonus buffer ln -s $(jot -s '' 320 1 1) 1 + + cd - + + atf_check -o empty -e empty -s exit:0 \ + $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents +} +soft_links_cleanup() +{ + common_cleanup +} + +# +# Verify that we can set properties on the root dataset. +# +atf_test_case root_props cleanup +root_props_body() +{ + create_test_inputs + + atf_check -o empty -e empty -s exit:0 \ + $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + -o fs=${ZFS_POOL_NAME}:atime=off:setuid=off \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents + + atf_check -o inline:off\\n -e empty -s exit:0 \ + zfs get -H -o value atime $ZFS_POOL_NAME + atf_check -o inline:local\\n -e empty -s exit:0 \ + zfs get -H -o source atime $ZFS_POOL_NAME + atf_check -o inline:off\\n -e empty -s exit:0 \ + zfs get -H -o value setuid $ZFS_POOL_NAME + atf_check -o inline:local\\n -e empty -s exit:0 \ + zfs get -H -o source setuid $ZFS_POOL_NAME +} +root_props_cleanup() +{ + common_cleanup +} + +atf_init_test_cases() +{ + atf_add_test_case basic + atf_add_test_case dataset_removal + atf_add_test_case empty_dir + atf_add_test_case empty_fs + atf_add_test_case file_sizes + atf_add_test_case hard_links + atf_add_test_case indirect_dnode_array + atf_add_test_case long_file_name + atf_add_test_case multi_dataset_1 + atf_add_test_case multi_dataset_2 + # XXX-MJ one to check handling of non-existent mountpoints + # one to check mountpoint "none" + atf_add_test_case reproducible + atf_add_test_case snapshot + atf_add_test_case soft_links + atf_add_test_case root_props + + # XXXMJ tests: + # - test with different ashifts (at least, 9 and 12), different image sizes + # - create datasets in imported pool + # - bootenvs +} Index: usr.sbin/makefs/zfs.c =================================================================== --- /dev/null +++ usr.sbin/makefs/zfs.c @@ -0,0 +1,3322 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 The FreeBSD Foundation + * + * This software was developed by Mark Johnston under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "makefs.h" +#include "zfs/nvlist.h" +#include "zfs/zfsimpl.h" + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-function" +#include "fletcher.c" +#include "sha256.c" +#pragma clang diagnostic pop + +/* + * XXX-MJ + * - documentation + * - split into multiple files? + * - review checksum algorithm selection (most should likely be "inherit"?) + * - review vdev_space_alloc() + * - review type usage (off_t vs. size_t vs. uint64_t) + * - inconsistency in variable/field naming (how to name a dnode vs dnode id) + * - bootfs property, bootenvs + * - ZFS_SHARES_DIR + */ + +#define MAXBLOCKSHIFT 17 /* 128KB */ +#define MAXBLOCKSIZE ((off_t)(1 << MAXBLOCKSHIFT)) +_Static_assert(MAXBLOCKSIZE == SPA_OLDMAXBLOCKSIZE, ""); +#define MINBLOCKSHIFT 9 /* 512B */ +#define MINBLOCKSIZE ((off_t)(1 << MINBLOCKSHIFT)) +_Static_assert(MINBLOCKSIZE == SPA_MINBLOCKSIZE, ""); +#define MINDEVSIZE ((off_t)SPA_MINDEVSIZE) + +#define INDIR_LEVELS 6 +#define BLKPTR_PER_INDIR (MAXBLOCKSIZE / sizeof(blkptr_t)) + +#define VDEV_LABEL_SPACE \ + ((off_t)(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) +_Static_assert(VDEV_LABEL_SPACE <= MINDEVSIZE, ""); + +typedef struct { + const char *name; + unsigned int id; + uint16_t size; + sa_bswap_type_t bs; +} zfs_sattr_t; + +typedef struct zfs_objset { + objset_phys_t *phys; + off_t osloc; + off_t osblksz; + blkptr_t osbp; /* set in objset_write() */ + + off_t space; /* bytes allocated to this objset */ + + dnode_phys_t *dnodes; /* dnode array */ + uint64_t dnodenextfree; /* dnode ID bump allocator */ + uint64_t dnodecount; /* total number of dnodes */ + off_t dnodeloc; /* preallocated vdev space */ +} zfs_objset_t; + +typedef struct zfs_zap_entry { + char *name; /* entry key, private copy */ + uint64_t hash; /* key hash */ + union { + uint8_t *valp; + uint16_t *val16p; + uint32_t *val32p; + uint64_t *val64p; + }; /* entry value, an integer array */ + uint64_t val64; /* embedded value for a common case */ + size_t intsz; /* array element size; 1, 2, 4 or 8 */ + size_t intcnt; /* array size */ + STAILQ_ENTRY(zfs_zap_entry) next; +} zfs_zap_entry_t; + +typedef struct zfs_zap { + STAILQ_HEAD(, zfs_zap_entry) kvps; + uint64_t hashsalt; /* key hash input */ + unsigned long kvpcnt; /* number of key-value pairs */ + unsigned long chunks; /* count of chunks needed for fat ZAP */ + bool micro; /* can this be a micro ZAP? */ + + dnode_phys_t *dnode; /* backpointer */ + zfs_objset_t *os; /* backpointer */ +} zfs_zap_t; + +struct zfs_dsl_dir; + +typedef struct zfs_dsl_dataset { + zfs_objset_t *os; /* referenced objset, may be null */ + dsl_dataset_phys_t *phys; /* on-disk representation */ + uint64_t dsid; /* DSL dataset dnode */ + + struct zfs_dsl_dir *dir; /* containing parent */ +} zfs_dsl_dataset_t; + +typedef STAILQ_HEAD(zfs_dsl_dir_list, zfs_dsl_dir) zfs_dsl_dir_list_t; + +typedef struct zfs_dsl_dir { + char *fullname; /* full dataset name */ + char *name; /* basename(fullname) */ + dsl_dir_phys_t *phys; /* on-disk representation */ + nvlist_t *propsnv; /* properties saved in propszap */ + + zfs_dsl_dataset_t *headds; /* principal dataset, may be null */ + + uint64_t dirid; /* DSL directory dnode */ + zfs_zap_t propszap; /* dataset properties */ + zfs_zap_t childzap; /* child directories */ + + /* DSL directory tree linkage. */ + struct zfs_dsl_dir *parent; + zfs_dsl_dir_list_t children; + STAILQ_ENTRY(zfs_dsl_dir) next; +} zfs_dsl_dir_t; + +typedef struct zfs_fs { + zfs_objset_t *os; + + /* Offset table for system attributes, indexed by a zpl_attr_t. */ + uint16_t *saoffs; + size_t sacnt; + const zfs_sattr_t *satab; +} zfs_fs_t; + +struct dataset_desc { + char *params; + STAILQ_ENTRY(dataset_desc) next; +}; + +typedef struct { + /* I/O buffer, just for convenience. */ + char filebuf[MAXBLOCKSIZE]; + + /* Pool parameters. */ + const char *poolname; + char *rootpath; /* implicit mount point prefix */ + char *bootfs; /* bootable dataset, pool property */ + int ashift; /* vdev block size */ + STAILQ_HEAD(, dataset_desc) datasets; /* non-root dataset descrs */ + + /* Pool state. */ + uint64_t guid; /* pool and vdev GUID */ + zfs_zap_t poolprops; + + /* MOS state. */ + zfs_objset_t mos; /* meta object set */ + uint64_t objarrid; /* space map object array */ + + /* DSL state. */ + zfs_dsl_dir_t rootdsldir; /* root DSL directory */ + zfs_dsl_dataset_t rootds; + zfs_dsl_dir_t origindsldir; /* $ORIGIN */ + zfs_dsl_dataset_t originds; + zfs_dsl_dataset_t snapds; + zfs_zap_t cloneszap; + zfs_dsl_dir_t freedsldir; /* $FREE */ + zfs_dsl_dir_t mosdsldir; /* $MOS */ + + /* vdev state. */ + int fd; /* vdev disk fd */ + off_t vdevsize; /* vdev size, including labels */ + off_t asize; /* vdev size, excluding labels */ + bitstr_t *spacemap; /* space allocation tracking */ + int spacemapbits; /* one bit per ashift-sized block */ + uint64_t msshift; /* log2(metaslab size) */ + uint64_t mscount; /* number of metaslabs for this vdev */ +} zfs_opt_t; + +static void zap_init(zfs_zap_t *, zfs_objset_t *, dnode_phys_t *); +static void zap_add_uint64(zfs_zap_t *, const char *, uint64_t); +static void zap_add_string(zfs_zap_t *, const char *, const char *); +static void zap_write(zfs_opt_t *, zfs_zap_t *); + +static dnode_phys_t *objset_dnode_lookup(zfs_objset_t *, uint64_t); +static dnode_phys_t *objset_dnode_alloc(zfs_objset_t *, uint8_t, uint64_t *); +static dnode_phys_t *objset_dnode_bonus_alloc(zfs_objset_t *, uint8_t, uint8_t, + uint16_t, uint64_t *); +static off_t objset_space_alloc(zfs_opt_t *, zfs_objset_t *, off_t *); + +static void dsl_dir_init(zfs_opt_t *, const char *, zfs_dsl_dir_t *); +static void dsl_dataset_init(zfs_opt_t *, zfs_dsl_dir_t *, zfs_dsl_dataset_t *); + +static void spacemap_init(zfs_opt_t *); + +struct dnode_cursor { + char inddir[INDIR_LEVELS][MAXBLOCKSIZE]; + off_t indloc; + off_t indspace; + dnode_phys_t *dnode; + off_t dataoff; + off_t datablksz; +}; + +static struct dnode_cursor *dnode_cursor_init(zfs_opt_t *, zfs_objset_t *, + dnode_phys_t *, off_t, off_t); +static blkptr_t *dnode_cursor_next(zfs_opt_t *, struct dnode_cursor *, + off_t); +static void dnode_cursor_finish(zfs_opt_t *, struct dnode_cursor *); + +static void fs_build_one(zfs_opt_t *, zfs_dsl_dir_t *, fsnode *, int); + +/* + * The order of the attributes doesn't matter, this is simply the one hard-coded + * by OpenZFS, based on a zdb dump of the SA_REGISTRY table. + */ +typedef enum zpl_attr { + ZPL_ATIME, + ZPL_MTIME, + ZPL_CTIME, + ZPL_CRTIME, + ZPL_GEN, + ZPL_MODE, + ZPL_SIZE, + ZPL_PARENT, + ZPL_LINKS, + ZPL_XATTR, + ZPL_RDEV, + ZPL_FLAGS, + ZPL_UID, + ZPL_GID, + ZPL_PAD, + ZPL_ZNODE_ACL, + ZPL_DACL_COUNT, + ZPL_SYMLINK, + ZPL_SCANSTAMP, + ZPL_DACL_ACES, + ZPL_DXATTR, + ZPL_PROJID, +} zpl_attr_t; + +/* + * This table must be kept in sync with zpl_attr_layout[] and zpl_attr_t. + */ +static const zfs_sattr_t zpl_attrs[] = { +#define _ZPL_ATTR(n, s, b) { .name = #n, .id = n, .size = s, .bs = b } + _ZPL_ATTR(ZPL_ATIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_MTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_CTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_CRTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_GEN, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_MODE, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_SIZE, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_PARENT, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_LINKS, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_XATTR, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_RDEV, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_FLAGS, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_UID, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_GID, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_PAD, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_ZNODE_ACL, 88, SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_DACL_COUNT, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_SYMLINK, 0, SA_UINT8_ARRAY), + _ZPL_ATTR(ZPL_SCANSTAMP, sizeof(uint64_t) * 4, SA_UINT8_ARRAY), + _ZPL_ATTR(ZPL_DACL_ACES, 0, SA_ACL), + _ZPL_ATTR(ZPL_DXATTR, 0, SA_UINT8_ARRAY), + _ZPL_ATTR(ZPL_PROJID, sizeof(uint64_t), SA_UINT64_ARRAY), +#undef ZPL_ATTR +}; + +/* + * This layout matches that of a filesystem created using OpenZFS on FreeBSD. + * It need not match in general, but FreeBSD's loader doesn't bother parsing the + * layout and just hard-codes attribute offsets. + */ +static const sa_attr_type_t zpl_attr_layout[] = { + ZPL_MODE, + ZPL_SIZE, + ZPL_GEN, + ZPL_UID, + ZPL_GID, + ZPL_PARENT, + ZPL_FLAGS, + ZPL_ATIME, + ZPL_MTIME, + ZPL_CTIME, + ZPL_CRTIME, + ZPL_LINKS, + ZPL_DACL_COUNT, + ZPL_DACL_ACES, + ZPL_SYMLINK, +}; + +/* + * Keys for the ZPL attribute tables in the SA layout ZAP. The first two + * indices are reserved for legacy attribute encoding. + */ +#define SA_LAYOUT_INDEX_DEFAULT 2 +#define SA_LAYOUT_INDEX_SYMLINK 3 + +void +zfs_prep_opts(fsinfo_t *fsopts) +{ + zfs_opt_t *zfs = ecalloc(1, sizeof(*zfs)); + + const option_t zfs_options[] = { + { '\0', "bootfs", &zfs->bootfs, OPT_STRPTR, + 0, 0, "Bootable dataset" }, + { '\0', "poolname", &zfs->poolname, OPT_STRPTR, + 0, 0, "ZFS pool name" }, + { '\0', "rootpath", &zfs->rootpath, OPT_STRPTR, + 0, 0, "Prefix for all dataset mount points" }, + { '\0', "ashift", &zfs->ashift, OPT_INT32, + MINBLOCKSHIFT, MAXBLOCKSHIFT, "ZFS pool ashift" }, + { .name = NULL } + }; + + /* Set some default values. */ + zfs->ashift = 12; + + STAILQ_INIT(&zfs->datasets); + + fsopts->fs_specific = zfs; + fsopts->fs_options = copy_opts(zfs_options); +} + +int +zfs_parse_opts(const char *option, fsinfo_t *fsopts) +{ + zfs_opt_t *zfs; + struct dataset_desc *dsdesc; + char buf[BUFSIZ], *opt, *val; + int rv; + + zfs = fsopts->fs_specific; + + opt = val = estrdup(option); + opt = strsep(&val, "="); + if (strcmp(opt, "fs") == 0) { + if (val == NULL) + errx(1, "invalid filesystem parameters `%s'", option); + + /* + * Dataset descriptions will be parsed later, in dsl_init(). + * Just stash them away for now. + */ + dsdesc = ecalloc(1, sizeof(*dsdesc)); + dsdesc->params = estrdup(val); + free(opt); + STAILQ_INSERT_TAIL(&zfs->datasets, dsdesc, next); + return (1); + } + free(opt); + + rv = set_option(fsopts->fs_options, option, buf, sizeof(buf)); + return (rv == -1 ? 0 : 1); +} + +static void +zfs_check_opts(fsinfo_t *fsopts) +{ + zfs_opt_t *zfs; + + zfs = fsopts->fs_specific; + + if (fsopts->offset != 0) + errx(1, "unhandled offset option"); + if (zfs->poolname == NULL) + errx(1, "a pool name must be specified"); + if (zfs->rootpath == NULL) + easprintf(&zfs->rootpath, "/%s", zfs->poolname); + if (zfs->rootpath[0] != '/') + errx(1, "mountpoint `%s' must be absolute", zfs->rootpath); +} + +void +zfs_cleanup_opts(fsinfo_t *fsopts) +{ + struct dataset_desc *d, *tmp; + zfs_opt_t *zfs; + + zfs = fsopts->fs_specific; + free(zfs->rootpath); + free(zfs->bootfs); + free(__DECONST(void *, zfs->poolname)); + STAILQ_FOREACH_SAFE(d, &zfs->datasets, next, tmp) { + free(d->params); + free(d); + } + free(zfs); + free(fsopts->fs_options); +} + +static int +nvlist_find_string(nvlist_t *nvl, const char *key, char **retp) +{ + char *str; + int error, len; + + error = nvlist_find(nvl, key, DATA_TYPE_STRING, NULL, &str, &len); + if (error == 0) { + *retp = ecalloc(1, len + 1); + memcpy(*retp, str, len); + } + return (error); +} + +static int +nvlist_find_uint64(nvlist_t *nvl, const char *key, uint64_t *retp) +{ + return (nvlist_find(nvl, key, DATA_TYPE_UINT64, NULL, retp, NULL)); +} + +static size_t +nvlist_size(const nvlist_t *nvl) +{ + return (sizeof(nvl->nv_header) + nvl->nv_size); +} + +static void +nvlist_copy(const nvlist_t *nvl, char *buf, size_t sz) +{ + assert(sz >= nvlist_size(nvl)); + + memcpy(buf, &nvl->nv_header, sizeof(nvl->nv_header)); + memcpy(buf + sizeof(nvl->nv_header), nvl->nv_data, nvl->nv_size); +} + +static void +blkptr_set(blkptr_t *bp, off_t off, off_t size, uint8_t dntype, uint8_t level, + uint64_t fill, enum zio_checksum cksumt, zio_cksum_t *cksum) +{ + dva_t *dva; + + assert(powerof2(size)); + + BP_ZERO(bp); + BP_SET_LSIZE(bp, size); + BP_SET_PSIZE(bp, size); + BP_SET_CHECKSUM(bp, cksumt); + BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); + BP_SET_LEVEL(bp, level); + BP_SET_FILL(bp, fill); + BP_SET_TYPE(bp, dntype); + + dva = BP_IDENTITY(bp); + DVA_SET_VDEV(dva, 0); + DVA_SET_OFFSET(dva, off); + DVA_SET_ASIZE(dva, size); + memcpy(&bp->blk_cksum, cksum, sizeof(*cksum)); +} + +static void +vdev_init(zfs_opt_t *zfs, size_t size, const char *image) +{ + assert(zfs->ashift >= MINBLOCKSHIFT); + + zfs->vdevsize = rounddown2(size, 1 << zfs->ashift); + if (zfs->vdevsize < MINDEVSIZE) { + errx(1, "Maximum image size %ju is too small", + (uintmax_t)zfs->vdevsize); + } + zfs->asize = zfs->vdevsize - VDEV_LABEL_SPACE; + + zfs->fd = open(image, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (zfs->fd == -1) + err(1, "Can't open `%s' for writing", image); + if (ftruncate(zfs->fd, zfs->vdevsize) != 0) + err(1, "Failed to extend image file `%s'", image); + + spacemap_init(zfs); +} + +static void +vdev_fini(zfs_opt_t *zfs) +{ + assert(zfs->spacemap == NULL); + + if (zfs->fd != -1) { + if (close(zfs->fd) != 0) + err(1, "close"); + zfs->fd = -1; + } +} + +/* + * Write a block of data to the vdev. The offset is always relative to the end + * of the second leading vdev label. + * + * Consumers should generally use the helpers below, which provide block + * pointers and update dnode accounting, rather than calling this function + * directly. + */ +static void +vdev_pwrite(const zfs_opt_t *zfs, const void *buf, size_t len, off_t off) +{ + ssize_t n; + + assert(off >= 0 && off < zfs->asize); + assert(powerof2(len)); + assert((off_t)len > 0 && off + (off_t)len > off && + off + (off_t)len < zfs->asize); + if (zfs->spacemap != NULL) { + /* + * Verify that the blocks being written were in fact allocated. + * + * The space map isn't available once the on-disk space map is + * finalized, so this check doesn't quite catch everything. + */ + assert(bit_ntest(zfs->spacemap, off >> zfs->ashift, + (off + len - 1) >> zfs->ashift, 1)); + } + + off += VDEV_LABEL_START_SIZE; + for (size_t sofar = 0; sofar < len; sofar += n) { + n = pwrite(zfs->fd, (const char *)buf + sofar, len - sofar, + off + sofar); + if (n < 0) + err(1, "pwrite"); + assert(n > 0); + } +} + +static void +vdev_pwrite_data(zfs_opt_t *zfs, uint8_t datatype, uint8_t cksumtype, + uint8_t level, uint64_t fill, const void *data, off_t sz, off_t loc, + blkptr_t *bp) +{ + zio_cksum_t cksum; + + assert(cksumtype == ZIO_CHECKSUM_FLETCHER_4); + + fletcher_4_native(data, sz, NULL, &cksum); + blkptr_set(bp, loc, sz, datatype, level, fill, cksumtype, &cksum); + vdev_pwrite(zfs, data, sz, loc); +} + +static void +vdev_pwrite_dnode_indir(zfs_opt_t *zfs, dnode_phys_t *dnode, uint8_t level, + uint64_t fill, const void *data, off_t sz, off_t loc, blkptr_t *bp) +{ + vdev_pwrite_data(zfs, dnode->dn_type, dnode->dn_checksum, level, fill, + data, sz, loc, bp); + + assert((dnode->dn_flags & DNODE_FLAG_USED_BYTES) != 0); + dnode->dn_used += sz; +} + +static void +vdev_pwrite_dnode_data(zfs_opt_t *zfs, dnode_phys_t *dnode, const void *data, + off_t sz, off_t loc) +{ + vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, data, sz, loc, + &dnode->dn_blkptr[0]); +} + +static void +vdev_label_set_checksum(void *buf, off_t off, off_t size) +{ + zio_cksum_t cksum; + zio_eck_t *eck; + + assert(size > 0 && (size_t)size >= sizeof(zio_eck_t)); + + eck = (zio_eck_t *)((char *)buf + size) - 1; + eck->zec_magic = ZEC_MAGIC; + ZIO_SET_CHECKSUM(&eck->zec_cksum, off, 0, 0, 0); + zio_checksum_SHA256(buf, size, NULL, &cksum); + eck->zec_cksum = cksum; +} + +/* + * Set embedded checksums and write the label at the specified index. + */ +static void +vdev_label_write(zfs_opt_t *zfs, int ind, const vdev_label_t *labelp) +{ + vdev_label_t *label; + ssize_t n; + off_t blksz, loff; + + assert(ind >= 0 && ind < VDEV_LABELS); + + /* + * Make a copy since we have to modify the label to set checksums. + */ + label = ecalloc(1, sizeof(*label)); + memcpy(label, labelp, sizeof(*label)); + + if (ind < 2) + loff = ind * sizeof(*label); + else + loff = zfs->vdevsize - (VDEV_LABELS - ind) * sizeof(*label); + + /* + * Set the verifier checksum for the boot block. We don't use it, but + * the FreeBSD loader reads it and will complain if the checksum isn't + * valid. + */ + vdev_label_set_checksum(&label->vl_be, + loff + __offsetof(vdev_label_t, vl_be), sizeof(label->vl_be)); + + /* + * Set the verifier checksum for the label. + */ + vdev_label_set_checksum(&label->vl_vdev_phys, + loff + __offsetof(vdev_label_t, vl_vdev_phys), + sizeof(label->vl_vdev_phys)); + + /* + * Set the verifier checksum for the uberblocks. There is one uberblock + * per sector; for example, with an ashift of 12 we end up with + * 128KB/4KB=32 copies of the uberblock in the ring. + */ + blksz = 1 << zfs->ashift; + assert(sizeof(label->vl_uberblock) % blksz == 0); + for (size_t roff = 0; roff < sizeof(label->vl_uberblock); + roff += blksz) { + vdev_label_set_checksum(&label->vl_uberblock[0] + roff, + loff + __offsetof(vdev_label_t, vl_uberblock) + roff, + blksz); + } + + n = pwrite(zfs->fd, label, sizeof(*label), loff); + if (n < 0) + err(1, "writing vdev label"); + assert(n == sizeof(*label)); + + free(label); +} + +/* + * Find a chunk of contiguous free space of length *lenp, according to the + * following rules: + * 1. If the length is less than or equal to 128KB, the returned run's length + * will be the smallest power of 2 equal to or larger than the length. + * 2. If the length is larger than 128KB, the returned run's length will be + * the smallest multiple of 128KB that is larger than the length. + * 3. The returned run's length will be size-aligned up to 128KB. + * + * XXX-MJ the third rule isn't actually required, so this can just be a dumb + * bump allocator. Maybe there's some benefit to keeping large blocks aligned, + * so let's keep it for now and hope we don't get too much fragmentation. + * Alternately we could try to allocate all blocks of a certain size from the + * same metaslab. + */ +static off_t +vdev_space_alloc(zfs_opt_t *zfs, off_t *lenp) +{ + off_t len; + int align, loc, minblksz, nbits; + + minblksz = 1 << zfs->ashift; + len = roundup2(*lenp, minblksz); + + assert(len != 0); + assert(len / minblksz <= INT_MAX); + + if (len < MAXBLOCKSIZE) { + if ((len & (len - 1)) != 0) + len = (off_t)1 << flsll(len); + align = len / minblksz; + } else { + len = roundup2(len, MAXBLOCKSIZE); + align = MAXBLOCKSIZE / minblksz; + } + + for (loc = 0, nbits = len / minblksz;; loc = roundup2(loc, align)) { + bit_ffc_area_at(zfs->spacemap, loc, zfs->spacemapbits, nbits, + &loc); + if (loc == -1) { + errx(1, "failed to find %ju bytes of space", + (uintmax_t)len); + } + if ((loc & (align - 1)) == 0) + break; + } + assert(loc + nbits > loc); + bit_nset(zfs->spacemap, loc, loc + nbits - 1); + *lenp = len; + + return ((off_t)loc << zfs->ashift); +} + +static void +spacemap_init(zfs_opt_t *zfs) +{ + uint64_t msshift, nbits, slabs; + + nbits = zfs->asize >> zfs->ashift; + if (nbits > INT_MAX) { + /* + * With the smallest block size of 512B, the limit on the image + * size is 2TB. That should be enough for anyone. + */ + errx(1, "image size is too large"); + } + zfs->spacemapbits = (int)nbits; + zfs->spacemap = bit_alloc(zfs->spacemapbits); + if (zfs->spacemap == NULL) + err(1, "bitstring allocation failed"); + + /* + * Try to choose a metaslab size that gives us a "reasonable" number of + * metaslabs. OpenZFS seems to expect at least 2. + * + * This is simplistic since we expect the pool to be autoexpanded upon + * first use, so OpenZFS will have to reorganize things anyway. + */ + for (msshift = 24 /* 16MB */; msshift < 34 /* 16GB */; msshift++) { + slabs = zfs->asize / ((uint64_t)1 << msshift); + if (slabs >= 4 && slabs <= 200) + break; + } + if (msshift == 34) { + errx(1, + "failed to find a metaslab size, image size is too large"); + } + + zfs->msshift = msshift; + zfs->mscount = slabs; +} + +static void +spacemap_write(zfs_opt_t *zfs) +{ + dnode_phys_t *objarr; + zfs_objset_t *mos; + bitstr_t *spacemap; + uint64_t *objarrblk; + off_t smblksz, objarrblksz, objarrloc; + + struct { + dnode_phys_t *dnode; + uint64_t dnid; + off_t loc; + } *sma; + + mos = &zfs->mos; + + objarrblksz = sizeof(uint64_t) * zfs->mscount; + assert(objarrblksz <= MAXBLOCKSIZE); + objarrloc = objset_space_alloc(zfs, mos, &objarrblksz); + objarrblk = ecalloc(1, objarrblksz); + + objarr = objset_dnode_lookup(mos, zfs->objarrid); + objarr->dn_datablkszsec = objarrblksz >> MINBLOCKSHIFT; + + /* + * Use the smallest block size for space maps. The space allocation + * algorithm should aim to minimize the number of holes. + */ + smblksz = 1 << zfs->ashift; + + /* + * First allocate dnodes and space for all of our space maps. No more + * space can be allocated from the vdev after this point. + */ + sma = ecalloc(zfs->mscount, sizeof(*sma)); + for (uint64_t i = 0; i < zfs->mscount; i++) { + sma[i].dnode = objset_dnode_bonus_alloc(mos, DMU_OT_SPACE_MAP, + DMU_OT_SPACE_MAP_HEADER, SPACE_MAP_SIZE_V0, &sma[i].dnid); + sma[i].loc = objset_space_alloc(zfs, mos, &smblksz); + } + spacemap = zfs->spacemap; + zfs->spacemap = NULL; + + /* + * Now that the set of allocated space is finalized, populate each space + * map and write it to the vdev. + */ + for (uint64_t i = 0; i < zfs->mscount; i++) { + space_map_phys_t *sm; + uint64_t alloc, length, *smblk; + int shift, startb, endb, srunb, erunb; + + /* + * We only allocate a single block for this space map, but OpenZFS + * assumes that a space map object with sufficient bonus space supports + * histograms. + */ + sma[i].dnode->dn_nblkptr = 3; + sma[i].dnode->dn_datablkszsec = smblksz >> MINBLOCKSHIFT; + + smblk = ecalloc(1, smblksz); + + alloc = length = 0; + shift = zfs->msshift - zfs->ashift; + for (srunb = startb = i * (1 << shift), + endb = (i + 1) * (1 << shift); + srunb < endb; srunb = erunb) { + uint64_t runlen, runoff; + + /* Find a run of allocated space. */ + bit_ffs_at(spacemap, srunb, zfs->spacemapbits, &srunb); + if (srunb == -1 || srunb >= endb) + break; + + bit_ffc_at(spacemap, srunb, zfs->spacemapbits, &erunb); + if (erunb == -1 || erunb > endb) + erunb = endb; + + /* + * The space represented by [srunb, erunb) has been + * allocated. Add a record to the space map to indicate + * this. Run offsets are relative to the beginning of + * the metaslab. + */ + runlen = erunb - srunb; + runoff = srunb - startb; + + assert(length * sizeof(uint64_t) < (uint64_t)smblksz); + smblk[length] = SM_PREFIX_ENCODE(SM2_PREFIX) | + SM2_RUN_ENCODE(runlen) | SM2_VDEV_ENCODE(0); + smblk[length + 1] = SM2_TYPE_ENCODE(SM_ALLOC) | + SM2_OFFSET_ENCODE(runoff); + + alloc += runlen << zfs->ashift; + length += 2; + } + + sm = DN_BONUS(sma[i].dnode); + sm->smp_length = length * sizeof(uint64_t); + sm->smp_alloc = alloc; + + vdev_pwrite_dnode_data(zfs, sma[i].dnode, smblk, smblksz, + sma[i].loc); + free(smblk); + + /* Record this space map in the space map object array. */ + objarrblk[i] = sma[i].dnid; + } + + /* + * All of the space maps are written, now write the object array. + */ + vdev_pwrite_dnode_data(zfs, objarr, objarrblk, objarrblksz, objarrloc); + free(objarrblk); + + assert(zfs->spacemap == NULL); + free(spacemap); + free(sma); +} + +static void +objset_init(zfs_opt_t *zfs, zfs_objset_t *os, uint64_t type, + uint64_t dnodecount) +{ + dnode_phys_t *mdnode; + off_t blksz; + + /* + * Allocate space on the vdev for the objset and dnode array. For other + * objects we do that only when going to actually write them to the + * vdev, but in this case it simplifies space map accounting to do it + * now. + */ + os->osblksz = sizeof(objset_phys_t); + os->osloc = objset_space_alloc(zfs, os, &os->osblksz); + + /* + * Object ID zero is always reserved for the meta dnode, which is + * embedded in the objset itself. + */ + dnodecount++; + + os->dnodenextfree = 1; + os->dnodecount = dnodecount; + blksz = roundup2(dnodecount * sizeof(dnode_phys_t), DNODE_BLOCK_SIZE); + os->dnodeloc = objset_space_alloc(zfs, os, &blksz); + assert(blksz % DNODE_BLOCK_SIZE == 0); + os->dnodes = ecalloc(1, blksz); + + os->phys = ecalloc(1, os->osblksz); + os->phys->os_type = type; + + mdnode = &os->phys->os_meta_dnode; + mdnode->dn_indblkshift = MAXBLOCKSHIFT; + mdnode->dn_type = DMU_OT_DNODE; + mdnode->dn_bonustype = DMU_OT_NONE; + mdnode->dn_checksum = ZIO_CHECKSUM_FLETCHER_4; + mdnode->dn_datablkszsec = DNODE_BLOCK_SIZE >> MINBLOCKSHIFT; + mdnode->dn_nlevels = 1; + for (uint64_t count = dnodecount / DNODES_PER_BLOCK; count > 1; + count /= BLKPTR_PER_INDIR) + mdnode->dn_nlevels++; + mdnode->dn_nblkptr = 1; + mdnode->dn_maxblkid = howmany(dnodecount, DNODES_PER_BLOCK) - 1; + mdnode->dn_flags = DNODE_FLAG_USED_BYTES; +} + +/* + * Write the dnode array and physical object set to disk. + */ +static void +_objset_write(zfs_opt_t *zfs, zfs_objset_t *os, struct dnode_cursor *c) +{ + assert(os->dnodenextfree == os->dnodecount); + + /* + * Write out the dnode array, i.e., the meta-dnode. For some reason its + * data blocks must be 16KB in size no matter how large the array is. + */ + for (uint64_t i = 0; i < os->dnodecount; i += DNODES_PER_BLOCK) { + dnode_phys_t *blk; + uint64_t fill; + off_t loc; + + blk = os->dnodes + i; + loc = os->dnodeloc + i * sizeof(dnode_phys_t); + fill = os->dnodecount - i < DNODES_PER_BLOCK ? + os->dnodecount - i : 0; + + vdev_pwrite_dnode_indir(zfs, &os->phys->os_meta_dnode, + 0, fill, blk, DNODE_BLOCK_SIZE, loc, + dnode_cursor_next(zfs, c, i * sizeof(dnode_phys_t))); + } + dnode_cursor_finish(zfs, c); + free(os->dnodes); + os->dnodes = NULL; + + /* + * Write the object set itself. The saved block pointer will be copied + * into the referencing DSL dataset or the uberblocks. + */ + vdev_pwrite_data(zfs, DMU_OT_OBJSET, ZIO_CHECKSUM_FLETCHER_4, 0, 1, + os->phys, os->osblksz, os->osloc, &os->osbp); +} + +static void +objset_write(zfs_opt_t *zfs, zfs_objset_t *os) +{ + struct dnode_cursor *c; + + c = dnode_cursor_init(zfs, os, &os->phys->os_meta_dnode, + os->dnodecount * sizeof(dnode_phys_t), DNODE_BLOCK_SIZE); + _objset_write(zfs, os, c); +} + +static void +objset_mos_write(zfs_opt_t *zfs) +{ + struct dnode_cursor *c; + zfs_objset_t *mos; + + mos = &zfs->mos; + + /* + * There is a chicken-and-egg problem here: we cannot write space maps + * before we're finished allocating space from the vdev, and we can't + * write the MOS without having allocated space for indirect dnode + * blocks. Thus, rather than lazily allocating indirect blocks for the + * meta-dnode (which would be simpler), they are allocated up-front and + * before writing space maps. + */ + c = dnode_cursor_init(zfs, mos, &mos->phys->os_meta_dnode, + mos->dnodecount * sizeof(dnode_phys_t), DNODE_BLOCK_SIZE); + spacemap_write(zfs); + + /* + * We've finished allocating space, account for it in $MOS. + */ + zfs->mosdsldir.phys->dd_used_bytes = mos->space; + zfs->mosdsldir.phys->dd_compressed_bytes = mos->space; + zfs->mosdsldir.phys->dd_uncompressed_bytes = mos->space; + + _objset_write(zfs, mos, c); +} + +static dnode_phys_t * +objset_dnode_bonus_alloc(zfs_objset_t *os, uint8_t type, uint8_t bonustype, + uint16_t bonuslen, uint64_t *idp) +{ + dnode_phys_t *dnode; + + assert(os->dnodenextfree < os->dnodecount); + assert(bonuslen <= DN_OLD_MAX_BONUSLEN); + + *idp = os->dnodenextfree; + dnode = &os->dnodes[os->dnodenextfree++]; + dnode->dn_type = type; + dnode->dn_indblkshift = MAXBLOCKSHIFT; + dnode->dn_datablkszsec = os->osblksz >> MINBLOCKSHIFT; + dnode->dn_nlevels = 1; + dnode->dn_nblkptr = 1; + dnode->dn_bonustype = bonustype; + dnode->dn_bonuslen = bonuslen; + dnode->dn_checksum = ZIO_CHECKSUM_FLETCHER_4; + dnode->dn_compress = ZIO_COMPRESS_OFF; + dnode->dn_flags = DNODE_FLAG_USED_BYTES; + return (dnode); +} + +static dnode_phys_t * +objset_dnode_alloc(zfs_objset_t *os, uint8_t type, uint64_t *idp) +{ + return (objset_dnode_bonus_alloc(os, type, DMU_OT_NONE, 0, idp)); +} + +static dnode_phys_t * +objset_dnode_lookup(zfs_objset_t *os, uint64_t id) +{ + assert(id > 0 && id <= os->dnodecount); + + return (&os->dnodes[id]); +} + +static off_t +objset_space_alloc(zfs_opt_t *zfs, zfs_objset_t *os, off_t *lenp) +{ + off_t loc; + + loc = vdev_space_alloc(zfs, lenp); + os->space += *lenp; + return (loc); +} + +/* + * Return an allocated string containing the head dataset's mountpoint, + * including the root path prefix. + * + * If the dataset has a mountpoint property, it is returned. Otherwise we have + * to follow ZFS' inheritance rules. + */ +static char * +dsl_dir_get_mountpoint(zfs_opt_t *zfs, zfs_dsl_dir_t *dir) +{ + zfs_dsl_dir_t *pdir; + char *mountpoint, *origmountpoint; + + if (nvlist_find_string(dir->propsnv, "mountpoint", &mountpoint) == 0) { + if (strcmp(mountpoint, "none") == 0) + return (NULL); + + /* + * nvlist_find_string() does not make a copy. + */ + mountpoint = estrdup(mountpoint); + } else { + /* + * If we don't have a mountpoint, it's inherited from one of our + * ancestors. Walk up the hierarchy until we find it, building + * up our mountpoint along the way. The mountpoint property is + * always set for the root dataset. + */ + for (pdir = dir->parent, mountpoint = estrdup(dir->name);;) { + origmountpoint = mountpoint; + + if (nvlist_find_string(pdir->propsnv, "mountpoint", + &mountpoint) == 0) { + easprintf(&mountpoint, "%s%s%s", mountpoint, + mountpoint[strlen(mountpoint) - 1] == '/' ? + "" : "/", origmountpoint); + free(origmountpoint); + break; + } + + easprintf(&mountpoint, "%s/%s", pdir->name, + origmountpoint); + free(origmountpoint); + pdir = pdir->parent; + } + } + assert(mountpoint[0] == '/'); + assert(strstr(mountpoint, zfs->rootpath) == mountpoint); + + return (mountpoint); +} + +/* + * Handle dataset properties that we know about; stash them into an nvlist to be + * written later to the properties ZAP object. + * + * If the set of properties we handle grows too much, we should probably explore + * using libzfs to manage them. + */ +static void +dsl_dir_set_prop(zfs_opt_t *zfs, zfs_dsl_dir_t *dir, const char *key, + const char *val) +{ + nvlist_t *nvl; + + nvl = dir->propsnv; + if (val == NULL || val[0] == '\0') + errx(1, "missing value for property `%s'", key); + if (nvpair_find(nvl, key) != NULL) + errx(1, "property `%s' already set", key); + + if (strcmp(key, "mountpoint") == 0) { + if (strcmp(val, "none") != 0) { + if (val[0] != '/') + errx(1, "mountpoint `%s' is not absolute", val); + if (strcmp(val, zfs->rootpath) != 0 && + strcmp(zfs->rootpath, "/") != 0 && + (strstr(val, zfs->rootpath) != val || + val[strlen(zfs->rootpath)] != '/')) { + errx(1, "mountpoint `%s' is not prefixed by " + "the root path `%s'", val, zfs->rootpath); + } + } + nvlist_add_string(nvl, key, val); + } else if (strcmp(key, "atime") == 0 || strcmp(key, "exec") == 0 || + strcmp(key, "setuid") == 0) { + if (strcmp(val, "on") == 0) + nvlist_add_uint64(nvl, key, 1); + else if (strcmp(val, "off") == 0) + nvlist_add_uint64(nvl, key, 0); + else + errx(1, "invalid value `%s' for %s", val, key); + } else if (strcmp(key, "canmount") == 0) { + if (strcmp(val, "noauto") == 0) + nvlist_add_uint64(nvl, key, 2); + else if (strcmp(val, "on") == 0) + nvlist_add_uint64(nvl, key, 1); + else if (strcmp(val, "off") == 0) + nvlist_add_uint64(nvl, key, 0); + else + errx(1, "invalid value `%s' for %s", val, key); + } else { + errx(1, "unknown property `%s'", key); + } +} + +static void +dsl_init_metadir(zfs_opt_t *zfs, const char *name, zfs_dsl_dir_t *dir) +{ + char *path; + + easprintf(&path, "%s/%s", zfs->poolname, name); + dsl_dir_init(zfs, path, dir); + free(path); +} + +static void +dsl_init_origindir(zfs_opt_t *zfs) +{ + dnode_phys_t *clones; + uint64_t clonesid; + + dsl_init_metadir(zfs, "$ORIGIN", &zfs->origindsldir); + dsl_dataset_init(zfs, &zfs->origindsldir, &zfs->originds); + dsl_dataset_init(zfs, &zfs->origindsldir, &zfs->snapds); + + clones = objset_dnode_alloc(&zfs->mos, DMU_OT_DSL_CLONES, &clonesid); + zap_init(&zfs->cloneszap, &zfs->mos, clones); + zfs->origindsldir.phys->dd_clones = clonesid; +} + +static void +dsl_init(zfs_opt_t *zfs) +{ + zfs_dsl_dir_t *dir; + struct dataset_desc *d; + + dsl_dir_init(zfs, NULL, &zfs->rootdsldir); + + nvlist_add_uint64(zfs->rootdsldir.propsnv, "compression", + ZIO_COMPRESS_OFF); + + dsl_dataset_init(zfs, &zfs->rootdsldir, &zfs->rootds); + zfs->rootdsldir.headds = &zfs->rootds; + + dsl_init_metadir(zfs, "$MOS", &zfs->mosdsldir); + dsl_init_metadir(zfs, "$FREE", &zfs->freedsldir); + dsl_init_origindir(zfs); + + /* + * Go through the list of user-specified datasets and create DSL objects + * for them. + */ + STAILQ_FOREACH(d, &zfs->datasets, next) { + char *dsname, *params, *param, *nextparam; + + params = d->params; + dsname = strsep(¶ms, ":"); + + if (strcmp(dsname, zfs->poolname) == 0) { + /* + * This is the root dataset; it's already created, so + * we're just setting options. + */ + dir = &zfs->rootdsldir; + } else { + dir = ecalloc(1, sizeof(*dir)); + dsl_dir_init(zfs, dsname, dir); + dir->headds = ecalloc(1, sizeof(*dir->headds)); + dsl_dataset_init(zfs, dir, dir->headds); + } + + for (nextparam = param = params; nextparam != NULL;) { + char *key, *val; + + param = strsep(&nextparam, ":"); + + key = val = param; + key = strsep(&val, "="); + dsl_dir_set_prop(zfs, dir, key, val); + } + } + + /* + * Set the root dataset's mount point if the user didn't override the + * default. + */ + if (nvpair_find(zfs->rootdsldir.propsnv, "mountpoint") == NULL) { + nvlist_add_string(zfs->rootdsldir.propsnv, "mountpoint", + zfs->rootpath); + } +} + +static void +dsl_dir_foreach_post(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, + void (*cb)(zfs_opt_t *, zfs_dsl_dir_t *, void *), void *arg) +{ + zfs_dsl_dir_t *cdsldir; + + STAILQ_FOREACH(cdsldir, &dsldir->children, next) { + dsl_dir_foreach_post(zfs, cdsldir, cb, arg); + } + cb(zfs, dsldir, arg); +} + +/* + * Used when the caller doesn't care about the order one way or another. + */ +static void +dsl_dir_foreach(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, + void (*cb)(zfs_opt_t *, zfs_dsl_dir_t *, void *), void *arg) +{ + dsl_dir_foreach_post(zfs, dsldir, cb, arg); +} + +/* + * Create a DSL directory, which is effectively an entry in the ZFS namespace. + * We always create a root DSL directory, whose name is the pool's name, and + * several metadata directories. + * + * Each directory has two ZAP objects, one pointing to child directories, and + * one for properties (which are inherited by children unless overridden). + * Directories typically reference a DSL dataset, the "head dataset", which + * points to an object set. + */ +static void +dsl_dir_init(zfs_opt_t *zfs, const char *name, zfs_dsl_dir_t *dsldir) +{ + zfs_dsl_dir_list_t l, *lp; + zfs_dsl_dir_t *parent; + zfs_objset_t *mos; + dnode_phys_t *dnode; + char *dirname, *nextdir, *origname; + uint64_t childid, propsid; + + mos = &zfs->mos; + + dnode = objset_dnode_bonus_alloc(mos, DMU_OT_DSL_DIR, DMU_OT_DSL_DIR, + sizeof(dsl_dir_phys_t), &dsldir->dirid); + dsldir->phys = (dsl_dir_phys_t *)DN_BONUS(dnode); + + dnode = objset_dnode_alloc(mos, DMU_OT_DSL_PROPS, &propsid); + zap_init(&dsldir->propszap, mos, dnode); + + dnode = objset_dnode_alloc(mos, DMU_OT_DSL_DIR_CHILD_MAP, &childid); + zap_init(&dsldir->childzap, mos, dnode); + + dsldir->propsnv = nvlist_create(NV_UNIQUE_NAME); + STAILQ_INIT(&dsldir->children); + + dsldir->phys->dd_child_dir_zapobj = childid; + dsldir->phys->dd_props_zapobj = propsid; + + if (name == NULL) { + /* + * This is the root DSL directory. + */ + assert(dsldir == &zfs->rootdsldir); + dsldir->name = estrdup(zfs->poolname); + dsldir->fullname = estrdup(zfs->poolname); + dsldir->parent = NULL; + dsldir->phys->dd_parent_obj = 0; + return; + } + + /* + * Insert the new directory into the hierarchy. Currently this must be + * done in order, e.g., when creating pool/a/b, pool/a must already + * exist. + */ + STAILQ_INIT(&l); + STAILQ_INSERT_HEAD(&l, &zfs->rootdsldir, next); + origname = dirname = nextdir = estrdup(name); + for (lp = &l;; lp = &parent->children) { + dirname = strsep(&nextdir, "/"); + if (nextdir == NULL) + break; + + STAILQ_FOREACH(parent, lp, next) { + if (strcmp(parent->name, dirname) == 0) + break; + } + if (parent == NULL) { + errx(1, "no parent at `%s' for filesystem `%s'", + dirname, name); + } + } + + dsldir->fullname = estrdup(name); + dsldir->name = estrdup(dirname); + free(origname); + STAILQ_INSERT_TAIL(lp, dsldir, next); + zap_add_uint64(&parent->childzap, dsldir->name, dsldir->dirid); + + dsldir->parent = parent; + dsldir->phys->dd_parent_obj = parent->dirid; +} + +/* + * Convert dataset properties into entries in the DSL directory's properties + * ZAP. + */ +static void +dsl_dir_finalize_props(zfs_dsl_dir_t *dir) +{ + for (nvp_header_t *nvh = NULL; + (nvh = nvlist_next_nvpair(dir->propsnv, nvh)) != NULL;) { + nv_string_t *nvname; + nv_pair_data_t *nvdata; + const char *name; + + nvname = (nv_string_t *)(nvh + 1); + nvdata = (nv_pair_data_t *)(&nvname->nv_data[0] + + NV_ALIGN4(nvname->nv_size)); + + name = nvstring_get(nvname); + switch (nvdata->nv_type) { + case DATA_TYPE_UINT64: { + uint64_t val; + + memcpy(&val, &nvdata->nv_data[0], sizeof(uint64_t)); + zap_add_uint64(&dir->propszap, name, val); + break; + } + case DATA_TYPE_STRING: { + nv_string_t *nvstr; + + nvstr = (nv_string_t *)&nvdata->nv_data[0]; + zap_add_string(&dir->propszap, name, + nvstring_get(nvstr)); + break; + } + default: + assert(0); + } + } +} + +static void +dsl_dir_finalize(zfs_opt_t *zfs, zfs_dsl_dir_t *dir, void *arg __unused) +{ + zfs_dsl_dir_t *cdir; + uint64_t bytes; + + dsl_dir_finalize_props(dir); + zap_write(zfs, &dir->propszap); + zap_write(zfs, &dir->childzap); + + if (dir->headds != NULL && dir->headds->os != NULL) { + char key[32]; + zfs_zap_t snapnameszap; + dnode_phys_t *snapnames; + zfs_dsl_dataset_t *headds; + zfs_objset_t *os; + uint64_t snapnamesid; + + headds = dir->headds; + os = headds->os; + + snapnames = objset_dnode_alloc(&zfs->mos, + DMU_OT_DSL_DS_SNAP_MAP, &snapnamesid); + zap_init(&snapnameszap, &zfs->mos, snapnames); + zap_write(zfs, &snapnameszap); + + dir->phys->dd_head_dataset_obj = headds->dsid; + dir->phys->dd_clone_parent_obj = zfs->snapds.dsid; + headds->phys->ds_prev_snap_obj = zfs->snapds.dsid; + headds->phys->ds_snapnames_zapobj = snapnamesid; + memcpy(&headds->phys->ds_bp, &os->osbp, sizeof(blkptr_t)); + + zfs->snapds.phys->ds_num_children++; + snprintf(key, sizeof(key), "%jx", (uintmax_t)headds->dsid); + zap_add_uint64(&zfs->cloneszap, key, headds->dsid); + + bytes = os->space; + headds->phys->ds_used_bytes = bytes; + /* XXX-MJ not sure what the difference is here... */ + headds->phys->ds_uncompressed_bytes = bytes; + headds->phys->ds_compressed_bytes = bytes; + + STAILQ_FOREACH(cdir, &dir->children, next) { + bytes += cdir->phys->dd_used_bytes; + } + dir->phys->dd_used_bytes = bytes; + dir->phys->dd_compressed_bytes = bytes; + dir->phys->dd_uncompressed_bytes = bytes; + } +} + +static void +dsl_write(zfs_opt_t *zfs) +{ + zfs_zap_t snapnameszap; + zfs_objset_t *mos; + dnode_phys_t *snapnames; + uint64_t snapmapid; + + mos = &zfs->mos; + + /* + * Perform accounting, starting from the leaves of the DSL directory + * tree. Accounting for $MOS is done later, once we've finished + * allocating space. + */ + dsl_dir_foreach_post(zfs, &zfs->rootdsldir, dsl_dir_finalize, NULL); + + snapnames = objset_dnode_alloc(mos, DMU_OT_DSL_DS_SNAP_MAP, &snapmapid); + + zfs->origindsldir.phys->dd_head_dataset_obj = zfs->originds.dsid; + zfs->originds.phys->ds_prev_snap_obj = zfs->snapds.dsid; + zfs->originds.phys->ds_snapnames_zapobj = snapmapid; + zfs->snapds.phys->ds_next_snap_obj = zfs->originds.dsid; + assert(zfs->snapds.phys->ds_num_children > 0); + zfs->snapds.phys->ds_num_children++; + + zap_init(&snapnameszap, mos, snapnames); + zap_add_uint64(&snapnameszap, "$ORIGIN", zfs->snapds.dsid); + zap_write(zfs, &snapnameszap); + + zap_write(zfs, &zfs->cloneszap); +} + +static void +dsl_dataset_init(zfs_opt_t *zfs, zfs_dsl_dir_t *dir, zfs_dsl_dataset_t *ds) +{ + zfs_zap_t deadlistzap; + dnode_phys_t *dnode; + uint64_t deadlistid; + + dnode = objset_dnode_bonus_alloc(&zfs->mos, DMU_OT_DSL_DATASET, + DMU_OT_DSL_DATASET, sizeof(dsl_dataset_phys_t), &ds->dsid); + ds->phys = (dsl_dataset_phys_t *)DN_BONUS(dnode); + + dnode = objset_dnode_bonus_alloc(&zfs->mos, DMU_OT_DEADLIST, + DMU_OT_DEADLIST_HDR, sizeof(dsl_deadlist_phys_t), &deadlistid); + zap_init(&deadlistzap, &zfs->mos, dnode); + zap_write(zfs, &deadlistzap); + + ds->phys->ds_dir_obj = dir->dirid; + ds->phys->ds_deadlist_obj = deadlistid; + ds->phys->ds_creation_txg = TXG_INITIAL - 1; + if (ds != &zfs->snapds) + ds->phys->ds_prev_snap_txg = TXG_INITIAL - 1; + + ds->dir = dir; +} + +static uint16_t +zap_entry_chunks(zfs_zap_entry_t *ent) +{ + return (1 + howmany(strlen(ent->name) + 1, ZAP_LEAF_ARRAY_BYTES) + + howmany(ent->intsz * ent->intcnt, ZAP_LEAF_ARRAY_BYTES)); +} + +static uint64_t +zap_hash(uint64_t salt, const char *name) +{ + static uint64_t crc64_table[256]; + const uint64_t crc64_poly = 0xC96C5795D7870F42UL; + const uint8_t *cp; + uint64_t crc; + uint8_t c; + + assert(salt != 0); + if (crc64_table[128] == 0) { + for (int i = 0; i < 256; i++) { + uint64_t *t; + + t = crc64_table + i; + *t = i; + for (int j = 8; j > 0; j--) + *t = (*t >> 1) ^ (-(*t & 1) & crc64_poly); + } + } + assert(crc64_table[128] == crc64_poly); + + for (cp = (const uint8_t *)name, crc = salt; (c = *cp) != '\0'; cp++) + crc = (crc >> 8) ^ crc64_table[(crc ^ c) & 0xFF]; + + /* + * Only use 28 bits, since we need 4 bits in the cookie for the + * collision differentiator. We MUST use the high bits, since + * those are the ones that we first pay attention to when + * choosing the bucket. + */ + crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); + + return (crc); +} + +static void +zap_init(zfs_zap_t *zap, zfs_objset_t *os, dnode_phys_t *dnode) +{ + STAILQ_INIT(&zap->kvps); + zap->hashsalt = ((uint64_t)random() << 32) | random(); + zap->micro = true; + zap->kvpcnt = 0; + zap->chunks = 0; + zap->dnode = dnode; + zap->os = os; +} + +static void +zap_add(zfs_zap_t *zap, const char *name, size_t intsz, size_t intcnt, + const uint8_t *val) +{ + zfs_zap_entry_t *ent; + + assert(intsz == 1 || intsz == 2 || intsz == 4 || intsz == 8); + assert(strlen(name) + 1 <= ZAP_MAXNAMELEN); + assert(intcnt <= ZAP_MAXVALUELEN && intcnt * intsz <= ZAP_MAXVALUELEN); + + ent = ecalloc(1, sizeof(*ent)); + ent->name = estrdup(name); + ent->hash = zap_hash(zap->hashsalt, ent->name); + ent->intsz = intsz; + ent->intcnt = intcnt; + if (intsz == sizeof(uint64_t) && intcnt == 1) { + /* + * Micro-optimization to elide a memory allocation in that most + * common case where this is a directory entry. + */ + ent->val64p = &ent->val64; + } else { + ent->valp = ecalloc(intcnt, intsz); + } + memcpy(ent->valp, val, intcnt * intsz); + zap->kvpcnt++; + zap->chunks += zap_entry_chunks(ent); + STAILQ_INSERT_TAIL(&zap->kvps, ent, next); + + if (zap->micro && (intcnt != 1 || intsz != sizeof(uint64_t) || + strlen(name) + 1 > MZAP_NAME_LEN || zap->kvpcnt > MZAP_ENT_MAX)) + zap->micro = false; +} + +static void +zap_add_uint64(zfs_zap_t *zap, const char *name, uint64_t val) +{ + zap_add(zap, name, sizeof(uint64_t), 1, (uint8_t *)&val); +} + +static void +zap_add_string(zfs_zap_t *zap, const char *name, const char *val) +{ + zap_add(zap, name, 1, strlen(val) + 1, val); +} + +static bool +zap_entry_exists(zfs_zap_t *zap, const char *name) +{ + zfs_zap_entry_t *ent; + + STAILQ_FOREACH(ent, &zap->kvps, next) { + if (strcmp(ent->name, name) == 0) + return (true); + } + return (false); +} + +static void +zap_micro_write(zfs_opt_t *zfs, zfs_zap_t *zap) +{ + dnode_phys_t *dnode; + zfs_zap_entry_t *ent; + mzap_phys_t *mzap; + mzap_ent_phys_t *ment; + off_t bytes, loc; + + memset(zfs->filebuf, 0, sizeof(zfs->filebuf)); + mzap = (mzap_phys_t *)&zfs->filebuf[0]; + mzap->mz_block_type = ZBT_MICRO; + mzap->mz_salt = zap->hashsalt; + mzap->mz_normflags = 0; + + bytes = sizeof(*mzap) + (zap->kvpcnt - 1) * sizeof(*ment); + assert(bytes <= (off_t)MZAP_MAX_BLKSZ); + + ment = &mzap->mz_chunk[0]; + STAILQ_FOREACH(ent, &zap->kvps, next) { + memcpy(&ment->mze_value, ent->valp, ent->intsz * ent->intcnt); + ment->mze_cd = 0; /* XXX-MJ */ + strlcpy(ment->mze_name, ent->name, sizeof(ment->mze_name)); + ment++; + } + + loc = objset_space_alloc(zfs, zap->os, &bytes); + + dnode = zap->dnode; + dnode->dn_maxblkid = 0; + dnode->dn_datablkszsec = bytes >> MINBLOCKSHIFT; + dnode->dn_flags = DNODE_FLAG_USED_BYTES; + + vdev_pwrite_dnode_data(zfs, dnode, zfs->filebuf, bytes, loc); +} + +/* + * Write some data to the fat ZAP leaf chunk starting at index "li". + * + * Note that individual integers in the value may be split among consecutive + * leaves. + */ +static void +zap_fat_write_array_chunk(zap_leaf_t *l, uint16_t li, size_t sz, + const uint8_t *val) +{ + struct zap_leaf_array *la; + + assert(sz <= ZAP_MAXVALUELEN); + + for (uint16_t n, resid = sz; resid > 0; resid -= n, val += n, li++) { + n = MIN(resid, ZAP_LEAF_ARRAY_BYTES); + + la = &ZAP_LEAF_CHUNK(l, li).l_array; + assert(la->la_type == ZAP_CHUNK_FREE); + la->la_type = ZAP_CHUNK_ARRAY; + memcpy(la->la_array, val, n); + la->la_next = li + 1; + } + la->la_next = 0xffff; +} + +/* + * Find the shortest hash prefix length which lets us distribute keys without + * overflowing a leaf block. This is not (space) optimal, but is simple, and + * directories large enough to overflow a single 128KB leaf block are uncommon. + */ +static unsigned int +zap_fat_write_prefixlen(zfs_zap_t *zap, zap_leaf_t *l) +{ + zfs_zap_entry_t *ent; + unsigned int prefixlen; + + if (zap->chunks <= ZAP_LEAF_NUMCHUNKS(l)) { + /* + * All chunks will fit in a single leaf block. + */ + return (0); + } + + for (prefixlen = 1; prefixlen < (unsigned int)l->l_bs; prefixlen++) { + uint32_t *leafchunks; + + leafchunks = ecalloc(1u << prefixlen, sizeof(*leafchunks)); + STAILQ_FOREACH(ent, &zap->kvps, next) { + uint64_t li; + uint16_t chunks; + + li = ZAP_HASH_IDX(ent->hash, prefixlen); + + chunks = zap_entry_chunks(ent); + if (ZAP_LEAF_NUMCHUNKS(l) - leafchunks[li] < chunks) { + /* + * Not enough space, grow the prefix and retry. + */ + break; + } + leafchunks[li] += chunks; + } + free(leafchunks); + + if (ent == NULL) { + /* + * Everything fits, we're done. + */ + break; + } + } + + /* + * If this fails, then we need to expand the pointer table. For now + * this situation is unhandled since it is hard to trigger. + */ + assert(prefixlen < (unsigned int)l->l_bs); + + return (prefixlen); +} + +/* + * Initialize a fat ZAP leaf block. + */ +static void +zap_fat_write_leaf_init(zap_leaf_t *l, uint64_t prefix, int prefixlen) +{ + zap_leaf_phys_t *leaf; + + leaf = l->l_phys; + + leaf->l_hdr.lh_block_type = ZBT_LEAF; + leaf->l_hdr.lh_magic = ZAP_LEAF_MAGIC; + leaf->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l); + leaf->l_hdr.lh_prefix = prefix; + leaf->l_hdr.lh_prefix_len = prefixlen; + + /* Initialize the leaf hash table. */ + assert(leaf->l_hdr.lh_nfree < 0xffff); + memset(leaf->l_hash, 0xff, + ZAP_LEAF_HASH_NUMENTRIES(l) * sizeof(*leaf->l_hash)); + + /* Initialize the leaf chunks. */ + for (uint16_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { + struct zap_leaf_free *lf; + + lf = &ZAP_LEAF_CHUNK(l, i).l_free; + lf->lf_type = ZAP_CHUNK_FREE; + if (i + 1 == ZAP_LEAF_NUMCHUNKS(l)) + lf->lf_next = 0xffff; + else + lf->lf_next = i + 1; + } +} + +static void +zap_fat_write(zfs_opt_t *zfs, zfs_zap_t *zap) +{ + struct dnode_cursor *c; + zap_leaf_t l; + zap_phys_t *zaphdr; + struct zap_table_phys *zt; + zfs_zap_entry_t *ent; + dnode_phys_t *dnode; + uint8_t *leafblks; + uint64_t lblkcnt, *ptrhasht; + off_t loc, blksz; + size_t blkshift; + unsigned int prefixlen; + int ptrcnt; + + /* + * For simplicity, always use the largest block size. This should be ok + * since most directories will be micro ZAPs, but it's space inefficient + * for small ZAPs and might need to be revisited. + */ + blkshift = MAXBLOCKSHIFT; + blksz = (off_t)1 << blkshift; + + /* + * Embedded pointer tables give up to 8192 entries. This ought to be + * enough for anything except massive directories. + */ + ptrcnt = (blksz / 2) / sizeof(uint64_t); + + memset(zfs->filebuf, 0, sizeof(zfs->filebuf)); + zaphdr = (zap_phys_t *)&zfs->filebuf[0]; + zaphdr->zap_block_type = ZBT_HEADER; + zaphdr->zap_magic = ZAP_MAGIC; + zaphdr->zap_num_entries = zap->kvpcnt; + zaphdr->zap_salt = zap->hashsalt; + + l.l_bs = blkshift; + l.l_phys = NULL; + + zt = &zaphdr->zap_ptrtbl; + zt->zt_blk = 0; + zt->zt_numblks = 0; + zt->zt_shift = flsl(ptrcnt) - 1; + zt->zt_nextblk = 0; + zt->zt_blks_copied = 0; + + /* + * How many leaf blocks do we need? Initialize them and update the + * header. + */ + prefixlen = zap_fat_write_prefixlen(zap, &l); + lblkcnt = 1 << prefixlen; + leafblks = ecalloc(lblkcnt, blksz); + for (unsigned int li = 0; li < lblkcnt; li++) { + l.l_phys = (zap_leaf_phys_t *)(leafblks + li * blksz); + zap_fat_write_leaf_init(&l, li, prefixlen); + } + zaphdr->zap_num_leafs = lblkcnt; + zaphdr->zap_freeblk = lblkcnt + 1; + + /* + * For each entry, figure out which leaf block it belongs to based on + * the upper bits of its hash, allocate chunks from that leaf, and fill + * them out. + */ + ptrhasht = (uint64_t *)(&zfs->filebuf[0] + blksz / 2); + STAILQ_FOREACH(ent, &zap->kvps, next) { + struct zap_leaf_entry *le; + uint16_t *lptr; + uint64_t hi, li; + uint16_t namelen, nchunks, nnamechunks, nvalchunks; + + hi = ZAP_HASH_IDX(ent->hash, zt->zt_shift); + li = ZAP_HASH_IDX(ent->hash, prefixlen); + assert(ptrhasht[hi] == 0 || ptrhasht[hi] == li + 1); + ptrhasht[hi] = li + 1; + l.l_phys = (zap_leaf_phys_t *)(leafblks + li * blksz); + + namelen = strlen(ent->name) + 1; + + /* + * How many leaf chunks do we need for this entry? + */ + nnamechunks = howmany(namelen, ZAP_LEAF_ARRAY_BYTES); + nvalchunks = howmany(ent->intcnt, + ZAP_LEAF_ARRAY_BYTES / ent->intsz); + nchunks = 1 + nnamechunks + nvalchunks; + + /* + * Allocate a run of free leaf chunks for this entry, + * potentially extending a hash chain. + */ + assert(l.l_phys->l_hdr.lh_nfree >= nchunks); + l.l_phys->l_hdr.lh_nfree -= nchunks; + l.l_phys->l_hdr.lh_nentries++; + lptr = ZAP_LEAF_HASH_ENTPTR(&l, ent->hash); + while (*lptr != 0xffff) { + assert(*lptr < ZAP_LEAF_NUMCHUNKS(&l)); + le = ZAP_LEAF_ENTRY(&l, *lptr); + assert(le->le_type == ZAP_CHUNK_ENTRY); + le->le_cd++; + lptr = &le->le_next; + } + *lptr = l.l_phys->l_hdr.lh_freelist; + l.l_phys->l_hdr.lh_freelist += nchunks; + assert(l.l_phys->l_hdr.lh_freelist <= + ZAP_LEAF_NUMCHUNKS(&l)); + if (l.l_phys->l_hdr.lh_freelist == + ZAP_LEAF_NUMCHUNKS(&l)) + l.l_phys->l_hdr.lh_freelist = 0xffff; + + /* + * Integer values must be stored in big-endian format. + */ + switch (ent->intsz) { + case 1: + break; + case 2: + for (uint16_t *v = ent->val16p; + v - ent->val16p < (ptrdiff_t)ent->intcnt; + v++) + *v = htobe16(*v); + break; + case 4: + for (uint32_t *v = ent->val32p; + v - ent->val32p < (ptrdiff_t)ent->intcnt; + v++) + *v = htobe32(*v); + break; + case 8: + for (uint64_t *v = ent->val64p; + v - ent->val64p < (ptrdiff_t)ent->intcnt; + v++) + *v = htobe64(*v); + break; + default: + assert(0); + } + + /* + * Finally, write out the leaf chunks for this entry. + */ + le = ZAP_LEAF_ENTRY(&l, *lptr); + assert(le->le_type == ZAP_CHUNK_FREE); + le->le_type = ZAP_CHUNK_ENTRY; + le->le_next = 0xffff; + le->le_name_chunk = *lptr + 1; + le->le_name_numints = namelen; + le->le_value_chunk = *lptr + 1 + nnamechunks; + le->le_value_intlen = ent->intsz; + le->le_value_numints = ent->intcnt; + le->le_hash = ent->hash; + zap_fat_write_array_chunk(&l, *lptr + 1, namelen, ent->name); + zap_fat_write_array_chunk(&l, *lptr + 1 + nnamechunks, + ent->intcnt * ent->intsz, ent->valp); + } + + /* + * Initialize unused slots of the pointer table. + */ + for (int i = 0; i < ptrcnt; i++) + if (ptrhasht[i] == 0) + ptrhasht[i] = (i >> (zt->zt_shift - prefixlen)) + 1; + + /* + * Write the whole thing to disk. + */ + dnode = zap->dnode; + dnode->dn_nblkptr = 1; + dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT; + dnode->dn_maxblkid = lblkcnt + 1; + dnode->dn_flags = DNODE_FLAG_USED_BYTES; + + c = dnode_cursor_init(zfs, zap->os, zap->dnode, + (lblkcnt + 1) * blksz, blksz); + + loc = objset_space_alloc(zfs, zap->os, &blksz); + vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, zfs->filebuf, blksz, loc, + dnode_cursor_next(zfs, c, 0)); + + for (uint64_t i = 0; i < lblkcnt; i++) { + loc = objset_space_alloc(zfs, zap->os, &blksz); + vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, leafblks + i * blksz, + blksz, loc, dnode_cursor_next(zfs, c, (i + 1) * blksz)); + } + + dnode_cursor_finish(zfs, c); + + free(leafblks); +} + +static void +zap_write(zfs_opt_t *zfs, zfs_zap_t *zap) +{ + zfs_zap_entry_t *ent; + + if (zap->micro) { + zap_micro_write(zfs, zap); + } else { + assert(!STAILQ_EMPTY(&zap->kvps)); + assert(zap->kvpcnt > 0); + zap_fat_write(zfs, zap); + } + + while ((ent = STAILQ_FIRST(&zap->kvps)) != NULL) { + STAILQ_REMOVE_HEAD(&zap->kvps, next); + if (ent->val64p != &ent->val64) + free(ent->valp); + free(ent->name); + free(ent); + } +} + +static nvlist_t * +pool_config_nvcreate(zfs_opt_t *zfs) +{ + nvlist_t *featuresnv, *poolnv; + + poolnv = nvlist_create(NV_UNIQUE_NAME); + nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_TXG, TXG_INITIAL); + nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VERSION, SPA_VERSION); + nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_STATE, POOL_STATE_EXPORTED); + nvlist_add_string(poolnv, ZPOOL_CONFIG_POOL_NAME, zfs->poolname); + nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_GUID, zfs->guid); + nvlist_add_uint64(poolnv, ZPOOL_CONFIG_TOP_GUID, zfs->guid); + nvlist_add_uint64(poolnv, ZPOOL_CONFIG_GUID, zfs->guid); + nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VDEV_CHILDREN, 1); + + featuresnv = nvlist_create(NV_UNIQUE_NAME); + nvlist_add_nvlist(poolnv, ZPOOL_CONFIG_FEATURES_FOR_READ, featuresnv); + nvlist_destroy(featuresnv); + + return (poolnv); +} + +static nvlist_t * +pool_disk_vdev_config_nvcreate(zfs_opt_t *zfs) +{ + nvlist_t *diskvdevnv; + + assert(zfs->objarrid != 0); + + diskvdevnv = nvlist_create(NV_UNIQUE_NAME); + nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK); + nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASHIFT, zfs->ashift); + nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASIZE, zfs->asize); + nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_GUID, zfs->guid); + nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ID, 0); + nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_PATH, "/dev/null"); + nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_WHOLE_DISK, 1); + nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG_INITIAL); + nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_ARRAY, + zfs->objarrid); + nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_SHIFT, + zfs->msshift); + + return (diskvdevnv); +} + +static nvlist_t * +pool_root_vdev_config_nvcreate(zfs_opt_t *zfs) +{ + nvlist_t *diskvdevnv, *rootvdevnv; + + diskvdevnv = pool_disk_vdev_config_nvcreate(zfs); + rootvdevnv = nvlist_create(NV_UNIQUE_NAME); + + nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_ID, 0); + nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_GUID, zfs->guid); + nvlist_add_string(rootvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); + nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG_INITIAL); + nvlist_add_nvlist_array(rootvdevnv, ZPOOL_CONFIG_CHILDREN, &diskvdevnv, + 1); + nvlist_destroy(diskvdevnv); + + return (rootvdevnv); +} + +/* + * Create the pool's "config" object, which contains an nvlist describing pool + * parameters and the vdev topology. It is similar but not identical to the + * nvlist stored in vdev labels. The main difference is that vdev labels do not + * describe the full vdev tree and in particular do not contain the "root" + * meta-vdev. + */ +static void +pool_init_objdir_config(zfs_opt_t *zfs, zfs_zap_t *objdir) +{ + dnode_phys_t *dnode; + nvlist_t *poolconfig, *vdevconfig; + zfs_objset_t *mos; + void *configbuf; + uint64_t dnid; + off_t configloc, configblksz; + int error; + + mos = &zfs->mos; + + dnode = objset_dnode_bonus_alloc(mos, DMU_OT_PACKED_NVLIST, + DMU_OT_PACKED_NVLIST_SIZE, sizeof(uint64_t), &dnid); + + poolconfig = pool_config_nvcreate(zfs); + + vdevconfig = pool_root_vdev_config_nvcreate(zfs); + nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig); + nvlist_destroy(vdevconfig); + + error = nvlist_export(poolconfig); + if (error != 0) + errc(1, error, "nvlist_export"); + + configblksz = nvlist_size(poolconfig); + configloc = objset_space_alloc(zfs, mos, &configblksz); + configbuf = ecalloc(1, configblksz); + nvlist_copy(poolconfig, configbuf, configblksz); + + vdev_pwrite_dnode_data(zfs, dnode, configbuf, configblksz, configloc); + + dnode->dn_datablkszsec = configblksz >> MINBLOCKSHIFT; + dnode->dn_flags = DNODE_FLAG_USED_BYTES; + *(uint64_t *)DN_BONUS(dnode) = nvlist_size(poolconfig); + + zap_add_uint64(objdir, DMU_POOL_CONFIG, dnid); + + nvlist_destroy(poolconfig); + free(configbuf); +} + +/* + * Add objects block pointer list objects, used for deferred frees. We don't do + * anything with them, but they need to be present or OpenZFS will refuse to + * import the pool. + */ +static void +pool_init_objdir_bplists(zfs_opt_t *zfs __unused, zfs_zap_t *objdir) +{ + zfs_objset_t *mos; + uint64_t dnid; + + mos = &zfs->mos; + + (void)objset_dnode_bonus_alloc(mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR, + BPOBJ_SIZE_V2, &dnid); + zap_add_uint64(objdir, DMU_POOL_FREE_BPOBJ, dnid); + + (void)objset_dnode_bonus_alloc(mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR, + BPOBJ_SIZE_V2, &dnid); + zap_add_uint64(objdir, DMU_POOL_SYNC_BPLIST, dnid); +} + +/* + * Add required feature metadata objects. We don't know anything about ZFS + * features, so the objects are just empty ZAPs. + */ +static void +pool_init_objdir_feature_maps(zfs_opt_t *zfs, zfs_zap_t *objdir) +{ + zfs_zap_t zap; + zfs_objset_t *mos; + dnode_phys_t *dnode; + uint64_t dnid; + + mos = &zfs->mos; + + dnode = objset_dnode_alloc(mos, DMU_OTN_ZAP_METADATA, &dnid); + zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_READ, dnid); + zap_init(&zap, mos, dnode); + zap_write(zfs, &zap); + + dnode = objset_dnode_alloc(mos, DMU_OTN_ZAP_METADATA, &dnid); + zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_WRITE, dnid); + zap_init(&zap, mos, dnode); + zap_write(zfs, &zap); + + dnode = objset_dnode_alloc(mos, DMU_OTN_ZAP_METADATA, &dnid); + zap_add_uint64(objdir, DMU_POOL_FEATURE_DESCRIPTIONS, dnid); + zap_init(&zap, mos, dnode); + zap_write(zfs, &zap); +} + +static void +pool_init_objdir_dsl(zfs_opt_t *zfs, zfs_zap_t *objdir) +{ + uint64_t id; + + id = zfs->rootdsldir.dirid; + assert(id > 0); + zap_add_uint64(objdir, DMU_POOL_ROOT_DATASET, id); +} + +static void +pool_init_objdir_poolprops(zfs_opt_t *zfs, zfs_zap_t *objdir) +{ + dnode_phys_t *dnode; + uint64_t id; + + dnode = objset_dnode_alloc(&zfs->mos, DMU_OT_POOL_PROPS, &id); + zap_init(&zfs->poolprops, &zfs->mos, dnode); + zap_add_uint64(objdir, DMU_POOL_PROPS, id); +} + +/* + * Initialize the MOS object directory, the root of virtually all of the pool's + * data and metadata. + */ +static void +pool_init_objdir(zfs_opt_t *zfs) +{ + zfs_zap_t zap; + dnode_phys_t *objdir; + + objdir = objset_dnode_lookup(&zfs->mos, DMU_POOL_DIRECTORY_OBJECT); + + zap_init(&zap, &zfs->mos, objdir); + pool_init_objdir_config(zfs, &zap); + pool_init_objdir_bplists(zfs, &zap); + pool_init_objdir_feature_maps(zfs, &zap); + pool_init_objdir_dsl(zfs, &zap); + pool_init_objdir_poolprops(zfs, &zap); + zap_write(zfs, &zap); +} + +/* + * Initialize the meta-object set and immediately write out several special + * objects whose contents are already finalized, including the object directory. + */ +static void +pool_init(zfs_opt_t *zfs) +{ + struct dataset_desc *d; + zfs_objset_t *mos; + uint64_t dnid, dnodecount; + + zfs->guid = 0xdeadfacec0debeef; + + mos = &zfs->mos; + + /* + * Figure out how many dnodes will be allocated from the MOS. + */ + dnodecount = 0; + dnodecount++; /* object directory (ZAP) */ + dnodecount++; /* |-> vdev config object (nvlist) */ + dnodecount++; /* |-> features for read */ + dnodecount++; /* |-> features for write */ + dnodecount++; /* |-> feature descriptions */ + dnodecount++; /* |-> sync bplist */ + dnodecount++; /* |-> free bplist */ + dnodecount++; /* |-> pool properties */ + dnodecount++; /* L-> root DSL directory */ + dnodecount++; /* |-> DSL child directory (ZAP) */ + dnodecount++; /* | |-> $MOS (DSL dir) */ + dnodecount++; /* | | |-> child map */ + dnodecount++; /* | | L-> props (ZAP) */ + dnodecount++; /* | |-> $FREE (DSL dir) */ + dnodecount++; /* | | |-> child map */ + dnodecount++; /* | | L-> props (ZAP) */ + dnodecount++; /* | L-> $ORIGIN (DSL dir) */ + dnodecount++; /* | |-> child map */ + dnodecount++; /* | |-> dataset */ + dnodecount++; /* | | L-> deadlist */ + dnodecount++; /* | |-> snapshot */ + dnodecount++; /* | | |-> deadlist */ + dnodecount++; /* | | L-> snapshot names */ + dnodecount++; /* | |-> props (ZAP) */ + dnodecount++; /* | L-> clones (ZAP) */ + dnodecount++; /* |-> DSL root dataset */ + dnodecount++; /* | |-> snapshot names */ + dnodecount++; /* | L-> deadlist */ + dnodecount++; /* L-> props (ZAP) */ + /* + * Space map stuff. + */ + dnodecount++; /* space map object array */ + dnodecount += zfs->mscount; /* space maps */ + /* + * Child datasets. + */ + STAILQ_FOREACH(d, &zfs->datasets, next) { + char buf[BUFSIZ]; + + /* Ugly hack to skip over root dataset parameters. */ + snprintf(buf, sizeof(buf), "%s:", zfs->poolname); + if (strncmp(buf, d->params, strlen(buf)) == 0) + continue; + + dnodecount++; /* DSL directory */ + dnodecount++; /* |-> DSL dataset */ + dnodecount++; /* | |-> snapshot names */ + dnodecount++; /* | L-> deadlist */ + dnodecount++; /* |-> child map */ + dnodecount++; /* |-> props */ + } + + objset_init(zfs, mos, DMU_OST_META, dnodecount); + + (void)objset_dnode_alloc(mos, DMU_OT_OBJECT_DIRECTORY, &dnid); + assert(dnid == DMU_POOL_DIRECTORY_OBJECT); + + (void)objset_dnode_alloc(mos, DMU_OT_OBJECT_ARRAY, &zfs->objarrid); + + dsl_init(zfs); + + pool_init_objdir(zfs); +} + +static void +pool_labels_write(zfs_opt_t *zfs) +{ + uberblock_t *ub; + vdev_label_t *label; + nvlist_t *poolconfig, *vdevconfig; + int error; + + label = ecalloc(1, sizeof(*label)); + + /* + * Assemble the vdev configuration and store it in the label. + */ + poolconfig = pool_config_nvcreate(zfs); + vdevconfig = pool_disk_vdev_config_nvcreate(zfs); + nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig); + nvlist_destroy(vdevconfig); + + error = nvlist_export(poolconfig); + if (error != 0) + errc(1, error, "nvlist_export"); + nvlist_copy(poolconfig, label->vl_vdev_phys.vp_nvlist, + sizeof(label->vl_vdev_phys.vp_nvlist)); + nvlist_destroy(poolconfig); + + /* + * Fill out the uberblock. Just make each one the same. The embedded + * checksum is calculated in vdev_label_write(). + */ + for (size_t uoff = 0; uoff < sizeof(label->vl_uberblock); + uoff += (1 << zfs->ashift)) { + ub = (uberblock_t *)(&label->vl_uberblock[0] + uoff); + ub->ub_magic = UBERBLOCK_MAGIC; + ub->ub_version = SPA_VERSION; + ub->ub_txg = TXG_INITIAL; + ub->ub_guid_sum = zfs->guid + zfs->guid; /* root + disk */ + ub->ub_timestamp = 0; /* XXX-MJ */ + + ub->ub_software_version = SPA_VERSION; + ub->ub_mmp_magic = MMP_MAGIC; + ub->ub_mmp_delay = 0; + ub->ub_mmp_config = 0; + ub->ub_checkpoint_txg = 0; + memcpy(&ub->ub_rootbp, &zfs->mos.osbp, sizeof(blkptr_t)); + } + + /* + * Write out four copies of the label: two at the beginning of the vdev + * and two at the end. + */ + for (int i = 0; i < VDEV_LABELS; i++) + vdev_label_write(zfs, i, label); + + free(label); +} + +static void +pool_fini(zfs_opt_t *zfs) +{ + zap_write(zfs, &zfs->poolprops); + dsl_write(zfs); + objset_mos_write(zfs); + pool_labels_write(zfs); +} + +/* + * Visit each node in a directory hierarchy, in pre-order depth-first order. + */ +static void +fsnode_foreach(fsnode *root, int (*cb)(fsnode *, void *), void *arg) +{ + assert(root->type == S_IFDIR); + + for (fsnode *cur = root; cur != NULL; cur = cur->next) { + assert(cur->type == S_IFREG || cur->type == S_IFDIR || + cur->type == S_IFLNK); + + if (cb(cur, arg) == 0) + continue; + if (cur->type == S_IFDIR && cur->child != NULL) + fsnode_foreach(cur->child, cb, arg); + } +} + +static bool +fsnode_isroot(const fsnode *cur) +{ + return (strcmp(cur->name, ".") == 0); +} + +static struct dnode_cursor * +dnode_cursor_init(zfs_opt_t *zfs, zfs_objset_t *os, dnode_phys_t *dnode, + off_t size, off_t blksz) +{ + struct dnode_cursor *c; + uint64_t nbppindir, indlevel, ndatablks, nindblks; + + assert(dnode->dn_nblkptr == 1); + assert(blksz <= MAXBLOCKSIZE); + + if (blksz == 0) { + /* Must be between 1<ashift, + powerof2(size) ? size : (1ul << flsl(size)))); + } + assert(powerof2(blksz)); + + /* + * Do we need indirect blocks? Figure out how many levels are needed + * (indlevel == 1 means no indirect blocks) and how much space is needed + * (it has to be allocated up-front to break the dependency cycle + * described in objset_mos_write()). + */ + ndatablks = size == 0 ? 0 : howmany(size, blksz); + nindblks = 0; + for (indlevel = 1, nbppindir = 1; ndatablks > nbppindir; indlevel++) { + nbppindir *= BLKPTR_PER_INDIR; + nindblks += howmany(ndatablks, indlevel * nbppindir); + } + assert(indlevel < INDIR_LEVELS); + + dnode->dn_nlevels = (uint8_t)indlevel; + dnode->dn_maxblkid = ndatablks > 0 ? ndatablks - 1 : 0; + dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT; + + c = ecalloc(1, sizeof(*c)); + if (nindblks > 0) { + c->indspace = nindblks * MAXBLOCKSIZE; + c->indloc = objset_space_alloc(zfs, os, &c->indspace); + } + c->dnode = dnode; + c->dataoff = 0; + c->datablksz = blksz; + + return (c); +} + +static void +_dnode_cursor_flush(zfs_opt_t *zfs, struct dnode_cursor *c, int levels) +{ + blkptr_t *bp, *pbp; + void *buf; + uint64_t fill; + off_t blkid, blksz, loc; + + assert(levels > 0); + assert(levels <= c->dnode->dn_nlevels - 1); + + blksz = MAXBLOCKSIZE; + blkid = (c->dataoff / c->datablksz) / BLKPTR_PER_INDIR; + for (int level = 1; level <= levels; level++) { + buf = c->inddir[level - 1]; + + if (level == c->dnode->dn_nlevels - 1) { + pbp = &c->dnode->dn_blkptr[0]; + } else { + uint64_t iblkid; + + iblkid = blkid & (BLKPTR_PER_INDIR - 1); + pbp = (blkptr_t *) + &c->inddir[level][iblkid * sizeof(blkptr_t)]; + } + + /* + * Space for indirect blocks is allocated up-front; see the + * comment in objset_mos_write(). + */ + loc = c->indloc; + c->indloc += blksz; + assert(c->indspace >= blksz); + c->indspace -= blksz; + + bp = buf; + fill = 0; + for (size_t i = 0; i < BLKPTR_PER_INDIR; i++) + fill += BP_GET_FILL(&bp[i]); + + vdev_pwrite_dnode_indir(zfs, c->dnode, level, fill, buf, blksz, + loc, pbp); + memset(buf, 0, MAXBLOCKSIZE); + + blkid /= BLKPTR_PER_INDIR; + } +} + +static blkptr_t * +dnode_cursor_next(zfs_opt_t *zfs, struct dnode_cursor *c, off_t off) +{ + off_t blkid, l1id; + int levels; + + if (c->dnode->dn_nlevels == 1) { + assert(off < MAXBLOCKSIZE); + return (&c->dnode->dn_blkptr[0]); + } + + assert(off % c->datablksz == 0); + + /* Do we need to flush any full indirect blocks? */ + if (off > 0) { + blkid = off / c->datablksz; + for (levels = 0; levels < c->dnode->dn_nlevels - 1; levels++) { + if (blkid % BLKPTR_PER_INDIR != 0) + break; + blkid /= BLKPTR_PER_INDIR; + } + if (levels > 0) + _dnode_cursor_flush(zfs, c, levels); + } + + c->dataoff = off; + l1id = (off / c->datablksz) & (BLKPTR_PER_INDIR - 1); + return ((blkptr_t *)&c->inddir[0][l1id * sizeof(blkptr_t)]); +} + +static void +dnode_cursor_finish(zfs_opt_t *zfs, struct dnode_cursor *c) +{ + int levels; + + levels = c->dnode->dn_nlevels - 1; + if (levels > 0) + _dnode_cursor_flush(zfs, c, levels); + assert(c->indspace == 0); + free(c); +} + +struct fs_populate_dir { + SLIST_ENTRY(fs_populate_dir) next; + int dirfd; + uint64_t objid; + zfs_zap_t zap; +}; + +struct fs_populate_arg { + zfs_opt_t *zfs; + zfs_fs_t *fs; /* owning filesystem */ + int dirfd; /* current directory fd */ + uint64_t rootdirid; /* root directory dnode ID */ + SLIST_HEAD(, fs_populate_dir) dirs; /* stack of directories */ +}; + +static void +fs_populate_dirent(struct fs_populate_arg *arg, fsnode *cur, uint64_t dnid) +{ + struct fs_populate_dir *dir; + uint64_t type; + + switch (cur->type) { + case S_IFREG: + type = DT_REG; + break; + case S_IFDIR: + type = DT_DIR; + break; + case S_IFLNK: + type = DT_LNK; + break; + default: + assert(0); + } + + dir = SLIST_FIRST(&arg->dirs); + zap_add_uint64(&dir->zap, cur->name, ZFS_DIRENT_MAKE(type, dnid)); +} + +static void +fs_populate_attr(zfs_fs_t *fs, char *attrbuf, const void *val, uint16_t ind, + size_t *szp) +{ + assert(ind < fs->sacnt); + assert(fs->saoffs[ind] != 0xffff); + + memcpy(attrbuf + fs->saoffs[ind], val, fs->satab[ind].size); + *szp += fs->satab[ind].size; +} + +static void +fs_populate_varszattr(zfs_fs_t *fs, char *attrbuf, const void *val, + size_t valsz, size_t varoff, uint16_t ind, size_t *szp) +{ + assert(ind < fs->sacnt); + assert(fs->saoffs[ind] != 0xffff); + assert(fs->satab[ind].size == 0); + + memcpy(attrbuf + fs->saoffs[ind] + varoff, val, valsz); + *szp += valsz; +} + +static void +fs_populate_sattrs(struct fs_populate_arg *arg, const fsnode *cur, + dnode_phys_t *dnode) +{ + char target[PATH_MAX]; + zfs_fs_t *fs; + zfs_ace_hdr_t aces[3]; + struct stat *sb; + sa_hdr_phys_t *sahdr; + uint64_t daclcount, flags, gen, gid, links, mode, parent, objsize, uid; + char *attrbuf; + size_t bonussz, hdrsz; + int layout; + + assert(dnode->dn_bonustype == DMU_OT_SA); + assert(dnode->dn_nblkptr == 1); + + fs = arg->fs; + sb = &cur->inode->st; + + switch (cur->type) { + case S_IFREG: + layout = SA_LAYOUT_INDEX_DEFAULT; + links = cur->inode->nlink; + objsize = sb->st_size; + parent = SLIST_FIRST(&arg->dirs)->objid; + break; + case S_IFDIR: + layout = SA_LAYOUT_INDEX_DEFAULT; + links = 1; /* .. */ + objsize = 1; /* .. */ + + /* + * The size of a ZPL directory is the number of entries + * (including "." and ".."), and the link count is the number of + * entries which are directories (including "." and ".."). + */ + for (fsnode *c = fsnode_isroot(cur) ? cur->next : cur->child; + c != NULL; c = c->next) { + if (c->type == S_IFDIR) + links++; + objsize++; + } + + /* The root directory is its own parent. */ + parent = SLIST_EMPTY(&arg->dirs) ? + arg->rootdirid : SLIST_FIRST(&arg->dirs)->objid; + break; + case S_IFLNK: { + ssize_t n; + + if ((n = readlinkat(SLIST_FIRST(&arg->dirs)->dirfd, cur->name, + target, sizeof(target) - 1)) == -1) + err(1, "readlinkat(%s)", cur->name); + target[n] = '\0'; + + layout = SA_LAYOUT_INDEX_SYMLINK; + links = 1; + objsize = strlen(target); + parent = SLIST_FIRST(&arg->dirs)->objid; + break; + } + default: + assert(0); + } + + daclcount = nitems(aces); + flags = ZFS_ACL_TRIVIAL | ZFS_ACL_AUTO_INHERIT | ZFS_NO_EXECS_DENIED | + ZFS_ARCHIVE | ZFS_AV_MODIFIED; /* XXX-MJ */ + gen = 1; + gid = sb->st_gid; + mode = sb->st_mode; + uid = sb->st_uid; + + /* XXX-MJ need to review these */ + memset(aces, 0, sizeof(aces)); + aces[0].z_flags = ACE_OWNER; + aces[0].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE; + aces[0].z_access_mask = ACE_READ_DATA | ACE_WRITE_ATTRIBUTES | + ACE_WRITE_OWNER | ACE_WRITE_ACL | ACE_WRITE_NAMED_ATTRS | + ACE_READ_ACL | ACE_READ_ATTRIBUTES | ACE_READ_NAMED_ATTRS | + ACE_SYNCHRONIZE; + aces[1].z_flags = ACE_GROUP | ACE_IDENTIFIER_GROUP; + aces[1].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE; + aces[1].z_access_mask = ACE_READ_DATA | ACE_READ_ACL | + ACE_READ_ATTRIBUTES | ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE; + aces[2].z_flags = ACE_EVERYONE; + aces[2].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE; + aces[2].z_access_mask = ACE_READ_DATA | ACE_READ_ACL | + ACE_READ_ATTRIBUTES | ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE; + + switch (layout) { + case SA_LAYOUT_INDEX_DEFAULT: + /* At most one variable-length attribute. */ + hdrsz = sizeof(uint64_t); + break; + case SA_LAYOUT_INDEX_SYMLINK: + /* At most five variable-length attributes. */ + hdrsz = sizeof(uint64_t) * 2; + break; + default: + assert(0); + } + + sahdr = (sa_hdr_phys_t *)DN_BONUS(dnode); + sahdr->sa_magic = SA_MAGIC; + SA_HDR_LAYOUT_INFO_ENCODE(sahdr->sa_layout_info, layout, hdrsz); + + bonussz = SA_HDR_SIZE(sahdr); + attrbuf = (char *)sahdr + SA_HDR_SIZE(sahdr); + + fs_populate_attr(fs, attrbuf, &daclcount, ZPL_DACL_COUNT, &bonussz); + fs_populate_attr(fs, attrbuf, &flags, ZPL_FLAGS, &bonussz); + fs_populate_attr(fs, attrbuf, &gen, ZPL_GEN, &bonussz); + fs_populate_attr(fs, attrbuf, &gid, ZPL_GID, &bonussz); + fs_populate_attr(fs, attrbuf, &links, ZPL_LINKS, &bonussz); + fs_populate_attr(fs, attrbuf, &mode, ZPL_MODE, &bonussz); + fs_populate_attr(fs, attrbuf, &parent, ZPL_PARENT, &bonussz); + fs_populate_attr(fs, attrbuf, &objsize, ZPL_SIZE, &bonussz); + fs_populate_attr(fs, attrbuf, &uid, ZPL_UID, &bonussz); + + /* + * We deliberately set atime = mtime here to ensure that images are + * reproducible. + */ + assert(sizeof(sb->st_mtim) == fs->satab[ZPL_ATIME].size); + fs_populate_attr(fs, attrbuf, &sb->st_mtim, ZPL_ATIME, &bonussz); + assert(sizeof(sb->st_ctim) == fs->satab[ZPL_CTIME].size); + fs_populate_attr(fs, attrbuf, &sb->st_ctim, ZPL_CTIME, &bonussz); + assert(sizeof(sb->st_mtim) == fs->satab[ZPL_MTIME].size); + fs_populate_attr(fs, attrbuf, &sb->st_mtim, ZPL_MTIME, &bonussz); + assert(sizeof(sb->st_birthtim) == fs->satab[ZPL_CRTIME].size); + fs_populate_attr(fs, attrbuf, &sb->st_birthtim, ZPL_CRTIME, &bonussz); + + fs_populate_varszattr(fs, attrbuf, aces, sizeof(aces), 0, + ZPL_DACL_ACES, &bonussz); + sahdr->sa_lengths[0] = sizeof(aces); + + if (cur->type == S_IFLNK) { + assert(layout == SA_LAYOUT_INDEX_SYMLINK); + /* Need to use a spill block pointer if the target is long. */ + assert(bonussz + objsize <= DN_OLD_MAX_BONUSLEN); + fs_populate_varszattr(fs, attrbuf, target, objsize, + sahdr->sa_lengths[0], ZPL_SYMLINK, &bonussz); + sahdr->sa_lengths[1] = (uint16_t)objsize; + } + + dnode->dn_bonuslen = bonussz; +} + +static void +fs_populate_file(fsnode *cur, struct fs_populate_arg *arg) +{ + struct dnode_cursor *c; + dnode_phys_t *dnode; + zfs_opt_t *zfs; + char *buf; + uint64_t dnid; + ssize_t n; + size_t bufsz; + off_t size, target; + int fd; + + assert(cur->type == S_IFREG); + assert((cur->inode->flags & FI_ROOT) == 0); + + zfs = arg->zfs; + + assert(cur->inode->ino != 0); + if ((cur->inode->flags & FI_ALLOCATED) != 0) { + /* + * This is a hard link of an existing file. + * + * XXX-MJ need to check whether it crosses datasets, add a test + * case for that + */ + fs_populate_dirent(arg, cur, cur->inode->ino); + return; + } + + dnode = objset_dnode_bonus_alloc(arg->fs->os, + DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid); + cur->inode->ino = dnid; + cur->inode->flags |= FI_ALLOCATED; + + fd = openat(SLIST_FIRST(&arg->dirs)->dirfd, cur->name, O_RDONLY); + if (fd == -1) + err(1, "openat(%s)", cur->name); + + buf = zfs->filebuf; + bufsz = sizeof(zfs->filebuf); + size = cur->inode->st.st_size; + c = dnode_cursor_init(zfs, arg->fs->os, dnode, size, 0); + for (off_t foff = 0; foff < size; foff += target) { + off_t loc, sofar; + + /* Fill up our buffer, handling partial reads. */ + sofar = 0; + target = MIN(size - foff, (off_t)bufsz); + do { + n = read(fd, buf + sofar, target); + if (n < 0) + err(1, "reading from '%s'", cur->name); + if (n == 0) + errx(1, "unexpected EOF reading '%s'", + cur->name); + sofar += n; + } while (sofar < target); + + if (target < (off_t)bufsz) + memset(buf + target, 0, bufsz - target); + + loc = objset_space_alloc(zfs, arg->fs->os, &target); + vdev_pwrite_dnode_indir(zfs, c->dnode, 0, 1, buf, target, loc, + dnode_cursor_next(zfs, c, foff)); + } + if (close(fd) != 0) + err(1, "close"); + dnode_cursor_finish(zfs, c); + + fs_populate_sattrs(arg, cur, dnode); + fs_populate_dirent(arg, cur, dnid); +} + +static void +fs_populate_dir(fsnode *cur, struct fs_populate_arg *arg) +{ + dnode_phys_t *dnode; + zfs_objset_t *os; + uint64_t dnid; + int dirfd; + + assert(cur->type == S_IFDIR); + assert((cur->inode->flags & FI_ALLOCATED) == 0); + + os = arg->fs->os; + + dnode = objset_dnode_bonus_alloc(os, DMU_OT_DIRECTORY_CONTENTS, + DMU_OT_SA, 0, &dnid); + + /* + * Add an entry to the parent directory and open this directory. + */ + if (!SLIST_EMPTY(&arg->dirs)) { + fs_populate_dirent(arg, cur, dnid); + dirfd = openat(SLIST_FIRST(&arg->dirs)->dirfd, cur->name, + O_DIRECTORY); + if (dirfd < 0) + err(1, "open(%s)", cur->name); + } else { + arg->rootdirid = dnid; + dirfd = arg->dirfd; + } + + fs_populate_sattrs(arg, cur, dnode); + + /* + * If this is a root directory, then its children belong to a different + * dataset and this directory remains empty in the current objset. + */ + if ((cur->inode->flags & FI_ROOT) == 0) { + struct fs_populate_dir *dir; + + dir = ecalloc(1, sizeof(*dir)); + dir->dirfd = dirfd; + dir->objid = dnid; + zap_init(&dir->zap, os, dnode); + SLIST_INSERT_HEAD(&arg->dirs, dir, next); + } else { + zfs_zap_t dirzap; + + zap_init(&dirzap, os, dnode); + zap_write(arg->zfs, &dirzap); + + fs_build_one(arg->zfs, cur->inode->param, cur->child, dirfd); + } +} + +static void +fs_populate_symlink(fsnode *cur, struct fs_populate_arg *arg) +{ + dnode_phys_t *dnode; + uint64_t dnid; + + assert(cur->type == S_IFLNK); + assert((cur->inode->flags & (FI_ALLOCATED | FI_ROOT)) == 0); + + dnode = objset_dnode_bonus_alloc(arg->fs->os, + DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid); + + fs_populate_dirent(arg, cur, dnid); + + fs_populate_sattrs(arg, cur, dnode); +} + +static int +fs_foreach_populate(fsnode *cur, void *_arg) +{ + struct fs_populate_arg *arg; + struct fs_populate_dir *dir; + int ret; + + arg = _arg; + switch (cur->type) { + case S_IFREG: + fs_populate_file(cur, arg); + break; + case S_IFDIR: + if (fsnode_isroot(cur)) + break; + fs_populate_dir(cur, arg); + break; + case S_IFLNK: + fs_populate_symlink(cur, arg); + break; + default: + assert(0); + } + + ret = (cur->inode->flags & FI_ROOT) != 0 ? 0 : 1; + + if (cur->next == NULL && + (cur->child == NULL || (cur->inode->flags & FI_ROOT) != 0)) { + /* + * We reached a terminal node in a subtree. Walk back up and + * write out directories. We're done once we hit the root of a + * dataset or find a level where we're not on the edge of the + * tree. + */ + do { + dir = SLIST_FIRST(&arg->dirs); + SLIST_REMOVE_HEAD(&arg->dirs, next); + zap_write(arg->zfs, &dir->zap); + if (dir->dirfd != -1 && close(dir->dirfd) != 0) + err(1, "close"); + free(dir); + cur = cur->parent; + } while (cur != NULL && cur->next == NULL && + (cur->inode->flags & FI_ROOT) == 0); + } + + return (ret); +} + +static void +fs_add_zpl_attr_layout(zfs_zap_t *zap, unsigned int index, + const sa_attr_type_t layout[], size_t sacnt) +{ + char ti[16]; + + assert(sizeof(layout[0]) == 2); + + snprintf(ti, sizeof(ti), "%u", index); + zap_add(zap, ti, sizeof(sa_attr_type_t), sacnt, + (const uint8_t *)layout); +} + +/* + * Initialize system attribute tables. + * + * There are two elements to this. First, we write the zpl_attrs[] and + * zpl_attr_layout[] tables to disk. Then we create a lookup table which + * allows us to set file attributes quickly. + */ +static uint64_t +fs_set_zpl_attrs(zfs_opt_t *zfs, zfs_fs_t *fs) +{ + zfs_zap_t sazap, salzap, sarzap; + zfs_objset_t *os; + dnode_phys_t *saobj, *salobj, *sarobj; + uint64_t saobjid, salobjid, sarobjid; + uint16_t offset; + + os = fs->os; + + /* + * The on-disk tables are stored in two ZAP objects, the registry object + * and the layout object. Individual attributes are described by + * entries in the registry object; for example, the value for the + * "ZPL_SIZE" key gives the size and encoding of the ZPL_SIZE attribute. + * The attributes of a file are ordered according to one of the layouts + * defined in the layout object. The master node object is simply used + * to locate the registry and layout objects. + */ + saobj = objset_dnode_alloc(os, DMU_OT_SA_MASTER_NODE, &saobjid); + salobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_LAYOUTS, &salobjid); + sarobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_REGISTRATION, &sarobjid); + + zap_init(&sarzap, os, sarobj); + for (size_t i = 0; i < nitems(zpl_attrs); i++) { + const zfs_sattr_t *sa; + uint64_t attr; + + attr = 0; + sa = &zpl_attrs[i]; + SA_ATTR_ENCODE(attr, (uint64_t)i, sa->size, sa->bs); + zap_add_uint64(&sarzap, sa->name, attr); + } + zap_write(zfs, &sarzap); + + /* + * Layouts are arrays of indices into the registry. We define two + * layouts for use by the ZPL, one for non-symlinks and one for + * symlinks. They are identical except that the symlink layout includes + * ZPL_SYMLINK as its final attribute. + */ + zap_init(&salzap, os, salobj); + assert(zpl_attr_layout[nitems(zpl_attr_layout) - 1] == ZPL_SYMLINK); + fs_add_zpl_attr_layout(&salzap, SA_LAYOUT_INDEX_DEFAULT, + zpl_attr_layout, nitems(zpl_attr_layout) - 1); + fs_add_zpl_attr_layout(&salzap, SA_LAYOUT_INDEX_SYMLINK, + zpl_attr_layout, nitems(zpl_attr_layout)); + zap_write(zfs, &salzap); + + zap_init(&sazap, os, saobj); + zap_add_uint64(&sazap, SA_LAYOUTS, salobjid); + zap_add_uint64(&sazap, SA_REGISTRY, sarobjid); + zap_write(zfs, &sazap); + + /* Sanity check. */ + for (size_t i = 0; i < nitems(zpl_attrs); i++) + assert(i == zpl_attrs[i].id); + + /* + * Build the offset table used when setting file attributes. File + * attributes are stored in the object's bonus buffer; this table + * provides the buffer offset of attributes referenced by the layout + * table. + */ + fs->sacnt = nitems(zpl_attrs); + fs->saoffs = ecalloc(fs->sacnt, sizeof(*fs->saoffs)); + for (size_t i = 0; i < fs->sacnt; i++) + fs->saoffs[i] = 0xffff; + offset = 0; + for (size_t i = 0; i < nitems(zpl_attr_layout); i++) { + uint16_t size; + + assert(zpl_attr_layout[i] < fs->sacnt); + + fs->saoffs[zpl_attr_layout[i]] = offset; + size = zpl_attrs[zpl_attr_layout[i]].size; + offset += size; + } + fs->satab = zpl_attrs; + + return (saobjid); +} + +static void +fs_layout_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg) +{ + char *mountpoint, *origmountpoint, *name, *next; + fsnode *cur, *root; + uint64_t canmount; + + if (dsldir->headds == NULL) + return; + + mountpoint = dsl_dir_get_mountpoint(zfs, dsldir); + if (mountpoint == NULL) + return; + if (nvlist_find_uint64(dsldir->propsnv, "canmount", &canmount) == 0 && + canmount == 0) + return; + + /* + * If we were asked to specify a bootfs, set it here. + */ + if (zfs->bootfs != NULL && strcmp(zfs->bootfs, dsldir->fullname) == 0) + zap_add_uint64(&zfs->poolprops, "bootfs", dsldir->headds->dsid); + + origmountpoint = mountpoint; + + /* + * Figure out which fsnode corresponds to our mountpoint. + */ + root = arg; + cur = root; + if (strcmp(mountpoint, zfs->rootpath) != 0) { + mountpoint += strlen(zfs->rootpath); + + /* + * Look up the directory in the staged tree. For example, if + * the dataset's mount point is /foo/bar/baz, we'll search the + * root directory for "foo", search "foo" for "baz", and so on. + * Each intermediate name must refer to a directory; the final + * component need not exist. + */ + cur = root; + for (next = name = mountpoint; next != NULL;) { + for (; *next == '/'; next++) + ; + name = strsep(&next, "/"); + + for (; cur != NULL && strcmp(cur->name, name) != 0; + cur = cur->next) + ; + if (cur == NULL) { + if (next == NULL) + break; + errx(1, "missing mountpoint directory for `%s'", + dsldir->fullname); + } + if (cur->type != S_IFDIR) { + errx(1, + "mountpoint for `%s' is not a directory", + dsldir->fullname); + } + if (next != NULL) + cur = cur->child; + } + } + + if (cur != NULL) { + assert(cur->type == S_IFDIR); + + /* + * Multiple datasets shouldn't share a mountpoint. It's + * technically allowed, but it's not clear what makefs should do + * in that case. + */ + assert((cur->inode->flags & FI_ROOT) == 0); + if (cur != root) + cur->inode->flags |= FI_ROOT; + assert(cur->inode->param == NULL); + cur->inode->param = dsldir; + } + + free(origmountpoint); +} + +static int +fs_foreach_count(fsnode *cur, void *arg) +{ + uint64_t *countp; + + countp = arg; + if (cur->type == S_IFDIR && fsnode_isroot(cur)) + return (1); + + if (cur->inode->ino == 0) { + cur->inode->ino = ++(*countp); + cur->inode->nlink = 1; + } else { + cur->inode->nlink++; + } + + return ((cur->inode->flags & FI_ROOT) != 0 ? 0 : 1); +} + +/* + * Create a filesystem dataset. More specifically: + * - create an object set for the dataset, + * - add required metadata (SA tables, property definitions, etc.) to that + * object set, + * - optionally populate the object set with file objects, using "root" as the + * root directory. + * + * "dirfd" is a directory descriptor for the directory referenced by "root". It + * is closed before returning. + */ +static void +fs_build_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, fsnode *root, int dirfd) +{ + struct fs_populate_arg arg; + zfs_fs_t fs; + zfs_zap_t deleteqzap, masterzap; + zfs_objset_t *os; + dnode_phys_t *deleteq, *masterobj; + uint64_t deleteqid, dnodecount, moid, rootdirid, saobjid; + bool fakedroot; + + if (root != NULL) { + assert(root->type == S_IFDIR); + assert(fsnode_isroot(root)); + } + + os = ecalloc(1, sizeof(*os)); + + memset(&fs, 0, sizeof(fs)); + fs.os = os; + + /* + * This dataset's mountpoint doesn't exist in the staging tree. Fake up + * a root fsnode to handle this case. + */ + fakedroot = root == NULL; + if (fakedroot) { + struct stat *stp; + + assert(dirfd == -1); + + root = ecalloc(1, sizeof(*root)); + root->inode = ecalloc(1, sizeof(*root->inode)); + root->name = estrdup("."); + root->type = S_IFDIR; + + stp = &root->inode->st; + stp->st_uid = 0; + stp->st_gid = 0; + stp->st_mode = S_IFDIR | 0755; + } + + /* + * How many dnodes do we need? One for each file/directory/symlink plus + * several metadata objects. + */ + dnodecount = 1; /* root directory */ + fsnode_foreach(root, fs_foreach_count, &dnodecount); + dnodecount++; /* master object */ + dnodecount++; /* delete queue */ + dnodecount++; /* system attributes master node */ + dnodecount++; /* system attributes registry */ + dnodecount++; /* system attributes layout */ + + objset_init(zfs, os, DMU_OST_ZFS, dnodecount); + masterobj = objset_dnode_alloc(os, DMU_OT_MASTER_NODE, &moid); + assert(moid == MASTER_NODE_OBJ); + + /* + * Create the ZAP SA layout now since filesystem object dnodes will + * refer to those attributes. + */ + saobjid = fs_set_zpl_attrs(zfs, &fs); + + /* + * Populate the dataset with files from the staging directory. Most of + * our runtime is spent here. + */ + arg.dirfd = dirfd; + arg.zfs = zfs; + arg.fs = &fs; + SLIST_INIT(&arg.dirs); + fs_populate_dir(root, &arg); + assert(!SLIST_EMPTY(&arg.dirs)); + fsnode_foreach(root, fs_foreach_populate, &arg); + assert(SLIST_EMPTY(&arg.dirs)); + rootdirid = arg.rootdirid; + + /* + * Create an empty delete queue. We don't do anything with it, but + * OpenZFS will refuse to mount filesystems that don't have one. + */ + deleteq = objset_dnode_alloc(os, DMU_OT_UNLINKED_SET, &deleteqid); + zap_init(&deleteqzap, os, deleteq); + zap_write(zfs, &deleteqzap); + + /* + * Populate and write the master node object. This is a ZAP object + * containing various dataset properties and the object IDs of the root + * directory and delete queue. + */ + zap_init(&masterzap, os, masterobj); + zap_add_uint64(&masterzap, ZFS_ROOT_OBJ, rootdirid); + zap_add_uint64(&masterzap, ZFS_UNLINKED_SET, deleteqid); + zap_add_uint64(&masterzap, ZFS_SA_ATTRS, saobjid); + zap_add_uint64(&masterzap, ZPL_VERSION_OBJ, 5 /* ZPL_VERSION_SA */); + zap_add_uint64(&masterzap, "normalization", 0 /* off */); + zap_add_uint64(&masterzap, "utf8only", 0 /* off */); + zap_add_uint64(&masterzap, "casesensitivity", 0 /* case sensitive */); + zap_add_uint64(&masterzap, "acltype", 2 /* NFSv4 */); + zap_write(zfs, &masterzap); + + /* + * All finished with this object set, we may as well write it now. + * The DSL layer will sum up the bytes consumed by each dataset using + * information stored in the object set, so it can't be freed just yet. + */ + assert(dsldir != NULL); + dsldir->headds->os = os; + objset_write(zfs, os); + + if (fakedroot) { + free(root->inode); + free(root->name); + free(root); + } + free(fs.saoffs); +} + +static void +fs_build_unmounted(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg __unused) +{ + if (dsldir->headds == NULL) + return; + if (dsldir->headds->os != NULL) + return; + fs_build_one(zfs, dsldir, NULL, -1); +} + +/* + * Create our datasets and populate them with files. + */ +static void +fs_build(zfs_opt_t *zfs, int dirfd, fsnode *root) +{ + /* + * Run through our datasets and find the root fsnode for each one. Each + * root fsnode is flagged so that we can figure out which dataset it + * belongs to. + */ + dsl_dir_foreach(zfs, &zfs->rootdsldir, fs_layout_one, root); + + /* + * Did we find our boot filesystem? + */ + if (zfs->bootfs != NULL && !zap_entry_exists(&zfs->poolprops, "bootfs")) + errx(1, "no mounted dataset matches bootfs property `%s'", + zfs->bootfs); + + /* + * Traverse the file hierarchy starting from the root fsnode. One + * dataset, not necessarily the root dataset, must "own" the root + * directory by having its mountpoint be equal to the root path. + * + * As roots of other datasets are encountered during the traversal, + * fs_build_one() recursively creates the corresponding object sets and + * populates them. Once this function has returned, all datasets will + * have been fully populated. + */ + fs_build_one(zfs, root->inode->param, root, dirfd); + + /* + * Now create object sets for datasets whose mountpoints weren't found + * in the staging directory, either because there is no mountpoint, or + * because the mountpoint doesn't correspond to an existing directory. + */ + dsl_dir_foreach(zfs, &zfs->rootdsldir, fs_build_unmounted, NULL); +} + +/* + * The entry point to all other code in this file. + */ +void +zfs_makefs(const char *image, const char *dir, fsnode *root, fsinfo_t *fsopts) +{ + zfs_opt_t *zfs; + int dirfd; + + zfs = fsopts->fs_specific; + + /* + * Use a fixed seed to provide reproducible pseudo-random numbers for + * on-disk structures when needed (e.g., ZAP hash salts). + */ + srandom(1729); + + zfs_check_opts(fsopts); + + dirfd = open(dir, O_DIRECTORY | O_RDONLY); + if (dirfd < 0) + err(1, "open(%s)", dir); + + vdev_init(zfs, fsopts->maxsize, image); + pool_init(zfs); + fs_build(zfs, dirfd, root); + pool_fini(zfs); + vdev_fini(zfs); +} Index: usr.sbin/makefs/zfs/Makefile.inc =================================================================== --- /dev/null +++ usr.sbin/makefs/zfs/Makefile.inc @@ -0,0 +1,5 @@ +.PATH: ${SRCDIR}/zfs + +SRCS+= nvlist.c + +CFLAGS.nvlist.c+= -Wno-cast-qual Index: usr.sbin/makefs/zfs/nvlist.h =================================================================== --- /dev/null +++ usr.sbin/makefs/zfs/nvlist.h @@ -0,0 +1,167 @@ +/*- + * Copyright (c) 2012 Andriy Gapon + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NVLIST_H_ +#define _NVLIST_H_ + +/* nvp implementation version */ +#define NV_VERSION 0 + +/* nvlist persistent unique name flags, stored in nvl_nvflags */ +#define NV_UNIQUE_NAME 0x1 +#define NV_UNIQUE_NAME_TYPE 0x2 + +#define NV_ALIGN4(x) (((x) + 3) & ~3) +#define NV_ALIGN(x) (((x) + 7) & ~7) + +/* nvlist pack encoding */ +#define NV_ENCODE_NATIVE 0 +#define NV_ENCODE_XDR 1 + +typedef enum { + DATA_TYPE_UNKNOWN = 0, + DATA_TYPE_BOOLEAN, + DATA_TYPE_BYTE, + DATA_TYPE_INT16, + DATA_TYPE_UINT16, + DATA_TYPE_INT32, + DATA_TYPE_UINT32, + DATA_TYPE_INT64, + DATA_TYPE_UINT64, + DATA_TYPE_STRING, + DATA_TYPE_BYTE_ARRAY, + DATA_TYPE_INT16_ARRAY, + DATA_TYPE_UINT16_ARRAY, + DATA_TYPE_INT32_ARRAY, + DATA_TYPE_UINT32_ARRAY, + DATA_TYPE_INT64_ARRAY, + DATA_TYPE_UINT64_ARRAY, + DATA_TYPE_STRING_ARRAY, + DATA_TYPE_HRTIME, + DATA_TYPE_NVLIST, + DATA_TYPE_NVLIST_ARRAY, + DATA_TYPE_BOOLEAN_VALUE, + DATA_TYPE_INT8, + DATA_TYPE_UINT8, + DATA_TYPE_BOOLEAN_ARRAY, + DATA_TYPE_INT8_ARRAY, + DATA_TYPE_UINT8_ARRAY +} data_type_t; + +/* + * nvlist header. + * nvlist has 4 bytes header followed by version and flags, then nvpairs + * and the list is terminated by double zero. + */ +typedef struct { + char nvh_encoding; + char nvh_endian; + char nvh_reserved1; + char nvh_reserved2; +} nvs_header_t; + +typedef struct { + nvs_header_t nv_header; + size_t nv_asize; + size_t nv_size; + uint8_t *nv_data; + uint8_t *nv_idx; +} nvlist_t; + +/* + * nvpair header. + * nvpair has encoded and decoded size + * name string (size and data) + * data type and number of elements + * data + */ +typedef struct { + unsigned encoded_size; + unsigned decoded_size; +} nvp_header_t; + +/* + * nvlist stream head. + */ +typedef struct { + unsigned nvl_version; + unsigned nvl_nvflag; + nvp_header_t nvl_pair; +} nvs_data_t; + +typedef struct { + unsigned nv_size; + uint8_t nv_data[]; /* NV_ALIGN4(string) */ +} nv_string_t; + +typedef struct { + unsigned nv_type; /* data_type_t */ + unsigned nv_nelem; /* number of elements */ + uint8_t nv_data[]; /* data stream */ +} nv_pair_data_t; + +nvlist_t *nvlist_create(int); +void nvlist_destroy(nvlist_t *); +nvlist_t *nvlist_import(const char *, size_t); +int nvlist_export(nvlist_t *); +int nvlist_remove(nvlist_t *, const char *, data_type_t); +int nvpair_type_from_name(const char *); +nvp_header_t *nvpair_find(nvlist_t *, const char *); +void nvpair_print(nvp_header_t *, unsigned int); +void nvlist_print(const nvlist_t *, unsigned int); +char *nvstring_get(nv_string_t *); +int nvlist_find(const nvlist_t *, const char *, data_type_t, + int *, void *, int *); +nvp_header_t *nvlist_next_nvpair(nvlist_t *, nvp_header_t *); + +int nvlist_add_boolean_value(nvlist_t *, const char *, bool); +int nvlist_add_byte(nvlist_t *, const char *, uint8_t); +int nvlist_add_int8(nvlist_t *, const char *, int8_t); +int nvlist_add_uint8(nvlist_t *, const char *, uint8_t); +int nvlist_add_int16(nvlist_t *, const char *, int16_t); +int nvlist_add_uint16(nvlist_t *, const char *, uint16_t); +int nvlist_add_int32(nvlist_t *, const char *, int32_t); +int nvlist_add_uint32(nvlist_t *, const char *, uint32_t); +int nvlist_add_int64(nvlist_t *, const char *, int64_t); +int nvlist_add_uint64(nvlist_t *, const char *, uint64_t); +int nvlist_add_string(nvlist_t *, const char *, const char *); +int nvlist_add_boolean_array(nvlist_t *, const char *, bool *, uint32_t); +int nvlist_add_byte_array(nvlist_t *, const char *, uint8_t *, uint32_t); +int nvlist_add_int8_array(nvlist_t *, const char *, int8_t *, uint32_t); +int nvlist_add_uint8_array(nvlist_t *, const char *, uint8_t *, uint32_t); +int nvlist_add_int16_array(nvlist_t *, const char *, int16_t *, uint32_t); +int nvlist_add_uint16_array(nvlist_t *, const char *, uint16_t *, uint32_t); +int nvlist_add_int32_array(nvlist_t *, const char *, int32_t *, uint32_t); +int nvlist_add_uint32_array(nvlist_t *, const char *, uint32_t *, uint32_t); +int nvlist_add_int64_array(nvlist_t *, const char *, int64_t *, uint32_t); +int nvlist_add_uint64_array(nvlist_t *, const char *, uint64_t *, uint32_t); +int nvlist_add_string_array(nvlist_t *, const char *, char * const *, uint32_t); +int nvlist_add_nvlist(nvlist_t *, const char *, nvlist_t *); +int nvlist_add_nvlist_array(nvlist_t *, const char *, nvlist_t **, uint32_t); + +#endif /* !_NVLIST_H_ */ Index: usr.sbin/makefs/zfs/nvlist.c =================================================================== --- /dev/null +++ usr.sbin/makefs/zfs/nvlist.c @@ -0,0 +1,1699 @@ +/*- + * Copyright 2020 Toomas Soome + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "zfs/nvlist.h" + +enum xdr_op { + XDR_OP_ENCODE = 1, + XDR_OP_DECODE = 2 +}; + +typedef struct xdr { + enum xdr_op xdr_op; + int (*xdr_getint)(struct xdr *, int *); + int (*xdr_putint)(struct xdr *, int); + int (*xdr_getuint)(struct xdr *, unsigned *); + int (*xdr_putuint)(struct xdr *, unsigned); + const uint8_t *xdr_buf; + uint8_t *xdr_idx; + size_t xdr_buf_size; +} xdr_t; + +static int nvlist_xdr_nvlist(xdr_t *, nvlist_t *); +static bool nvlist_size_xdr(xdr_t *, size_t *); +static bool nvlist_size_native(xdr_t *, size_t *); +static bool xdr_int(xdr_t *, int *); +static bool xdr_u_int(xdr_t *, unsigned *); + +typedef bool (*xdrproc_t)(xdr_t *, void *); + +/* Basic primitives for XDR translation operations, getint and putint. */ +static int +_getint(struct xdr *xdr, int *ip) +{ + *ip = be32dec(xdr->xdr_idx); + return (sizeof(int)); +} + +static int +_putint(struct xdr *xdr, int i) +{ + int *ip = (int *)xdr->xdr_idx; + + *ip = htobe32(i); + return (sizeof(int)); +} + +static int +_getuint(struct xdr *xdr, unsigned *ip) +{ + *ip = be32dec(xdr->xdr_idx); + return (sizeof(unsigned)); +} + +static int +_putuint(struct xdr *xdr, unsigned i) +{ + unsigned *up = (unsigned *)xdr->xdr_idx; + + *up = htobe32(i); + return (sizeof(int)); +} + +static int +_getint_mem(struct xdr *xdr, int *ip) +{ + *ip = *(int *)xdr->xdr_idx; + return (sizeof(int)); +} + +static int +_putint_mem(struct xdr *xdr, int i) +{ + int *ip = (int *)xdr->xdr_idx; + + *ip = i; + return (sizeof(int)); +} + +static int +_getuint_mem(struct xdr *xdr, unsigned *ip) +{ + *ip = *(unsigned *)xdr->xdr_idx; + return (sizeof(unsigned)); +} + +static int +_putuint_mem(struct xdr *xdr, unsigned i) +{ + unsigned *up = (unsigned *)xdr->xdr_idx; + + *up = i; + return (sizeof(int)); +} + +/* + * XDR data translations. + */ +static bool +xdr_short(xdr_t *xdr, short *ip) +{ + int i; + bool rv; + + i = *ip; + if ((rv = xdr_int(xdr, &i))) { + if (xdr->xdr_op == XDR_OP_DECODE) + *ip = i; + } + return (rv); +} + +static bool +xdr_u_short(xdr_t *xdr, unsigned short *ip) +{ + unsigned u; + bool rv; + + u = *ip; + if ((rv = xdr_u_int(xdr, &u))) { + if (xdr->xdr_op == XDR_OP_DECODE) + *ip = u; + } + return (rv); +} + +/* + * translate xdr->xdr_idx, increment it by size of int. + */ +static bool +xdr_int(xdr_t *xdr, int *ip) +{ + bool rv = false; + int *i = (int *)xdr->xdr_idx; + + if (xdr->xdr_idx + sizeof(int) > xdr->xdr_buf + xdr->xdr_buf_size) + return (rv); + + switch (xdr->xdr_op) { + case XDR_OP_ENCODE: + /* Encode value *ip, store to buf */ + xdr->xdr_idx += xdr->xdr_putint(xdr, *ip); + rv = true; + break; + + case XDR_OP_DECODE: + /* Decode buf, return value to *ip */ + xdr->xdr_idx += xdr->xdr_getint(xdr, i); + *ip = *i; + rv = true; + break; + } + return (rv); +} + +/* + * translate xdr->xdr_idx, increment it by size of unsigned int. + */ +static bool +xdr_u_int(xdr_t *xdr, unsigned *ip) +{ + bool rv = false; + unsigned *u = (unsigned *)xdr->xdr_idx; + + if (xdr->xdr_idx + sizeof(unsigned) > xdr->xdr_buf + xdr->xdr_buf_size) + return (rv); + + switch (xdr->xdr_op) { + case XDR_OP_ENCODE: + /* Encode value *ip, store to buf */ + xdr->xdr_idx += xdr->xdr_putuint(xdr, *ip); + rv = true; + break; + + case XDR_OP_DECODE: + /* Decode buf, return value to *ip */ + xdr->xdr_idx += xdr->xdr_getuint(xdr, u); + *ip = *u; + rv = true; + break; + } + return (rv); +} + +static bool +xdr_int64(xdr_t *xdr, int64_t *lp) +{ + bool rv = false; + + if (xdr->xdr_idx + sizeof(int64_t) > xdr->xdr_buf + xdr->xdr_buf_size) + return (rv); + + switch (xdr->xdr_op) { + case XDR_OP_ENCODE: + /* Encode value *lp, store to buf */ + if (xdr->xdr_putint == _putint) + *(int64_t *)xdr->xdr_idx = htobe64(*lp); + else + *(int64_t *)xdr->xdr_idx = *lp; + xdr->xdr_idx += sizeof(int64_t); + rv = true; + break; + + case XDR_OP_DECODE: + /* Decode buf, return value to *ip */ + if (xdr->xdr_getint == _getint) + *lp = be64toh(*(int64_t *)xdr->xdr_idx); + else + *lp = *(int64_t *)xdr->xdr_idx; + xdr->xdr_idx += sizeof(int64_t); + rv = true; + } + return (rv); +} + +static bool +xdr_uint64(xdr_t *xdr, uint64_t *lp) +{ + bool rv = false; + + if (xdr->xdr_idx + sizeof(uint64_t) > xdr->xdr_buf + xdr->xdr_buf_size) + return (rv); + + switch (xdr->xdr_op) { + case XDR_OP_ENCODE: + /* Encode value *ip, store to buf */ + if (xdr->xdr_putint == _putint) + *(uint64_t *)xdr->xdr_idx = htobe64(*lp); + else + *(uint64_t *)xdr->xdr_idx = *lp; + xdr->xdr_idx += sizeof(uint64_t); + rv = true; + break; + + case XDR_OP_DECODE: + /* Decode buf, return value to *ip */ + if (xdr->xdr_getuint == _getuint) + *lp = be64toh(*(uint64_t *)xdr->xdr_idx); + else + *lp = *(uint64_t *)xdr->xdr_idx; + xdr->xdr_idx += sizeof(uint64_t); + rv = true; + } + return (rv); +} + +static bool +xdr_char(xdr_t *xdr, char *cp) +{ + int i; + bool rv = false; + + i = *cp; + if ((rv = xdr_int(xdr, &i))) { + if (xdr->xdr_op == XDR_OP_DECODE) + *cp = i; + } + return (rv); +} + +static bool +xdr_string(xdr_t *xdr, nv_string_t *s) +{ + int size = 0; + bool rv = false; + + switch (xdr->xdr_op) { + case XDR_OP_ENCODE: + size = s->nv_size; + if (xdr->xdr_idx + sizeof(unsigned) + NV_ALIGN4(size) > + xdr->xdr_buf + xdr->xdr_buf_size) + break; + xdr->xdr_idx += xdr->xdr_putuint(xdr, s->nv_size); + xdr->xdr_idx += NV_ALIGN4(size); + rv = true; + break; + + case XDR_OP_DECODE: + if (xdr->xdr_idx + sizeof(unsigned) > + xdr->xdr_buf + xdr->xdr_buf_size) + break; + size = xdr->xdr_getuint(xdr, &s->nv_size); + size = NV_ALIGN4(size + s->nv_size); + if (xdr->xdr_idx + size > xdr->xdr_buf + xdr->xdr_buf_size) + break; + xdr->xdr_idx += size; + rv = true; + break; + } + return (rv); +} + +static bool +xdr_array(xdr_t *xdr, const unsigned nelem, const xdrproc_t elproc) +{ + bool rv = true; + unsigned c = nelem; + + if (!xdr_u_int(xdr, &c)) + return (false); + + for (unsigned i = 0; i < nelem; i++) { + if (!elproc(xdr, xdr->xdr_idx)) + return (false); + } + return (rv); +} + +/* + * nvlist management functions. + */ +void +nvlist_destroy(nvlist_t *nvl) +{ + if (nvl != NULL) { + /* Free data if it was allocated by us. */ + if (nvl->nv_asize > 0) + free(nvl->nv_data); + } + free(nvl); +} + +char * +nvstring_get(nv_string_t *nvs) +{ + char *s; + + s = malloc(nvs->nv_size + 1); + if (s != NULL) { + bcopy(nvs->nv_data, s, nvs->nv_size); + s[nvs->nv_size] = '\0'; + } + return (s); +} + +/* + * Create empty nvlist. + * The nvlist is terminated by 2x zeros (8 bytes). + */ +nvlist_t * +nvlist_create(int flag) +{ + nvlist_t *nvl; + nvs_data_t *nvs; + + nvl = calloc(1, sizeof(*nvl)); + if (nvl == NULL) + return (nvl); + + nvl->nv_header.nvh_encoding = NV_ENCODE_XDR; + nvl->nv_header.nvh_endian = _BYTE_ORDER == _LITTLE_ENDIAN; + + nvl->nv_asize = nvl->nv_size = sizeof(*nvs); + nvs = calloc(1, nvl->nv_asize); + if (nvs == NULL) { + free(nvl); + return (NULL); + } + /* data in nvlist is byte stream */ + nvl->nv_data = (uint8_t *)nvs; + + nvs->nvl_version = NV_VERSION; + nvs->nvl_nvflag = flag; + return (nvl); +} + +static bool +nvlist_xdr_nvp(xdr_t *xdr, nvlist_t *nvl) +{ + nv_string_t *nv_string; + nv_pair_data_t *nvp_data; + nvlist_t nvlist; + unsigned type, nelem; + xdr_t nv_xdr; + + nv_string = (nv_string_t *)xdr->xdr_idx; + if (!xdr_string(xdr, nv_string)) { + return (false); + } + nvp_data = (nv_pair_data_t *)xdr->xdr_idx; + + type = nvp_data->nv_type; + nelem = nvp_data->nv_nelem; + if (!xdr_u_int(xdr, &type) || !xdr_u_int(xdr, &nelem)) + return (false); + + switch (type) { + case DATA_TYPE_NVLIST: + case DATA_TYPE_NVLIST_ARRAY: + bzero(&nvlist, sizeof(nvlist)); + nvlist.nv_data = xdr->xdr_idx; + nvlist.nv_idx = nvlist.nv_data; + + /* Set up xdr for this nvlist. */ + nv_xdr = *xdr; + nv_xdr.xdr_buf = nvlist.nv_data; + nv_xdr.xdr_idx = nvlist.nv_data; + nv_xdr.xdr_buf_size = + nvl->nv_data + nvl->nv_size - nvlist.nv_data; + + for (unsigned i = 0; i < nelem; i++) { + if (xdr->xdr_op == XDR_OP_ENCODE) { + if (!nvlist_size_native(&nv_xdr, + &nvlist.nv_size)) + return (false); + } else { + if (!nvlist_size_xdr(&nv_xdr, + &nvlist.nv_size)) + return (false); + } + if (nvlist_xdr_nvlist(xdr, &nvlist) != 0) + return (false); + + nvlist.nv_data = nv_xdr.xdr_idx; + nvlist.nv_idx = nv_xdr.xdr_idx; + + nv_xdr.xdr_buf = nv_xdr.xdr_idx; + nv_xdr.xdr_buf_size = + nvl->nv_data + nvl->nv_size - nvlist.nv_data; + } + break; + + case DATA_TYPE_BOOLEAN: + /* BOOLEAN does not take value space */ + break; + case DATA_TYPE_BYTE: + case DATA_TYPE_INT8: + case DATA_TYPE_UINT8: + if (!xdr_char(xdr, (char *)&nvp_data->nv_data[0])) + return (false); + break; + + case DATA_TYPE_INT16: + if (!xdr_short(xdr, (short *)&nvp_data->nv_data[0])) + return (false); + break; + + case DATA_TYPE_UINT16: + if (!xdr_u_short(xdr, (unsigned short *)&nvp_data->nv_data[0])) + return (false); + break; + + case DATA_TYPE_BOOLEAN_VALUE: + case DATA_TYPE_INT32: + if (!xdr_int(xdr, (int *)&nvp_data->nv_data[0])) + return (false); + break; + + case DATA_TYPE_UINT32: + if (!xdr_u_int(xdr, (unsigned *)&nvp_data->nv_data[0])) + return (false); + break; + + case DATA_TYPE_HRTIME: + case DATA_TYPE_INT64: + if (!xdr_int64(xdr, (int64_t *)&nvp_data->nv_data[0])) + return (false); + break; + + case DATA_TYPE_UINT64: + if (!xdr_uint64(xdr, (uint64_t *)&nvp_data->nv_data[0])) + return (false); + break; + + case DATA_TYPE_BYTE_ARRAY: + case DATA_TYPE_STRING: + nv_string = (nv_string_t *)&nvp_data->nv_data[0]; + if (!xdr_string(xdr, nv_string)) + return (false); + break; + + case DATA_TYPE_STRING_ARRAY: + nv_string = (nv_string_t *)&nvp_data->nv_data[0]; + for (unsigned i = 0; i < nelem; i++) { + if (!xdr_string(xdr, nv_string)) + return (false); + nv_string = (nv_string_t *)xdr->xdr_idx; + } + break; + + case DATA_TYPE_INT8_ARRAY: + case DATA_TYPE_UINT8_ARRAY: + case DATA_TYPE_INT16_ARRAY: + case DATA_TYPE_UINT16_ARRAY: + case DATA_TYPE_BOOLEAN_ARRAY: + case DATA_TYPE_INT32_ARRAY: + case DATA_TYPE_UINT32_ARRAY: + if (!xdr_array(xdr, nelem, (xdrproc_t)xdr_u_int)) + return (false); + break; + + case DATA_TYPE_INT64_ARRAY: + case DATA_TYPE_UINT64_ARRAY: + if (!xdr_array(xdr, nelem, (xdrproc_t)xdr_uint64)) + return (false); + break; + } + return (true); +} + +static int +nvlist_xdr_nvlist(xdr_t *xdr, nvlist_t *nvl) +{ + nvp_header_t *nvph; + nvs_data_t *nvs; + unsigned encoded_size, decoded_size; + int rv; + + nvs = (nvs_data_t *)xdr->xdr_idx; + nvph = &nvs->nvl_pair; + + if (!xdr_u_int(xdr, &nvs->nvl_version)) + return (EINVAL); + if (!xdr_u_int(xdr, &nvs->nvl_nvflag)) + return (EINVAL); + + encoded_size = nvph->encoded_size; + decoded_size = nvph->decoded_size; + + if (xdr->xdr_op == XDR_OP_ENCODE) { + if (!xdr_u_int(xdr, &nvph->encoded_size)) + return (EINVAL); + if (!xdr_u_int(xdr, &nvph->decoded_size)) + return (EINVAL); + } else { + xdr->xdr_idx += 2 * sizeof(unsigned); + } + + rv = 0; + while (encoded_size && decoded_size) { + if (!nvlist_xdr_nvp(xdr, nvl)) + return (EINVAL); + + nvph = (nvp_header_t *)(xdr->xdr_idx); + encoded_size = nvph->encoded_size; + decoded_size = nvph->decoded_size; + if (xdr->xdr_op == XDR_OP_ENCODE) { + if (!xdr_u_int(xdr, &nvph->encoded_size)) + return (EINVAL); + if (!xdr_u_int(xdr, &nvph->decoded_size)) + return (EINVAL); + } else { + xdr->xdr_idx += 2 * sizeof(unsigned); + } + } + return (rv); +} + +/* + * Calculate nvlist size, translating encoded_size and decoded_size. + */ +static bool +nvlist_size_xdr(xdr_t *xdr, size_t *size) +{ + uint8_t *pair; + unsigned encoded_size, decoded_size; + + xdr->xdr_idx += 2 * sizeof(unsigned); + + pair = xdr->xdr_idx; + if (!xdr_u_int(xdr, &encoded_size) || !xdr_u_int(xdr, &decoded_size)) + return (false); + + while (encoded_size && decoded_size) { + xdr->xdr_idx = pair + encoded_size; + pair = xdr->xdr_idx; + if (!xdr_u_int(xdr, &encoded_size) || + !xdr_u_int(xdr, &decoded_size)) + return (false); + } + *size = xdr->xdr_idx - xdr->xdr_buf; + + return (true); +} + +nvp_header_t * +nvlist_next_nvpair(nvlist_t *nvl, nvp_header_t *nvh) +{ + uint8_t *pair; + unsigned encoded_size, decoded_size; + xdr_t xdr; + + if (nvl == NULL) + return (NULL); + + xdr.xdr_buf = nvl->nv_data; + xdr.xdr_idx = nvl->nv_data; + xdr.xdr_buf_size = nvl->nv_size; + + xdr.xdr_idx += 2 * sizeof(unsigned); + + /* Skip tp current pair */ + if (nvh != NULL) { + xdr.xdr_idx = (uint8_t *)nvh; + } + + pair = xdr.xdr_idx; + if (xdr.xdr_idx > xdr.xdr_buf + xdr.xdr_buf_size) + return (NULL); + + encoded_size = *(unsigned *)xdr.xdr_idx; + xdr.xdr_idx += sizeof(unsigned); + if (xdr.xdr_idx > xdr.xdr_buf + xdr.xdr_buf_size) + return (NULL); + + decoded_size = *(unsigned *)xdr.xdr_idx; + xdr.xdr_idx += sizeof(unsigned); + if (xdr.xdr_idx > xdr.xdr_buf + xdr.xdr_buf_size) + return (NULL); + + while (encoded_size && decoded_size) { + if (nvh == NULL) + return ((nvp_header_t *)pair); + + xdr.xdr_idx = pair + encoded_size; + nvh = (nvp_header_t *)xdr.xdr_idx; + + if (xdr.xdr_idx > xdr.xdr_buf + xdr.xdr_buf_size) + return (NULL); + + encoded_size = *(unsigned *)xdr.xdr_idx; + xdr.xdr_idx += sizeof(unsigned); + if (xdr.xdr_idx > xdr.xdr_buf + xdr.xdr_buf_size) + return (NULL); + decoded_size = *(unsigned *)xdr.xdr_idx; + xdr.xdr_idx += sizeof(unsigned); + if (xdr.xdr_idx > xdr.xdr_buf + xdr.xdr_buf_size) + return (NULL); + + if (encoded_size != 0 && decoded_size != 0) { + return (nvh); + } + } + return (NULL); +} + +/* + * Calculate nvlist size by walking in memory data. + */ +static bool +nvlist_size_native(xdr_t *xdr, size_t *size) +{ + uint8_t *pair; + unsigned encoded_size, decoded_size; + + xdr->xdr_idx += 2 * sizeof(unsigned); + + pair = xdr->xdr_idx; + if (xdr->xdr_idx > xdr->xdr_buf + xdr->xdr_buf_size) + return (false); + + encoded_size = *(unsigned *)xdr->xdr_idx; + xdr->xdr_idx += sizeof(unsigned); + if (xdr->xdr_idx > xdr->xdr_buf + xdr->xdr_buf_size) + return (false); + decoded_size = *(unsigned *)xdr->xdr_idx; + xdr->xdr_idx += sizeof(unsigned); + while (encoded_size && decoded_size) { + xdr->xdr_idx = pair + encoded_size; + pair = xdr->xdr_idx; + if (xdr->xdr_idx > xdr->xdr_buf + xdr->xdr_buf_size) + return (false); + encoded_size = *(unsigned *)xdr->xdr_idx; + xdr->xdr_idx += sizeof(unsigned); + if (xdr->xdr_idx > xdr->xdr_buf + xdr->xdr_buf_size) + return (false); + decoded_size = *(unsigned *)xdr->xdr_idx; + xdr->xdr_idx += sizeof(unsigned); + } + *size = xdr->xdr_idx - xdr->xdr_buf; + + return (true); +} + +/* + * Export nvlist to byte stream format. + */ +int +nvlist_export(nvlist_t *nvl) +{ + int rv; + xdr_t xdr = { + .xdr_op = XDR_OP_ENCODE, + .xdr_putint = _putint, + .xdr_putuint = _putuint, + .xdr_buf = nvl->nv_data, + .xdr_idx = nvl->nv_data, + .xdr_buf_size = nvl->nv_size + }; + + if (nvl->nv_header.nvh_encoding != NV_ENCODE_XDR) + return (ENOTSUP); + + nvl->nv_idx = nvl->nv_data; + rv = nvlist_xdr_nvlist(&xdr, nvl); + + return (rv); +} + +/* + * Import nvlist from byte stream. + * Determine the stream size and allocate private copy. + * Then translate the data. + */ +nvlist_t * +nvlist_import(const char *stream, size_t size) +{ + nvlist_t *nvl; + xdr_t xdr = { + .xdr_op = XDR_OP_DECODE, + .xdr_getint = _getint, + .xdr_getuint = _getuint + }; + + /* Check the nvlist head. */ + if (stream[0] != NV_ENCODE_XDR || + (stream[1] != '\0' && stream[1] != '\1') || + stream[2] != '\0' || stream[3] != '\0' || + be32toh(*(uint32_t *)(stream + 4)) != NV_VERSION || + be32toh(*(uint32_t *)(stream + 8)) != NV_UNIQUE_NAME) + return (NULL); + + nvl = malloc(sizeof(*nvl)); + if (nvl == NULL) + return (nvl); + + nvl->nv_header.nvh_encoding = stream[0]; + nvl->nv_header.nvh_endian = stream[1]; + nvl->nv_header.nvh_reserved1 = stream[2]; + nvl->nv_header.nvh_reserved2 = stream[3]; + + xdr.xdr_buf = xdr.xdr_idx = (uint8_t *)stream + 4; + xdr.xdr_buf_size = size - 4; + + if (!nvlist_size_xdr(&xdr, &nvl->nv_asize)) { + free(nvl); + return (NULL); + } + nvl->nv_size = nvl->nv_asize; + nvl->nv_data = malloc(nvl->nv_asize); + if (nvl->nv_data == NULL) { + free(nvl); + return (NULL); + } + nvl->nv_idx = nvl->nv_data; + bcopy(stream + 4, nvl->nv_data, nvl->nv_asize); + + xdr.xdr_buf = xdr.xdr_idx = nvl->nv_data; + xdr.xdr_buf_size = nvl->nv_asize; + + if (nvlist_xdr_nvlist(&xdr, nvl) != 0) { + free(nvl->nv_data); + free(nvl); + nvl = NULL; + } + + return (nvl); +} + +/* + * remove pair from this nvlist. + */ +int +nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type) +{ + uint8_t *head, *tail; + nvs_data_t *data; + nvp_header_t *nvp; + nv_string_t *nvp_name; + nv_pair_data_t *nvp_data; + size_t size; + xdr_t xdr; + + if (nvl == NULL || nvl->nv_data == NULL || name == NULL) + return (EINVAL); + + /* Make sure the nvlist size is set correct */ + xdr.xdr_idx = nvl->nv_data; + xdr.xdr_buf = xdr.xdr_idx; + xdr.xdr_buf_size = nvl->nv_size; + if (!nvlist_size_native(&xdr, &nvl->nv_size)) + return (EINVAL); + + data = (nvs_data_t *)nvl->nv_data; + nvp = &data->nvl_pair; /* first pair in nvlist */ + head = (uint8_t *)nvp; + + while (nvp->encoded_size != 0 && nvp->decoded_size != 0) { + nvp_name = (nv_string_t *)(nvp + 1); + + nvp_data = (nv_pair_data_t *)(&nvp_name->nv_data[0] + + NV_ALIGN4(nvp_name->nv_size)); + + if (strlen(name) == nvp_name->nv_size && + memcmp(nvp_name->nv_data, name, nvp_name->nv_size) == 0 && + (nvp_data->nv_type == type || type == DATA_TYPE_UNKNOWN)) { + /* + * set tail to point to next nvpair and size + * is the length of the tail. + */ + tail = head + nvp->encoded_size; + size = nvl->nv_size - (tail - nvl->nv_data); + + /* adjust the size of the nvlist. */ + nvl->nv_size -= nvp->encoded_size; + bcopy(tail, head, size); + return (0); + } + /* Not our pair, skip to next. */ + head = head + nvp->encoded_size; + nvp = (nvp_header_t *)head; + } + return (ENOENT); +} + +static int +clone_nvlist(const nvlist_t *nvl, const uint8_t *ptr, unsigned size, + nvlist_t **nvlist) +{ + nvlist_t *nv; + + nv = calloc(1, sizeof(*nv)); + if (nv == NULL) + return (ENOMEM); + + nv->nv_header = nvl->nv_header; + nv->nv_asize = size; + nv->nv_size = size; + nv->nv_data = malloc(nv->nv_asize); + if (nv->nv_data == NULL) { + free(nv); + return (ENOMEM); + } + + bcopy(ptr, nv->nv_data, nv->nv_asize); + *nvlist = nv; + return (0); +} + +/* + * Return the next nvlist in an nvlist array. + */ +static uint8_t * +nvlist_next(const uint8_t *ptr) +{ + nvs_data_t *data; + nvp_header_t *nvp; + + data = (nvs_data_t *)ptr; + nvp = &data->nvl_pair; /* first pair in nvlist */ + + while (nvp->encoded_size != 0 && nvp->decoded_size != 0) { + nvp = (nvp_header_t *)((uint8_t *)nvp + nvp->encoded_size); + } + return ((uint8_t *)nvp + sizeof(*nvp)); +} + +/* + * Note: nvlist and nvlist array must be freed by caller. + */ +int +nvlist_find(const nvlist_t *nvl, const char *name, data_type_t type, + int *elementsp, void *valuep, int *sizep) +{ + nvs_data_t *data; + nvp_header_t *nvp; + nv_string_t *nvp_name; + nv_pair_data_t *nvp_data; + nvlist_t **nvlist, *nv; + uint8_t *ptr; + int rv; + + if (nvl == NULL || nvl->nv_data == NULL || name == NULL) + return (EINVAL); + + data = (nvs_data_t *)nvl->nv_data; + nvp = &data->nvl_pair; /* first pair in nvlist */ + + while (nvp->encoded_size != 0 && nvp->decoded_size != 0) { + nvp_name = (nv_string_t *)((uint8_t *)nvp + sizeof(*nvp)); + if (nvl->nv_data + nvl->nv_size < + nvp_name->nv_data + nvp_name->nv_size) + return (EIO); + + nvp_data = (nv_pair_data_t *) + NV_ALIGN4((uintptr_t)&nvp_name->nv_data[0] + + nvp_name->nv_size); + + if (strlen(name) == nvp_name->nv_size && + memcmp(nvp_name->nv_data, name, nvp_name->nv_size) == 0 && + (nvp_data->nv_type == type || type == DATA_TYPE_UNKNOWN)) { + if (elementsp != NULL) + *elementsp = nvp_data->nv_nelem; + switch (nvp_data->nv_type) { + case DATA_TYPE_UINT64: + bcopy(nvp_data->nv_data, valuep, + sizeof(uint64_t)); + return (0); + case DATA_TYPE_STRING: + nvp_name = (nv_string_t *)nvp_data->nv_data; + if (sizep != NULL) { + *sizep = nvp_name->nv_size; + } + *(const uint8_t **)valuep = + &nvp_name->nv_data[0]; + return (0); + case DATA_TYPE_NVLIST: + ptr = &nvp_data->nv_data[0]; + rv = clone_nvlist(nvl, ptr, + nvlist_next(ptr) - ptr, &nv); + if (rv == 0) { + *(nvlist_t **)valuep = nv; + } + return (rv); + + case DATA_TYPE_NVLIST_ARRAY: + nvlist = calloc(nvp_data->nv_nelem, + sizeof(nvlist_t *)); + if (nvlist == NULL) + return (ENOMEM); + ptr = &nvp_data->nv_data[0]; + rv = 0; + for (unsigned i = 0; i < nvp_data->nv_nelem; + i++) { + rv = clone_nvlist(nvl, ptr, + nvlist_next(ptr) - ptr, &nvlist[i]); + if (rv != 0) + goto error; + ptr = nvlist_next(ptr); + } + *(nvlist_t ***)valuep = nvlist; + return (rv); + } + return (EIO); + } + /* Not our pair, skip to next. */ + nvp = (nvp_header_t *)((uint8_t *)nvp + nvp->encoded_size); + if (nvl->nv_data + nvl->nv_size < (uint8_t *)nvp) + return (EIO); + } + return (ENOENT); +error: + for (unsigned i = 0; i < nvp_data->nv_nelem; i++) { + free(nvlist[i]->nv_data); + free(nvlist[i]); + } + free(nvlist); + return (rv); +} + +static int +get_value_size(data_type_t type, const void *data, uint32_t nelem) +{ + uint64_t value_sz = 0; + + switch (type) { + case DATA_TYPE_BOOLEAN: + value_sz = 0; + break; + case DATA_TYPE_BOOLEAN_VALUE: + case DATA_TYPE_BYTE: + case DATA_TYPE_INT8: + case DATA_TYPE_UINT8: + case DATA_TYPE_INT16: + case DATA_TYPE_UINT16: + case DATA_TYPE_INT32: + case DATA_TYPE_UINT32: + /* Our smallest data unit is 32-bit */ + value_sz = sizeof(uint32_t); + break; + case DATA_TYPE_HRTIME: + case DATA_TYPE_INT64: + value_sz = sizeof(int64_t); + break; + case DATA_TYPE_UINT64: + value_sz = sizeof(uint64_t); + break; + case DATA_TYPE_STRING: + if (data == NULL) + value_sz = 0; + else + value_sz = strlen(data) + 1; + break; + case DATA_TYPE_BYTE_ARRAY: + value_sz = nelem * sizeof(uint8_t); + break; + case DATA_TYPE_BOOLEAN_ARRAY: + case DATA_TYPE_INT8_ARRAY: + case DATA_TYPE_UINT8_ARRAY: + case DATA_TYPE_INT16_ARRAY: + case DATA_TYPE_UINT16_ARRAY: + case DATA_TYPE_INT32_ARRAY: + case DATA_TYPE_UINT32_ARRAY: + value_sz = (uint64_t)nelem * sizeof(uint32_t); + break; + case DATA_TYPE_INT64_ARRAY: + value_sz = (uint64_t)nelem * sizeof(int64_t); + break; + case DATA_TYPE_UINT64_ARRAY: + value_sz = (uint64_t)nelem * sizeof(uint64_t); + break; + case DATA_TYPE_STRING_ARRAY: + value_sz = (uint64_t)nelem * sizeof(uint64_t); + + if (data != NULL) { + char *const *strs = data; + uint32_t i; + + for (i = 0; i < nelem; i++) { + if (strs[i] == NULL) + return (-1); + value_sz += strlen(strs[i]) + 1; + } + } + break; + case DATA_TYPE_NVLIST: + /* + * The decoded size of nvlist is constant. + */ + value_sz = NV_ALIGN(6 * 4); /* sizeof nvlist_t */ + break; + case DATA_TYPE_NVLIST_ARRAY: + value_sz = (uint64_t)nelem * sizeof(uint64_t) + + (uint64_t)nelem * NV_ALIGN(6 * 4); /* sizeof nvlist_t */ + break; + default: + return (-1); + } + + return (value_sz > INT32_MAX ? -1 : (int)value_sz); +} + +static int +get_nvp_data_size(data_type_t type, const void *data, uint32_t nelem) +{ + uint64_t value_sz = 0; + xdr_t xdr; + size_t size; + + switch (type) { + case DATA_TYPE_BOOLEAN: + value_sz = 0; + break; + case DATA_TYPE_BOOLEAN_VALUE: + case DATA_TYPE_BYTE: + case DATA_TYPE_INT8: + case DATA_TYPE_UINT8: + case DATA_TYPE_INT16: + case DATA_TYPE_UINT16: + case DATA_TYPE_INT32: + case DATA_TYPE_UINT32: + /* Our smallest data unit is 32-bit */ + value_sz = sizeof(uint32_t); + break; + case DATA_TYPE_HRTIME: + case DATA_TYPE_INT64: + case DATA_TYPE_UINT64: + value_sz = sizeof(uint64_t); + break; + case DATA_TYPE_STRING: + value_sz = 4 + NV_ALIGN4(strlen(data)); + break; + case DATA_TYPE_BYTE_ARRAY: + value_sz = NV_ALIGN4(nelem); + break; + case DATA_TYPE_BOOLEAN_ARRAY: + case DATA_TYPE_INT8_ARRAY: + case DATA_TYPE_UINT8_ARRAY: + case DATA_TYPE_INT16_ARRAY: + case DATA_TYPE_UINT16_ARRAY: + case DATA_TYPE_INT32_ARRAY: + case DATA_TYPE_UINT32_ARRAY: + value_sz = 4 + (uint64_t)nelem * sizeof(uint32_t); + break; + case DATA_TYPE_INT64_ARRAY: + case DATA_TYPE_UINT64_ARRAY: + value_sz = 4 + (uint64_t)nelem * sizeof(uint64_t); + break; + case DATA_TYPE_STRING_ARRAY: + if (data != NULL) { + char *const *strs = data; + uint32_t i; + + for (i = 0; i < nelem; i++) { + value_sz += 4 + NV_ALIGN4(strlen(strs[i])); + } + } + break; + case DATA_TYPE_NVLIST: + xdr.xdr_idx = ((nvlist_t *)data)->nv_data; + xdr.xdr_buf = xdr.xdr_idx; + xdr.xdr_buf_size = ((nvlist_t *)data)->nv_size; + + if (!nvlist_size_native(&xdr, &size)) + return (-1); + + value_sz = size; + break; + case DATA_TYPE_NVLIST_ARRAY: + value_sz = 0; + for (uint32_t i = 0; i < nelem; i++) { + xdr.xdr_idx = ((nvlist_t **)data)[i]->nv_data; + xdr.xdr_buf = xdr.xdr_idx; + xdr.xdr_buf_size = ((nvlist_t **)data)[i]->nv_size; + + if (!nvlist_size_native(&xdr, &size)) + return (-1); + value_sz += size; + } + break; + default: + return (-1); + } + + return (value_sz > INT32_MAX ? -1 : (int)value_sz); +} + +#define NVPE_SIZE(name_len, data_len) \ + (4 + 4 + 4 + NV_ALIGN4(name_len) + 4 + 4 + data_len) +#define NVP_SIZE(name_len, data_len) \ + (NV_ALIGN((4 * 4) + (name_len)) + NV_ALIGN(data_len)) + +static int +nvlist_add_common(nvlist_t *nvl, const char *name, data_type_t type, + uint32_t nelem, const void *data) +{ + nvs_data_t *nvs; + nvp_header_t head, *hp; + uint8_t *ptr; + size_t namelen; + int decoded_size, encoded_size; + xdr_t xdr = { + .xdr_op = XDR_OP_ENCODE, + .xdr_putint = _putint_mem, + .xdr_putuint = _putuint_mem, + .xdr_buf = nvl->nv_data, + .xdr_idx = nvl->nv_data, + .xdr_buf_size = nvl->nv_size + }; + + nvs = (nvs_data_t *)nvl->nv_data; + if (nvs->nvl_nvflag & NV_UNIQUE_NAME) + (void) nvlist_remove(nvl, name, type); + + xdr.xdr_buf = nvl->nv_data; + xdr.xdr_idx = nvl->nv_data; + xdr.xdr_buf_size = nvl->nv_size; + if (!nvlist_size_native(&xdr, &nvl->nv_size)) + return (EINVAL); + + namelen = strlen(name); + if ((decoded_size = get_value_size(type, data, nelem)) < 0) + return (EINVAL); + if ((encoded_size = get_nvp_data_size(type, data, nelem)) < 0) + return (EINVAL); + + /* + * The encoded size is calculated as: + * encode_size (4) + decode_size (4) + + * name string size (4 + NV_ALIGN4(namelen) + + * data type (4) + nelem size (4) + datalen + * + * The decoded size is calculated as: + * Note: namelen is with terminating 0. + * NV_ALIGN(sizeof(nvpair_t) (4 * 4) + namelen + 1) + + * NV_ALIGN(data_len) + */ + + head.encoded_size = NVPE_SIZE(namelen, encoded_size); + head.decoded_size = NVP_SIZE(namelen + 1, decoded_size); + + if (nvl->nv_asize - nvl->nv_size < head.encoded_size + 8) { + ptr = realloc(nvl->nv_data, nvl->nv_asize + head.encoded_size); + if (ptr == NULL) + return (ENOMEM); + nvl->nv_data = ptr; + nvl->nv_asize += head.encoded_size; + } + nvl->nv_idx = nvl->nv_data + nvl->nv_size - sizeof(*hp); + bzero(nvl->nv_idx, head.encoded_size + 8); + hp = (nvp_header_t *)nvl->nv_idx; + *hp = head; + nvl->nv_idx += sizeof(*hp); + + xdr.xdr_buf = nvl->nv_data; + xdr.xdr_buf_size = nvl->nv_asize; + xdr.xdr_idx = nvl->nv_idx; + + xdr.xdr_idx += xdr.xdr_putuint(&xdr, namelen); + strlcpy((char *)xdr.xdr_idx, name, namelen + 1); + xdr.xdr_idx += NV_ALIGN4(namelen); + xdr.xdr_idx += xdr.xdr_putuint(&xdr, type); + xdr.xdr_idx += xdr.xdr_putuint(&xdr, nelem); + + switch (type) { + case DATA_TYPE_BOOLEAN: + break; + + case DATA_TYPE_BYTE_ARRAY: + xdr.xdr_idx += xdr.xdr_putuint(&xdr, encoded_size); + bcopy(data, xdr.xdr_idx, nelem); + xdr.xdr_idx += NV_ALIGN4(encoded_size); + break; + + case DATA_TYPE_STRING: + encoded_size = strlen(data); + xdr.xdr_idx += xdr.xdr_putuint(&xdr, encoded_size); + strlcpy((char *)xdr.xdr_idx, data, encoded_size + 1); + xdr.xdr_idx += NV_ALIGN4(encoded_size); + break; + + case DATA_TYPE_STRING_ARRAY: + for (uint32_t i = 0; i < nelem; i++) { + encoded_size = strlen(((char **)data)[i]); + xdr.xdr_idx += xdr.xdr_putuint(&xdr, encoded_size); + strlcpy((char *)xdr.xdr_idx, ((char **)data)[i], + encoded_size + 1); + xdr.xdr_idx += NV_ALIGN4(encoded_size); + } + break; + + case DATA_TYPE_BYTE: + case DATA_TYPE_INT8: + case DATA_TYPE_UINT8: + xdr_char(&xdr, (char *)data); + break; + + case DATA_TYPE_INT8_ARRAY: + case DATA_TYPE_UINT8_ARRAY: + xdr_array(&xdr, nelem, (xdrproc_t)xdr_char); + break; + + case DATA_TYPE_INT16: + xdr_short(&xdr, (short *)data); + break; + + case DATA_TYPE_UINT16: + xdr_u_short(&xdr, (unsigned short *)data); + break; + + case DATA_TYPE_INT16_ARRAY: + xdr_array(&xdr, nelem, (xdrproc_t)xdr_short); + break; + + case DATA_TYPE_UINT16_ARRAY: + xdr_array(&xdr, nelem, (xdrproc_t)xdr_u_short); + break; + + case DATA_TYPE_BOOLEAN_VALUE: + case DATA_TYPE_INT32: + xdr_int(&xdr, (int *)data); + break; + + case DATA_TYPE_UINT32: + xdr_u_int(&xdr, (unsigned int *)data); + break; + + case DATA_TYPE_BOOLEAN_ARRAY: + case DATA_TYPE_INT32_ARRAY: + xdr_array(&xdr, nelem, (xdrproc_t)xdr_int); + break; + + case DATA_TYPE_UINT32_ARRAY: + xdr_array(&xdr, nelem, (xdrproc_t)xdr_u_int); + break; + + case DATA_TYPE_INT64: + xdr_int64(&xdr, (int64_t *)data); + break; + + case DATA_TYPE_UINT64: + xdr_uint64(&xdr, (uint64_t *)data); + break; + + case DATA_TYPE_INT64_ARRAY: + xdr_array(&xdr, nelem, (xdrproc_t)xdr_int64); + break; + + case DATA_TYPE_UINT64_ARRAY: + xdr_array(&xdr, nelem, (xdrproc_t)xdr_uint64); + break; + + case DATA_TYPE_NVLIST: + bcopy(((nvlist_t *)data)->nv_data, xdr.xdr_idx, encoded_size); + break; + + case DATA_TYPE_NVLIST_ARRAY: { + size_t size; + xdr_t xdr_nv; + + for (uint32_t i = 0; i < nelem; i++) { + xdr_nv.xdr_idx = ((nvlist_t **)data)[i]->nv_data; + xdr_nv.xdr_buf = xdr_nv.xdr_idx; + xdr_nv.xdr_buf_size = ((nvlist_t **)data)[i]->nv_size; + + if (!nvlist_size_native(&xdr_nv, &size)) + return (EINVAL); + + bcopy(((nvlist_t **)data)[i]->nv_data, xdr.xdr_idx, + size); + xdr.xdr_idx += size; + } + break; + } + default: + bcopy(data, xdr.xdr_idx, encoded_size); + } + + nvl->nv_size += head.encoded_size; + + return (0); +} + +int +nvlist_add_boolean_value(nvlist_t *nvl, const char *name, bool value) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_VALUE, 1, + &value)); +} + +int +nvlist_add_byte(nvlist_t *nvl, const char *name, uint8_t value) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE, 1, &value)); +} + +int +nvlist_add_int8(nvlist_t *nvl, const char *name, int8_t value) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_INT8, 1, &value)); +} + +int +nvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t value) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8, 1, &value)); +} + +int +nvlist_add_int16(nvlist_t *nvl, const char *name, int16_t value) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_INT16, 1, &value)); +} + +int +nvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t value) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16, 1, &value)); +} + +int +nvlist_add_int32(nvlist_t *nvl, const char *name, int32_t value) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_INT32, 1, &value)); +} + +int +nvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t value) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32, 1, &value)); +} + +int +nvlist_add_int64(nvlist_t *nvl, const char *name, int64_t value) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_INT64, 1, &value)); +} + +int +nvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t value) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64, 1, &value)); +} + +int +nvlist_add_string(nvlist_t *nvl, const char *name, const char *value) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_STRING, 1, value)); +} + +int +nvlist_add_boolean_array(nvlist_t *nvl, const char *name, + bool *a, uint32_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_ARRAY, n, a)); +} + +int +nvlist_add_byte_array(nvlist_t *nvl, const char *name, uint8_t *a, uint32_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a)); +} + +int +nvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *a, uint32_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a)); +} + +int +nvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *a, uint32_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a)); +} + +int +nvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *a, uint32_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a)); +} + +int +nvlist_add_uint16_array(nvlist_t *nvl, const char *name, uint16_t *a, + uint32_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a)); +} + +int +nvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *a, uint32_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a)); +} + +int +nvlist_add_uint32_array(nvlist_t *nvl, const char *name, uint32_t *a, + uint32_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a)); +} + +int +nvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *a, uint32_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a)); +} + +int +nvlist_add_uint64_array(nvlist_t *nvl, const char *name, uint64_t *a, + uint32_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a)); +} + +int +nvlist_add_string_array(nvlist_t *nvl, const char *name, + char * const *a, uint32_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a)); +} + +int +nvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST, 1, val)); +} + +int +nvlist_add_nvlist_array(nvlist_t *nvl, const char *name, nvlist_t **a, + uint32_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a)); +} + +static const char *typenames[] = { + "DATA_TYPE_UNKNOWN", + "DATA_TYPE_BOOLEAN", + "DATA_TYPE_BYTE", + "DATA_TYPE_INT16", + "DATA_TYPE_UINT16", + "DATA_TYPE_INT32", + "DATA_TYPE_UINT32", + "DATA_TYPE_INT64", + "DATA_TYPE_UINT64", + "DATA_TYPE_STRING", + "DATA_TYPE_BYTE_ARRAY", + "DATA_TYPE_INT16_ARRAY", + "DATA_TYPE_UINT16_ARRAY", + "DATA_TYPE_INT32_ARRAY", + "DATA_TYPE_UINT32_ARRAY", + "DATA_TYPE_INT64_ARRAY", + "DATA_TYPE_UINT64_ARRAY", + "DATA_TYPE_STRING_ARRAY", + "DATA_TYPE_HRTIME", + "DATA_TYPE_NVLIST", + "DATA_TYPE_NVLIST_ARRAY", + "DATA_TYPE_BOOLEAN_VALUE", + "DATA_TYPE_INT8", + "DATA_TYPE_UINT8", + "DATA_TYPE_BOOLEAN_ARRAY", + "DATA_TYPE_INT8_ARRAY", + "DATA_TYPE_UINT8_ARRAY" +}; + +int +nvpair_type_from_name(const char *name) +{ + unsigned i; + + for (i = 0; i < nitems(typenames); i++) { + if (strcmp(name, typenames[i]) == 0) + return (i); + } + return (0); +} + +nvp_header_t * +nvpair_find(nvlist_t *nv, const char *name) +{ + nvp_header_t *nvh; + + nvh = NULL; + while ((nvh = nvlist_next_nvpair(nv, nvh)) != NULL) { + nv_string_t *nvp_name; + + nvp_name = (nv_string_t *)(nvh + 1); + if (nvp_name->nv_size == strlen(name) && + memcmp(nvp_name->nv_data, name, nvp_name->nv_size) == 0) + break; + } + return (nvh); +} + +void +nvpair_print(nvp_header_t *nvp, unsigned int indent) +{ + nv_string_t *nvp_name; + nv_pair_data_t *nvp_data; + nvlist_t nvlist; + unsigned i, j; + xdr_t xdr = { + .xdr_op = XDR_OP_DECODE, + .xdr_getint = _getint_mem, + .xdr_getuint = _getuint_mem, + .xdr_buf = (const uint8_t *)nvp, + .xdr_idx = NULL, + .xdr_buf_size = nvp->encoded_size + }; + + nvp_name = (nv_string_t *)((uintptr_t)nvp + sizeof(*nvp)); + nvp_data = (nv_pair_data_t *) + NV_ALIGN4((uintptr_t)&nvp_name->nv_data[0] + nvp_name->nv_size); + + for (i = 0; i < indent; i++) + printf(" "); + + printf("%s [%d] %.*s", typenames[nvp_data->nv_type], + nvp_data->nv_nelem, nvp_name->nv_size, nvp_name->nv_data); + + xdr.xdr_idx = nvp_data->nv_data; + switch (nvp_data->nv_type) { + case DATA_TYPE_BYTE: + case DATA_TYPE_INT8: + case DATA_TYPE_UINT8: { + char c; + + if (xdr_char(&xdr, &c)) + printf(" = 0x%x\n", c); + break; + } + + case DATA_TYPE_INT16: + case DATA_TYPE_UINT16: { + unsigned short u; + + if (xdr_u_short(&xdr, &u)) + printf(" = 0x%hx\n", u); + break; + } + + case DATA_TYPE_BOOLEAN_VALUE: + case DATA_TYPE_INT32: + case DATA_TYPE_UINT32: { + unsigned u; + + if (xdr_u_int(&xdr, &u)) + printf(" = 0x%x\n", u); + break; + } + + case DATA_TYPE_INT64: + case DATA_TYPE_UINT64: { + uint64_t u; + + if (xdr_uint64(&xdr, &u)) + printf(" = 0x%jx\n", (uintmax_t)u); + break; + } + + case DATA_TYPE_INT64_ARRAY: + case DATA_TYPE_UINT64_ARRAY: { + uint64_t *u; + + if (xdr_array(&xdr, nvp_data->nv_nelem, + (xdrproc_t)xdr_uint64)) { + u = (uint64_t *)(nvp_data->nv_data + sizeof(unsigned)); + for (i = 0; i < nvp_data->nv_nelem; i++) + printf(" [%u] = 0x%jx", i, (uintmax_t)u[i]); + printf("\n"); + } + + break; + } + + case DATA_TYPE_STRING: + case DATA_TYPE_STRING_ARRAY: + nvp_name = (nv_string_t *)&nvp_data->nv_data[0]; + for (i = 0; i < nvp_data->nv_nelem; i++) { + printf(" = \"%.*s\"\n", nvp_name->nv_size, + nvp_name->nv_data); + } + break; + + case DATA_TYPE_NVLIST: + printf("\n"); + nvlist.nv_data = &nvp_data->nv_data[0]; + nvlist_print(&nvlist, indent + 2); + break; + + case DATA_TYPE_NVLIST_ARRAY: + nvlist.nv_data = &nvp_data->nv_data[0]; + for (j = 0; j < nvp_data->nv_nelem; j++) { + size_t size; + + printf("[%d]\n", j); + nvlist_print(&nvlist, indent + 2); + if (j != nvp_data->nv_nelem - 1) { + for (i = 0; i < indent; i++) + printf(" "); + printf("%s %.*s", + typenames[nvp_data->nv_type], + nvp_name->nv_size, + nvp_name->nv_data); + } + xdr.xdr_idx = nvlist.nv_data; + xdr.xdr_buf = xdr.xdr_idx; + xdr.xdr_buf_size = nvp->encoded_size - + (xdr.xdr_idx - (uint8_t *)nvp); + + if (!nvlist_size_native(&xdr, &size)) + return; + + nvlist.nv_data += size; + } + break; + + default: + printf("\n"); + } +} + +void +nvlist_print(const nvlist_t *nvl, unsigned int indent) +{ + nvs_data_t *data; + nvp_header_t *nvp; + + data = (nvs_data_t *)nvl->nv_data; + nvp = &data->nvl_pair; /* first pair in nvlist */ + while (nvp->encoded_size != 0 && nvp->decoded_size != 0) { + nvpair_print(nvp, indent); + nvp = (nvp_header_t *)((uint8_t *)nvp + nvp->encoded_size); + } + printf("%*s\n", indent + 13, "End of nvlist"); +} Index: usr.sbin/makefs/zfs/zfsimpl.h =================================================================== --- /dev/null +++ usr.sbin/makefs/zfs/zfsimpl.h @@ -0,0 +1,2119 @@ +/*- + * Copyright (c) 2002 McAfee, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Marshall + * Kirk McKusick and McAfee Research,, the Security Research Division of + * McAfee, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as + * part of the DARPA CHATS research program + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2013 by Saso Kiselkov. All rights reserved. + */ +/* + * Copyright (c) 2020 by Delphix. All rights reserved. + */ + +#include + +#ifndef _ZFSIMPL_H_ +#define _ZFSIMPL_H_ + +#define MAXNAMELEN 256 + +#define _NOTE(s) + +/* + * AVL comparator helpers + */ +#define AVL_ISIGN(a) (((a) > 0) - ((a) < 0)) +#define AVL_CMP(a, b) (((a) > (b)) - ((a) < (b))) +#define AVL_PCMP(a, b) \ + (((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b))) + +typedef enum { B_FALSE, B_TRUE } boolean_t; + +/* CRC64 table */ +#define ZFS_CRC64_POLY 0xC96C5795D7870F42UL /* ECMA-182, reflected form */ + +/* + * Macros for various sorts of alignment and rounding when the alignment + * is known to be a power of 2. + */ +#define P2ALIGN(x, align) ((x) & -(align)) +#define P2PHASE(x, align) ((x) & ((align) - 1)) +#define P2NPHASE(x, align) (-(x) & ((align) - 1)) +#define P2ROUNDUP(x, align) (-(-(x) & -(align))) +#define P2END(x, align) (-(~(x) & -(align))) +#define P2PHASEUP(x, align, phase) ((phase) - (((phase) - (x)) & -(align))) +#define P2BOUNDARY(off, len, align) (((off) ^ ((off) + (len) - 1)) > (align) - 1) + +/* + * General-purpose 32-bit and 64-bit bitfield encodings. + */ +#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len)) +#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len)) +#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low)) +#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low)) + +#define BF32_GET(x, low, len) BF32_DECODE(x, low, len) +#define BF64_GET(x, low, len) BF64_DECODE(x, low, len) + +#define BF32_SET(x, low, len, val) \ + ((x) ^= BF32_ENCODE((x >> low) ^ (val), low, len)) +#define BF64_SET(x, low, len, val) \ + ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)) + +#define BF32_GET_SB(x, low, len, shift, bias) \ + ((BF32_GET(x, low, len) + (bias)) << (shift)) +#define BF64_GET_SB(x, low, len, shift, bias) \ + ((BF64_GET(x, low, len) + (bias)) << (shift)) + +#define BF32_SET_SB(x, low, len, shift, bias, val) \ + BF32_SET(x, low, len, ((val) >> (shift)) - (bias)) +#define BF64_SET_SB(x, low, len, shift, bias, val) \ + BF64_SET(x, low, len, ((val) >> (shift)) - (bias)) + +/* + * Macros to reverse byte order + */ +#define BSWAP_8(x) ((x) & 0xff) +#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8)) +#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16)) +#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32)) + +#define SPA_MINBLOCKSHIFT 9 +#define SPA_OLDMAXBLOCKSHIFT 17 +#define SPA_MAXBLOCKSHIFT 24 +#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) +#define SPA_OLDMAXBLOCKSIZE (1ULL << SPA_OLDMAXBLOCKSHIFT) +#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) + +/* + * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB. + * The ASIZE encoding should be at least 64 times larger (6 more bits) + * to support up to 4-way RAID-Z mirror mode with worst-case gang block + * overhead, three DVAs per bp, plus one more bit in case we do anything + * else that expands the ASIZE. + */ +#define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */ +#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ +#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ + +/* + * All SPA data is represented by 128-bit data virtual addresses (DVAs). + * The members of the dva_t should be considered opaque outside the SPA. + */ +typedef struct dva { + uint64_t dva_word[2]; +} dva_t; + +/* + * Each block has a 256-bit checksum -- strong enough for cryptographic hashes. + */ +typedef struct zio_cksum { + uint64_t zc_word[4]; +} zio_cksum_t; + +/* + * Some checksums/hashes need a 256-bit initialization salt. This salt is kept + * secret and is suitable for use in MAC algorithms as the key. + */ +typedef struct zio_cksum_salt { + uint8_t zcs_bytes[32]; +} zio_cksum_salt_t; + +/* + * Each block is described by its DVAs, time of birth, checksum, etc. + * The word-by-word, bit-by-bit layout of the blkptr is as follows: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 0 | vdev1 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 1 |G| offset1 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 2 | vdev2 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 3 |G| offset2 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 4 | vdev3 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 5 |G| offset3 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 7 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 8 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 9 | physical birth txg | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * a | logical birth txg | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * b | fill count | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * c | checksum[0] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * d | checksum[1] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * e | checksum[2] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * f | checksum[3] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Legend: + * + * vdev virtual device ID + * offset offset into virtual device + * LSIZE logical size + * PSIZE physical size (after compression) + * ASIZE allocated size (including RAID-Z parity and gang block headers) + * GRID RAID-Z layout information (reserved for future use) + * cksum checksum function + * comp compression function + * G gang block indicator + * B byteorder (endianness) + * D dedup + * X encryption (on version 30, which is not supported) + * E blkptr_t contains embedded data (see below) + * lvl level of indirection + * type DMU object type + * phys birth txg of block allocation; zero if same as logical birth txg + * log. birth transaction group in which the block was logically born + * fill count number of non-zero blocks under this bp + * checksum[4] 256-bit checksum of the data this bp describes + */ + +/* + * "Embedded" blkptr_t's don't actually point to a block, instead they + * have a data payload embedded in the blkptr_t itself. See the comment + * in blkptr.c for more details. + * + * The blkptr_t is laid out as follows: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 0 | payload | + * 1 | payload | + * 2 | payload | + * 3 | payload | + * 4 | payload | + * 5 | payload | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 6 |BDX|lvl| type | etype |E| comp| PSIZE| LSIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 7 | payload | + * 8 | payload | + * 9 | payload | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * a | logical birth txg | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * b | payload | + * c | payload | + * d | payload | + * e | payload | + * f | payload | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Legend: + * + * payload contains the embedded data + * B (byteorder) byteorder (endianness) + * D (dedup) padding (set to zero) + * X encryption (set to zero; see above) + * E (embedded) set to one + * lvl indirection level + * type DMU object type + * etype how to interpret embedded data (BP_EMBEDDED_TYPE_*) + * comp compression function of payload + * PSIZE size of payload after compression, in bytes + * LSIZE logical size of payload, in bytes + * note that 25 bits is enough to store the largest + * "normal" BP's LSIZE (2^16 * 2^9) in bytes + * log. birth transaction group in which the block was logically born + * + * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded + * bp's they are stored in units of SPA_MINBLOCKSHIFT. + * Generally, the generic BP_GET_*() macros can be used on embedded BP's. + * The B, D, X, lvl, type, and comp fields are stored the same as with normal + * BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must + * be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before + * other macros, as they assert that they are only used on BP's of the correct + * "embedded-ness". + */ + +#define BPE_GET_ETYPE(bp) \ + (assert(BP_IS_EMBEDDED(bp)), \ + BF64_GET((bp)->blk_prop, 40, 8)) +#define BPE_SET_ETYPE(bp, t) do { \ + assert(BP_IS_EMBEDDED(bp)); \ + BF64_SET((bp)->blk_prop, 40, 8, t); \ +_NOTE(CONSTCOND) } while (0) + +#define BPE_GET_LSIZE(bp) \ + (assert(BP_IS_EMBEDDED(bp)), \ + BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1)) +#define BPE_SET_LSIZE(bp, x) do { \ + assert(BP_IS_EMBEDDED(bp)); \ + BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \ +_NOTE(CONSTCOND) } while (0) + +#define BPE_GET_PSIZE(bp) \ + (assert(BP_IS_EMBEDDED(bp)), \ + BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1)) +#define BPE_SET_PSIZE(bp, x) do { \ + assert(BP_IS_EMBEDDED(bp)); \ + BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \ +_NOTE(CONSTCOND) } while (0) + +typedef enum bp_embedded_type { + BP_EMBEDDED_TYPE_DATA, + BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */ + NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED +} bp_embedded_type_t; + +#define BPE_NUM_WORDS 14 +#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t)) +#define BPE_IS_PAYLOADWORD(bp, wp) \ + ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth) + +#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ +#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ + +typedef struct blkptr { + dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */ + uint64_t blk_prop; /* size, compression, type, etc */ + uint64_t blk_pad[2]; /* Extra space for the future */ + uint64_t blk_phys_birth; /* txg when block was allocated */ + uint64_t blk_birth; /* transaction group at birth */ + uint64_t blk_fill; /* fill count */ + zio_cksum_t blk_cksum; /* 256-bit checksum */ +} blkptr_t; + +/* + * Macros to get and set fields in a bp or DVA. + */ +#define DVA_GET_ASIZE(dva) \ + BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0) +#define DVA_SET_ASIZE(dva, x) \ + BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \ + SPA_MINBLOCKSHIFT, 0, x) + +#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8) +#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x) + +#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, 32) +#define DVA_SET_VDEV(dva, x) BF64_SET((dva)->dva_word[0], 32, 32, x) + +#define DVA_GET_OFFSET(dva) \ + BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0) +#define DVA_SET_OFFSET(dva, x) \ + BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x) + +#define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1) +#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) + +#define BP_GET_LSIZE(bp) \ + (BP_IS_EMBEDDED(bp) ? \ + (BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \ + BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)) +#define BP_SET_LSIZE(bp, x) do { \ + assert(!BP_IS_EMBEDDED(bp)); \ + BF64_SET_SB((bp)->blk_prop, \ + 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ +_NOTE(CONSTCOND) } while (0) + +#define BP_GET_PSIZE(bp) \ + BF64_GET_SB((bp)->blk_prop, 16, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1) +#define BP_SET_PSIZE(bp, x) \ + BF64_SET_SB((bp)->blk_prop, 16, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) + +#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 7) +#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 7, x) + +#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8) +#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x) + +#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) +#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) + +#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) +#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) + +#define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1) + +#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1) +#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x) + +#define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1) +#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) + +#define BP_PHYSICAL_BIRTH(bp) \ + ((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth) + +#define BP_SET_BIRTH(bp, logical, physical) \ +{ \ + assert(!BP_IS_EMBEDDED(bp)); \ + (bp)->blk_birth = (logical); \ + (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \ +} + +#define BP_GET_FILL(bp) \ + ((BP_IS_EMBEDDED(bp)) ? 1 : (bp)->blk_fill) + +#define BP_SET_FILL(bp, fill) \ +{ \ + (bp)->blk_fill = fill; \ +} + +#define BP_GET_ASIZE(bp) \ + (DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ + DVA_GET_ASIZE(&(bp)->blk_dva[2])) + +#define BP_GET_UCSIZE(bp) \ + ((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \ + BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)); + +#define BP_GET_NDVAS(bp) \ + (!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ + !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ + !!DVA_GET_ASIZE(&(bp)->blk_dva[2])) + +#define DVA_EQUAL(dva1, dva2) \ + ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ + (dva1)->dva_word[0] == (dva2)->dva_word[0]) + +#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \ + (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \ + ((zc1).zc_word[1] - (zc2).zc_word[1]) | \ + ((zc1).zc_word[2] - (zc2).zc_word[2]) | \ + ((zc1).zc_word[3] - (zc2).zc_word[3]))) + + +#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0) + +#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \ +{ \ + (zcp)->zc_word[0] = w0; \ + (zcp)->zc_word[1] = w1; \ + (zcp)->zc_word[2] = w2; \ + (zcp)->zc_word[3] = w3; \ +} + +#define BP_IDENTITY(bp) (&(bp)->blk_dva[0]) +#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp)) +#define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \ + (dva)->dva_word[1] == 0ULL) +#define BP_IS_HOLE(bp) DVA_IS_EMPTY(BP_IDENTITY(bp)) +#define BP_IS_OLDER(bp, txg) (!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg)) + +#define BP_ZERO(bp) \ +{ \ + (bp)->blk_dva[0].dva_word[0] = 0; \ + (bp)->blk_dva[0].dva_word[1] = 0; \ + (bp)->blk_dva[1].dva_word[0] = 0; \ + (bp)->blk_dva[1].dva_word[1] = 0; \ + (bp)->blk_dva[2].dva_word[0] = 0; \ + (bp)->blk_dva[2].dva_word[1] = 0; \ + (bp)->blk_prop = 0; \ + (bp)->blk_pad[0] = 0; \ + (bp)->blk_pad[1] = 0; \ + (bp)->blk_phys_birth = 0; \ + (bp)->blk_birth = 0; \ + (bp)->blk_fill = 0; \ + ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ +} + +#if BYTE_ORDER == _BIG_ENDIAN +#define ZFS_HOST_BYTEORDER (0ULL) +#else +#define ZFS_HOST_BYTEORDER (1ULL) +#endif + +#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER) +#define BPE_NUM_WORDS 14 +#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t)) +#define BPE_IS_PAYLOADWORD(bp, wp) \ + ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth) + +#define TXG_INITIAL 4 + +/* + * Embedded checksum + */ +#define ZEC_MAGIC 0x210da7ab10c7a11ULL + +typedef struct zio_eck { + uint64_t zec_magic; /* for validation, endianness */ + zio_cksum_t zec_cksum; /* 256-bit checksum */ +} zio_eck_t; + +/* + * Gang block headers are self-checksumming and contain an array + * of block pointers. + */ +#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE +#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \ + sizeof (zio_eck_t)) / sizeof (blkptr_t)) +#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \ + sizeof (zio_eck_t) - \ + (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\ + sizeof (uint64_t)) + +typedef struct zio_gbh { + blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS]; + uint64_t zg_filler[SPA_GBH_FILLER]; + zio_eck_t zg_tail; +} zio_gbh_phys_t; + +#define VDEV_RAIDZ_MAXPARITY 3 + +#define VDEV_PAD_SIZE (8 << 10) +/* 2 padding areas (vl_pad1 and vl_be) to skip */ +#define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 +#define VDEV_PHYS_SIZE (112 << 10) +#define VDEV_UBERBLOCK_RING (128 << 10) + +/* + * MMP blocks occupy the last MMP_BLOCKS_PER_LABEL slots in the uberblock + * ring when MMP is enabled. + */ +#define MMP_BLOCKS_PER_LABEL 1 + +/* The largest uberblock we support is 8k. */ +#define MAX_UBERBLOCK_SHIFT (13) +#define VDEV_UBERBLOCK_SHIFT(vd) \ + MIN(MAX((vd)->v_top->v_ashift, UBERBLOCK_SHIFT), MAX_UBERBLOCK_SHIFT) +#define VDEV_UBERBLOCK_COUNT(vd) \ + (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd)) +#define VDEV_UBERBLOCK_OFFSET(vd, n) \ + offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)]) +#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd)) + +typedef struct vdev_phys { + char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)]; + zio_eck_t vp_zbt; +} vdev_phys_t; + +typedef enum vbe_vers { + /* The bootenv file is stored as ascii text in the envblock */ + VB_RAW = 0, + + /* + * The bootenv file is converted to an nvlist and then packed into the + * envblock. + */ + VB_NVLIST = 1 +} vbe_vers_t; + +typedef struct vdev_boot_envblock { + uint64_t vbe_version; + char vbe_bootenv[VDEV_PAD_SIZE - sizeof (uint64_t) - + sizeof (zio_eck_t)]; + zio_eck_t vbe_zbt; +} vdev_boot_envblock_t; + +_Static_assert(sizeof (vdev_boot_envblock_t) == VDEV_PAD_SIZE, + "incorrect vdev_boot_envblock size"); + +typedef struct vdev_label { + char vl_pad1[VDEV_PAD_SIZE]; /* 8K */ + vdev_boot_envblock_t vl_be; /* 8K */ + vdev_phys_t vl_vdev_phys; /* 112K */ + char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ +} vdev_label_t; /* 256K total */ + +/* + * vdev_dirty() flags + */ +#define VDD_METASLAB 0x01 +#define VDD_DTL 0x02 + +/* + * Size and offset of embedded boot loader region on each label. + * The total size of the first two labels plus the boot area is 4MB. + */ +#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t)) +#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ + +/* + * Size of label regions at the start and end of each leaf device. + */ +#define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE) +#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t)) +#define VDEV_LABELS 4 + +enum zio_checksum { + ZIO_CHECKSUM_INHERIT = 0, + ZIO_CHECKSUM_ON, + ZIO_CHECKSUM_OFF, + ZIO_CHECKSUM_LABEL, + ZIO_CHECKSUM_GANG_HEADER, + ZIO_CHECKSUM_ZILOG, + ZIO_CHECKSUM_FLETCHER_2, + ZIO_CHECKSUM_FLETCHER_4, + ZIO_CHECKSUM_SHA256, + ZIO_CHECKSUM_ZILOG2, + ZIO_CHECKSUM_NOPARITY, + ZIO_CHECKSUM_SHA512, + ZIO_CHECKSUM_SKEIN, + ZIO_CHECKSUM_EDONR, + ZIO_CHECKSUM_FUNCTIONS +}; + +#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4 +#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON + +enum zio_compress { + ZIO_COMPRESS_INHERIT = 0, + ZIO_COMPRESS_ON, + ZIO_COMPRESS_OFF, + ZIO_COMPRESS_LZJB, + ZIO_COMPRESS_EMPTY, + ZIO_COMPRESS_GZIP_1, + ZIO_COMPRESS_GZIP_2, + ZIO_COMPRESS_GZIP_3, + ZIO_COMPRESS_GZIP_4, + ZIO_COMPRESS_GZIP_5, + ZIO_COMPRESS_GZIP_6, + ZIO_COMPRESS_GZIP_7, + ZIO_COMPRESS_GZIP_8, + ZIO_COMPRESS_GZIP_9, + ZIO_COMPRESS_ZLE, + ZIO_COMPRESS_LZ4, + ZIO_COMPRESS_ZSTD, + ZIO_COMPRESS_FUNCTIONS +}; + +enum zio_zstd_levels { + ZIO_ZSTD_LEVEL_INHERIT = 0, + ZIO_ZSTD_LEVEL_1, +#define ZIO_ZSTD_LEVEL_MIN ZIO_ZSTD_LEVEL_1 + ZIO_ZSTD_LEVEL_2, + ZIO_ZSTD_LEVEL_3, +#define ZIO_ZSTD_LEVEL_DEFAULT ZIO_ZSTD_LEVEL_3 + ZIO_ZSTD_LEVEL_4, + ZIO_ZSTD_LEVEL_5, + ZIO_ZSTD_LEVEL_6, + ZIO_ZSTD_LEVEL_7, + ZIO_ZSTD_LEVEL_8, + ZIO_ZSTD_LEVEL_9, + ZIO_ZSTD_LEVEL_10, + ZIO_ZSTD_LEVEL_11, + ZIO_ZSTD_LEVEL_12, + ZIO_ZSTD_LEVEL_13, + ZIO_ZSTD_LEVEL_14, + ZIO_ZSTD_LEVEL_15, + ZIO_ZSTD_LEVEL_16, + ZIO_ZSTD_LEVEL_17, + ZIO_ZSTD_LEVEL_18, + ZIO_ZSTD_LEVEL_19, +#define ZIO_ZSTD_LEVEL_MAX ZIO_ZSTD_LEVEL_19 + ZIO_ZSTD_LEVEL_RESERVE = 101, /* Leave room for new positive levels */ + ZIO_ZSTD_LEVEL_FAST, /* Fast levels are negative */ + ZIO_ZSTD_LEVEL_FAST_1, +#define ZIO_ZSTD_LEVEL_FAST_DEFAULT ZIO_ZSTD_LEVEL_FAST_1 + ZIO_ZSTD_LEVEL_FAST_2, + ZIO_ZSTD_LEVEL_FAST_3, + ZIO_ZSTD_LEVEL_FAST_4, + ZIO_ZSTD_LEVEL_FAST_5, + ZIO_ZSTD_LEVEL_FAST_6, + ZIO_ZSTD_LEVEL_FAST_7, + ZIO_ZSTD_LEVEL_FAST_8, + ZIO_ZSTD_LEVEL_FAST_9, + ZIO_ZSTD_LEVEL_FAST_10, + ZIO_ZSTD_LEVEL_FAST_20, + ZIO_ZSTD_LEVEL_FAST_30, + ZIO_ZSTD_LEVEL_FAST_40, + ZIO_ZSTD_LEVEL_FAST_50, + ZIO_ZSTD_LEVEL_FAST_60, + ZIO_ZSTD_LEVEL_FAST_70, + ZIO_ZSTD_LEVEL_FAST_80, + ZIO_ZSTD_LEVEL_FAST_90, + ZIO_ZSTD_LEVEL_FAST_100, + ZIO_ZSTD_LEVEL_FAST_500, + ZIO_ZSTD_LEVEL_FAST_1000, +#define ZIO_ZSTD_LEVEL_FAST_MAX ZIO_ZSTD_LEVEL_FAST_1000 + ZIO_ZSTD_LEVEL_AUTO = 251, /* Reserved for future use */ + ZIO_ZSTD_LEVEL_LEVELS +}; + +#define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB +#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF + +/* + * On-disk version number. + */ +#define SPA_VERSION_1 1ULL +#define SPA_VERSION_2 2ULL +#define SPA_VERSION_3 3ULL +#define SPA_VERSION_4 4ULL +#define SPA_VERSION_5 5ULL +#define SPA_VERSION_6 6ULL +#define SPA_VERSION_7 7ULL +#define SPA_VERSION_8 8ULL +#define SPA_VERSION_9 9ULL +#define SPA_VERSION_10 10ULL +#define SPA_VERSION_11 11ULL +#define SPA_VERSION_12 12ULL +#define SPA_VERSION_13 13ULL +#define SPA_VERSION_14 14ULL +#define SPA_VERSION_15 15ULL +#define SPA_VERSION_16 16ULL +#define SPA_VERSION_17 17ULL +#define SPA_VERSION_18 18ULL +#define SPA_VERSION_19 19ULL +#define SPA_VERSION_20 20ULL +#define SPA_VERSION_21 21ULL +#define SPA_VERSION_22 22ULL +#define SPA_VERSION_23 23ULL +#define SPA_VERSION_24 24ULL +#define SPA_VERSION_25 25ULL +#define SPA_VERSION_26 26ULL +#define SPA_VERSION_27 27ULL +#define SPA_VERSION_28 28ULL +#define SPA_VERSION_5000 5000ULL + +/* + * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk + * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*}, + * and do the appropriate changes. Also bump the version number in + * usr/src/grub/capability. + */ +#define SPA_VERSION SPA_VERSION_5000 +#define SPA_VERSION_STRING "5000" + +/* + * Symbolic names for the changes that caused a SPA_VERSION switch. + * Used in the code when checking for presence or absence of a feature. + * Feel free to define multiple symbolic names for each version if there + * were multiple changes to on-disk structures during that version. + * + * NOTE: When checking the current SPA_VERSION in your code, be sure + * to use spa_version() since it reports the version of the + * last synced uberblock. Checking the in-flight version can + * be dangerous in some cases. + */ +#define SPA_VERSION_INITIAL SPA_VERSION_1 +#define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2 +#define SPA_VERSION_SPARES SPA_VERSION_3 +#define SPA_VERSION_RAID6 SPA_VERSION_3 +#define SPA_VERSION_BPLIST_ACCOUNT SPA_VERSION_3 +#define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3 +#define SPA_VERSION_DNODE_BYTES SPA_VERSION_3 +#define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4 +#define SPA_VERSION_GZIP_COMPRESSION SPA_VERSION_5 +#define SPA_VERSION_BOOTFS SPA_VERSION_6 +#define SPA_VERSION_SLOGS SPA_VERSION_7 +#define SPA_VERSION_DELEGATED_PERMS SPA_VERSION_8 +#define SPA_VERSION_FUID SPA_VERSION_9 +#define SPA_VERSION_REFRESERVATION SPA_VERSION_9 +#define SPA_VERSION_REFQUOTA SPA_VERSION_9 +#define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9 +#define SPA_VERSION_L2CACHE SPA_VERSION_10 +#define SPA_VERSION_NEXT_CLONES SPA_VERSION_11 +#define SPA_VERSION_ORIGIN SPA_VERSION_11 +#define SPA_VERSION_DSL_SCRUB SPA_VERSION_11 +#define SPA_VERSION_SNAP_PROPS SPA_VERSION_12 +#define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13 +#define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14 +#define SPA_VERSION_USERSPACE SPA_VERSION_15 +#define SPA_VERSION_STMF_PROP SPA_VERSION_16 +#define SPA_VERSION_RAIDZ3 SPA_VERSION_17 +#define SPA_VERSION_USERREFS SPA_VERSION_18 +#define SPA_VERSION_HOLES SPA_VERSION_19 +#define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20 +#define SPA_VERSION_DEDUP SPA_VERSION_21 +#define SPA_VERSION_RECVD_PROPS SPA_VERSION_22 +#define SPA_VERSION_SLIM_ZIL SPA_VERSION_23 +#define SPA_VERSION_SA SPA_VERSION_24 +#define SPA_VERSION_SCAN SPA_VERSION_25 +#define SPA_VERSION_DIR_CLONES SPA_VERSION_26 +#define SPA_VERSION_DEADLISTS SPA_VERSION_26 +#define SPA_VERSION_FAST_SNAP SPA_VERSION_27 +#define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28 +#define SPA_VERSION_BEFORE_FEATURES SPA_VERSION_28 +#define SPA_VERSION_FEATURES SPA_VERSION_5000 + +#define SPA_VERSION_IS_SUPPORTED(v) \ + (((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \ + ((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION)) + +/* + * The following are configuration names used in the nvlist describing a pool's + * configuration. + */ +#define ZPOOL_CONFIG_VERSION "version" +#define ZPOOL_CONFIG_POOL_NAME "name" +#define ZPOOL_CONFIG_POOL_STATE "state" +#define ZPOOL_CONFIG_POOL_TXG "txg" +#define ZPOOL_CONFIG_POOL_GUID "pool_guid" +#define ZPOOL_CONFIG_CREATE_TXG "create_txg" +#define ZPOOL_CONFIG_TOP_GUID "top_guid" +#define ZPOOL_CONFIG_VDEV_TREE "vdev_tree" +#define ZPOOL_CONFIG_TYPE "type" +#define ZPOOL_CONFIG_CHILDREN "children" +#define ZPOOL_CONFIG_ID "id" +#define ZPOOL_CONFIG_GUID "guid" +#define ZPOOL_CONFIG_INDIRECT_OBJECT "com.delphix:indirect_object" +#define ZPOOL_CONFIG_INDIRECT_BIRTHS "com.delphix:indirect_births" +#define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev" +#define ZPOOL_CONFIG_PATH "path" +#define ZPOOL_CONFIG_DEVID "devid" +#define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array" +#define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift" +#define ZPOOL_CONFIG_ASHIFT "ashift" +#define ZPOOL_CONFIG_ASIZE "asize" +#define ZPOOL_CONFIG_DTL "DTL" +#define ZPOOL_CONFIG_STATS "stats" +#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk" +#define ZPOOL_CONFIG_ERRCOUNT "error_count" +#define ZPOOL_CONFIG_NOT_PRESENT "not_present" +#define ZPOOL_CONFIG_SPARES "spares" +#define ZPOOL_CONFIG_IS_SPARE "is_spare" +#define ZPOOL_CONFIG_NPARITY "nparity" +#define ZPOOL_CONFIG_HOSTID "hostid" +#define ZPOOL_CONFIG_HOSTNAME "hostname" +#define ZPOOL_CONFIG_IS_LOG "is_log" +#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */ +#define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read" +#define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children" + +/* + * The persistent vdev state is stored as separate values rather than a single + * 'vdev_state' entry. This is because a device can be in multiple states, such + * as offline and degraded. + */ +#define ZPOOL_CONFIG_OFFLINE "offline" +#define ZPOOL_CONFIG_FAULTED "faulted" +#define ZPOOL_CONFIG_DEGRADED "degraded" +#define ZPOOL_CONFIG_REMOVED "removed" +#define ZPOOL_CONFIG_FRU "fru" +#define ZPOOL_CONFIG_AUX_STATE "aux_state" + +#define VDEV_TYPE_ROOT "root" +#define VDEV_TYPE_MIRROR "mirror" +#define VDEV_TYPE_REPLACING "replacing" +#define VDEV_TYPE_RAIDZ "raidz" +#define VDEV_TYPE_DISK "disk" +#define VDEV_TYPE_FILE "file" +#define VDEV_TYPE_MISSING "missing" +#define VDEV_TYPE_HOLE "hole" +#define VDEV_TYPE_SPARE "spare" +#define VDEV_TYPE_LOG "log" +#define VDEV_TYPE_L2CACHE "l2cache" +#define VDEV_TYPE_INDIRECT "indirect" + +/* + * This is needed in userland to report the minimum necessary device size. + */ +#define SPA_MINDEVSIZE (64ULL << 20) + +/* + * The location of the pool configuration repository, shared between kernel and + * userland. + */ +#define ZPOOL_CACHE "/boot/zfs/zpool.cache" + +/* + * vdev states are ordered from least to most healthy. + * A vdev that's CANT_OPEN or below is considered unusable. + */ +typedef enum vdev_state { + VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */ + VDEV_STATE_CLOSED, /* Not currently open */ + VDEV_STATE_OFFLINE, /* Not allowed to open */ + VDEV_STATE_REMOVED, /* Explicitly removed from system */ + VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */ + VDEV_STATE_FAULTED, /* External request to fault device */ + VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */ + VDEV_STATE_HEALTHY /* Presumed good */ +} vdev_state_t; + +/* + * vdev aux states. When a vdev is in the CANT_OPEN state, the aux field + * of the vdev stats structure uses these constants to distinguish why. + */ +typedef enum vdev_aux { + VDEV_AUX_NONE, /* no error */ + VDEV_AUX_OPEN_FAILED, /* ldi_open_*() or vn_open() failed */ + VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */ + VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */ + VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */ + VDEV_AUX_TOO_SMALL, /* vdev size is too small */ + VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */ + VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */ + VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */ + VDEV_AUX_SPARED /* hot spare used in another pool */ +} vdev_aux_t; + +/* + * pool state. The following states are written to disk as part of the normal + * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE. The remaining states are + * software abstractions used at various levels to communicate pool state. + */ +typedef enum pool_state { + POOL_STATE_ACTIVE = 0, /* In active use */ + POOL_STATE_EXPORTED, /* Explicitly exported */ + POOL_STATE_DESTROYED, /* Explicitly destroyed */ + POOL_STATE_SPARE, /* Reserved for hot spare use */ + POOL_STATE_UNINITIALIZED, /* Internal spa_t state */ + POOL_STATE_UNAVAIL, /* Internal libzfs state */ + POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */ +} pool_state_t; + +/* + * The uberblock version is incremented whenever an incompatible on-disk + * format change is made to the SPA, DMU, or ZAP. + * + * Note: the first two fields should never be moved. When a storage pool + * is opened, the uberblock must be read off the disk before the version + * can be checked. If the ub_version field is moved, we may not detect + * version mismatch. If the ub_magic field is moved, applications that + * expect the magic number in the first word won't work. + */ +#define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */ +#define UBERBLOCK_SHIFT 10 /* up to 1K */ + +#define MMP_MAGIC 0xa11cea11 /* all-see-all */ + +#define MMP_INTERVAL_VALID_BIT 0x01 +#define MMP_SEQ_VALID_BIT 0x02 +#define MMP_FAIL_INT_VALID_BIT 0x04 + +#define MMP_VALID(ubp) (ubp->ub_magic == UBERBLOCK_MAGIC && \ + ubp->ub_mmp_magic == MMP_MAGIC) +#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ + MMP_INTERVAL_VALID_BIT)) +#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ + MMP_SEQ_VALID_BIT)) +#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ + MMP_FAIL_INT_VALID_BIT)) + +#define MMP_INTERVAL(ubp) ((ubp->ub_mmp_config & 0x00000000FFFFFF00) \ + >> 8) +#define MMP_SEQ(ubp) ((ubp->ub_mmp_config & 0x0000FFFF00000000) \ + >> 32) +#define MMP_FAIL_INT(ubp) ((ubp->ub_mmp_config & 0xFFFF000000000000) \ + >> 48) + +typedef struct uberblock { + uint64_t ub_magic; /* UBERBLOCK_MAGIC */ + uint64_t ub_version; /* SPA_VERSION */ + uint64_t ub_txg; /* txg of last sync */ + uint64_t ub_guid_sum; /* sum of all vdev guids */ + uint64_t ub_timestamp; /* UTC time of last sync */ + blkptr_t ub_rootbp; /* MOS objset_phys_t */ + /* highest SPA_VERSION supported by software that wrote this txg */ + uint64_t ub_software_version; + /* Maybe missing in uberblocks we read, but always written */ + uint64_t ub_mmp_magic; + /* + * If ub_mmp_delay == 0 and ub_mmp_magic is valid, MMP is off. + * Otherwise, nanosec since last MMP write. + */ + uint64_t ub_mmp_delay; + + /* + * The ub_mmp_config contains the multihost write interval, multihost + * fail intervals, sequence number for sub-second granularity, and + * valid bit mask. This layout is as follows: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 0 | Fail Intervals| Seq | Write Interval (ms) | VALID | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * This allows a write_interval of (2^24/1000)s, over 4.5 hours + * + * VALID Bits: + * - 0x01 - Write Interval (ms) + * - 0x02 - Sequence number exists + * - 0x04 - Fail Intervals + * - 0xf8 - Reserved + */ + uint64_t ub_mmp_config; + + /* + * ub_checkpoint_txg indicates two things about the current uberblock: + * + * 1] If it is not zero then this uberblock is a checkpoint. If it is + * zero, then this uberblock is not a checkpoint. + * + * 2] On checkpointed uberblocks, the value of ub_checkpoint_txg is + * the ub_txg that the uberblock had at the time we moved it to + * the MOS config. + * + * The field is set when we checkpoint the uberblock and continues to + * hold that value even after we've rewound (unlike the ub_txg that + * is reset to a higher value). + * + * Besides checks used to determine whether we are reopening the + * pool from a checkpointed uberblock [see spa_ld_select_uberblock()], + * the value of the field is used to determine which ZIL blocks have + * been allocated according to the ms_sm when we are rewinding to a + * checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then + * the ZIL block is not allocated [see uses of spa_min_claim_txg()]. + */ + uint64_t ub_checkpoint_txg; +} uberblock_t; + +/* + * Flags. + */ +#define DNODE_MUST_BE_ALLOCATED 1 +#define DNODE_MUST_BE_FREE 2 + +/* + * Fixed constants. + */ +#define DNODE_SHIFT 9 /* 512 bytes */ +#define DN_MIN_INDBLKSHIFT 12 /* 4k */ +#define DN_MAX_INDBLKSHIFT 17 /* 128k */ +#define DNODE_BLOCK_SHIFT 14 /* 16k */ +#define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */ +#define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */ +#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */ + +/* + * Derived constants. + */ +#define DNODE_MIN_SIZE (1 << DNODE_SHIFT) +#define DNODE_MAX_SIZE (1 << DNODE_BLOCK_SHIFT) +#define DNODE_BLOCK_SIZE (1 << DNODE_BLOCK_SHIFT) +#define DNODE_MIN_SLOTS (DNODE_MIN_SIZE >> DNODE_SHIFT) +#define DNODE_MAX_SLOTS (DNODE_MAX_SIZE >> DNODE_SHIFT) +#define DN_BONUS_SIZE(dnsize) ((dnsize) - DNODE_CORE_SIZE - \ + (1 << SPA_BLKPTRSHIFT)) +#define DN_SLOTS_TO_BONUSLEN(slots) DN_BONUS_SIZE((slots) << DNODE_SHIFT) +#define DN_OLD_MAX_BONUSLEN (DN_BONUS_SIZE(DNODE_MIN_SIZE)) +#define DN_MAX_NBLKPTR ((DNODE_MIN_SIZE - DNODE_CORE_SIZE) >> \ + SPA_BLKPTRSHIFT) +#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT) +#define DN_ZERO_BONUSLEN (DN_BONUS_SIZE(DNODE_MAX_SIZE) + 1) + +#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT) +#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT) +#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT) + +/* The +2 here is a cheesy way to round up */ +#define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \ + (DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT))) + +#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \ + (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t)))) + +#define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \ + (dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT) + +#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift)) + +/* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */ +#define DNODE_FLAG_USED_BYTES (1<<0) +#define DNODE_FLAG_USERUSED_ACCOUNTED (1<<1) + +/* Does dnode have a SA spill blkptr in bonus? */ +#define DNODE_FLAG_SPILL_BLKPTR (1<<2) + +typedef struct dnode_phys { + uint8_t dn_type; /* dmu_object_type_t */ + uint8_t dn_indblkshift; /* ln2(indirect block size) */ + uint8_t dn_nlevels; /* 1=dn_blkptr->data blocks */ + uint8_t dn_nblkptr; /* length of dn_blkptr */ + uint8_t dn_bonustype; /* type of data in bonus buffer */ + uint8_t dn_checksum; /* ZIO_CHECKSUM type */ + uint8_t dn_compress; /* ZIO_COMPRESS type */ + uint8_t dn_flags; /* DNODE_FLAG_* */ + uint16_t dn_datablkszsec; /* data block size in 512b sectors */ + uint16_t dn_bonuslen; /* length of dn_bonus */ + uint8_t dn_extra_slots; /* # of subsequent slots consumed */ + uint8_t dn_pad2[3]; + + /* accounting is protected by dn_dirty_mtx */ + uint64_t dn_maxblkid; /* largest allocated block ID */ + uint64_t dn_used; /* bytes (or sectors) of disk space */ + + uint64_t dn_pad3[4]; + + /* + * The tail region is 448 bytes for a 512 byte dnode, and + * correspondingly larger for larger dnode sizes. The spill + * block pointer, when present, is always at the end of the tail + * region. There are three ways this space may be used, using + * a 512 byte dnode for this diagram: + * + * 0 64 128 192 256 320 384 448 (offset) + * +---------------+---------------+---------------+-------+ + * | dn_blkptr[0] | dn_blkptr[1] | dn_blkptr[2] | / | + * +---------------+---------------+---------------+-------+ + * | dn_blkptr[0] | dn_bonus[0..319] | + * +---------------+-----------------------+---------------+ + * | dn_blkptr[0] | dn_bonus[0..191] | dn_spill | + * +---------------+-----------------------+---------------+ + */ + union { + blkptr_t dn_blkptr[1+DN_OLD_MAX_BONUSLEN/sizeof (blkptr_t)]; + struct { + blkptr_t __dn_ignore1; + uint8_t dn_bonus[DN_OLD_MAX_BONUSLEN]; + }; + struct { + blkptr_t __dn_ignore2; + uint8_t __dn_ignore3[DN_OLD_MAX_BONUSLEN - + sizeof (blkptr_t)]; + blkptr_t dn_spill; + }; + }; +} dnode_phys_t; + +#define DN_SPILL_BLKPTR(dnp) (blkptr_t *)((char *)(dnp) + \ + (((dnp)->dn_extra_slots + 1) << DNODE_SHIFT) - (1 << SPA_BLKPTRSHIFT)) + +typedef enum dmu_object_byteswap { + DMU_BSWAP_UINT8, + DMU_BSWAP_UINT16, + DMU_BSWAP_UINT32, + DMU_BSWAP_UINT64, + DMU_BSWAP_ZAP, + DMU_BSWAP_DNODE, + DMU_BSWAP_OBJSET, + DMU_BSWAP_ZNODE, + DMU_BSWAP_OLDACL, + DMU_BSWAP_ACL, + /* + * Allocating a new byteswap type number makes the on-disk format + * incompatible with any other format that uses the same number. + * + * Data can usually be structured to work with one of the + * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types. + */ + DMU_BSWAP_NUMFUNCS +} dmu_object_byteswap_t; + +#define DMU_OT_NEWTYPE 0x80 +#define DMU_OT_METADATA 0x40 +#define DMU_OT_BYTESWAP_MASK 0x3f + +/* + * Defines a uint8_t object type. Object types specify if the data + * in the object is metadata (boolean) and how to byteswap the data + * (dmu_object_byteswap_t). + */ +#define DMU_OT(byteswap, metadata) \ + (DMU_OT_NEWTYPE | \ + ((metadata) ? DMU_OT_METADATA : 0) | \ + ((byteswap) & DMU_OT_BYTESWAP_MASK)) + +typedef enum dmu_object_type { + DMU_OT_NONE, + /* general: */ + DMU_OT_OBJECT_DIRECTORY, /* ZAP */ + DMU_OT_OBJECT_ARRAY, /* UINT64 */ + DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */ + DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */ + DMU_OT_BPOBJ, /* UINT64 */ + DMU_OT_BPOBJ_HDR, /* UINT64 */ + /* spa: */ + DMU_OT_SPACE_MAP_HEADER, /* UINT64 */ + DMU_OT_SPACE_MAP, /* UINT64 */ + /* zil: */ + DMU_OT_INTENT_LOG, /* UINT64 */ + /* dmu: */ + DMU_OT_DNODE, /* DNODE */ + DMU_OT_OBJSET, /* OBJSET */ + /* dsl: */ + DMU_OT_DSL_DIR, /* UINT64 */ + DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */ + DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */ + DMU_OT_DSL_PROPS, /* ZAP */ + DMU_OT_DSL_DATASET, /* UINT64 */ + /* zpl: */ + DMU_OT_ZNODE, /* ZNODE */ + DMU_OT_OLDACL, /* Old ACL */ + DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */ + DMU_OT_DIRECTORY_CONTENTS, /* ZAP */ + DMU_OT_MASTER_NODE, /* ZAP */ + DMU_OT_UNLINKED_SET, /* ZAP */ + /* zvol: */ + DMU_OT_ZVOL, /* UINT8 */ + DMU_OT_ZVOL_PROP, /* ZAP */ + /* other; for testing only! */ + DMU_OT_PLAIN_OTHER, /* UINT8 */ + DMU_OT_UINT64_OTHER, /* UINT64 */ + DMU_OT_ZAP_OTHER, /* ZAP */ + /* new object types: */ + DMU_OT_ERROR_LOG, /* ZAP */ + DMU_OT_SPA_HISTORY, /* UINT8 */ + DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */ + DMU_OT_POOL_PROPS, /* ZAP */ + DMU_OT_DSL_PERMS, /* ZAP */ + DMU_OT_ACL, /* ACL */ + DMU_OT_SYSACL, /* SYSACL */ + DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */ + DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ + DMU_OT_NEXT_CLONES, /* ZAP */ + DMU_OT_SCAN_QUEUE, /* ZAP */ + DMU_OT_USERGROUP_USED, /* ZAP */ + DMU_OT_USERGROUP_QUOTA, /* ZAP */ + DMU_OT_USERREFS, /* ZAP */ + DMU_OT_DDT_ZAP, /* ZAP */ + DMU_OT_DDT_STATS, /* ZAP */ + DMU_OT_SA, /* System attr */ + DMU_OT_SA_MASTER_NODE, /* ZAP */ + DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */ + DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */ + DMU_OT_SCAN_XLATE, /* ZAP */ + DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */ + DMU_OT_DEADLIST, /* ZAP */ + DMU_OT_DEADLIST_HDR, /* UINT64 */ + DMU_OT_DSL_CLONES, /* ZAP */ + DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */ + DMU_OT_NUMTYPES, + + /* + * Names for valid types declared with DMU_OT(). + */ + DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE), + DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE), + DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE), + DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE), + DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE), + DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE), + DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE), + DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE), + DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE), + DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE) +} dmu_object_type_t; + +typedef enum dmu_objset_type { + DMU_OST_NONE, + DMU_OST_META, + DMU_OST_ZFS, + DMU_OST_ZVOL, + DMU_OST_OTHER, /* For testing only! */ + DMU_OST_ANY, /* Be careful! */ + DMU_OST_NUMTYPES +} dmu_objset_type_t; + +#define ZAP_MAXVALUELEN (1024 * 8) + +/* + * header for all bonus and spill buffers. + * The header has a fixed portion with a variable number + * of "lengths" depending on the number of variable sized + * attribues which are determined by the "layout number" + */ + +#define SA_MAGIC 0x2F505A /* ZFS SA */ +typedef struct sa_hdr_phys { + uint32_t sa_magic; + uint16_t sa_layout_info; /* Encoded with hdrsize and layout number */ + uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */ + /* ... Data follows the lengths. */ +} sa_hdr_phys_t; + +/* + * sa_hdr_phys -> sa_layout_info + * + * 16 10 0 + * +--------+-------+ + * | hdrsz |layout | + * +--------+-------+ + * + * Bits 0-10 are the layout number + * Bits 11-16 are the size of the header. + * The hdrsize is the number * 8 + * + * For example. + * hdrsz of 1 ==> 8 byte header + * 2 ==> 16 byte header + * + */ + +#define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10) +#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 16, 3, 0) +#define SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \ +{ \ + BF32_SET_SB(x, 10, 6, 3, 0, size); \ + BF32_SET(x, 0, 10, num); \ +} + +#define SA_ATTR_BSWAP(x) BF32_GET(x, 16, 8) +#define SA_ATTR_LENGTH(x) BF32_GET(x, 24, 16) +#define SA_ATTR_NUM(x) BF32_GET(x, 0, 16) +#define SA_ATTR_ENCODE(x, attr, length, bswap) \ +{ \ + BF64_SET(x, 24, 16, length); \ + BF64_SET(x, 16, 8, bswap); \ + BF64_SET(x, 0, 16, attr); \ +} + +#define SA_MODE_OFFSET 0 +#define SA_SIZE_OFFSET 8 +#define SA_GEN_OFFSET 16 +#define SA_UID_OFFSET 24 +#define SA_GID_OFFSET 32 +#define SA_PARENT_OFFSET 40 +#define SA_FLAGS_OFFSET 48 +#define SA_ATIME_OFFSET 56 +#define SA_MTIME_OFFSET 72 +#define SA_CTIME_OFFSET 88 +#define SA_CRTIME_OFFSET 104 +#define SA_LINKS_OFFSET 120 +//#define SA_PROJID_OFFSET 128 + +#define SA_REGISTRY "REGISTRY" +#define SA_LAYOUTS "LAYOUTS" + +typedef enum sa_bswap_type { + SA_UINT64_ARRAY, + SA_UINT32_ARRAY, + SA_UINT16_ARRAY, + SA_UINT8_ARRAY, + SA_ACL, +} sa_bswap_type_t; + +typedef uint16_t sa_attr_type_t; + +#define ZIO_OBJSET_MAC_LEN 32 + +/* + * Intent log header - this on disk structure holds fields to manage + * the log. All fields are 64 bit to easily handle cross architectures. + */ +typedef struct zil_header { + uint64_t zh_claim_txg; /* txg in which log blocks were claimed */ + uint64_t zh_replay_seq; /* highest replayed sequence number */ + blkptr_t zh_log; /* log chain */ + uint64_t zh_claim_seq; /* highest claimed sequence number */ + uint64_t zh_pad[5]; +} zil_header_t; + +#define OBJSET_PHYS_SIZE_V2 2048 +#define OBJSET_PHYS_SIZE_V3 4096 + +typedef struct objset_phys { + dnode_phys_t os_meta_dnode; + zil_header_t os_zil_header; + uint64_t os_type; + uint64_t os_flags; + uint8_t os_portable_mac[ZIO_OBJSET_MAC_LEN]; + uint8_t os_local_mac[ZIO_OBJSET_MAC_LEN]; + char os_pad0[OBJSET_PHYS_SIZE_V2 - sizeof (dnode_phys_t)*3 - + sizeof (zil_header_t) - sizeof (uint64_t)*2 - + 2*ZIO_OBJSET_MAC_LEN]; + dnode_phys_t os_userused_dnode; + dnode_phys_t os_groupused_dnode; + dnode_phys_t os_projectused_dnode; + char os_pad1[OBJSET_PHYS_SIZE_V3 - OBJSET_PHYS_SIZE_V2 - + sizeof (dnode_phys_t)]; +} objset_phys_t; + +#define SPACE_MAP_SIZE_V0 (3 * sizeof (uint64_t)) +#define SPACE_MAP_HISTOGRAM_SIZE 32 + +typedef struct space_map_phys { + /* object number: not needed but kept for backwards compatibility */ + uint64_t smp_object; + + /* length of the object in bytes */ + uint64_t smp_length; + + /* space allocated from the map */ + int64_t smp_alloc; + + /* reserved */ + uint64_t smp_pad[5]; + + /* + * The smp_histogram maintains a histogram of free regions. Each + * bucket, smp_histogram[i], contains the number of free regions + * whose size is: + * 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1) + * + * Note that, if log space map feature is enabled, histograms of + * space maps that belong to metaslabs will take into account any + * unflushed changes for their metaslabs, even though the actual + * space map doesn't have entries for these changes. + */ + uint64_t smp_histogram[SPACE_MAP_HISTOGRAM_SIZE]; +} space_map_phys_t; + +typedef enum { + SM_ALLOC, + SM_FREE +} maptype_t; + +typedef struct space_map_entry { + maptype_t sme_type; + uint32_t sme_vdev; /* max is 2^24-1; SM_NO_VDEVID if not present */ + uint64_t sme_offset; /* max is 2^63-1; units of sm_shift */ + uint64_t sme_run; /* max is 2^36; units of sm_shift */ + + /* + * The following fields are not part of the actual space map entry + * on-disk and they are populated with the values from the debug + * entry most recently visited starting from the beginning to the + * end of the space map. + */ + uint64_t sme_txg; + uint64_t sme_sync_pass; +} space_map_entry_t; + +/* one-word entry constants */ +#define SM_DEBUG_PREFIX 2 +#define SM_OFFSET_BITS 47 +#define SM_RUN_BITS 15 + +/* two-word entry constants */ +#define SM2_PREFIX 3 +#define SM2_OFFSET_BITS 63 +#define SM2_RUN_BITS 36 + +#define SM_PREFIX_DECODE(x) BF64_DECODE(x, 62, 2) +#define SM_PREFIX_ENCODE(x) BF64_ENCODE(x, 62, 2) + +#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 2) +#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 2) +#define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10) +#define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10) +#define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50) +#define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50) + +#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, SM_OFFSET_BITS) +#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, SM_OFFSET_BITS) +#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1) +#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1) +#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, SM_RUN_BITS) + 1) +#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, SM_RUN_BITS) +#define SM_RUN_MAX SM_RUN_DECODE(~0ULL) +#define SM_OFFSET_MAX SM_OFFSET_DECODE(~0ULL) + +#define SM2_RUN_DECODE(x) (BF64_DECODE(x, 24, SM2_RUN_BITS) + 1) +#define SM2_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 24, SM2_RUN_BITS) +#define SM2_VDEV_DECODE(x) BF64_DECODE(x, 0, 24) +#define SM2_VDEV_ENCODE(x) BF64_ENCODE(x, 0, 24) +#define SM2_TYPE_DECODE(x) BF64_DECODE(x, SM2_OFFSET_BITS, 1) +#define SM2_TYPE_ENCODE(x) BF64_ENCODE(x, SM2_OFFSET_BITS, 1) +#define SM2_OFFSET_DECODE(x) BF64_DECODE(x, 0, SM2_OFFSET_BITS) +#define SM2_OFFSET_ENCODE(x) BF64_ENCODE(x, 0, SM2_OFFSET_BITS) +#define SM2_RUN_MAX SM2_RUN_DECODE(~0ULL) +#define SM2_OFFSET_MAX SM2_OFFSET_DECODE(~0ULL) + +typedef struct dsl_dir_phys { + uint64_t dd_creation_time; /* not actually used */ + uint64_t dd_head_dataset_obj; + uint64_t dd_parent_obj; + uint64_t dd_clone_parent_obj; + uint64_t dd_child_dir_zapobj; + /* + * how much space our children are accounting for; for leaf + * datasets, == physical space used by fs + snaps + */ + uint64_t dd_used_bytes; + uint64_t dd_compressed_bytes; + uint64_t dd_uncompressed_bytes; + /* Administrative quota setting */ + uint64_t dd_quota; + /* Administrative reservation setting */ + uint64_t dd_reserved; + uint64_t dd_props_zapobj; + uint64_t dd_pad[7]; + uint64_t dd_clones; + uint64_t dd_pad1[13]; /* pad out to 256 bytes for good measure */ +} dsl_dir_phys_t; + +typedef struct dsl_dataset_phys { + uint64_t ds_dir_obj; + uint64_t ds_prev_snap_obj; + uint64_t ds_prev_snap_txg; + uint64_t ds_next_snap_obj; + uint64_t ds_snapnames_zapobj; /* zap obj of snaps; ==0 for snaps */ + uint64_t ds_num_children; /* clone/snap children; ==0 for head */ + uint64_t ds_creation_time; /* seconds since 1970 */ + uint64_t ds_creation_txg; + uint64_t ds_deadlist_obj; + uint64_t ds_used_bytes; + uint64_t ds_compressed_bytes; + uint64_t ds_uncompressed_bytes; + uint64_t ds_unique_bytes; /* only relevant to snapshots */ + /* + * The ds_fsid_guid is a 56-bit ID that can change to avoid + * collisions. The ds_guid is a 64-bit ID that will never + * change, so there is a small probability that it will collide. + */ + uint64_t ds_fsid_guid; + uint64_t ds_guid; + uint64_t ds_flags; + blkptr_t ds_bp; + uint64_t ds_pad[8]; /* pad out to 320 bytes for good measure */ +} dsl_dataset_phys_t; + +typedef struct dsl_deadlist_phys { + uint64_t dl_used; + uint64_t dl_comp; + uint64_t dl_uncomp; + uint64_t dl_pad[37]; /* pad out to 320b for future expansion */ +} dsl_deadlist_phys_t; + +#define BPOBJ_SIZE_V2 (6 * sizeof (uint64_t)) + +typedef struct bpobj_phys { + uint64_t bpo_num_blkptrs; + uint64_t bpo_bytes; + uint64_t bpo_comp; + uint64_t bpo_uncomp; + uint64_t bpo_subobjs; + uint64_t bpo_num_subobjs; + uint64_t bpo_num_freed; +} bpobj_phys_t; + +/* + * The names of zap entries in the DIRECTORY_OBJECT of the MOS. + */ +#define DMU_POOL_DIRECTORY_OBJECT 1 +#define DMU_POOL_CONFIG "config" +#define DMU_POOL_FEATURES_FOR_READ "features_for_read" +#define DMU_POOL_FEATURES_FOR_WRITE "features_for_write" +#define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions" +#define DMU_POOL_ROOT_DATASET "root_dataset" +#define DMU_POOL_SYNC_BPLIST "sync_bplist" +#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" +#define DMU_POOL_ERRLOG_LAST "errlog_last" +#define DMU_POOL_SPARES "spares" +#define DMU_POOL_DEFLATE "deflate" +#define DMU_POOL_HISTORY "history" +#define DMU_POOL_PROPS "pool_props" +#define DMU_POOL_FREE_BPOBJ "free_bpobj" +#define DMU_POOL_BPTREE_OBJ "bptree_obj" +#define DMU_POOL_EMPTY_BPOBJ "empty_bpobj" +#define DMU_POOL_TMP_USERREFS "tmp_userrefs" +#define DMU_POOL_CHECKSUM_SALT "org.illumos:checksum_salt" +#define DMU_POOL_REMOVING "com.delphix:removing" +#define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj" +#define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect" +#define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint" + +#define ZAP_MAGIC 0x2F52AB2ABULL + +#define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_block_shift) + +#define ZAP_MAXCD (uint32_t)(-1) +#define ZAP_HASHBITS 28 +#define MZAP_ENT_LEN 64 +#define MZAP_ENT_MAX \ + ((MZAP_MAX_BLKSZ - sizeof(mzap_phys_t)) / sizeof(mzap_ent_phys_t) + 1) +#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2) +#define MZAP_MAX_BLKSZ SPA_OLDMAXBLOCKSIZE + +typedef struct mzap_ent_phys { + uint64_t mze_value; + uint32_t mze_cd; + uint16_t mze_pad; /* in case we want to chain them someday */ + char mze_name[MZAP_NAME_LEN]; +} mzap_ent_phys_t; + +typedef struct mzap_phys { + uint64_t mz_block_type; /* ZBT_MICRO */ + uint64_t mz_salt; + uint64_t mz_normflags; + uint64_t mz_pad[5]; + mzap_ent_phys_t mz_chunk[1]; + /* actually variable size depending on block size */ +} mzap_phys_t; + +/* + * The (fat) zap is stored in one object. It is an array of + * 1<= 6] [zap_leaf_t] [ptrtbl] ... + * + */ + +#define ZBT_LEAF ((1ULL << 63) + 0) +#define ZBT_HEADER ((1ULL << 63) + 1) +#define ZBT_MICRO ((1ULL << 63) + 3) +/* any other values are ptrtbl blocks */ + +/* + * the embedded pointer table takes up half a block: + * block size / entry size (2^3) / 2 + */ +#define ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1) + +/* + * The embedded pointer table starts half-way through the block. Since + * the pointer table itself is half the block, it starts at (64-bit) + * word number (1<zap_phys) \ + [(idx) + (1<> (64 - (n)))) + +/* + * TAKE NOTE: + * If zap_phys_t is modified, zap_byteswap() must be modified. + */ +typedef struct zap_phys { + uint64_t zap_block_type; /* ZBT_HEADER */ + uint64_t zap_magic; /* ZAP_MAGIC */ + + struct zap_table_phys { + uint64_t zt_blk; /* starting block number */ + uint64_t zt_numblks; /* number of blocks */ + uint64_t zt_shift; /* bits to index it */ + uint64_t zt_nextblk; /* next (larger) copy start block */ + uint64_t zt_blks_copied; /* number source blocks copied */ + } zap_ptrtbl; + + uint64_t zap_freeblk; /* the next free block */ + uint64_t zap_num_leafs; /* number of leafs */ + uint64_t zap_num_entries; /* number of entries */ + uint64_t zap_salt; /* salt to stir into hash function */ + uint64_t zap_normflags; /* flags for u8_textprep_str() */ + uint64_t zap_flags; /* zap_flags_t */ + /* + * This structure is followed by padding, and then the embedded + * pointer table. The embedded pointer table takes up second + * half of the block. It is accessed using the + * ZAP_EMBEDDED_PTRTBL_ENT() macro. + */ +} zap_phys_t; + +typedef struct zap_table_phys zap_table_phys_t; + +struct spa; +typedef struct fat_zap { + int zap_block_shift; /* block size shift */ + zap_phys_t *zap_phys; + const struct spa *zap_spa; + const dnode_phys_t *zap_dnode; +} fat_zap_t; + +#define ZAP_LEAF_MAGIC 0x2AB1EAF + +/* chunk size = 24 bytes */ +#define ZAP_LEAF_CHUNKSIZE 24 + +/* + * The amount of space available for chunks is: + * block size (1<l_bs) - hash entry size (2) * number of hash + * entries - header space (2*chunksize) + */ +#define ZAP_LEAF_NUMCHUNKS(l) \ + (((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \ + ZAP_LEAF_CHUNKSIZE - 2) + +/* + * The amount of space within the chunk available for the array is: + * chunk size - space for type (1) - space for next pointer (2) + */ +#define ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3) + +#define ZAP_LEAF_ARRAY_NCHUNKS(bytes) \ + (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES) + +/* + * Low water mark: when there are only this many chunks free, start + * growing the ptrtbl. Ideally, this should be larger than a + * "reasonably-sized" entry. 20 chunks is more than enough for the + * largest directory entry (MAXNAMELEN (256) byte name, 8-byte value), + * while still being only around 3% for 16k blocks. + */ +#define ZAP_LEAF_LOW_WATER (20) + +/* + * The leaf hash table has block size / 2^5 (32) number of entries, + * which should be more than enough for the maximum number of entries, + * which is less than block size / CHUNKSIZE (24) / minimum number of + * chunks per entry (3). + */ +#define ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5) +#define ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l)) + +/* + * The chunks start immediately after the hash table. The end of the + * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a + * chunk_t. + */ +#define ZAP_LEAF_CHUNK(l, idx) \ + ((zap_leaf_chunk_t *)(void *) \ + ((l)->l_phys->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx] +#define ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry) + +#define ZAP_LEAF_HASH(l, h) \ + ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \ + ((h) >> \ + (64 - ZAP_LEAF_HASH_SHIFT(l) - (l)->l_phys->l_hdr.lh_prefix_len))) +#define ZAP_LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[ZAP_LEAF_HASH(l, h)]) + +typedef enum zap_chunk_type { + ZAP_CHUNK_FREE = 253, + ZAP_CHUNK_ENTRY = 252, + ZAP_CHUNK_ARRAY = 251, + ZAP_CHUNK_TYPE_MAX = 250 +} zap_chunk_type_t; + +/* + * TAKE NOTE: + * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified. + */ +typedef struct zap_leaf_phys { + struct zap_leaf_header { + uint64_t lh_block_type; /* ZBT_LEAF */ + uint64_t lh_pad1; + uint64_t lh_prefix; /* hash prefix of this leaf */ + uint32_t lh_magic; /* ZAP_LEAF_MAGIC */ + uint16_t lh_nfree; /* number free chunks */ + uint16_t lh_nentries; /* number of entries */ + uint16_t lh_prefix_len; /* num bits used to id this */ + +/* above is accessable to zap, below is zap_leaf private */ + + uint16_t lh_freelist; /* chunk head of free list */ + uint8_t lh_pad2[12]; + } l_hdr; /* 2 24-byte chunks */ + + /* + * The header is followed by a hash table with + * ZAP_LEAF_HASH_NUMENTRIES(zap) entries. The hash table is + * followed by an array of ZAP_LEAF_NUMCHUNKS(zap) + * zap_leaf_chunk structures. These structures are accessed + * with the ZAP_LEAF_CHUNK() macro. + */ + + uint16_t l_hash[1]; +} zap_leaf_phys_t; + +typedef union zap_leaf_chunk { + struct zap_leaf_entry { + uint8_t le_type; /* always ZAP_CHUNK_ENTRY */ + uint8_t le_value_intlen; /* size of ints */ + uint16_t le_next; /* next entry in hash chain */ + uint16_t le_name_chunk; /* first chunk of the name */ + uint16_t le_name_numints; /* bytes in name, incl null */ + uint16_t le_value_chunk; /* first chunk of the value */ + uint16_t le_value_numints; /* value length in ints */ + uint32_t le_cd; /* collision differentiator */ + uint64_t le_hash; /* hash value of the name */ + } l_entry; + struct zap_leaf_array { + uint8_t la_type; /* always ZAP_CHUNK_ARRAY */ + uint8_t la_array[ZAP_LEAF_ARRAY_BYTES]; + uint16_t la_next; /* next blk or CHAIN_END */ + } l_array; + struct zap_leaf_free { + uint8_t lf_type; /* always ZAP_CHUNK_FREE */ + uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES]; + uint16_t lf_next; /* next in free list, or CHAIN_END */ + } l_free; +} zap_leaf_chunk_t; + +typedef struct zap_leaf { + int l_bs; /* block size shift */ + zap_leaf_phys_t *l_phys; +} zap_leaf_t; + +#define ZAP_MAXNAMELEN 256 +#define ZAP_MAXVALUELEN (1024 * 8) + +#define ACE_READ_DATA 0x00000001 /* file: read data */ +#define ACE_LIST_DIRECTORY 0x00000001 /* dir: list files */ +#define ACE_WRITE_DATA 0x00000002 /* file: write data */ +#define ACE_ADD_FILE 0x00000002 /* dir: create file */ +#define ACE_APPEND_DATA 0x00000004 /* file: append data */ +#define ACE_ADD_SUBDIRECTORY 0x00000004 /* dir: create subdir */ +#define ACE_READ_NAMED_ATTRS 0x00000008 /* FILE_READ_EA */ +#define ACE_WRITE_NAMED_ATTRS 0x00000010 /* FILE_WRITE_EA */ +#define ACE_EXECUTE 0x00000020 /* file: execute */ +#define ACE_TRAVERSE 0x00000020 /* dir: lookup name */ +#define ACE_DELETE_CHILD 0x00000040 /* dir: unlink child */ +#define ACE_READ_ATTRIBUTES 0x00000080 /* (all) stat, etc. */ +#define ACE_WRITE_ATTRIBUTES 0x00000100 /* (all) utimes, etc. */ +#define ACE_DELETE 0x00010000 /* (all) unlink self */ +#define ACE_READ_ACL 0x00020000 /* (all) getsecattr */ +#define ACE_WRITE_ACL 0x00040000 /* (all) setsecattr */ +#define ACE_WRITE_OWNER 0x00080000 /* (all) chown */ +#define ACE_SYNCHRONIZE 0x00100000 /* (all) */ + +#define ACE_FILE_INHERIT_ACE 0x0001 +#define ACE_DIRECTORY_INHERIT_ACE 0x0002 +#define ACE_NO_PROPAGATE_INHERIT_ACE 0x0004 +#define ACE_INHERIT_ONLY_ACE 0x0008 +#define ACE_SUCCESSFUL_ACCESS_ACE_FLAG 0x0010 +#define ACE_FAILED_ACCESS_ACE_FLAG 0x0020 +#define ACE_IDENTIFIER_GROUP 0x0040 +#define ACE_INHERITED_ACE 0x0080 +#define ACE_OWNER 0x1000 +#define ACE_GROUP 0x2000 +#define ACE_EVERYONE 0x4000 + +#define ACE_ACCESS_ALLOWED_ACE_TYPE 0x0000 +#define ACE_ACCESS_DENIED_ACE_TYPE 0x0001 +#define ACE_SYSTEM_AUDIT_ACE_TYPE 0x0002 +#define ACE_SYSTEM_ALARM_ACE_TYPE 0x0003 + +typedef struct zfs_ace_hdr { + uint16_t z_type; + uint16_t z_flags; + uint32_t z_access_mask; +} zfs_ace_hdr_t; + +/* + * Define special zfs pflags + */ +#define ZFS_XATTR 0x1 /* is an extended attribute */ +#define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */ +#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */ +#define ZFS_ACL_OBJ_ACE 0x8 /* ACL has CMPLX Object ACE */ +#define ZFS_ACL_PROTECTED 0x10 /* ACL protected */ +#define ZFS_ACL_DEFAULTED 0x20 /* ACL should be defaulted */ +#define ZFS_ACL_AUTO_INHERIT 0x40 /* ACL should be inherited */ +#define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */ +#define ZFS_NO_EXECS_DENIED 0x100 /* exec was given to everyone */ + +#define ZFS_READONLY 0x0000000100000000ull +#define ZFS_HIDDEN 0x0000000200000000ull +#define ZFS_SYSTEM 0x0000000400000000ull +#define ZFS_ARCHIVE 0x0000000800000000ull +#define ZFS_IMMUTABLE 0x0000001000000000ull +#define ZFS_NOUNLINK 0x0000002000000000ull +#define ZFS_APPENDONLY 0x0000004000000000ull +#define ZFS_NODUMP 0x0000008000000000ull +#define ZFS_OPAQUE 0x0000010000000000ull +#define ZFS_AV_QUARANTINED 0x0000020000000000ull +#define ZFS_AV_MODIFIED 0x0000040000000000ull +#define ZFS_REPARSE 0x0000080000000000ull +#define ZFS_OFFLINE 0x0000100000000000ull +#define ZFS_SPARSE 0x0000200000000000ull + +#define MASTER_NODE_OBJ 1 + +/* + * special attributes for master node. + */ + +#define ZFS_FSID "FSID" +#define ZFS_UNLINKED_SET "DELETE_QUEUE" +#define ZFS_ROOT_OBJ "ROOT" +#define ZPL_VERSION_OBJ "VERSION" +#define ZFS_PROP_BLOCKPERPAGE "BLOCKPERPAGE" +#define ZFS_PROP_NOGROWBLOCKS "NOGROWBLOCKS" +#define ZFS_SA_ATTRS "SA_ATTRS" + +#define ZFS_FLAG_BLOCKPERPAGE 0x1 +#define ZFS_FLAG_NOGROWBLOCKS 0x2 + +/* + * ZPL version - rev'd whenever an incompatible on-disk format change + * occurs. Independent of SPA/DMU/ZAP versioning. + */ + +#define ZPL_VERSION 1ULL + +/* + * The directory entry has the type (currently unused on Solaris) in the + * top 4 bits, and the object number in the low 48 bits. The "middle" + * 12 bits are unused. + */ +#define ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4) +#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48) +#define ZFS_DIRENT_MAKE(type, obj) (((uint64_t)type << 60) | obj) + +typedef struct ace { + uid_t a_who; /* uid or gid */ + uint32_t a_access_mask; /* read,write,... */ + uint16_t a_flags; /* see below */ + uint16_t a_type; /* allow or deny */ +} ace_t; + +#define ACE_SLOT_CNT 6 + +typedef struct zfs_znode_acl { + uint64_t z_acl_extern_obj; /* ext acl pieces */ + uint32_t z_acl_count; /* Number of ACEs */ + uint16_t z_acl_version; /* acl version */ + uint16_t z_acl_pad; /* pad */ + ace_t z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */ +} zfs_znode_acl_t; + +/* + * This is the persistent portion of the znode. It is stored + * in the "bonus buffer" of the file. Short symbolic links + * are also stored in the bonus buffer. + */ +typedef struct znode_phys { + uint64_t zp_atime[2]; /* 0 - last file access time */ + uint64_t zp_mtime[2]; /* 16 - last file modification time */ + uint64_t zp_ctime[2]; /* 32 - last file change time */ + uint64_t zp_crtime[2]; /* 48 - creation time */ + uint64_t zp_gen; /* 64 - generation (txg of creation) */ + uint64_t zp_mode; /* 72 - file mode bits */ + uint64_t zp_size; /* 80 - size of file */ + uint64_t zp_parent; /* 88 - directory parent (`..') */ + uint64_t zp_links; /* 96 - number of links to file */ + uint64_t zp_xattr; /* 104 - DMU object for xattrs */ + uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */ + uint64_t zp_flags; /* 120 - persistent flags */ + uint64_t zp_uid; /* 128 - file owner */ + uint64_t zp_gid; /* 136 - owning group */ + uint64_t zp_pad[4]; /* 144 - future */ + zfs_znode_acl_t zp_acl; /* 176 - 263 ACL */ + /* + * Data may pad out any remaining bytes in the znode buffer, eg: + * + * |<---------------------- dnode_phys (512) ------------------------>| + * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->| + * |<---- znode (264) ---->|<---- data (56) ---->| + * + * At present, we only use this space to store symbolic links. + */ +} znode_phys_t; + +/* + * In-core vdev representation. + */ +struct vdev; +struct spa; +typedef int vdev_phys_read_t(struct vdev *, void *, off_t, void *, size_t); +typedef int vdev_phys_write_t(struct vdev *, off_t, void *, size_t); +typedef int vdev_read_t(struct vdev *, const blkptr_t *, void *, off_t, size_t); + +typedef STAILQ_HEAD(vdev_list, vdev) vdev_list_t; + +typedef struct vdev_indirect_mapping_entry_phys { + /* + * Decode with DVA_MAPPING_* macros. + * Contains: + * the source offset (low 63 bits) + * the one-bit "mark", used for garbage collection (by zdb) + */ + uint64_t vimep_src; + + /* + * Note: the DVA's asize is 24 bits, and can thus store ranges + * up to 8GB. + */ + dva_t vimep_dst; +} vdev_indirect_mapping_entry_phys_t; + +#define DVA_MAPPING_GET_SRC_OFFSET(vimep) \ + BF64_GET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0) +#define DVA_MAPPING_SET_SRC_OFFSET(vimep, x) \ + BF64_SET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0, x) + +#if 0 +typedef struct vdev_indirect_mapping_entry { + vdev_indirect_mapping_entry_phys_t vime_mapping; + uint32_t vime_obsolete_count; + list_node_t vime_node; +} vdev_indirect_mapping_entry_t; +#endif + +/* + * This is stored in the bonus buffer of the mapping object, see comment of + * vdev_indirect_config for more details. + */ +typedef struct vdev_indirect_mapping_phys { + uint64_t vimp_max_offset; + uint64_t vimp_bytes_mapped; + uint64_t vimp_num_entries; /* number of v_i_m_entry_phys_t's */ + + /* + * For each entry in the mapping object, this object contains an + * entry representing the number of bytes of that mapping entry + * that were no longer in use by the pool at the time this indirect + * vdev was last condensed. + */ + uint64_t vimp_counts_object; +} vdev_indirect_mapping_phys_t; + +#define VDEV_INDIRECT_MAPPING_SIZE_V0 (3 * sizeof (uint64_t)) + +typedef struct vdev_indirect_mapping { + uint64_t vim_object; + boolean_t vim_havecounts; + + /* vim_entries segment offset currently in memory. */ + uint64_t vim_entry_offset; + /* vim_entries segment size. */ + size_t vim_num_entries; + + /* Needed by dnode_read() */ + const void *vim_spa; + dnode_phys_t *vim_dn; + + /* + * An ordered array of mapping entries, sorted by source offset. + * Note that vim_entries is needed during a removal (and contains + * mappings that have been synced to disk so far) to handle frees + * from the removing device. + */ + vdev_indirect_mapping_entry_phys_t *vim_entries; + objset_phys_t *vim_objset; + vdev_indirect_mapping_phys_t *vim_phys; +} vdev_indirect_mapping_t; + +/* + * On-disk indirect vdev state. + * + * An indirect vdev is described exclusively in the MOS config of a pool. + * The config for an indirect vdev includes several fields, which are + * accessed in memory by a vdev_indirect_config_t. + */ +typedef struct vdev_indirect_config { + /* + * Object (in MOS) which contains the indirect mapping. This object + * contains an array of vdev_indirect_mapping_entry_phys_t ordered by + * vimep_src. The bonus buffer for this object is a + * vdev_indirect_mapping_phys_t. This object is allocated when a vdev + * removal is initiated. + * + * Note that this object can be empty if none of the data on the vdev + * has been copied yet. + */ + uint64_t vic_mapping_object; + + /* + * Object (in MOS) which contains the birth times for the mapping + * entries. This object contains an array of + * vdev_indirect_birth_entry_phys_t sorted by vibe_offset. The bonus + * buffer for this object is a vdev_indirect_birth_phys_t. This object + * is allocated when a vdev removal is initiated. + * + * Note that this object can be empty if none of the vdev has yet been + * copied. + */ + uint64_t vic_births_object; + +/* + * This is the vdev ID which was removed previous to this vdev, or + * UINT64_MAX if there are no previously removed vdevs. + */ + uint64_t vic_prev_indirect_vdev; +} vdev_indirect_config_t; + +typedef struct vdev { + STAILQ_ENTRY(vdev) v_childlink; /* link in parent's child list */ + STAILQ_ENTRY(vdev) v_alllink; /* link in global vdev list */ + vdev_list_t v_children; /* children of this vdev */ + const char *v_name; /* vdev name */ + uint64_t v_guid; /* vdev guid */ + uint64_t v_id; /* index in parent */ + uint64_t v_psize; /* physical device capacity */ + int v_ashift; /* offset to block shift */ + int v_nparity; /* # parity for raidz */ + struct vdev *v_top; /* parent vdev */ + size_t v_nchildren; /* # children */ + vdev_state_t v_state; /* current state */ + vdev_phys_read_t *v_phys_read; /* read from raw leaf vdev */ + vdev_phys_write_t *v_phys_write; /* write to raw leaf vdev */ + vdev_read_t *v_read; /* read from vdev */ + void *v_priv; /* data for read/write function */ + boolean_t v_islog; + struct spa *v_spa; /* link to spa */ + /* + * Values stored in the config for an indirect or removing vdev. + */ + vdev_indirect_config_t vdev_indirect_config; + vdev_indirect_mapping_t *v_mapping; +} vdev_t; + +/* + * In-core pool representation. + */ +typedef STAILQ_HEAD(spa_list, spa) spa_list_t; + +typedef struct spa { + STAILQ_ENTRY(spa) spa_link; /* link in global pool list */ + char *spa_name; /* pool name */ + uint64_t spa_guid; /* pool guid */ + uint64_t spa_txg; /* most recent transaction */ + struct uberblock *spa_uberblock; /* best uberblock so far */ + vdev_t *spa_root_vdev; /* toplevel vdev container */ + objset_phys_t *spa_mos; /* MOS for this pool */ + zio_cksum_salt_t spa_cksum_salt; /* secret salt for cksum */ + void *spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS]; + boolean_t spa_with_log; /* this pool has log */ + + struct uberblock spa_uberblock_master; /* best uberblock so far */ + objset_phys_t spa_mos_master; /* MOS for this pool */ + struct uberblock spa_uberblock_checkpoint; /* checkpoint uberblock */ + objset_phys_t spa_mos_checkpoint; /* Checkpoint MOS */ + void *spa_bootenv; /* bootenv from pool label */ +} spa_t; + +spa_t *spa_create(uint64_t guid, const char *name); + +/* IO related arguments. */ +typedef struct zio { + spa_t *io_spa; + blkptr_t *io_bp; + void *io_data; + uint64_t io_size; + uint64_t io_offset; + + /* Stuff for the vdev stack */ + vdev_t *io_vd; + void *io_vsd; + + int io_error; +} zio_t; + +#if 0 /* XXXMJ */ +static void decode_embedded_bp_compressed(const blkptr_t *, void *); +#endif + +#endif /* _ZFSIMPL_H_ */