diff --git a/usr.sbin/makefs/zfs/dsl.c b/usr.sbin/makefs/zfs/dsl.c index 223f5941ef69..28560dd4a429 100644 --- a/usr.sbin/makefs/zfs/dsl.c +++ b/usr.sbin/makefs/zfs/dsl.c @@ -1,610 +1,611 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2022 The FreeBSD Foundation * * This software was developed by Mark Johnston under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include +#include #include #include #include "makefs.h" #include "zfs.h" typedef struct zfs_dsl_dataset { zfs_objset_t *os; /* referenced objset, may be null */ dsl_dataset_phys_t *phys; /* on-disk representation */ uint64_t dsid; /* DSL dataset dnode */ struct zfs_dsl_dir *dir; /* containing parent */ } zfs_dsl_dataset_t; typedef STAILQ_HEAD(zfs_dsl_dir_list, zfs_dsl_dir) zfs_dsl_dir_list_t; typedef struct zfs_dsl_dir { char *fullname; /* full dataset name */ char *name; /* basename(fullname) */ dsl_dir_phys_t *phys; /* on-disk representation */ nvlist_t *propsnv; /* properties saved in propszap */ zfs_dsl_dataset_t *headds; /* principal dataset, may be null */ uint64_t dirid; /* DSL directory dnode */ zfs_zap_t *propszap; /* dataset properties */ zfs_zap_t *childzap; /* child directories */ /* DSL directory tree linkage. */ struct zfs_dsl_dir *parent; zfs_dsl_dir_list_t children; STAILQ_ENTRY(zfs_dsl_dir) next; } zfs_dsl_dir_t; static zfs_dsl_dir_t *dsl_dir_alloc(zfs_opt_t *zfs, const char *name); static zfs_dsl_dataset_t *dsl_dataset_alloc(zfs_opt_t *zfs, zfs_dsl_dir_t *dir); static int nvlist_find_string(nvlist_t *nvl, const char *key, char **retp) { char *str; int error, len; error = nvlist_find(nvl, key, DATA_TYPE_STRING, NULL, &str, &len); if (error == 0) { *retp = ecalloc(1, len + 1); memcpy(*retp, str, len); } return (error); } static int nvlist_find_uint64(nvlist_t *nvl, const char *key, uint64_t *retp) { return (nvlist_find(nvl, key, DATA_TYPE_UINT64, NULL, retp, NULL)); } /* * Return an allocated string containing the head dataset's mountpoint, * including the root path prefix. * * If the dataset has a mountpoint property, it is returned. Otherwise we have * to follow ZFS' inheritance rules. */ char * dsl_dir_get_mountpoint(zfs_opt_t *zfs, zfs_dsl_dir_t *dir) { zfs_dsl_dir_t *pdir; char *mountpoint, *origmountpoint; if (nvlist_find_string(dir->propsnv, "mountpoint", &mountpoint) == 0) { if (strcmp(mountpoint, "none") == 0) return (NULL); /* * nvlist_find_string() does not make a copy. */ mountpoint = estrdup(mountpoint); } else { /* * If we don't have a mountpoint, it's inherited from one of our * ancestors. Walk up the hierarchy until we find it, building * up our mountpoint along the way. The mountpoint property is * always set for the root dataset. */ for (pdir = dir->parent, mountpoint = estrdup(dir->name);;) { origmountpoint = mountpoint; if (nvlist_find_string(pdir->propsnv, "mountpoint", &mountpoint) == 0) { easprintf(&mountpoint, "%s%s%s", mountpoint, mountpoint[strlen(mountpoint) - 1] == '/' ? "" : "/", origmountpoint); free(origmountpoint); break; } easprintf(&mountpoint, "%s/%s", pdir->name, origmountpoint); free(origmountpoint); pdir = pdir->parent; } } assert(mountpoint[0] == '/'); assert(strstr(mountpoint, zfs->rootpath) == mountpoint); return (mountpoint); } int dsl_dir_get_canmount(zfs_dsl_dir_t *dir, uint64_t *canmountp) { return (nvlist_find_uint64(dir->propsnv, "canmount", canmountp)); } /* * Handle dataset properties that we know about; stash them into an nvlist to be * written later to the properties ZAP object. * * If the set of properties we handle grows too much, we should probably explore * using libzfs to manage them. */ static void dsl_dir_set_prop(zfs_opt_t *zfs, zfs_dsl_dir_t *dir, const char *key, const char *val) { nvlist_t *nvl; nvl = dir->propsnv; if (val == NULL || val[0] == '\0') errx(1, "missing value for property `%s'", key); if (nvpair_find(nvl, key) != NULL) errx(1, "property `%s' already set", key); if (strcmp(key, "mountpoint") == 0) { if (strcmp(val, "none") != 0) { if (val[0] != '/') errx(1, "mountpoint `%s' is not absolute", val); if (strcmp(val, zfs->rootpath) != 0 && strcmp(zfs->rootpath, "/") != 0 && (strstr(val, zfs->rootpath) != val || val[strlen(zfs->rootpath)] != '/')) { errx(1, "mountpoint `%s' is not prefixed by " "the root path `%s'", val, zfs->rootpath); } } nvlist_add_string(nvl, key, val); } else if (strcmp(key, "atime") == 0 || strcmp(key, "exec") == 0 || strcmp(key, "setuid") == 0) { if (strcmp(val, "on") == 0) nvlist_add_uint64(nvl, key, 1); else if (strcmp(val, "off") == 0) nvlist_add_uint64(nvl, key, 0); else errx(1, "invalid value `%s' for %s", val, key); } else if (strcmp(key, "canmount") == 0) { if (strcmp(val, "noauto") == 0) nvlist_add_uint64(nvl, key, 2); else if (strcmp(val, "on") == 0) nvlist_add_uint64(nvl, key, 1); else if (strcmp(val, "off") == 0) nvlist_add_uint64(nvl, key, 0); else errx(1, "invalid value `%s' for %s", val, key); } else { errx(1, "unknown property `%s'", key); } } static zfs_dsl_dir_t * dsl_metadir_alloc(zfs_opt_t *zfs, const char *name) { zfs_dsl_dir_t *dir; char *path; easprintf(&path, "%s/%s", zfs->poolname, name); dir = dsl_dir_alloc(zfs, path); free(path); return (dir); } static void dsl_origindir_init(zfs_opt_t *zfs) { dnode_phys_t *clones; uint64_t clonesid; zfs->origindsldir = dsl_metadir_alloc(zfs, "$ORIGIN"); zfs->originds = dsl_dataset_alloc(zfs, zfs->origindsldir); zfs->snapds = dsl_dataset_alloc(zfs, zfs->origindsldir); clones = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_CLONES, &clonesid); zfs->cloneszap = zap_alloc(zfs->mos, clones); zfs->origindsldir->phys->dd_clones = clonesid; } void dsl_init(zfs_opt_t *zfs) { zfs_dsl_dir_t *dir; struct dataset_desc *d; const char *dspropdelim; dspropdelim = ";"; zfs->rootdsldir = dsl_dir_alloc(zfs, NULL); nvlist_add_uint64(zfs->rootdsldir->propsnv, "compression", ZIO_COMPRESS_OFF); zfs->rootds = dsl_dataset_alloc(zfs, zfs->rootdsldir); zfs->rootdsldir->headds = zfs->rootds; zfs->mosdsldir = dsl_metadir_alloc(zfs, "$MOS"); zfs->freedsldir = dsl_metadir_alloc(zfs, "$FREE"); dsl_origindir_init(zfs); /* * Go through the list of user-specified datasets and create DSL objects * for them. */ STAILQ_FOREACH(d, &zfs->datasetdescs, next) { char *dsname, *next, *params, *param, *nextparam; params = d->params; dsname = strsep(¶ms, dspropdelim); if (strcmp(dsname, zfs->poolname) == 0) { /* * This is the root dataset; it's already created, so * we're just setting options. */ dir = zfs->rootdsldir; } else { /* * This dataset must be a child of the root dataset. */ if (strstr(dsname, zfs->poolname) != dsname || (next = strchr(dsname, '/')) == NULL || (size_t)(next - dsname) != strlen(zfs->poolname)) { errx(1, "dataset `%s' must be a child of `%s'", dsname, zfs->poolname); } dir = dsl_dir_alloc(zfs, dsname); dir->headds = dsl_dataset_alloc(zfs, dir); } for (nextparam = param = params; nextparam != NULL;) { char *key, *val; param = strsep(&nextparam, dspropdelim); key = val = param; key = strsep(&val, "="); dsl_dir_set_prop(zfs, dir, key, val); } } /* * Set the root dataset's mount point if the user didn't override the * default. */ if (nvpair_find(zfs->rootdsldir->propsnv, "mountpoint") == NULL) { nvlist_add_string(zfs->rootdsldir->propsnv, "mountpoint", zfs->rootpath); } } uint64_t dsl_dir_id(zfs_dsl_dir_t *dir) { return (dir->dirid); } uint64_t dsl_dir_dataset_id(zfs_dsl_dir_t *dir) { return (dir->headds->dsid); } static void dsl_dir_foreach_post(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void (*cb)(zfs_opt_t *, zfs_dsl_dir_t *, void *), void *arg) { zfs_dsl_dir_t *cdsldir; STAILQ_FOREACH(cdsldir, &dsldir->children, next) { dsl_dir_foreach_post(zfs, cdsldir, cb, arg); } cb(zfs, dsldir, arg); } /* * Used when the caller doesn't care about the order one way or another. */ void dsl_dir_foreach(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void (*cb)(zfs_opt_t *, zfs_dsl_dir_t *, void *), void *arg) { dsl_dir_foreach_post(zfs, dsldir, cb, arg); } const char * dsl_dir_fullname(const zfs_dsl_dir_t *dir) { return (dir->fullname); } /* * Create a DSL directory, which is effectively an entry in the ZFS namespace. * We always create a root DSL directory, whose name is the pool's name, and * several metadata directories. * * Each directory has two ZAP objects, one pointing to child directories, and * one for properties (which are inherited by children unless overridden). * Directories typically reference a DSL dataset, the "head dataset", which * points to an object set. */ static zfs_dsl_dir_t * dsl_dir_alloc(zfs_opt_t *zfs, const char *name) { zfs_dsl_dir_list_t l, *lp; zfs_dsl_dir_t *dir, *parent; dnode_phys_t *dnode; char *dirname, *nextdir, *origname; uint64_t childid, propsid; dir = ecalloc(1, sizeof(*dir)); dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_DSL_DIR, DMU_OT_DSL_DIR, sizeof(dsl_dir_phys_t), &dir->dirid); dir->phys = (dsl_dir_phys_t *)DN_BONUS(dnode); dnode = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_PROPS, &propsid); dir->propszap = zap_alloc(zfs->mos, dnode); dnode = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_DIR_CHILD_MAP, &childid); dir->childzap = zap_alloc(zfs->mos, dnode); dir->propsnv = nvlist_create(NV_UNIQUE_NAME); STAILQ_INIT(&dir->children); dir->phys->dd_child_dir_zapobj = childid; dir->phys->dd_props_zapobj = propsid; if (name == NULL) { /* * This is the root DSL directory. */ dir->name = estrdup(zfs->poolname); dir->fullname = estrdup(zfs->poolname); dir->parent = NULL; dir->phys->dd_parent_obj = 0; assert(zfs->rootdsldir == NULL); zfs->rootdsldir = dir; return (dir); } /* * Insert the new directory into the hierarchy. Currently this must be * done in order, e.g., when creating pool/a/b, pool/a must already * exist. */ STAILQ_INIT(&l); STAILQ_INSERT_HEAD(&l, zfs->rootdsldir, next); origname = dirname = nextdir = estrdup(name); for (lp = &l;; lp = &parent->children) { dirname = strsep(&nextdir, "/"); if (nextdir == NULL) break; STAILQ_FOREACH(parent, lp, next) { if (strcmp(parent->name, dirname) == 0) break; } if (parent == NULL) { errx(1, "no parent at `%s' for filesystem `%s'", dirname, name); } } dir->fullname = estrdup(name); dir->name = estrdup(dirname); free(origname); STAILQ_INSERT_TAIL(lp, dir, next); zap_add_uint64(parent->childzap, dir->name, dir->dirid); dir->parent = parent; dir->phys->dd_parent_obj = parent->dirid; return (dir); } void dsl_dir_size_add(zfs_dsl_dir_t *dir, uint64_t bytes) { dir->phys->dd_used_bytes += bytes; dir->phys->dd_compressed_bytes += bytes; dir->phys->dd_uncompressed_bytes += bytes; } /* * Convert dataset properties into entries in the DSL directory's properties * ZAP. */ static void dsl_dir_finalize_props(zfs_dsl_dir_t *dir) { for (nvp_header_t *nvh = NULL; (nvh = nvlist_next_nvpair(dir->propsnv, nvh)) != NULL;) { nv_string_t *nvname; nv_pair_data_t *nvdata; char *name; nvname = (nv_string_t *)(nvh + 1); nvdata = (nv_pair_data_t *)(&nvname->nv_data[0] + NV_ALIGN4(nvname->nv_size)); name = nvstring_get(nvname); switch (nvdata->nv_type) { case DATA_TYPE_UINT64: { uint64_t val; memcpy(&val, &nvdata->nv_data[0], sizeof(uint64_t)); zap_add_uint64(dir->propszap, name, val); break; } case DATA_TYPE_STRING: { nv_string_t *nvstr; char *val; nvstr = (nv_string_t *)&nvdata->nv_data[0]; val = nvstring_get(nvstr); zap_add_string(dir->propszap, name, val); free(val); break; } default: assert(0); } free(name); } } static void dsl_dir_finalize(zfs_opt_t *zfs, zfs_dsl_dir_t *dir, void *arg __unused) { char key[32]; zfs_dsl_dir_t *cdir; dnode_phys_t *snapnames; zfs_dsl_dataset_t *headds; zfs_objset_t *os; uint64_t bytes, snapnamesid; dsl_dir_finalize_props(dir); zap_write(zfs, dir->propszap); zap_write(zfs, dir->childzap); headds = dir->headds; if (headds == NULL) return; os = headds->os; if (os == NULL) return; snapnames = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_DS_SNAP_MAP, &snapnamesid); zap_write(zfs, zap_alloc(zfs->mos, snapnames)); dir->phys->dd_head_dataset_obj = headds->dsid; dir->phys->dd_clone_parent_obj = zfs->snapds->dsid; headds->phys->ds_prev_snap_obj = zfs->snapds->dsid; headds->phys->ds_snapnames_zapobj = snapnamesid; objset_root_blkptr_copy(os, &headds->phys->ds_bp); zfs->snapds->phys->ds_num_children++; snprintf(key, sizeof(key), "%jx", (uintmax_t)headds->dsid); zap_add_uint64(zfs->cloneszap, key, headds->dsid); bytes = objset_space(os); headds->phys->ds_used_bytes = bytes; headds->phys->ds_uncompressed_bytes = bytes; headds->phys->ds_compressed_bytes = bytes; STAILQ_FOREACH(cdir, &dir->children, next) { /* * The root directory needs a special case: the amount of * space used for the MOS isn't known until everything else is * finalized, so it can't be accounted in the MOS directory's * parent until then. */ if (dir == zfs->rootdsldir && cdir == zfs->mosdsldir) continue; bytes += cdir->phys->dd_used_bytes; } dsl_dir_size_add(dir, bytes); } void dsl_write(zfs_opt_t *zfs) { zfs_zap_t *snapnameszap; dnode_phys_t *snapnames; uint64_t snapmapid; /* * Perform accounting, starting from the leaves of the DSL directory * tree. Accounting for $MOS is done later, once we've finished * allocating space. */ dsl_dir_foreach_post(zfs, zfs->rootdsldir, dsl_dir_finalize, NULL); snapnames = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_DS_SNAP_MAP, &snapmapid); snapnameszap = zap_alloc(zfs->mos, snapnames); zap_add_uint64(snapnameszap, "$ORIGIN", zfs->snapds->dsid); zap_write(zfs, snapnameszap); zfs->origindsldir->phys->dd_head_dataset_obj = zfs->originds->dsid; zfs->originds->phys->ds_prev_snap_obj = zfs->snapds->dsid; zfs->originds->phys->ds_snapnames_zapobj = snapmapid; zfs->snapds->phys->ds_next_snap_obj = zfs->originds->dsid; assert(zfs->snapds->phys->ds_num_children > 0); zfs->snapds->phys->ds_num_children++; zap_write(zfs, zfs->cloneszap); /* XXX-MJ dirs and datasets are leaked */ } void dsl_dir_dataset_write(zfs_opt_t *zfs, zfs_objset_t *os, zfs_dsl_dir_t *dir) { dir->headds->os = os; objset_write(zfs, os); } bool dsl_dir_has_dataset(zfs_dsl_dir_t *dir) { return (dir->headds != NULL); } bool dsl_dir_dataset_has_objset(zfs_dsl_dir_t *dir) { return (dsl_dir_has_dataset(dir) && dir->headds->os != NULL); } static zfs_dsl_dataset_t * dsl_dataset_alloc(zfs_opt_t *zfs, zfs_dsl_dir_t *dir) { zfs_dsl_dataset_t *ds; dnode_phys_t *dnode; uint64_t deadlistid; ds = ecalloc(1, sizeof(*ds)); dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_DSL_DATASET, DMU_OT_DSL_DATASET, sizeof(dsl_dataset_phys_t), &ds->dsid); ds->phys = (dsl_dataset_phys_t *)DN_BONUS(dnode); dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR, sizeof(dsl_deadlist_phys_t), &deadlistid); zap_write(zfs, zap_alloc(zfs->mos, dnode)); ds->phys->ds_dir_obj = dir->dirid; ds->phys->ds_deadlist_obj = deadlistid; ds->phys->ds_creation_txg = TXG - 1; if (ds != zfs->snapds) ds->phys->ds_prev_snap_txg = TXG - 1; ds->phys->ds_guid = ((uint64_t)random() << 32) | random(); ds->dir = dir; return (ds); } diff --git a/usr.sbin/makefs/zfs/fs.c b/usr.sbin/makefs/zfs/fs.c index bf8d5483d610..ecade55db52c 100644 --- a/usr.sbin/makefs/zfs/fs.c +++ b/usr.sbin/makefs/zfs/fs.c @@ -1,1047 +1,1053 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2022 The FreeBSD Foundation * * This software was developed by Mark Johnston under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#include #include #include +#include #include +#include #include #include #include #include "makefs.h" #include "zfs.h" typedef struct { const char *name; unsigned int id; uint16_t size; sa_bswap_type_t bs; } zfs_sattr_t; typedef struct zfs_fs { zfs_objset_t *os; /* Offset table for system attributes, indexed by a zpl_attr_t. */ uint16_t *saoffs; size_t sacnt; const zfs_sattr_t *satab; } zfs_fs_t; /* * The order of the attributes doesn't matter, this is simply the one hard-coded * by OpenZFS, based on a zdb dump of the SA_REGISTRY table. */ typedef enum zpl_attr { ZPL_ATIME, ZPL_MTIME, ZPL_CTIME, ZPL_CRTIME, ZPL_GEN, ZPL_MODE, ZPL_SIZE, ZPL_PARENT, ZPL_LINKS, ZPL_XATTR, ZPL_RDEV, ZPL_FLAGS, ZPL_UID, ZPL_GID, ZPL_PAD, ZPL_ZNODE_ACL, ZPL_DACL_COUNT, ZPL_SYMLINK, ZPL_SCANSTAMP, ZPL_DACL_ACES, ZPL_DXATTR, ZPL_PROJID, } zpl_attr_t; /* * This table must be kept in sync with zpl_attr_layout[] and zpl_attr_t. */ static const zfs_sattr_t zpl_attrs[] = { #define _ZPL_ATTR(n, s, b) { .name = #n, .id = n, .size = s, .bs = b } _ZPL_ATTR(ZPL_ATIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), _ZPL_ATTR(ZPL_MTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), _ZPL_ATTR(ZPL_CTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), _ZPL_ATTR(ZPL_CRTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), _ZPL_ATTR(ZPL_GEN, sizeof(uint64_t), SA_UINT64_ARRAY), _ZPL_ATTR(ZPL_MODE, sizeof(uint64_t), SA_UINT64_ARRAY), _ZPL_ATTR(ZPL_SIZE, sizeof(uint64_t), SA_UINT64_ARRAY), _ZPL_ATTR(ZPL_PARENT, sizeof(uint64_t), SA_UINT64_ARRAY), _ZPL_ATTR(ZPL_LINKS, sizeof(uint64_t), SA_UINT64_ARRAY), _ZPL_ATTR(ZPL_XATTR, sizeof(uint64_t), SA_UINT64_ARRAY), _ZPL_ATTR(ZPL_RDEV, sizeof(uint64_t), SA_UINT64_ARRAY), _ZPL_ATTR(ZPL_FLAGS, sizeof(uint64_t), SA_UINT64_ARRAY), _ZPL_ATTR(ZPL_UID, sizeof(uint64_t), SA_UINT64_ARRAY), _ZPL_ATTR(ZPL_GID, sizeof(uint64_t), SA_UINT64_ARRAY), _ZPL_ATTR(ZPL_PAD, sizeof(uint64_t), SA_UINT64_ARRAY), _ZPL_ATTR(ZPL_ZNODE_ACL, 88, SA_UINT64_ARRAY), _ZPL_ATTR(ZPL_DACL_COUNT, sizeof(uint64_t), SA_UINT64_ARRAY), _ZPL_ATTR(ZPL_SYMLINK, 0, SA_UINT8_ARRAY), _ZPL_ATTR(ZPL_SCANSTAMP, sizeof(uint64_t) * 4, SA_UINT8_ARRAY), _ZPL_ATTR(ZPL_DACL_ACES, 0, SA_ACL), _ZPL_ATTR(ZPL_DXATTR, 0, SA_UINT8_ARRAY), _ZPL_ATTR(ZPL_PROJID, sizeof(uint64_t), SA_UINT64_ARRAY), #undef ZPL_ATTR }; /* * This layout matches that of a filesystem created using OpenZFS on FreeBSD. * It need not match in general, but FreeBSD's loader doesn't bother parsing the * layout and just hard-codes attribute offsets. */ static const sa_attr_type_t zpl_attr_layout[] = { ZPL_MODE, ZPL_SIZE, ZPL_GEN, ZPL_UID, ZPL_GID, ZPL_PARENT, ZPL_FLAGS, ZPL_ATIME, ZPL_MTIME, ZPL_CTIME, ZPL_CRTIME, ZPL_LINKS, ZPL_DACL_COUNT, ZPL_DACL_ACES, ZPL_SYMLINK, }; /* * Keys for the ZPL attribute tables in the SA layout ZAP. The first two * indices are reserved for legacy attribute encoding. */ #define SA_LAYOUT_INDEX_DEFAULT 2 #define SA_LAYOUT_INDEX_SYMLINK 3 struct fs_populate_dir { SLIST_ENTRY(fs_populate_dir) next; int dirfd; uint64_t objid; zfs_zap_t *zap; }; struct fs_populate_arg { zfs_opt_t *zfs; zfs_fs_t *fs; /* owning filesystem */ uint64_t rootdirid; /* root directory dnode ID */ int rootdirfd; /* root directory fd */ SLIST_HEAD(, fs_populate_dir) dirs; /* stack of directories */ }; static void fs_build_one(zfs_opt_t *, zfs_dsl_dir_t *, fsnode *, int); static void eclose(int fd) { if (close(fd) != 0) err(1, "close"); } static bool fsnode_isroot(const fsnode *cur) { return (strcmp(cur->name, ".") == 0); } /* * Visit each node in a directory hierarchy, in pre-order depth-first order. */ static void fsnode_foreach(fsnode *root, int (*cb)(fsnode *, void *), void *arg) { assert(root->type == S_IFDIR); for (fsnode *cur = root; cur != NULL; cur = cur->next) { assert(cur->type == S_IFREG || cur->type == S_IFDIR || cur->type == S_IFLNK); if (cb(cur, arg) == 0) continue; if (cur->type == S_IFDIR && cur->child != NULL) fsnode_foreach(cur->child, cb, arg); } } static void fs_populate_dirent(struct fs_populate_arg *arg, fsnode *cur, uint64_t dnid) { struct fs_populate_dir *dir; uint64_t type; switch (cur->type) { case S_IFREG: type = DT_REG; break; case S_IFDIR: type = DT_DIR; break; case S_IFLNK: type = DT_LNK; break; default: assert(0); } dir = SLIST_FIRST(&arg->dirs); zap_add_uint64(dir->zap, cur->name, ZFS_DIRENT_MAKE(type, dnid)); } static void fs_populate_attr(zfs_fs_t *fs, char *attrbuf, const void *val, uint16_t ind, size_t *szp) { assert(ind < fs->sacnt); assert(fs->saoffs[ind] != 0xffff); memcpy(attrbuf + fs->saoffs[ind], val, fs->satab[ind].size); *szp += fs->satab[ind].size; } static void fs_populate_varszattr(zfs_fs_t *fs, char *attrbuf, const void *val, size_t valsz, size_t varoff, uint16_t ind, size_t *szp) { assert(ind < fs->sacnt); assert(fs->saoffs[ind] != 0xffff); assert(fs->satab[ind].size == 0); memcpy(attrbuf + fs->saoffs[ind] + varoff, val, valsz); *szp += valsz; } /* * Derive the relative fd/path combo needed to access a file. Ideally we'd * always be able to use relative lookups (i.e., use the *at() system calls), * since they require less path translation and are more amenable to sandboxing, * but the handling of multiple staging directories makes that difficult. To * make matters worse, we have no choice but to use relative lookups when * dealing with an mtree manifest, so both mechanisms are implemented. */ static void fs_populate_path(const fsnode *cur, struct fs_populate_arg *arg, char *path, size_t sz, int *dirfdp) { if (cur->root == NULL) { size_t n; *dirfdp = SLIST_FIRST(&arg->dirs)->dirfd; n = strlcpy(path, cur->name, sz); assert(n < sz); } else { int n; *dirfdp = AT_FDCWD; n = snprintf(path, sz, "%s/%s/%s", cur->root, cur->path, cur->name); assert(n >= 0); assert((size_t)n < sz); } } static int fs_open(const fsnode *cur, struct fs_populate_arg *arg, int flags) { char path[PATH_MAX]; int fd; fs_populate_path(cur, arg, path, sizeof(path), &fd); fd = openat(fd, path, flags); if (fd < 0) err(1, "openat(%s)", path); return (fd); } static void fs_readlink(const fsnode *cur, struct fs_populate_arg *arg, char *buf, size_t bufsz) { char path[PATH_MAX]; ssize_t n; int fd; fs_populate_path(cur, arg, path, sizeof(path), &fd); n = readlinkat(fd, path, buf, bufsz - 1); if (n == -1) err(1, "readlinkat(%s)", cur->name); buf[n] = '\0'; } static void fs_populate_time(zfs_fs_t *fs, char *attrbuf, struct timespec *ts, uint16_t ind, size_t *szp) { uint64_t timebuf[2]; assert(ind < fs->sacnt); assert(fs->saoffs[ind] != 0xffff); assert(fs->satab[ind].size == sizeof(timebuf)); timebuf[0] = ts->tv_sec; timebuf[1] = ts->tv_nsec; fs_populate_attr(fs, attrbuf, timebuf, ind, szp); } static void fs_populate_sattrs(struct fs_populate_arg *arg, const fsnode *cur, dnode_phys_t *dnode) { char target[PATH_MAX]; zfs_fs_t *fs; zfs_ace_hdr_t aces[3]; struct stat *sb; sa_hdr_phys_t *sahdr; uint64_t daclcount, flags, gen, gid, links, mode, parent, objsize, uid; char *attrbuf; size_t bonussz, hdrsz; int layout; assert(dnode->dn_bonustype == DMU_OT_SA); assert(dnode->dn_nblkptr == 1); fs = arg->fs; sb = &cur->inode->st; switch (cur->type) { case S_IFREG: layout = SA_LAYOUT_INDEX_DEFAULT; links = cur->inode->nlink; objsize = sb->st_size; parent = SLIST_FIRST(&arg->dirs)->objid; break; case S_IFDIR: layout = SA_LAYOUT_INDEX_DEFAULT; links = 1; /* .. */ objsize = 1; /* .. */ /* * The size of a ZPL directory is the number of entries * (including "." and ".."), and the link count is the number of * entries which are directories (including "." and ".."). */ for (fsnode *c = fsnode_isroot(cur) ? cur->next : cur->child; c != NULL; c = c->next) { if (c->type == S_IFDIR) links++; objsize++; } /* The root directory is its own parent. */ parent = SLIST_EMPTY(&arg->dirs) ? arg->rootdirid : SLIST_FIRST(&arg->dirs)->objid; break; case S_IFLNK: fs_readlink(cur, arg, target, sizeof(target)); layout = SA_LAYOUT_INDEX_SYMLINK; links = 1; objsize = strlen(target); parent = SLIST_FIRST(&arg->dirs)->objid; break; default: assert(0); } daclcount = nitems(aces); flags = ZFS_ACL_TRIVIAL | ZFS_ACL_AUTO_INHERIT | ZFS_NO_EXECS_DENIED | ZFS_ARCHIVE | ZFS_AV_MODIFIED; /* XXX-MJ */ gen = 1; gid = sb->st_gid; mode = sb->st_mode; uid = sb->st_uid; memset(aces, 0, sizeof(aces)); aces[0].z_flags = ACE_OWNER; aces[0].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE; aces[0].z_access_mask = ACE_WRITE_ATTRIBUTES | ACE_WRITE_OWNER | ACE_WRITE_ACL | ACE_WRITE_NAMED_ATTRS | ACE_READ_ACL | ACE_READ_ATTRIBUTES | ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE; if ((mode & S_IRUSR) != 0) aces[0].z_access_mask |= ACE_READ_DATA; if ((mode & S_IWUSR) != 0) aces[0].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA; if ((mode & S_IXUSR) != 0) aces[0].z_access_mask |= ACE_EXECUTE; aces[1].z_flags = ACE_GROUP | ACE_IDENTIFIER_GROUP; aces[1].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE; aces[1].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES | ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE; if ((mode & S_IRGRP) != 0) aces[1].z_access_mask |= ACE_READ_DATA; if ((mode & S_IWGRP) != 0) aces[1].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA; if ((mode & S_IXGRP) != 0) aces[1].z_access_mask |= ACE_EXECUTE; aces[2].z_flags = ACE_EVERYONE; aces[2].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE; aces[2].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES | ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE; if ((mode & S_IROTH) != 0) aces[2].z_access_mask |= ACE_READ_DATA; if ((mode & S_IWOTH) != 0) aces[2].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA; if ((mode & S_IXOTH) != 0) aces[2].z_access_mask |= ACE_EXECUTE; switch (layout) { case SA_LAYOUT_INDEX_DEFAULT: /* At most one variable-length attribute. */ hdrsz = sizeof(uint64_t); break; case SA_LAYOUT_INDEX_SYMLINK: /* At most five variable-length attributes. */ hdrsz = sizeof(uint64_t) * 2; break; default: assert(0); } sahdr = (sa_hdr_phys_t *)DN_BONUS(dnode); sahdr->sa_magic = SA_MAGIC; SA_HDR_LAYOUT_INFO_ENCODE(sahdr->sa_layout_info, layout, hdrsz); bonussz = SA_HDR_SIZE(sahdr); attrbuf = (char *)sahdr + SA_HDR_SIZE(sahdr); fs_populate_attr(fs, attrbuf, &daclcount, ZPL_DACL_COUNT, &bonussz); fs_populate_attr(fs, attrbuf, &flags, ZPL_FLAGS, &bonussz); fs_populate_attr(fs, attrbuf, &gen, ZPL_GEN, &bonussz); fs_populate_attr(fs, attrbuf, &gid, ZPL_GID, &bonussz); fs_populate_attr(fs, attrbuf, &links, ZPL_LINKS, &bonussz); fs_populate_attr(fs, attrbuf, &mode, ZPL_MODE, &bonussz); fs_populate_attr(fs, attrbuf, &parent, ZPL_PARENT, &bonussz); fs_populate_attr(fs, attrbuf, &objsize, ZPL_SIZE, &bonussz); fs_populate_attr(fs, attrbuf, &uid, ZPL_UID, &bonussz); /* * We deliberately set atime = mtime here to ensure that images are * reproducible. */ fs_populate_time(fs, attrbuf, &sb->st_mtim, ZPL_ATIME, &bonussz); fs_populate_time(fs, attrbuf, &sb->st_ctim, ZPL_CTIME, &bonussz); fs_populate_time(fs, attrbuf, &sb->st_mtim, ZPL_MTIME, &bonussz); +#ifdef __linux__ + /* Linux has no st_birthtim; approximate with st_ctim */ + fs_populate_time(fs, attrbuf, &sb->st_ctim, ZPL_CRTIME, &bonussz); +#else fs_populate_time(fs, attrbuf, &sb->st_birthtim, ZPL_CRTIME, &bonussz); +#endif fs_populate_varszattr(fs, attrbuf, aces, sizeof(aces), 0, ZPL_DACL_ACES, &bonussz); sahdr->sa_lengths[0] = sizeof(aces); if (cur->type == S_IFLNK) { assert(layout == SA_LAYOUT_INDEX_SYMLINK); /* Need to use a spill block pointer if the target is long. */ assert(bonussz + objsize <= DN_OLD_MAX_BONUSLEN); fs_populate_varszattr(fs, attrbuf, target, objsize, sahdr->sa_lengths[0], ZPL_SYMLINK, &bonussz); sahdr->sa_lengths[1] = (uint16_t)objsize; } dnode->dn_bonuslen = bonussz; } static void fs_populate_file(fsnode *cur, struct fs_populate_arg *arg) { struct dnode_cursor *c; dnode_phys_t *dnode; zfs_opt_t *zfs; char *buf; uint64_t dnid; ssize_t n; size_t bufsz; off_t size, target; int fd; assert(cur->type == S_IFREG); assert((cur->inode->flags & FI_ROOT) == 0); zfs = arg->zfs; assert(cur->inode->ino != 0); if ((cur->inode->flags & FI_ALLOCATED) != 0) { /* * This is a hard link of an existing file. * * XXX-MJ need to check whether it crosses datasets, add a test * case for that */ fs_populate_dirent(arg, cur, cur->inode->ino); return; } dnode = objset_dnode_bonus_alloc(arg->fs->os, DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid); cur->inode->ino = dnid; cur->inode->flags |= FI_ALLOCATED; fd = fs_open(cur, arg, O_RDONLY); buf = zfs->filebuf; bufsz = sizeof(zfs->filebuf); size = cur->inode->st.st_size; c = dnode_cursor_init(zfs, arg->fs->os, dnode, size, 0); for (off_t foff = 0; foff < size; foff += target) { off_t loc, sofar; /* * Fill up our buffer, handling partial reads. * * It might be profitable to use copy_file_range(2) here. */ sofar = 0; target = MIN(size - foff, (off_t)bufsz); do { n = read(fd, buf + sofar, target); if (n < 0) err(1, "reading from '%s'", cur->name); if (n == 0) errx(1, "unexpected EOF reading '%s'", cur->name); sofar += n; } while (sofar < target); if (target < (off_t)bufsz) memset(buf + target, 0, bufsz - target); loc = objset_space_alloc(zfs, arg->fs->os, &target); vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, buf, target, loc, dnode_cursor_next(zfs, c, foff)); } eclose(fd); dnode_cursor_finish(zfs, c); fs_populate_sattrs(arg, cur, dnode); fs_populate_dirent(arg, cur, dnid); } static void fs_populate_dir(fsnode *cur, struct fs_populate_arg *arg) { dnode_phys_t *dnode; zfs_objset_t *os; uint64_t dnid; int dirfd; assert(cur->type == S_IFDIR); assert((cur->inode->flags & FI_ALLOCATED) == 0); os = arg->fs->os; dnode = objset_dnode_bonus_alloc(os, DMU_OT_DIRECTORY_CONTENTS, DMU_OT_SA, 0, &dnid); /* * Add an entry to the parent directory and open this directory. */ if (!SLIST_EMPTY(&arg->dirs)) { fs_populate_dirent(arg, cur, dnid); dirfd = fs_open(cur, arg, O_DIRECTORY | O_RDONLY); } else { arg->rootdirid = dnid; dirfd = arg->rootdirfd; arg->rootdirfd = -1; } /* * Set ZPL attributes. */ fs_populate_sattrs(arg, cur, dnode); /* * If this is a root directory, then its children belong to a different * dataset and this directory remains empty in the current objset. */ if ((cur->inode->flags & FI_ROOT) == 0) { struct fs_populate_dir *dir; dir = ecalloc(1, sizeof(*dir)); dir->dirfd = dirfd; dir->objid = dnid; dir->zap = zap_alloc(os, dnode); SLIST_INSERT_HEAD(&arg->dirs, dir, next); } else { zap_write(arg->zfs, zap_alloc(os, dnode)); fs_build_one(arg->zfs, cur->inode->param, cur->child, dirfd); } } static void fs_populate_symlink(fsnode *cur, struct fs_populate_arg *arg) { dnode_phys_t *dnode; uint64_t dnid; assert(cur->type == S_IFLNK); assert((cur->inode->flags & (FI_ALLOCATED | FI_ROOT)) == 0); dnode = objset_dnode_bonus_alloc(arg->fs->os, DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid); fs_populate_dirent(arg, cur, dnid); fs_populate_sattrs(arg, cur, dnode); } static int fs_foreach_populate(fsnode *cur, void *_arg) { struct fs_populate_arg *arg; struct fs_populate_dir *dir; int ret; arg = _arg; switch (cur->type) { case S_IFREG: fs_populate_file(cur, arg); break; case S_IFDIR: if (fsnode_isroot(cur)) break; fs_populate_dir(cur, arg); break; case S_IFLNK: fs_populate_symlink(cur, arg); break; default: assert(0); } ret = (cur->inode->flags & FI_ROOT) != 0 ? 0 : 1; if (cur->next == NULL && (cur->child == NULL || (cur->inode->flags & FI_ROOT) != 0)) { /* * We reached a terminal node in a subtree. Walk back up and * write out directories. We're done once we hit the root of a * dataset or find a level where we're not on the edge of the * tree. */ do { dir = SLIST_FIRST(&arg->dirs); SLIST_REMOVE_HEAD(&arg->dirs, next); zap_write(arg->zfs, dir->zap); if (dir->dirfd != -1) eclose(dir->dirfd); free(dir); cur = cur->parent; } while (cur != NULL && cur->next == NULL && (cur->inode->flags & FI_ROOT) == 0); } return (ret); } static void fs_add_zpl_attr_layout(zfs_zap_t *zap, unsigned int index, const sa_attr_type_t layout[], size_t sacnt) { char ti[16]; assert(sizeof(layout[0]) == 2); snprintf(ti, sizeof(ti), "%u", index); zap_add(zap, ti, sizeof(sa_attr_type_t), sacnt, (const uint8_t *)layout); } /* * Initialize system attribute tables. * * There are two elements to this. First, we write the zpl_attrs[] and * zpl_attr_layout[] tables to disk. Then we create a lookup table which * allows us to set file attributes quickly. */ static uint64_t fs_set_zpl_attrs(zfs_opt_t *zfs, zfs_fs_t *fs) { zfs_zap_t *sazap, *salzap, *sarzap; zfs_objset_t *os; dnode_phys_t *saobj, *salobj, *sarobj; uint64_t saobjid, salobjid, sarobjid; uint16_t offset; os = fs->os; /* * The on-disk tables are stored in two ZAP objects, the registry object * and the layout object. Individual attributes are described by * entries in the registry object; for example, the value for the * "ZPL_SIZE" key gives the size and encoding of the ZPL_SIZE attribute. * The attributes of a file are ordered according to one of the layouts * defined in the layout object. The master node object is simply used * to locate the registry and layout objects. */ saobj = objset_dnode_alloc(os, DMU_OT_SA_MASTER_NODE, &saobjid); salobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_LAYOUTS, &salobjid); sarobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_REGISTRATION, &sarobjid); sarzap = zap_alloc(os, sarobj); for (size_t i = 0; i < nitems(zpl_attrs); i++) { const zfs_sattr_t *sa; uint64_t attr; attr = 0; sa = &zpl_attrs[i]; SA_ATTR_ENCODE(attr, (uint64_t)i, sa->size, sa->bs); zap_add_uint64(sarzap, sa->name, attr); } zap_write(zfs, sarzap); /* * Layouts are arrays of indices into the registry. We define two * layouts for use by the ZPL, one for non-symlinks and one for * symlinks. They are identical except that the symlink layout includes * ZPL_SYMLINK as its final attribute. */ salzap = zap_alloc(os, salobj); assert(zpl_attr_layout[nitems(zpl_attr_layout) - 1] == ZPL_SYMLINK); fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_DEFAULT, zpl_attr_layout, nitems(zpl_attr_layout) - 1); fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_SYMLINK, zpl_attr_layout, nitems(zpl_attr_layout)); zap_write(zfs, salzap); sazap = zap_alloc(os, saobj); zap_add_uint64(sazap, SA_LAYOUTS, salobjid); zap_add_uint64(sazap, SA_REGISTRY, sarobjid); zap_write(zfs, sazap); /* Sanity check. */ for (size_t i = 0; i < nitems(zpl_attrs); i++) assert(i == zpl_attrs[i].id); /* * Build the offset table used when setting file attributes. File * attributes are stored in the object's bonus buffer; this table * provides the buffer offset of attributes referenced by the layout * table. */ fs->sacnt = nitems(zpl_attrs); fs->saoffs = ecalloc(fs->sacnt, sizeof(*fs->saoffs)); for (size_t i = 0; i < fs->sacnt; i++) fs->saoffs[i] = 0xffff; offset = 0; for (size_t i = 0; i < nitems(zpl_attr_layout); i++) { uint16_t size; assert(zpl_attr_layout[i] < fs->sacnt); fs->saoffs[zpl_attr_layout[i]] = offset; size = zpl_attrs[zpl_attr_layout[i]].size; offset += size; } fs->satab = zpl_attrs; return (saobjid); } static void fs_layout_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg) { char *mountpoint, *origmountpoint, *name, *next; fsnode *cur, *root; uint64_t canmount; if (!dsl_dir_has_dataset(dsldir)) return; if (dsl_dir_get_canmount(dsldir, &canmount) == 0 && canmount == 0) return; mountpoint = dsl_dir_get_mountpoint(zfs, dsldir); if (mountpoint == NULL) return; /* * If we were asked to specify a bootfs, set it here. */ if (zfs->bootfs != NULL && strcmp(zfs->bootfs, dsl_dir_fullname(dsldir)) == 0) { zap_add_uint64(zfs->poolprops, "bootfs", dsl_dir_dataset_id(dsldir)); } origmountpoint = mountpoint; /* * Figure out which fsnode corresponds to our mountpoint. */ root = arg; cur = root; if (strcmp(mountpoint, zfs->rootpath) != 0) { mountpoint += strlen(zfs->rootpath); /* * Look up the directory in the staged tree. For example, if * the dataset's mount point is /foo/bar/baz, we'll search the * root directory for "foo", search "foo" for "baz", and so on. * Each intermediate name must refer to a directory; the final * component need not exist. */ cur = root; for (next = name = mountpoint; next != NULL;) { for (; *next == '/'; next++) ; name = strsep(&next, "/"); for (; cur != NULL && strcmp(cur->name, name) != 0; cur = cur->next) ; if (cur == NULL) { if (next == NULL) break; errx(1, "missing mountpoint directory for `%s'", dsl_dir_fullname(dsldir)); } if (cur->type != S_IFDIR) { errx(1, "mountpoint for `%s' is not a directory", dsl_dir_fullname(dsldir)); } if (next != NULL) cur = cur->child; } } if (cur != NULL) { assert(cur->type == S_IFDIR); /* * Multiple datasets shouldn't share a mountpoint. It's * technically allowed, but it's not clear what makefs should do * in that case. */ assert((cur->inode->flags & FI_ROOT) == 0); if (cur != root) cur->inode->flags |= FI_ROOT; assert(cur->inode->param == NULL); cur->inode->param = dsldir; } free(origmountpoint); } static int fs_foreach_mark(fsnode *cur, void *arg) { uint64_t *countp; countp = arg; if (cur->type == S_IFDIR && fsnode_isroot(cur)) return (1); if (cur->inode->ino == 0) { cur->inode->ino = ++(*countp); cur->inode->nlink = 1; } else { cur->inode->nlink++; } return ((cur->inode->flags & FI_ROOT) != 0 ? 0 : 1); } /* * Create a filesystem dataset. More specifically: * - create an object set for the dataset, * - add required metadata (SA tables, property definitions, etc.) to that * object set, * - optionally populate the object set with file objects, using "root" as the * root directory. * * "dirfd" is a directory descriptor for the directory referenced by "root". It * is closed before returning. */ static void fs_build_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, fsnode *root, int dirfd) { struct fs_populate_arg arg; zfs_fs_t fs; zfs_zap_t *masterzap; zfs_objset_t *os; dnode_phys_t *deleteq, *masterobj; uint64_t deleteqid, dnodecount, moid, rootdirid, saobjid; bool fakedroot; /* * This dataset's mountpoint doesn't exist in the staging tree, or the * dataset doesn't have a mountpoint at all. In either case we still * need a root directory. Fake up a root fsnode to handle this case. */ fakedroot = root == NULL; if (fakedroot) { struct stat *stp; assert(dirfd == -1); root = ecalloc(1, sizeof(*root)); root->inode = ecalloc(1, sizeof(*root->inode)); root->name = estrdup("."); root->type = S_IFDIR; stp = &root->inode->st; stp->st_uid = 0; stp->st_gid = 0; stp->st_mode = S_IFDIR | 0755; } assert(root->type == S_IFDIR); assert(fsnode_isroot(root)); /* * Initialize the object set for this dataset. */ os = objset_alloc(zfs, DMU_OST_ZFS); masterobj = objset_dnode_alloc(os, DMU_OT_MASTER_NODE, &moid); assert(moid == MASTER_NODE_OBJ); memset(&fs, 0, sizeof(fs)); fs.os = os; /* * Create the ZAP SA layout now since filesystem object dnodes will * refer to those attributes. */ saobjid = fs_set_zpl_attrs(zfs, &fs); /* * Make a pass over the staged directory to detect hard links and assign * virtual dnode numbers. */ dnodecount = 1; /* root directory */ fsnode_foreach(root, fs_foreach_mark, &dnodecount); /* * Make a second pass to populate the dataset with files from the * staged directory. Most of our runtime is spent here. */ arg.rootdirfd = dirfd; arg.zfs = zfs; arg.fs = &fs; SLIST_INIT(&arg.dirs); fs_populate_dir(root, &arg); assert(!SLIST_EMPTY(&arg.dirs)); fsnode_foreach(root, fs_foreach_populate, &arg); assert(SLIST_EMPTY(&arg.dirs)); rootdirid = arg.rootdirid; /* * Create an empty delete queue. We don't do anything with it, but * OpenZFS will refuse to mount filesystems that don't have one. */ deleteq = objset_dnode_alloc(os, DMU_OT_UNLINKED_SET, &deleteqid); zap_write(zfs, zap_alloc(os, deleteq)); /* * Populate and write the master node object. This is a ZAP object * containing various dataset properties and the object IDs of the root * directory and delete queue. */ masterzap = zap_alloc(os, masterobj); zap_add_uint64(masterzap, ZFS_ROOT_OBJ, rootdirid); zap_add_uint64(masterzap, ZFS_UNLINKED_SET, deleteqid); zap_add_uint64(masterzap, ZFS_SA_ATTRS, saobjid); zap_add_uint64(masterzap, ZPL_VERSION_OBJ, 5 /* ZPL_VERSION_SA */); zap_add_uint64(masterzap, "normalization", 0 /* off */); zap_add_uint64(masterzap, "utf8only", 0 /* off */); zap_add_uint64(masterzap, "casesensitivity", 0 /* case sensitive */); zap_add_uint64(masterzap, "acltype", 2 /* NFSv4 */); zap_write(zfs, masterzap); /* * All finished with this object set, we may as well write it now. * The DSL layer will sum up the bytes consumed by each dataset using * information stored in the object set, so it can't be freed just yet. */ dsl_dir_dataset_write(zfs, os, dsldir); if (fakedroot) { free(root->inode); free(root->name); free(root); } free(fs.saoffs); } /* * Create an object set for each DSL directory which has a dataset and doesn't * already have an object set. */ static void fs_build_unmounted(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg __unused) { if (dsl_dir_has_dataset(dsldir) && !dsl_dir_dataset_has_objset(dsldir)) fs_build_one(zfs, dsldir, NULL, -1); } /* * Create our datasets and populate them with files. */ void fs_build(zfs_opt_t *zfs, int dirfd, fsnode *root) { /* * Run through our datasets and find the root fsnode for each one. Each * root fsnode is flagged so that we can figure out which dataset it * belongs to. */ dsl_dir_foreach(zfs, zfs->rootdsldir, fs_layout_one, root); /* * Did we find our boot filesystem? */ if (zfs->bootfs != NULL && !zap_entry_exists(zfs->poolprops, "bootfs")) errx(1, "no mounted dataset matches bootfs property `%s'", zfs->bootfs); /* * Traverse the file hierarchy starting from the root fsnode. One * dataset, not necessarily the root dataset, must "own" the root * directory by having its mountpoint be equal to the root path. * * As roots of other datasets are encountered during the traversal, * fs_build_one() recursively creates the corresponding object sets and * populates them. Once this function has returned, all datasets will * have been fully populated. */ fs_build_one(zfs, root->inode->param, root, dirfd); /* * Now create object sets for datasets whose mountpoints weren't found * in the staging directory, either because there is no mountpoint, or * because the mountpoint doesn't correspond to an existing directory. */ dsl_dir_foreach(zfs, zfs->rootdsldir, fs_build_unmounted, NULL); } diff --git a/usr.sbin/makefs/zfs/objset.c b/usr.sbin/makefs/zfs/objset.c index edd0b66d6147..c22a4f6415fe 100644 --- a/usr.sbin/makefs/zfs/objset.c +++ b/usr.sbin/makefs/zfs/objset.c @@ -1,261 +1,262 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2022 The FreeBSD Foundation * * This software was developed by Mark Johnston under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include +#include #include #include #include "zfs.h" #define DNODES_PER_CHUNK (MAXBLOCKSIZE / sizeof(dnode_phys_t)) struct objset_dnode_chunk { dnode_phys_t buf[DNODES_PER_CHUNK]; unsigned int nextfree; STAILQ_ENTRY(objset_dnode_chunk) next; }; typedef struct zfs_objset { /* Physical object set. */ objset_phys_t *phys; off_t osloc; off_t osblksz; blkptr_t osbp; /* set in objset_write() */ /* Accounting. */ off_t space; /* bytes allocated to this objset */ /* dnode allocator. */ uint64_t dnodecount; STAILQ_HEAD(, objset_dnode_chunk) dnodechunks; } zfs_objset_t; static void dnode_init(dnode_phys_t *dnode, uint8_t type, uint8_t bonustype, uint16_t bonuslen) { dnode->dn_indblkshift = MAXBLOCKSHIFT; dnode->dn_type = type; dnode->dn_bonustype = bonustype; dnode->dn_bonuslen = bonuslen; dnode->dn_checksum = ZIO_CHECKSUM_FLETCHER_4; dnode->dn_nlevels = 1; dnode->dn_nblkptr = 1; dnode->dn_flags = DNODE_FLAG_USED_BYTES; } zfs_objset_t * objset_alloc(zfs_opt_t *zfs, uint64_t type) { struct objset_dnode_chunk *chunk; zfs_objset_t *os; os = ecalloc(1, sizeof(*os)); os->osblksz = sizeof(objset_phys_t); os->osloc = objset_space_alloc(zfs, os, &os->osblksz); /* * Object ID zero is always reserved for the meta dnode, which is * embedded in the objset itself. */ STAILQ_INIT(&os->dnodechunks); chunk = ecalloc(1, sizeof(*chunk)); chunk->nextfree = 1; STAILQ_INSERT_HEAD(&os->dnodechunks, chunk, next); os->dnodecount = 1; os->phys = ecalloc(1, os->osblksz); os->phys->os_type = type; dnode_init(&os->phys->os_meta_dnode, DMU_OT_DNODE, DMU_OT_NONE, 0); os->phys->os_meta_dnode.dn_datablkszsec = DNODE_BLOCK_SIZE >> MINBLOCKSHIFT; return (os); } /* * Write the dnode array and physical object set to disk. */ static void _objset_write(zfs_opt_t *zfs, zfs_objset_t *os, struct dnode_cursor *c, off_t loc) { struct objset_dnode_chunk *chunk, *tmp; unsigned int total; /* * Write out the dnode array, i.e., the meta-dnode. For some reason its * data blocks must be 16KB in size no matter how large the array is. */ total = 0; STAILQ_FOREACH_SAFE(chunk, &os->dnodechunks, next, tmp) { unsigned int i; assert(chunk->nextfree <= os->dnodecount); assert(chunk->nextfree <= DNODES_PER_CHUNK); for (i = 0; i < chunk->nextfree; i += DNODES_PER_BLOCK) { blkptr_t *bp; uint64_t fill; if (chunk->nextfree - i < DNODES_PER_BLOCK) fill = DNODES_PER_BLOCK - (chunk->nextfree - i); else fill = 0; bp = dnode_cursor_next(zfs, c, (total + i) * sizeof(dnode_phys_t)); vdev_pwrite_dnode_indir(zfs, &os->phys->os_meta_dnode, 0, fill, chunk->buf + i, DNODE_BLOCK_SIZE, loc, bp); loc += DNODE_BLOCK_SIZE; } total += i; free(chunk); } dnode_cursor_finish(zfs, c); STAILQ_INIT(&os->dnodechunks); /* * Write the object set itself. The saved block pointer will be copied * into the referencing DSL dataset or the uberblocks. */ vdev_pwrite_data(zfs, DMU_OT_OBJSET, ZIO_CHECKSUM_FLETCHER_4, 0, 1, os->phys, os->osblksz, os->osloc, &os->osbp); } void objset_write(zfs_opt_t *zfs, zfs_objset_t *os) { struct dnode_cursor *c; off_t dnodeloc, dnodesz; uint64_t dnodecount; /* * There is a chicken-and-egg problem here when writing the MOS: we * cannot write space maps before we're finished allocating space from * the vdev, and we can't write the MOS without having allocated space * for indirect dnode blocks. Thus, rather than lazily allocating * indirect blocks for the meta-dnode (which would be simpler), they are * allocated up-front and before writing space maps. */ dnodecount = os->dnodecount; if (os == zfs->mos) dnodecount += zfs->mscount; dnodesz = dnodecount * sizeof(dnode_phys_t); c = dnode_cursor_init(zfs, os, &os->phys->os_meta_dnode, dnodesz, DNODE_BLOCK_SIZE); dnodesz = roundup2(dnodesz, DNODE_BLOCK_SIZE); dnodeloc = objset_space_alloc(zfs, os, &dnodesz); if (os == zfs->mos) { vdev_spacemap_write(zfs); /* * We've finished allocating space, account for it in $MOS and * in the parent directory. */ dsl_dir_size_add(zfs->mosdsldir, os->space); dsl_dir_size_add(zfs->rootdsldir, os->space); } _objset_write(zfs, os, c, dnodeloc); } dnode_phys_t * objset_dnode_bonus_alloc(zfs_objset_t *os, uint8_t type, uint8_t bonustype, uint16_t bonuslen, uint64_t *idp) { struct objset_dnode_chunk *chunk; dnode_phys_t *dnode; assert(bonuslen <= DN_OLD_MAX_BONUSLEN); assert(!STAILQ_EMPTY(&os->dnodechunks)); chunk = STAILQ_LAST(&os->dnodechunks, objset_dnode_chunk, next); if (chunk->nextfree == DNODES_PER_CHUNK) { chunk = ecalloc(1, sizeof(*chunk)); STAILQ_INSERT_TAIL(&os->dnodechunks, chunk, next); } *idp = os->dnodecount++; dnode = &chunk->buf[chunk->nextfree++]; dnode_init(dnode, type, bonustype, bonuslen); dnode->dn_datablkszsec = os->osblksz >> MINBLOCKSHIFT; return (dnode); } dnode_phys_t * objset_dnode_alloc(zfs_objset_t *os, uint8_t type, uint64_t *idp) { return (objset_dnode_bonus_alloc(os, type, DMU_OT_NONE, 0, idp)); } /* * Look up a physical dnode by ID. This is not used often so a linear search is * fine. */ dnode_phys_t * objset_dnode_lookup(zfs_objset_t *os, uint64_t id) { struct objset_dnode_chunk *chunk; assert(id > 0); assert(id < os->dnodecount); STAILQ_FOREACH(chunk, &os->dnodechunks, next) { if (id < DNODES_PER_CHUNK) return (&chunk->buf[id]); id -= DNODES_PER_CHUNK; } assert(0); return (NULL); } off_t objset_space_alloc(zfs_opt_t *zfs, zfs_objset_t *os, off_t *lenp) { off_t loc; loc = vdev_space_alloc(zfs, lenp); os->space += *lenp; return (loc); } uint64_t objset_space(const zfs_objset_t *os) { return (os->space); } void objset_root_blkptr_copy(const zfs_objset_t *os, blkptr_t *bp) { memcpy(bp, &os->osbp, sizeof(blkptr_t)); } diff --git a/usr.sbin/makefs/zfs/vdev.c b/usr.sbin/makefs/zfs/vdev.c index 1709a828b7c5..63a6e7289957 100644 --- a/usr.sbin/makefs/zfs/vdev.c +++ b/usr.sbin/makefs/zfs/vdev.c @@ -1,435 +1,436 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2022 The FreeBSD Foundation * * This software was developed by Mark Johnston under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include +#include #include #include #include #include "zfs.h" #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wunused-function" #include "zfs/fletcher.c" #include "zfs/sha256.c" #pragma clang diagnostic pop static void blkptr_set(blkptr_t *bp, off_t off, off_t size, uint8_t dntype, uint8_t level, uint64_t fill, enum zio_checksum cksumt, zio_cksum_t *cksum) { dva_t *dva; assert(powerof2(size)); BP_ZERO(bp); BP_SET_LSIZE(bp, size); BP_SET_PSIZE(bp, size); BP_SET_CHECKSUM(bp, cksumt); BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); BP_SET_BIRTH(bp, TXG, TXG); BP_SET_LEVEL(bp, level); BP_SET_FILL(bp, fill); BP_SET_TYPE(bp, dntype); dva = BP_IDENTITY(bp); DVA_SET_VDEV(dva, 0); DVA_SET_OFFSET(dva, off); DVA_SET_ASIZE(dva, size); memcpy(&bp->blk_cksum, cksum, sizeof(*cksum)); } /* * Write a block of data to the vdev. The offset is always relative to the end * of the second leading vdev label. * * Consumers should generally use the helpers below, which provide block * pointers and update dnode accounting, rather than calling this function * directly. */ static void vdev_pwrite(const zfs_opt_t *zfs, const void *buf, size_t len, off_t off) { ssize_t n; assert(off >= 0 && off < zfs->asize); assert(powerof2(len)); assert((off_t)len > 0 && off + (off_t)len > off && off + (off_t)len < zfs->asize); if (zfs->spacemap != NULL) { /* * Verify that the blocks being written were in fact allocated. * * The space map isn't available once the on-disk space map is * finalized, so this check doesn't quite catch everything. */ assert(bit_ntest(zfs->spacemap, off >> zfs->ashift, (off + len - 1) >> zfs->ashift, 1)); } off += VDEV_LABEL_START_SIZE; for (size_t sofar = 0; sofar < len; sofar += n) { n = pwrite(zfs->fd, (const char *)buf + sofar, len - sofar, off + sofar); if (n < 0) err(1, "pwrite"); assert(n > 0); } } void vdev_pwrite_data(zfs_opt_t *zfs, uint8_t datatype, uint8_t cksumtype, uint8_t level, uint64_t fill, const void *data, off_t sz, off_t loc, blkptr_t *bp) { zio_cksum_t cksum; assert(cksumtype == ZIO_CHECKSUM_FLETCHER_4); fletcher_4_native(data, sz, NULL, &cksum); blkptr_set(bp, loc, sz, datatype, level, fill, cksumtype, &cksum); vdev_pwrite(zfs, data, sz, loc); } void vdev_pwrite_dnode_indir(zfs_opt_t *zfs, dnode_phys_t *dnode, uint8_t level, uint64_t fill, const void *data, off_t sz, off_t loc, blkptr_t *bp) { vdev_pwrite_data(zfs, dnode->dn_type, dnode->dn_checksum, level, fill, data, sz, loc, bp); assert((dnode->dn_flags & DNODE_FLAG_USED_BYTES) != 0); dnode->dn_used += sz; } void vdev_pwrite_dnode_data(zfs_opt_t *zfs, dnode_phys_t *dnode, const void *data, off_t sz, off_t loc) { vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, data, sz, loc, &dnode->dn_blkptr[0]); } static void vdev_label_set_checksum(void *buf, off_t off, off_t size) { zio_cksum_t cksum; zio_eck_t *eck; assert(size > 0 && (size_t)size >= sizeof(zio_eck_t)); eck = (zio_eck_t *)((char *)buf + size) - 1; eck->zec_magic = ZEC_MAGIC; ZIO_SET_CHECKSUM(&eck->zec_cksum, off, 0, 0, 0); zio_checksum_SHA256(buf, size, NULL, &cksum); eck->zec_cksum = cksum; } /* * Set embedded checksums and write the label at the specified index. */ void vdev_label_write(zfs_opt_t *zfs, int ind, const vdev_label_t *labelp) { vdev_label_t *label; ssize_t n; off_t blksz, loff; assert(ind >= 0 && ind < VDEV_LABELS); /* * Make a copy since we have to modify the label to set checksums. */ label = ecalloc(1, sizeof(*label)); memcpy(label, labelp, sizeof(*label)); if (ind < 2) loff = ind * sizeof(*label); else loff = zfs->vdevsize - (VDEV_LABELS - ind) * sizeof(*label); /* * Set the verifier checksum for the boot block. We don't use it, but * the FreeBSD loader reads it and will complain if the checksum isn't * valid. */ vdev_label_set_checksum(&label->vl_be, loff + __offsetof(vdev_label_t, vl_be), sizeof(label->vl_be)); /* * Set the verifier checksum for the label. */ vdev_label_set_checksum(&label->vl_vdev_phys, loff + __offsetof(vdev_label_t, vl_vdev_phys), sizeof(label->vl_vdev_phys)); /* * Set the verifier checksum for the uberblocks. There is one uberblock * per sector; for example, with an ashift of 12 we end up with * 128KB/4KB=32 copies of the uberblock in the ring. */ blksz = 1 << zfs->ashift; assert(sizeof(label->vl_uberblock) % blksz == 0); for (size_t roff = 0; roff < sizeof(label->vl_uberblock); roff += blksz) { vdev_label_set_checksum(&label->vl_uberblock[0] + roff, loff + __offsetof(vdev_label_t, vl_uberblock) + roff, blksz); } n = pwrite(zfs->fd, label, sizeof(*label), loff); if (n < 0) err(1, "writing vdev label"); assert(n == sizeof(*label)); free(label); } /* * Find a chunk of contiguous free space of length *lenp, according to the * following rules: * 1. If the length is less than or equal to 128KB, the returned run's length * will be the smallest power of 2 equal to or larger than the length. * 2. If the length is larger than 128KB, the returned run's length will be * the smallest multiple of 128KB that is larger than the length. * 3. The returned run's length will be size-aligned up to 128KB. * * XXX-MJ the third rule isn't actually required, so this can just be a dumb * bump allocator. Maybe there's some benefit to keeping large blocks aligned, * so let's keep it for now and hope we don't get too much fragmentation. * Alternately we could try to allocate all blocks of a certain size from the * same metaslab. */ off_t vdev_space_alloc(zfs_opt_t *zfs, off_t *lenp) { off_t len; int align, loc, minblksz, nbits; minblksz = 1 << zfs->ashift; len = roundup2(*lenp, minblksz); assert(len != 0); assert(len / minblksz <= INT_MAX); if (len < MAXBLOCKSIZE) { if ((len & (len - 1)) != 0) len = (off_t)1 << flsll(len); align = len / minblksz; } else { len = roundup2(len, MAXBLOCKSIZE); align = MAXBLOCKSIZE / minblksz; } for (loc = 0, nbits = len / minblksz;; loc = roundup2(loc, align)) { bit_ffc_area_at(zfs->spacemap, loc, zfs->spacemapbits, nbits, &loc); if (loc == -1) { errx(1, "failed to find %ju bytes of space", (uintmax_t)len); } if ((loc & (align - 1)) == 0) break; } assert(loc + nbits > loc); bit_nset(zfs->spacemap, loc, loc + nbits - 1); *lenp = len; return ((off_t)loc << zfs->ashift); } static void vdev_spacemap_init(zfs_opt_t *zfs) { uint64_t nbits; assert(powerof2(zfs->mssize)); nbits = rounddown2(zfs->asize, zfs->mssize) >> zfs->ashift; if (nbits > INT_MAX) { /* * With the smallest block size of 512B, the limit on the image * size is 2TB. That should be enough for anyone. */ errx(1, "image size is too large"); } zfs->spacemapbits = (int)nbits; zfs->spacemap = bit_alloc(zfs->spacemapbits); if (zfs->spacemap == NULL) err(1, "bitstring allocation failed"); } void vdev_spacemap_write(zfs_opt_t *zfs) { dnode_phys_t *objarr; bitstr_t *spacemap; uint64_t *objarrblk; off_t smblksz, objarrblksz, objarrloc; struct { dnode_phys_t *dnode; uint64_t dnid; off_t loc; } *sma; objarrblksz = sizeof(uint64_t) * zfs->mscount; assert(objarrblksz <= MAXBLOCKSIZE); objarrloc = objset_space_alloc(zfs, zfs->mos, &objarrblksz); objarrblk = ecalloc(1, objarrblksz); objarr = objset_dnode_lookup(zfs->mos, zfs->objarrid); objarr->dn_datablkszsec = objarrblksz >> MINBLOCKSHIFT; /* * Use the smallest block size for space maps. The space allocation * algorithm should aim to minimize the number of holes. */ smblksz = 1 << zfs->ashift; /* * First allocate dnodes and space for all of our space maps. No more * space can be allocated from the vdev after this point. */ sma = ecalloc(zfs->mscount, sizeof(*sma)); for (uint64_t i = 0; i < zfs->mscount; i++) { sma[i].dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_SPACE_MAP, DMU_OT_SPACE_MAP_HEADER, sizeof(space_map_phys_t), &sma[i].dnid); sma[i].loc = objset_space_alloc(zfs, zfs->mos, &smblksz); } spacemap = zfs->spacemap; zfs->spacemap = NULL; /* * Now that the set of allocated space is finalized, populate each space * map and write it to the vdev. */ for (uint64_t i = 0; i < zfs->mscount; i++) { space_map_phys_t *sm; uint64_t alloc, length, *smblk; int shift, startb, endb, srunb, erunb; /* * We only allocate a single block for this space map, but * OpenZFS assumes that a space map object with sufficient bonus * space supports histograms. */ sma[i].dnode->dn_nblkptr = 3; sma[i].dnode->dn_datablkszsec = smblksz >> MINBLOCKSHIFT; smblk = ecalloc(1, smblksz); alloc = length = 0; shift = zfs->msshift - zfs->ashift; for (srunb = startb = i * (1 << shift), endb = (i + 1) * (1 << shift); srunb < endb; srunb = erunb) { uint64_t runlen, runoff; /* Find a run of allocated space. */ bit_ffs_at(spacemap, srunb, zfs->spacemapbits, &srunb); if (srunb == -1 || srunb >= endb) break; bit_ffc_at(spacemap, srunb, zfs->spacemapbits, &erunb); if (erunb == -1 || erunb > endb) erunb = endb; /* * The space represented by [srunb, erunb) has been * allocated. Add a record to the space map to indicate * this. Run offsets are relative to the beginning of * the metaslab. */ runlen = erunb - srunb; runoff = srunb - startb; assert(length * sizeof(uint64_t) < (uint64_t)smblksz); smblk[length] = SM_PREFIX_ENCODE(SM2_PREFIX) | SM2_RUN_ENCODE(runlen) | SM2_VDEV_ENCODE(0); smblk[length + 1] = SM2_TYPE_ENCODE(SM_ALLOC) | SM2_OFFSET_ENCODE(runoff); alloc += runlen << zfs->ashift; length += 2; } sm = DN_BONUS(sma[i].dnode); sm->smp_length = length * sizeof(uint64_t); sm->smp_alloc = alloc; vdev_pwrite_dnode_data(zfs, sma[i].dnode, smblk, smblksz, sma[i].loc); free(smblk); /* Record this space map in the space map object array. */ objarrblk[i] = sma[i].dnid; } /* * All of the space maps are written, now write the object array. */ vdev_pwrite_dnode_data(zfs, objarr, objarrblk, objarrblksz, objarrloc); free(objarrblk); assert(zfs->spacemap == NULL); free(spacemap); free(sma); } void vdev_init(zfs_opt_t *zfs, const char *image) { assert(zfs->ashift >= MINBLOCKSHIFT); zfs->fd = open(image, O_RDWR | O_CREAT | O_TRUNC, 0644); if (zfs->fd == -1) err(1, "Can't open `%s' for writing", image); if (ftruncate(zfs->fd, zfs->vdevsize) != 0) err(1, "Failed to extend image file `%s'", image); vdev_spacemap_init(zfs); } void vdev_fini(zfs_opt_t *zfs) { assert(zfs->spacemap == NULL); if (zfs->fd != -1) { if (close(zfs->fd) != 0) err(1, "close"); zfs->fd = -1; } } diff --git a/usr.sbin/makefs/zfs/zap.c b/usr.sbin/makefs/zfs/zap.c index 398c0fbf029c..33ca2650cf3d 100644 --- a/usr.sbin/makefs/zfs/zap.c +++ b/usr.sbin/makefs/zfs/zap.c @@ -1,551 +1,552 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2022 The FreeBSD Foundation * * This software was developed by Mark Johnston under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include +#include #include #include #include "makefs.h" #include "zfs.h" typedef struct zfs_zap_entry { char *name; /* entry key, private copy */ uint64_t hash; /* key hash */ union { uint8_t *valp; uint16_t *val16p; uint32_t *val32p; uint64_t *val64p; }; /* entry value, an integer array */ uint64_t val64; /* embedded value for a common case */ size_t intsz; /* array element size; 1, 2, 4 or 8 */ size_t intcnt; /* array size */ STAILQ_ENTRY(zfs_zap_entry) next; } zfs_zap_entry_t; struct zfs_zap { STAILQ_HEAD(, zfs_zap_entry) kvps; uint64_t hashsalt; /* key hash input */ unsigned long kvpcnt; /* number of key-value pairs */ unsigned long chunks; /* count of chunks needed for fat ZAP */ bool micro; /* can this be a micro ZAP? */ dnode_phys_t *dnode; /* backpointer */ zfs_objset_t *os; /* backpointer */ }; static uint16_t zap_entry_chunks(zfs_zap_entry_t *ent) { return (1 + howmany(strlen(ent->name) + 1, ZAP_LEAF_ARRAY_BYTES) + howmany(ent->intsz * ent->intcnt, ZAP_LEAF_ARRAY_BYTES)); } static uint64_t zap_hash(uint64_t salt, const char *name) { static uint64_t crc64_table[256]; const uint64_t crc64_poly = 0xC96C5795D7870F42UL; const uint8_t *cp; uint64_t crc; uint8_t c; assert(salt != 0); if (crc64_table[128] == 0) { for (int i = 0; i < 256; i++) { uint64_t *t; t = crc64_table + i; *t = i; for (int j = 8; j > 0; j--) *t = (*t >> 1) ^ (-(*t & 1) & crc64_poly); } } assert(crc64_table[128] == crc64_poly); for (cp = (const uint8_t *)name, crc = salt; (c = *cp) != '\0'; cp++) crc = (crc >> 8) ^ crc64_table[(crc ^ c) & 0xFF]; /* * Only use 28 bits, since we need 4 bits in the cookie for the * collision differentiator. We MUST use the high bits, since * those are the ones that we first pay attention to when * choosing the bucket. */ crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); return (crc); } zfs_zap_t * zap_alloc(zfs_objset_t *os, dnode_phys_t *dnode) { zfs_zap_t *zap; zap = ecalloc(1, sizeof(*zap)); STAILQ_INIT(&zap->kvps); zap->hashsalt = ((uint64_t)random() << 32) | random(); zap->micro = true; zap->kvpcnt = 0; zap->chunks = 0; zap->dnode = dnode; zap->os = os; return (zap); } void zap_add(zfs_zap_t *zap, const char *name, size_t intsz, size_t intcnt, const uint8_t *val) { zfs_zap_entry_t *ent; assert(intsz == 1 || intsz == 2 || intsz == 4 || intsz == 8); assert(strlen(name) + 1 <= ZAP_MAXNAMELEN); assert(intcnt <= ZAP_MAXVALUELEN && intcnt * intsz <= ZAP_MAXVALUELEN); ent = ecalloc(1, sizeof(*ent)); ent->name = estrdup(name); ent->hash = zap_hash(zap->hashsalt, ent->name); ent->intsz = intsz; ent->intcnt = intcnt; if (intsz == sizeof(uint64_t) && intcnt == 1) { /* * Micro-optimization to elide a memory allocation in that most * common case where this is a directory entry. */ ent->val64p = &ent->val64; } else { ent->valp = ecalloc(intcnt, intsz); } memcpy(ent->valp, val, intcnt * intsz); zap->kvpcnt++; zap->chunks += zap_entry_chunks(ent); STAILQ_INSERT_TAIL(&zap->kvps, ent, next); if (zap->micro && (intcnt != 1 || intsz != sizeof(uint64_t) || strlen(name) + 1 > MZAP_NAME_LEN || zap->kvpcnt > MZAP_ENT_MAX)) zap->micro = false; } void zap_add_uint64(zfs_zap_t *zap, const char *name, uint64_t val) { zap_add(zap, name, sizeof(uint64_t), 1, (uint8_t *)&val); } void zap_add_string(zfs_zap_t *zap, const char *name, const char *val) { zap_add(zap, name, 1, strlen(val) + 1, val); } bool zap_entry_exists(zfs_zap_t *zap, const char *name) { zfs_zap_entry_t *ent; STAILQ_FOREACH(ent, &zap->kvps, next) { if (strcmp(ent->name, name) == 0) return (true); } return (false); } static void zap_micro_write(zfs_opt_t *zfs, zfs_zap_t *zap) { dnode_phys_t *dnode; zfs_zap_entry_t *ent; mzap_phys_t *mzap; mzap_ent_phys_t *ment; off_t bytes, loc; memset(zfs->filebuf, 0, sizeof(zfs->filebuf)); mzap = (mzap_phys_t *)&zfs->filebuf[0]; mzap->mz_block_type = ZBT_MICRO; mzap->mz_salt = zap->hashsalt; mzap->mz_normflags = 0; bytes = sizeof(*mzap) + (zap->kvpcnt - 1) * sizeof(*ment); assert(bytes <= (off_t)MZAP_MAX_BLKSZ); ment = &mzap->mz_chunk[0]; STAILQ_FOREACH(ent, &zap->kvps, next) { memcpy(&ment->mze_value, ent->valp, ent->intsz * ent->intcnt); ment->mze_cd = 0; /* XXX-MJ */ strlcpy(ment->mze_name, ent->name, sizeof(ment->mze_name)); ment++; } loc = objset_space_alloc(zfs, zap->os, &bytes); dnode = zap->dnode; dnode->dn_maxblkid = 0; dnode->dn_datablkszsec = bytes >> MINBLOCKSHIFT; dnode->dn_flags = DNODE_FLAG_USED_BYTES; vdev_pwrite_dnode_data(zfs, dnode, zfs->filebuf, bytes, loc); } /* * Write some data to the fat ZAP leaf chunk starting at index "li". * * Note that individual integers in the value may be split among consecutive * leaves. */ static void zap_fat_write_array_chunk(zap_leaf_t *l, uint16_t li, size_t sz, const uint8_t *val) { struct zap_leaf_array *la; assert(sz <= ZAP_MAXVALUELEN); for (uint16_t n, resid = sz; resid > 0; resid -= n, val += n, li++) { n = MIN(resid, ZAP_LEAF_ARRAY_BYTES); la = &ZAP_LEAF_CHUNK(l, li).l_array; assert(la->la_type == ZAP_CHUNK_FREE); la->la_type = ZAP_CHUNK_ARRAY; memcpy(la->la_array, val, n); la->la_next = li + 1; } la->la_next = 0xffff; } /* * Find the shortest hash prefix length which lets us distribute keys without * overflowing a leaf block. This is not (space) optimal, but is simple, and * directories large enough to overflow a single 128KB leaf block are uncommon. */ static unsigned int zap_fat_write_prefixlen(zfs_zap_t *zap, zap_leaf_t *l) { zfs_zap_entry_t *ent; unsigned int prefixlen; if (zap->chunks <= ZAP_LEAF_NUMCHUNKS(l)) { /* * All chunks will fit in a single leaf block. */ return (0); } for (prefixlen = 1; prefixlen < (unsigned int)l->l_bs; prefixlen++) { uint32_t *leafchunks; leafchunks = ecalloc(1u << prefixlen, sizeof(*leafchunks)); STAILQ_FOREACH(ent, &zap->kvps, next) { uint64_t li; uint16_t chunks; li = ZAP_HASH_IDX(ent->hash, prefixlen); chunks = zap_entry_chunks(ent); if (ZAP_LEAF_NUMCHUNKS(l) - leafchunks[li] < chunks) { /* * Not enough space, grow the prefix and retry. */ break; } leafchunks[li] += chunks; } free(leafchunks); if (ent == NULL) { /* * Everything fits, we're done. */ break; } } /* * If this fails, then we need to expand the pointer table. For now * this situation is unhandled since it is hard to trigger. */ assert(prefixlen < (unsigned int)l->l_bs); return (prefixlen); } /* * Initialize a fat ZAP leaf block. */ static void zap_fat_write_leaf_init(zap_leaf_t *l, uint64_t prefix, int prefixlen) { zap_leaf_phys_t *leaf; leaf = l->l_phys; leaf->l_hdr.lh_block_type = ZBT_LEAF; leaf->l_hdr.lh_magic = ZAP_LEAF_MAGIC; leaf->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l); leaf->l_hdr.lh_prefix = prefix; leaf->l_hdr.lh_prefix_len = prefixlen; /* Initialize the leaf hash table. */ assert(leaf->l_hdr.lh_nfree < 0xffff); memset(leaf->l_hash, 0xff, ZAP_LEAF_HASH_NUMENTRIES(l) * sizeof(*leaf->l_hash)); /* Initialize the leaf chunks. */ for (uint16_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { struct zap_leaf_free *lf; lf = &ZAP_LEAF_CHUNK(l, i).l_free; lf->lf_type = ZAP_CHUNK_FREE; if (i + 1 == ZAP_LEAF_NUMCHUNKS(l)) lf->lf_next = 0xffff; else lf->lf_next = i + 1; } } static void zap_fat_write(zfs_opt_t *zfs, zfs_zap_t *zap) { struct dnode_cursor *c; zap_leaf_t l; zap_phys_t *zaphdr; struct zap_table_phys *zt; zfs_zap_entry_t *ent; dnode_phys_t *dnode; uint8_t *leafblks; uint64_t lblkcnt, *ptrhasht; off_t loc, blksz; size_t blkshift; unsigned int prefixlen; int ptrcnt; /* * For simplicity, always use the largest block size. This should be ok * since most directories will be micro ZAPs, but it's space inefficient * for small ZAPs and might need to be revisited. */ blkshift = MAXBLOCKSHIFT; blksz = (off_t)1 << blkshift; /* * Embedded pointer tables give up to 8192 entries. This ought to be * enough for anything except massive directories. */ ptrcnt = (blksz / 2) / sizeof(uint64_t); memset(zfs->filebuf, 0, sizeof(zfs->filebuf)); zaphdr = (zap_phys_t *)&zfs->filebuf[0]; zaphdr->zap_block_type = ZBT_HEADER; zaphdr->zap_magic = ZAP_MAGIC; zaphdr->zap_num_entries = zap->kvpcnt; zaphdr->zap_salt = zap->hashsalt; l.l_bs = blkshift; l.l_phys = NULL; zt = &zaphdr->zap_ptrtbl; zt->zt_blk = 0; zt->zt_numblks = 0; zt->zt_shift = flsll(ptrcnt) - 1; zt->zt_nextblk = 0; zt->zt_blks_copied = 0; /* * How many leaf blocks do we need? Initialize them and update the * header. */ prefixlen = zap_fat_write_prefixlen(zap, &l); lblkcnt = 1 << prefixlen; leafblks = ecalloc(lblkcnt, blksz); for (unsigned int li = 0; li < lblkcnt; li++) { l.l_phys = (zap_leaf_phys_t *)(leafblks + li * blksz); zap_fat_write_leaf_init(&l, li, prefixlen); } zaphdr->zap_num_leafs = lblkcnt; zaphdr->zap_freeblk = lblkcnt + 1; /* * For each entry, figure out which leaf block it belongs to based on * the upper bits of its hash, allocate chunks from that leaf, and fill * them out. */ ptrhasht = (uint64_t *)(&zfs->filebuf[0] + blksz / 2); STAILQ_FOREACH(ent, &zap->kvps, next) { struct zap_leaf_entry *le; uint16_t *lptr; uint64_t hi, li; uint16_t namelen, nchunks, nnamechunks, nvalchunks; hi = ZAP_HASH_IDX(ent->hash, zt->zt_shift); li = ZAP_HASH_IDX(ent->hash, prefixlen); assert(ptrhasht[hi] == 0 || ptrhasht[hi] == li + 1); ptrhasht[hi] = li + 1; l.l_phys = (zap_leaf_phys_t *)(leafblks + li * blksz); namelen = strlen(ent->name) + 1; /* * How many leaf chunks do we need for this entry? */ nnamechunks = howmany(namelen, ZAP_LEAF_ARRAY_BYTES); nvalchunks = howmany(ent->intcnt, ZAP_LEAF_ARRAY_BYTES / ent->intsz); nchunks = 1 + nnamechunks + nvalchunks; /* * Allocate a run of free leaf chunks for this entry, * potentially extending a hash chain. */ assert(l.l_phys->l_hdr.lh_nfree >= nchunks); l.l_phys->l_hdr.lh_nfree -= nchunks; l.l_phys->l_hdr.lh_nentries++; lptr = ZAP_LEAF_HASH_ENTPTR(&l, ent->hash); while (*lptr != 0xffff) { assert(*lptr < ZAP_LEAF_NUMCHUNKS(&l)); le = ZAP_LEAF_ENTRY(&l, *lptr); assert(le->le_type == ZAP_CHUNK_ENTRY); le->le_cd++; lptr = &le->le_next; } *lptr = l.l_phys->l_hdr.lh_freelist; l.l_phys->l_hdr.lh_freelist += nchunks; assert(l.l_phys->l_hdr.lh_freelist <= ZAP_LEAF_NUMCHUNKS(&l)); if (l.l_phys->l_hdr.lh_freelist == ZAP_LEAF_NUMCHUNKS(&l)) l.l_phys->l_hdr.lh_freelist = 0xffff; /* * Integer values must be stored in big-endian format. */ switch (ent->intsz) { case 1: break; case 2: for (uint16_t *v = ent->val16p; v - ent->val16p < (ptrdiff_t)ent->intcnt; v++) *v = htobe16(*v); break; case 4: for (uint32_t *v = ent->val32p; v - ent->val32p < (ptrdiff_t)ent->intcnt; v++) *v = htobe32(*v); break; case 8: for (uint64_t *v = ent->val64p; v - ent->val64p < (ptrdiff_t)ent->intcnt; v++) *v = htobe64(*v); break; default: assert(0); } /* * Finally, write out the leaf chunks for this entry. */ le = ZAP_LEAF_ENTRY(&l, *lptr); assert(le->le_type == ZAP_CHUNK_FREE); le->le_type = ZAP_CHUNK_ENTRY; le->le_next = 0xffff; le->le_name_chunk = *lptr + 1; le->le_name_numints = namelen; le->le_value_chunk = *lptr + 1 + nnamechunks; le->le_value_intlen = ent->intsz; le->le_value_numints = ent->intcnt; le->le_hash = ent->hash; zap_fat_write_array_chunk(&l, *lptr + 1, namelen, ent->name); zap_fat_write_array_chunk(&l, *lptr + 1 + nnamechunks, ent->intcnt * ent->intsz, ent->valp); } /* * Initialize unused slots of the pointer table. */ for (int i = 0; i < ptrcnt; i++) if (ptrhasht[i] == 0) ptrhasht[i] = (i >> (zt->zt_shift - prefixlen)) + 1; /* * Write the whole thing to disk. */ dnode = zap->dnode; dnode->dn_nblkptr = 1; dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT; dnode->dn_maxblkid = lblkcnt + 1; dnode->dn_flags = DNODE_FLAG_USED_BYTES; c = dnode_cursor_init(zfs, zap->os, zap->dnode, (lblkcnt + 1) * blksz, blksz); loc = objset_space_alloc(zfs, zap->os, &blksz); vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, zfs->filebuf, blksz, loc, dnode_cursor_next(zfs, c, 0)); for (uint64_t i = 0; i < lblkcnt; i++) { loc = objset_space_alloc(zfs, zap->os, &blksz); vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, leafblks + i * blksz, blksz, loc, dnode_cursor_next(zfs, c, (i + 1) * blksz)); } dnode_cursor_finish(zfs, c); free(leafblks); } void zap_write(zfs_opt_t *zfs, zfs_zap_t *zap) { zfs_zap_entry_t *ent; if (zap->micro) { zap_micro_write(zfs, zap); } else { assert(!STAILQ_EMPTY(&zap->kvps)); assert(zap->kvpcnt > 0); zap_fat_write(zfs, zap); } while ((ent = STAILQ_FIRST(&zap->kvps)) != NULL) { STAILQ_REMOVE_HEAD(&zap->kvps, next); if (ent->val64p != &ent->val64) free(ent->valp); free(ent->name); free(ent); } free(zap); } diff --git a/usr.sbin/makefs/zfs/zfs.h b/usr.sbin/makefs/zfs/zfs.h index 6b743b40b3ab..1e37bc54395a 100644 --- a/usr.sbin/makefs/zfs/zfs.h +++ b/usr.sbin/makefs/zfs/zfs.h @@ -1,171 +1,172 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2022 The FreeBSD Foundation * * This software was developed by Mark Johnston under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _MAKEFS_ZFS_H_ #define _MAKEFS_ZFS_H_ #include +#include #include #include #include #include #include "makefs.h" #include "zfs/nvlist.h" #define ASSERT assert #include "zfs/zfsimpl.h" #define MAXBLOCKSHIFT 17 /* 128KB */ #define MAXBLOCKSIZE ((off_t)(1 << MAXBLOCKSHIFT)) _Static_assert(MAXBLOCKSIZE == SPA_OLDMAXBLOCKSIZE, ""); #define MINBLOCKSHIFT 9 /* 512B */ #define MINBLOCKSIZE ((off_t)(1 << MINBLOCKSHIFT)) _Static_assert(MINBLOCKSIZE == SPA_MINBLOCKSIZE, ""); #define MINDEVSIZE ((off_t)SPA_MINDEVSIZE) /* All data was written in this transaction group. */ #define TXG 4 typedef struct zfs_dsl_dataset zfs_dsl_dataset_t; typedef struct zfs_dsl_dir zfs_dsl_dir_t; typedef struct zfs_objset zfs_objset_t; typedef struct zfs_zap zfs_zap_t; struct dataset_desc { char *params; STAILQ_ENTRY(dataset_desc) next; }; typedef struct { /* * Block buffer, needs to be aligned for various on-disk structures, * ZAPs, etc.. */ char filebuf[MAXBLOCKSIZE] __aligned(alignof(uint64_t)); bool nowarn; /* Pool parameters. */ const char *poolname; char *rootpath; /* implicit mount point prefix */ char *bootfs; /* bootable dataset, pool property */ int ashift; /* vdev block size */ uint64_t mssize; /* metaslab size */ STAILQ_HEAD(, dataset_desc) datasetdescs; /* non-root dataset descrs */ /* Pool state. */ uint64_t poolguid; /* pool and root vdev GUID */ zfs_zap_t *poolprops; /* MOS state. */ zfs_objset_t *mos; /* meta object set */ uint64_t objarrid; /* space map object array */ /* DSL state. */ zfs_dsl_dir_t *rootdsldir; /* root DSL directory */ zfs_dsl_dataset_t *rootds; zfs_dsl_dir_t *origindsldir; /* $ORIGIN */ zfs_dsl_dataset_t *originds; zfs_dsl_dataset_t *snapds; zfs_zap_t *cloneszap; zfs_dsl_dir_t *freedsldir; /* $FREE */ zfs_dsl_dir_t *mosdsldir; /* $MOS */ /* vdev state. */ int fd; /* vdev disk fd */ uint64_t vdevguid; /* disk vdev GUID */ off_t vdevsize; /* vdev size, including labels */ off_t asize; /* vdev size, excluding labels */ bitstr_t *spacemap; /* space allocation tracking */ int spacemapbits; /* one bit per ashift-sized block */ uint64_t msshift; /* log2(metaslab size) */ uint64_t mscount; /* number of metaslabs for this vdev */ } zfs_opt_t; /* dsl.c */ void dsl_init(zfs_opt_t *); const char *dsl_dir_fullname(const zfs_dsl_dir_t *); uint64_t dsl_dir_id(zfs_dsl_dir_t *); uint64_t dsl_dir_dataset_id(zfs_dsl_dir_t *); void dsl_dir_foreach(zfs_opt_t *, zfs_dsl_dir_t *, void (*)(zfs_opt_t *, zfs_dsl_dir_t *, void *), void *); int dsl_dir_get_canmount(zfs_dsl_dir_t *, uint64_t *); char *dsl_dir_get_mountpoint(zfs_opt_t *, zfs_dsl_dir_t *); bool dsl_dir_has_dataset(zfs_dsl_dir_t *); bool dsl_dir_dataset_has_objset(zfs_dsl_dir_t *); void dsl_dir_dataset_write(zfs_opt_t *, zfs_objset_t *, zfs_dsl_dir_t *); void dsl_dir_size_add(zfs_dsl_dir_t *, uint64_t); void dsl_write(zfs_opt_t *); /* fs.c */ void fs_build(zfs_opt_t *, int, fsnode *); /* objset.c */ zfs_objset_t *objset_alloc(zfs_opt_t *zfs, uint64_t type); off_t objset_space_alloc(zfs_opt_t *, zfs_objset_t *, off_t *); dnode_phys_t *objset_dnode_alloc(zfs_objset_t *, uint8_t, uint64_t *); dnode_phys_t *objset_dnode_bonus_alloc(zfs_objset_t *, uint8_t, uint8_t, uint16_t, uint64_t *); dnode_phys_t *objset_dnode_lookup(zfs_objset_t *, uint64_t); void objset_root_blkptr_copy(const zfs_objset_t *, blkptr_t *); uint64_t objset_space(const zfs_objset_t *); void objset_write(zfs_opt_t *zfs, zfs_objset_t *os); /* vdev.c */ void vdev_init(zfs_opt_t *, const char *); off_t vdev_space_alloc(zfs_opt_t *zfs, off_t *lenp); void vdev_pwrite_data(zfs_opt_t *zfs, uint8_t datatype, uint8_t cksumtype, uint8_t level, uint64_t fill, const void *data, off_t sz, off_t loc, blkptr_t *bp); void vdev_pwrite_dnode_indir(zfs_opt_t *zfs, dnode_phys_t *dnode, uint8_t level, uint64_t fill, const void *data, off_t sz, off_t loc, blkptr_t *bp); void vdev_pwrite_dnode_data(zfs_opt_t *zfs, dnode_phys_t *dnode, const void *data, off_t sz, off_t loc); void vdev_label_write(zfs_opt_t *zfs, int ind, const vdev_label_t *labelp); void vdev_spacemap_write(zfs_opt_t *); void vdev_fini(zfs_opt_t *zfs); /* zap.c */ zfs_zap_t *zap_alloc(zfs_objset_t *, dnode_phys_t *); void zap_add(zfs_zap_t *, const char *, size_t, size_t, const uint8_t *); void zap_add_uint64(zfs_zap_t *, const char *, uint64_t); void zap_add_string(zfs_zap_t *, const char *, const char *); bool zap_entry_exists(zfs_zap_t *, const char *); void zap_write(zfs_opt_t *, zfs_zap_t *); /* zfs.c */ struct dnode_cursor *dnode_cursor_init(zfs_opt_t *, zfs_objset_t *, dnode_phys_t *, off_t, off_t); blkptr_t *dnode_cursor_next(zfs_opt_t *, struct dnode_cursor *, off_t); void dnode_cursor_finish(zfs_opt_t *, struct dnode_cursor *); #endif /* !_MAKEFS_ZFS_H_ */