diff --git a/usr.sbin/makefs/Makefile b/usr.sbin/makefs/Makefile index 3fea648f9383..fe472d7e7309 100644 --- a/usr.sbin/makefs/Makefile +++ b/usr.sbin/makefs/Makefile @@ -1,42 +1,53 @@ # $FreeBSD$ SRCDIR:=${.PARSEDIR:tA} .include PROG= makefs CFLAGS+=-I${SRCDIR} SRCS= cd9660.c \ ffs.c \ makefs.c \ msdos.c \ mtree.c \ walk.c MAN= makefs.8 NO_WCAST_ALIGN= CSTD= c11 +.if ${MK_ZFS} != "no" +SRCS+= zfs.c +CFLAGS+=-I${SRCDIR}/zfs \ + -I${SRCTOP}/stand/libsa \ + -I${SRCTOP}/sys/cddl/boot + +CFLAGS+= -DHAVE_ZFS + +.include "${SRCDIR}/zfs/Makefile.inc" +.endif + .include "${SRCDIR}/cd9660/Makefile.inc" .include "${SRCDIR}/ffs/Makefile.inc" .include "${SRCDIR}/msdos/Makefile.inc" CFLAGS+=-DHAVE_STRUCT_STAT_ST_FLAGS=1 .PATH: ${SRCTOP}/contrib/mtree CFLAGS+=-I${SRCTOP}/contrib/mtree SRCS+= getid.c misc.c spec.c .PATH: ${SRCTOP}/contrib/mknod CFLAGS+=-I${SRCTOP}/contrib/mknod SRCS+= pack_dev.c CFLAGS+= -I${SRCTOP}/lib/libnetbsd LIBADD= netbsd util sbuf HAS_TESTS= SUBDIR.${MK_TESTS}+= tests .include diff --git a/usr.sbin/makefs/makefs.8 b/usr.sbin/makefs/makefs.8 index fdf8d532b69f..464583eab3a1 100644 --- a/usr.sbin/makefs/makefs.8 +++ b/usr.sbin/makefs/makefs.8 @@ -1,521 +1,612 @@ .\" $NetBSD: makefs.8,v 1.33 2011/05/22 21:51:39 christos Exp $ .\" .\" Copyright (c) 2001-2003 Wasabi Systems, Inc. .\" All rights reserved. .\" .\" Written by Luke Mewburn for Wasabi Systems, Inc. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" 3. All advertising materials mentioning features or use of this software .\" must display the following acknowledgement: .\" This product includes software developed for the NetBSD Project by .\" Wasabi Systems, Inc. .\" 4. The name of Wasabi Systems, Inc. may not be used to endorse .\" or promote products derived from this software without specific prior .\" written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED .\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR .\" PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC .\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR .\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF .\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS .\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN .\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) .\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE .\" POSSIBILITY OF SUCH DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd September 17, 2020 +.Dd August 5, 2022 .Dt MAKEFS 8 .Os .Sh NAME .Nm makefs .Nd create a file system image from a directory tree or a mtree manifest .Sh SYNOPSIS .Nm .Op Fl DxZ .Op Fl B Ar endian .Op Fl b Ar free-blocks .Op Fl d Ar debug-mask .Op Fl F Ar mtree-specfile .Op Fl f Ar free-files .Op Fl M Ar minimum-size .Op Fl m Ar maximum-size .Op Fl N Ar userdb-dir .Op Fl O Ar offset .Op Fl o Ar fs-options .Op Fl R Ar roundup-size .Op Fl S Ar sector-size .Op Fl s Ar image-size .Op Fl T Ar timestamp .Op Fl t Ar fs-type .Ar image-file .Ar directory | manifest .Op Ar extra-directory ... .Sh DESCRIPTION The utility .Nm creates a file system image into .Ar image-file from the directory tree .Ar directory or from the mtree manifest .Ar manifest . If any optional directory trees are passed in the .Ar extra-directory arguments, then the directory tree of each argument will be merged into the .Ar directory or .Ar manifest first before creating .Ar image-file . No special devices or privileges are required to perform this task. .Pp The options are as follows: .Bl -tag -width flag .It Fl B Ar endian Set the byte order of the image to .Ar endian . Valid byte orders are .Ql 4321 , .Ql big , or .Ql be for big endian, and .Ql 1234 , .Ql little , or .Ql le for little endian. Some file systems may have a fixed byte order; in those cases this argument will be ignored. .It Fl b Ar free-blocks Ensure that a minimum of .Ar free-blocks free blocks exist in the image. An optional .Ql % suffix may be provided to indicate that .Ar free-blocks indicates a percentage of the calculated image size. .It Fl D Treat duplicate paths in an mtree manifest as warnings not error. .It Fl d Ar debug-mask Enable various levels of debugging, depending upon which bits are set in .Ar debug-mask . XXX: document these .It Fl F Ar mtree-specfile .Em This is almost certainly not the option you are looking for. To create an image from a list of files in an mtree format manifest, specify it as the last argument on the command line, not as a the argument to .Fl F . .Pp Use .Ar mtree-specfile as an .Xr mtree 8 .Sq specfile specification. This option has no effect when the image is created from a mtree manifest rather than a directory. .Pp If a specfile entry exists in the underlying file system, its permissions and modification time will be used unless specifically overridden by the specfile. An error will be raised if the type of entry in the specfile conflicts with that of an existing entry. .Pp In the opposite case (where a specfile entry does not have an entry in the underlying file system) the following occurs: If the specfile entry is marked .Sy optional , the specfile entry is ignored. Otherwise, the entry will be created in the image, and it is necessary to specify at least the following parameters in the specfile: .Sy type , .Sy mode , .Sy gname , or .Sy gid , and .Sy uname or .Sy uid , and .Sy link (in the case of symbolic links). If .Sy time is not provided, the current time will be used. If .Sy flags is not provided, the current file flags will be used. Missing regular file entries will be created as zero-length files. .It Fl f Ar free-files Ensure that a minimum of .Ar free-files free files (inodes) exist in the image. An optional .Ql % suffix may be provided to indicate that .Ar free-files indicates a percentage of the calculated image size. .It Fl M Ar minimum-size Set the minimum size of the file system image to .Ar minimum-size . .It Fl m Ar maximum-size Set the maximum size of the file system image to .Ar maximum-size . An error will be raised if the target file system needs to be larger than this to accommodate the provided directory tree. .It Fl N Ar userdb-dir Use the user database text file .Pa master.passwd and group database text file .Pa group from .Ar userdb-dir , rather than using the results from the system's .Xr getpwnam 3 and .Xr getgrnam 3 (and related) library calls. .It Fl O Ar offset Instead of creating the filesystem at the beginning of the file, start at offset. Valid only for .Sy ffs and .Sy msdos . .It Fl o Ar fs-options Set file system specific options. .Ar fs-options is a comma separated list of options. Valid file system specific options are detailed below. .It Fl p Deprecated. See the .Fl Z flag. .It Fl R Ar roundup-size Round the image up to .Ar roundup-size . .Ar roundup-size should be a multiple of the file system block size. This option only applies to the .Sy ffs file system type. .It Fl S Ar sector-size Set the file system sector size to .Ar sector-size . .\" XXX: next line also true for cd9660? Defaults to 512. .It Fl s Ar image-size Set the size of the file system image to .Ar image-size . This is equivalent to setting both the minimum .Fl ( M ) and the maximum .Fl ( m ) sizes to the same value. For .Sy ffs and .Sy msdos the .Ar image-size does not include the .Ar offset . .Ar offset is not included in that size. .It Fl T Ar timestamp Specify a timestamp to be set for all filesystem files and directories created so that repeatable builds are possible. The .Ar timestamp can be a .Pa pathname , where the timestamps are derived from that file, or an integer value interpreted as the number of seconds from the Epoch. Note that timestamps specified in an .Xr mtree 5 spec file, override the default timestamp. .It Fl t Ar fs-type Create an .Ar fs-type file system image. The following file system types are supported: .Bl -tag -width cd9660 -offset indent .It Sy ffs BSD fast file system (default). .It Sy cd9660 ISO 9660 file system. .It Sy msdos FAT12, FAT16, or FAT32 file system. +.It Sy zfs +ZFS pool containing one or more file systems. .El .It Fl x Exclude file system nodes not explicitly listed in the specfile. .It Fl Z Create a sparse file for .Sy ffs . This is useful for virtual machine images. .El .Pp Where sizes are specified, a decimal number of bytes is expected. Two or more numbers may be separated by an .Dq x to indicate a product. Each number may have one of the following optional suffixes: .Bl -tag -width 3n -offset indent -compact .It b Block; multiply by 512 .It k Kibi; multiply by 1024 (1 KiB) .It m Mebi; multiply by 1048576 (1 MiB) .It g Gibi; multiply by 1073741824 (1 GiB) .It t Tebi; multiply by 1099511627776 (1 TiB) .It w Word; multiply by the number of bytes in an integer .El .\" .\" .Ss FFS-specific options .Sy ffs images have ffs-specific optional parameters that may be provided. Each of the options consists of a keyword, an equal sign .Pq Ql = , and a value. The following keywords are supported: .Pp .Bl -tag -width optimization -offset indent -compact .It Sy avgfilesize Expected average file size. .It Sy avgfpdir Expected number of files per directory. .It Sy bsize Block size. .It Sy density Bytes per inode. If unset, will allocate the minimum number of inodes to represent the filesystem if no free space has been requested (free blocks or minimum size set); otherwise the larger of the newfs defaults or what is required by the free inode parameters if set. .It Sy fsize Fragment size. .It Sy label Label name of the image. .It Sy maxbpg Maximum blocks per file in a cylinder group. .It Sy minfree Minimum % free. .It Sy optimization Optimization preference; one of .Ql space or .Ql time . .It Sy extent Maximum extent size. .It Sy maxbpcg Maximum total number of blocks in a cylinder group. .It Sy version UFS version. 1 for FFS (default), 2 for UFS2. .It Sy softupdates 0 for disable (default), 1 for enable .El .Ss CD9660-specific options .Sy cd9660 images have ISO9660-specific optional parameters that may be provided. The arguments consist of a keyword and, optionally, an equal sign .Pq Ql = , and a value. The following keywords are supported: .Pp .Bl -tag -width omit-trailing-period -offset indent -compact .It Sy allow-deep-trees Allow the directory structure to exceed the maximum specified in the spec. .It Sy allow-illegal-chars Allow illegal characters in filenames. This option is not implemented. .It Sy allow-lowercase Allow lowercase characters in filenames. This option is not implemented. .It Sy allow-max-name Allow 37 instead of 33 characters for filenames by omitting the version id. .It Sy allow-multidot Allow multiple dots in a filename. .It Sy applicationid Application ID of the image. .It Sy archimedes Use the .Ql ARCHIMEDES extension to encode .Tn RISC OS metadata. .It Sy bootimagedir Boot image directory. This option is not implemented. .It Sy chrp-boot Write an MBR partition table to the image to allow older CHRP hardware to boot. .It Sy boot-load-segment Set load segment for the boot image. .It Sy bootimage Filename of a boot image in the format .Dq sysid;filename , where .Dq sysid is one of .Ql efi , .Ql i386 , .Ql mac68k , .Ql macppc , or .Ql powerpc . .It Sy generic-bootimage Load a generic boot image into the first 32K of the cd9660 image. .It Sy hard-disk-boot Boot image is a hard disk image. .It Sy isolevel An integer representing the ISO 9660 interchange level where .Dq level is either .Ql 1 or .Ql 2 . .Dq level .Ql 3 is not implemented. .It Sy keep-bad-images Do not discard images whose write was aborted due to an error. For debugging purposes. .It Sy label Label name of the image. .It Sy no-boot Boot image is not bootable. .It Sy no-emul-boot Boot image is a .Dq no emulation ElTorito image. .It Sy no-trailing-padding Do not pad the image (apparently Linux needs the padding). .It Sy omit-trailing-period Omit trailing periods in filenames. .It Sy platformid Set platform ID of section header entry of the boot image. .It Sy preparer Preparer ID of the image. .It Sy publisher Publisher ID of the image. .It Sy rockridge Use RockRidge extensions (for longer filenames, etc.). .It Sy verbose Turns on verbose output. .It Sy volumeid Volume set identifier of the image. .El .Ss msdos-specific options .Sy msdos images have MS-DOS-specific optional parameters that may be provided. The arguments consist of a keyword, an equal sign .Pq Ql = , and a value. The following keywords are supported (see .Xr newfs_msdos 8 for more details): .Pp .Bl -tag -width omit-trailing-period -offset indent -compact .It Cm backup_sector Location of the backup boot sector. .It Cm block_size Block size. .It Cm bootstrap Bootstrap file. .It Cm bytes_per_sector Bytes per sector. .It Cm create_size Create file size. .It Cm directory_entries Directory entries. .It Cm drive_heads Drive heads. .It Cm fat_type FAT type (12, 16, or 32). .It Cm floppy Preset drive parameters for standard format floppy disks (160, 180, 320, 360, 640, 720, 1200, 1232, 1440, or 2880). .It Cm hidden_sectors Hidden sectors. .It Cm info_sector Location of the info sector. .It Cm media_descriptor Media descriptor. .It Cm num_FAT Number of FATs. .It Cm OEM_string OEM string. .It Cm offset Offset in device. This option will be ignored if .Fl O is set to a positive number. .It Cm reserved_sectors Reserved sectors. .It Cm sectors_per_cluster Sectors per cluster. .It Cm sectors_per_fat Sectors per FAT. .It Cm sectors_per_track Sectors per track. .It Cm size File System size. .It Cm volume_id Volume ID. .It Cm volume_label Volume Label. .El +.Ss zfs-specific options +Note: ZFS support is currently considered experimental. +Do not use it for anything critical. +.Pp +The image created by +.Nm +contains a ZFS pool with a single vdev of type +.Ql disk . +The root dataset is always created implicitly and contains the entire input +directory tree unless additional datasets are specified using the options +described below. +.Pp +The arguments consist of a keyword, an equal sign +.Pq Ql = , +and a value. +The following keywords are supported: +.Pp +.Bl -tag -width omit-trailing-period -offset indent -compact +.It ashift +The base-2 logarithm of the minimum block size. +Typical values are 9 (512B blocks) and 12 (4KB blocks). +The default value is 12. +.It bootfs +The name of the bootable dataset for the pool. +Specifying this option causes the +.Ql bootfs +property to be set in the created pool. +.It mssize +The size of metaslabs in the created pool. +By default, +.Nm +allocates large (up to 512MB) metaslabs with the expectation that +the image will be auto-expanded upon first use. +This option allows the default heuristic to be overridden. +.It poolname +The name of the ZFS pool. +This option must be specified. +.It rootpath +An implicit path prefix added to dataset mountpoints. +By default it is +.Pa / . +For creating bootable pools, the +.Va rootpath +should be set to +.Pa / . +At least one dataset must have a mountpoint equal to +.Va rootpath . +.It fs +Create an additional dataset. +This option may be specified multiple times. +The argument value must be of the form +.Ar [;[;[;...]]] , +where +.Ar dataset +is the name of the dataset and must belong to the pool's namespace. +For example, with a pool name of +.Ql test +all dataset names must be prefixed by +.Ql test/ . +A dataset must exist at each level of the pool's namespace. +For example, to create +.Ql test/foo/bar , +.Ql test/foo +must be created as well. +.Pp +The dataset mountpoints determine how the datasets are populated with +files from the staged directory tree. +Conceptually, all datasets are mounted before any are populated with files. +The root of the staged directory tree is mapped to +.Va rootpath . +.Pp +Dataset properties, as described in +.Xr zfsprops 8 , +may be specified following the dataset name. +The following properties may be set for a dataset: +.Pp +.Bl -tag -compact -offset indent +.It atime +.It canmount +.It exec +.It mountpoint +.It setuid +.El +.El .Sh SEE ALSO .Xr mtree 5 , .Xr mtree 8 , -.Xr newfs 8 +.Xr newfs 8 , +.Xr zfsconcepts 8 , +.Xr zfsprops 8 , +.Xr zpoolprops 8 .Sh HISTORY The .Nm utility appeared in .Nx 1.6 . It was ported to .Fx and first appeared in .Fx 8.0 . .Sh AUTHORS .An Luke Mewburn .Aq Mt lukem@NetBSD.org (original program), .An Daniel Watt , .An Walter Deignan , .An Ryan Gabrys , .An Alan Perez-Rathke , .An Ram Vedam (cd9660 support), .An Christos Zoulas -(msdos support). +(msdos support), +.An Mark Johnston +(zfs support). diff --git a/usr.sbin/makefs/makefs.c b/usr.sbin/makefs/makefs.c index 888a2b3edea7..2a50768d3152 100644 --- a/usr.sbin/makefs/makefs.c +++ b/usr.sbin/makefs/makefs.c @@ -1,507 +1,510 @@ /* $NetBSD: makefs.c,v 1.26 2006/10/22 21:11:56 christos Exp $ */ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2001-2003 Wasabi Systems, Inc. * All rights reserved. * * Written by Luke Mewburn for Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project by * Wasabi Systems, Inc. * 4. The name of Wasabi Systems, Inc. may not be used to endorse * or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include "makefs.h" #include "mtree.h" /* * list of supported file systems and dispatch functions */ typedef struct { const char *type; void (*prepare_options)(fsinfo_t *); int (*parse_options)(const char *, fsinfo_t *); void (*cleanup_options)(fsinfo_t *); void (*make_fs)(const char *, const char *, fsnode *, fsinfo_t *); } fstype_t; static fstype_t fstypes[] = { #define ENTRY(name) { \ # name, name ## _prep_opts, name ## _parse_opts, \ name ## _cleanup_opts, name ## _makefs \ } ENTRY(cd9660), ENTRY(ffs), ENTRY(msdos), +#ifdef HAVE_ZFS + ENTRY(zfs), +#endif { .type = NULL }, }; u_int debug; int dupsok; struct timespec start_time; struct stat stampst; static fstype_t *get_fstype(const char *); static int get_tstamp(const char *, struct stat *); static void usage(fstype_t *, fsinfo_t *); int main(int argc, char *argv[]) { struct stat sb; struct timeval start; fstype_t *fstype; fsinfo_t fsoptions; fsnode *root; int ch, i, len; const char *subtree; const char *specfile; setprogname(argv[0]); debug = 0; if ((fstype = get_fstype(DEFAULT_FSTYPE)) == NULL) errx(1, "Unknown default fs type `%s'.", DEFAULT_FSTYPE); /* set default fsoptions */ (void)memset(&fsoptions, 0, sizeof(fsoptions)); fsoptions.fd = -1; fsoptions.sectorsize = -1; if (fstype->prepare_options) fstype->prepare_options(&fsoptions); specfile = NULL; #ifdef CLOCK_REALTIME ch = clock_gettime(CLOCK_REALTIME, &start_time); #else ch = gettimeofday(&start, NULL); start_time.tv_sec = start.tv_sec; start_time.tv_nsec = start.tv_usec * 1000; #endif if (ch == -1) err(1, "Unable to get system time"); while ((ch = getopt(argc, argv, "B:b:Dd:f:F:M:m:N:O:o:pR:s:S:t:T:xZ")) != -1) { switch (ch) { case 'B': if (strcmp(optarg, "be") == 0 || strcmp(optarg, "4321") == 0 || strcmp(optarg, "big") == 0) { #if BYTE_ORDER == LITTLE_ENDIAN fsoptions.needswap = 1; #endif } else if (strcmp(optarg, "le") == 0 || strcmp(optarg, "1234") == 0 || strcmp(optarg, "little") == 0) { #if BYTE_ORDER == BIG_ENDIAN fsoptions.needswap = 1; #endif } else { warnx("Invalid endian `%s'.", optarg); usage(fstype, &fsoptions); } break; case 'b': len = strlen(optarg) - 1; if (optarg[len] == '%') { optarg[len] = '\0'; fsoptions.freeblockpc = strsuftoll("free block percentage", optarg, 0, 99); } else { fsoptions.freeblocks = strsuftoll("free blocks", optarg, 0, LLONG_MAX); } break; case 'D': dupsok = 1; break; case 'd': debug = strtoll(optarg, NULL, 0); break; case 'f': len = strlen(optarg) - 1; if (optarg[len] == '%') { optarg[len] = '\0'; fsoptions.freefilepc = strsuftoll("free file percentage", optarg, 0, 99); } else { fsoptions.freefiles = strsuftoll("free files", optarg, 0, LLONG_MAX); } break; case 'F': specfile = optarg; break; case 'M': fsoptions.minsize = strsuftoll("minimum size", optarg, 1LL, LLONG_MAX); break; case 'N': if (! setup_getid(optarg)) errx(1, "Unable to use user and group databases in `%s'", optarg); break; case 'm': fsoptions.maxsize = strsuftoll("maximum size", optarg, 1LL, LLONG_MAX); break; case 'O': fsoptions.offset = strsuftoll("offset", optarg, 0LL, LLONG_MAX); break; case 'o': { char *p; while ((p = strsep(&optarg, ",")) != NULL) { if (*p == '\0') errx(1, "Empty option"); if (! fstype->parse_options(p, &fsoptions)) usage(fstype, &fsoptions); } break; } case 'p': /* Deprecated in favor of 'Z' */ fsoptions.sparse = 1; break; case 'R': /* Round image size up to specified block size */ fsoptions.roundup = strsuftoll("roundup-size", optarg, 0, LLONG_MAX); break; case 's': fsoptions.minsize = fsoptions.maxsize = strsuftoll("size", optarg, 1LL, LLONG_MAX); break; case 'S': fsoptions.sectorsize = (int)strsuftoll("sector size", optarg, 1LL, INT_MAX); break; case 't': /* Check current one and cleanup if necessary. */ if (fstype->cleanup_options) fstype->cleanup_options(&fsoptions); fsoptions.fs_specific = NULL; if ((fstype = get_fstype(optarg)) == NULL) errx(1, "Unknown fs type `%s'.", optarg); fstype->prepare_options(&fsoptions); break; case 'T': if (get_tstamp(optarg, &stampst) == -1) errx(1, "Cannot get timestamp from `%s'", optarg); break; case 'x': fsoptions.onlyspec = 1; break; case 'Z': /* Superscedes 'p' for compatibility with NetBSD makefs(8) */ fsoptions.sparse = 1; break; case '?': default: usage(fstype, &fsoptions); /* NOTREACHED */ } } if (debug) { printf("debug mask: 0x%08x\n", debug); printf("start time: %ld.%ld, %s", (long)start_time.tv_sec, (long)start_time.tv_nsec, ctime(&start_time.tv_sec)); } argc -= optind; argv += optind; if (argc < 2) usage(fstype, &fsoptions); /* -x must be accompanied by -F */ if (fsoptions.onlyspec != 0 && specfile == NULL) errx(1, "-x requires -F mtree-specfile."); /* Accept '-' as meaning "read from standard input". */ if (strcmp(argv[1], "-") == 0) sb.st_mode = S_IFREG; else { if (stat(argv[1], &sb) == -1) err(1, "Can't stat `%s'", argv[1]); } switch (sb.st_mode & S_IFMT) { case S_IFDIR: /* walk the tree */ subtree = argv[1]; TIMER_START(start); root = walk_dir(subtree, ".", NULL, NULL); TIMER_RESULTS(start, "walk_dir"); break; case S_IFREG: /* read the manifest file */ subtree = "."; TIMER_START(start); root = read_mtree(argv[1], NULL); TIMER_RESULTS(start, "manifest"); break; default: errx(1, "%s: not a file or directory", argv[1]); /* NOTREACHED */ } /* append extra directory */ for (i = 2; i < argc; i++) { if (stat(argv[i], &sb) == -1) err(1, "Can't stat `%s'", argv[i]); if (!S_ISDIR(sb.st_mode)) errx(1, "%s: not a directory", argv[i]); TIMER_START(start); root = walk_dir(argv[i], ".", NULL, root); TIMER_RESULTS(start, "walk_dir2"); } if (specfile) { /* apply a specfile */ TIMER_START(start); apply_specfile(specfile, subtree, root, fsoptions.onlyspec); TIMER_RESULTS(start, "apply_specfile"); } if (debug & DEBUG_DUMP_FSNODES) { printf("\nparent: %s\n", subtree); dump_fsnodes(root); putchar('\n'); } /* build the file system */ TIMER_START(start); fstype->make_fs(argv[0], subtree, root, &fsoptions); TIMER_RESULTS(start, "make_fs"); free_fsnodes(root); exit(0); /* NOTREACHED */ } int set_option(const option_t *options, const char *option, char *buf, size_t len) { char *var, *val; int retval; assert(option != NULL); var = estrdup(option); for (val = var; *val; val++) if (*val == '=') { *val++ = '\0'; break; } retval = set_option_var(options, var, val, buf, len); free(var); return retval; } int set_option_var(const option_t *options, const char *var, const char *val, char *buf, size_t len) { char *s; size_t i; #define NUM(type) \ if (!*val) { \ *(type *)options[i].value = 1; \ break; \ } \ *(type *)options[i].value = (type)strsuftoll(options[i].desc, val, \ options[i].minimum, options[i].maximum); break for (i = 0; options[i].name != NULL; i++) { if (var[1] == '\0') { if (options[i].letter != var[0]) continue; } else if (strcmp(options[i].name, var) != 0) continue; switch (options[i].type) { case OPT_BOOL: *(bool *)options[i].value = 1; break; case OPT_STRARRAY: strlcpy((void *)options[i].value, val, (size_t) options[i].maximum); break; case OPT_STRPTR: s = estrdup(val); *(char **)options[i].value = s; break; case OPT_STRBUF: if (buf == NULL) abort(); strlcpy(buf, val, len); break; case OPT_INT64: NUM(uint64_t); case OPT_INT32: NUM(uint32_t); case OPT_INT16: NUM(uint16_t); case OPT_INT8: NUM(uint8_t); default: warnx("Unknown type %d in option %s", options[i].type, val); return 0; } return i; } warnx("Unknown option `%s'", var); return -1; } static fstype_t * get_fstype(const char *type) { int i; for (i = 0; fstypes[i].type != NULL; i++) if (strcmp(fstypes[i].type, type) == 0) return (&fstypes[i]); return (NULL); } option_t * copy_opts(const option_t *o) { size_t i; for (i = 0; o[i].name; i++) continue; i++; return memcpy(ecalloc(i, sizeof(*o)), o, i * sizeof(*o)); } static int get_tstamp(const char *b, struct stat *st) { time_t when; char *eb; long long l; if (stat(b, st) != -1) return 0; { errno = 0; l = strtoll(b, &eb, 0); if (b == eb || *eb || errno) return -1; when = (time_t)l; } st->st_ino = 1; #ifdef HAVE_STRUCT_STAT_BIRTHTIME st->st_birthtime = #endif st->st_mtime = st->st_ctime = st->st_atime = when; return 0; } static void usage(fstype_t *fstype, fsinfo_t *fsoptions) { const char *prog; prog = getprogname(); fprintf(stderr, "Usage: %s [-xZ] [-B endian] [-b free-blocks] [-d debug-mask]\n" "\t[-F mtree-specfile] [-f free-files] [-M minimum-size] [-m maximum-size]\n" "\t[-N userdb-dir] [-O offset] [-o fs-options] [-R roundup-size]\n" "\t[-S sector-size] [-s image-size] [-T ] [-t fs-type]\n" "\timage-file directory | manifest [extra-directory ...]\n", prog); if (fstype) { size_t i; option_t *o = fsoptions->fs_options; fprintf(stderr, "\n%s specific options:\n", fstype->type); for (i = 0; o[i].name != NULL; i++) fprintf(stderr, "\t%c%c%20.20s\t%s\n", o[i].letter ? o[i].letter : ' ', o[i].letter ? ',' : ' ', o[i].name, o[i].desc); } exit(1); } diff --git a/usr.sbin/makefs/makefs.h b/usr.sbin/makefs/makefs.h index 68dc0362dd21..e88313e8366d 100644 --- a/usr.sbin/makefs/makefs.h +++ b/usr.sbin/makefs/makefs.h @@ -1,306 +1,311 @@ /* $NetBSD: makefs.h,v 1.20 2008/12/28 21:51:46 christos Exp $ */ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2001 Wasabi Systems, Inc. * All rights reserved. * * Written by Luke Mewburn for Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project by * Wasabi Systems, Inc. * 4. The name of Wasabi Systems, Inc. may not be used to endorse * or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MAKEFS_H #define _MAKEFS_H #include #include /* * fsnode - * a component of the tree; contains a filename, a pointer to * fsinode, optional symlink name, and tree pointers * * fsinode - * equivalent to an inode, containing target file system inode number, * refcount (nlink), and stat buffer * * A tree of fsnodes looks like this: * * name "." "bin" "netbsd" * type S_IFDIR S_IFDIR S_IFREG * next > > NULL * parent NULL NULL NULL * child NULL v * * name "." "ls" * type S_IFDIR S_IFREG * next > NULL * parent ^ ^ (to "bin") * child NULL NULL * * Notes: * - first always points to first entry, at current level, which * must be "." when the tree has been built; during build it may * not be if "." hasn't yet been found by readdir(2). */ enum fi_flags { FI_SIZED = 1<<0, /* inode sized */ FI_ALLOCATED = 1<<1, /* fsinode->ino allocated */ FI_WRITTEN = 1<<2, /* inode written */ + FI_ROOT = 1<<3, /* root of a ZFS dataset */ }; typedef struct { uint32_t ino; /* inode number used on target fs */ uint32_t nlink; /* number of links to this entry */ enum fi_flags flags; /* flags used by fs specific code */ + void *param; /* for use by individual fs impls */ struct stat st; /* stat entry */ } fsinode; typedef struct _fsnode { struct _fsnode *parent; /* parent (NULL if root) */ struct _fsnode *child; /* child (if type == S_IFDIR) */ struct _fsnode *next; /* next */ struct _fsnode *first; /* first node of current level (".") */ uint32_t type; /* type of entry */ fsinode *inode; /* actual inode data */ char *symlink; /* symlink target */ char *contents; /* file to provide contents */ const char *root; /* root path */ char *path; /* directory name */ char *name; /* file name */ int flags; /* misc flags */ } fsnode; #define FSNODE_F_HASSPEC 0x01 /* fsnode has a spec entry */ #define FSNODE_F_OPTIONAL 0x02 /* fsnode is optional */ /* * option_t - contains option name, description, pointer to location to store * result, and range checks for the result. Used to simplify fs specific * option setting */ typedef enum { OPT_STRARRAY, OPT_STRPTR, OPT_STRBUF, OPT_BOOL, OPT_INT8, OPT_INT16, OPT_INT32, OPT_INT64 } opttype_t; typedef struct { char letter; /* option letter NUL for none */ const char *name; /* option name */ void *value; /* where to stuff the value */ opttype_t type; /* type of entry */ long long minimum; /* minimum for value */ long long maximum; /* maximum for value */ const char *desc; /* option description */ } option_t; /* * fsinfo_t - contains various settings and parameters pertaining to * the image, including current settings, global options, and fs * specific options */ typedef struct makefs_fsinfo { /* current settings */ off_t size; /* total size */ off_t inodes; /* number of inodes */ uint32_t curinode; /* current inode */ /* image settings */ int fd; /* file descriptor of image */ void *superblock; /* superblock */ int onlyspec; /* only add entries in specfile */ /* global options */ off_t minsize; /* minimum size image should be */ off_t maxsize; /* maximum size image can be */ off_t freefiles; /* free file entries to leave */ off_t freeblocks; /* free blocks to leave */ off_t offset; /* offset from start of file */ off_t roundup; /* round image size up to this value */ int freefilepc; /* free file % */ int freeblockpc; /* free block % */ int needswap; /* non-zero if byte swapping needed */ int sectorsize; /* sector size */ int sparse; /* sparse image, don't fill it with zeros */ void *fs_specific; /* File system specific additions. */ option_t *fs_options; /* File system specific options */ } fsinfo_t; void apply_specfile(const char *, const char *, fsnode *, int); void dump_fsnodes(fsnode *); const char * inode_type(mode_t); fsnode * read_mtree(const char *, fsnode *); int set_option(const option_t *, const char *, char *, size_t); int set_option_var(const option_t *, const char *, const char *, char *, size_t); fsnode * walk_dir(const char *, const char *, fsnode *, fsnode *); void free_fsnodes(fsnode *); option_t * copy_opts(const option_t *); #define DECLARE_FUN(fs) \ void fs ## _prep_opts(fsinfo_t *); \ int fs ## _parse_opts(const char *, fsinfo_t *); \ void fs ## _cleanup_opts(fsinfo_t *); \ void fs ## _makefs(const char *, const char *, fsnode *, fsinfo_t *) DECLARE_FUN(cd9660); DECLARE_FUN(ffs); DECLARE_FUN(msdos); +#ifdef HAVE_ZFS +DECLARE_FUN(zfs); +#endif extern u_int debug; extern int dupsok; extern struct timespec start_time; extern struct stat stampst; /* * If -x is specified, we want to exclude nodes which do not appear * in the spec file. */ #define FSNODE_EXCLUDE_P(opts, fsnode) \ ((opts)->onlyspec != 0 && ((fsnode)->flags & FSNODE_F_HASSPEC) == 0) #define DEBUG_TIME 0x00000001 /* debug bits 1..3 unused at this time */ #define DEBUG_WALK_DIR 0x00000010 #define DEBUG_WALK_DIR_NODE 0x00000020 #define DEBUG_WALK_DIR_LINKCHECK 0x00000040 #define DEBUG_DUMP_FSNODES 0x00000080 #define DEBUG_DUMP_FSNODES_VERBOSE 0x00000100 #define DEBUG_FS_PARSE_OPTS 0x00000200 #define DEBUG_FS_MAKEFS 0x00000400 #define DEBUG_FS_VALIDATE 0x00000800 #define DEBUG_FS_CREATE_IMAGE 0x00001000 #define DEBUG_FS_SIZE_DIR 0x00002000 #define DEBUG_FS_SIZE_DIR_NODE 0x00004000 #define DEBUG_FS_SIZE_DIR_ADD_DIRENT 0x00008000 #define DEBUG_FS_POPULATE 0x00010000 #define DEBUG_FS_POPULATE_DIRBUF 0x00020000 #define DEBUG_FS_POPULATE_NODE 0x00040000 #define DEBUG_FS_WRITE_FILE 0x00080000 #define DEBUG_FS_WRITE_FILE_BLOCK 0x00100000 #define DEBUG_FS_MAKE_DIRBUF 0x00200000 #define DEBUG_FS_WRITE_INODE 0x00400000 #define DEBUG_BUF_BREAD 0x00800000 #define DEBUG_BUF_BWRITE 0x01000000 #define DEBUG_BUF_GETBLK 0x02000000 #define DEBUG_APPLY_SPECFILE 0x04000000 #define DEBUG_APPLY_SPECENTRY 0x08000000 #define DEBUG_APPLY_SPECONLY 0x10000000 #define DEBUG_MSDOSFS 0x20000000 #define TIMER_START(x) \ if (debug & DEBUG_TIME) \ gettimeofday(&(x), NULL) #define TIMER_RESULTS(x,d) \ if (debug & DEBUG_TIME) { \ struct timeval end, td; \ gettimeofday(&end, NULL); \ timersub(&end, &(x), &td); \ printf("%s took %lld.%06ld seconds\n", \ (d), (long long)td.tv_sec, \ (long)td.tv_usec); \ } #ifndef DEFAULT_FSTYPE #define DEFAULT_FSTYPE "ffs" #endif /* * ffs specific settings * --------------------- */ #define FFS_EI /* for opposite endian support in ffs headers */ /* * Write-arounds/compat shims for endian-agnostic support. * These belong in the kernel if/when it's possible to mount * filesystems w/ either byte order. */ /* * File system internal flags, also in fs_flags. * (Pick highest number to avoid conflicts with others) */ #define FS_SWAPPED 0x80000000 /* file system is endian swapped */ #define FS_INTERNAL 0x80000000 /* mask for internal flags */ #define FS_ISCLEAN 1 #define DINODE1_SIZE (sizeof(struct ufs1_dinode)) #define DINODE2_SIZE (sizeof(struct ufs2_dinode)) #define UFS1_MAXSYMLINKLEN ((UFS_NDADDR + UFS_NIADDR) * sizeof(ufs1_daddr_t)) #define UFS2_MAXSYMLINKLEN ((UFS_NDADDR + UFS_NIADDR) * sizeof(ufs2_daddr_t)) #if (BYTE_ORDER == LITTLE_ENDIAN) #define DIRSIZ_SWAP(oldfmt, dp, needswap) \ (((oldfmt) && !(needswap)) ? \ DIRECTSIZ((dp)->d_type) : DIRECTSIZ((dp)->d_namlen)) #else #define DIRSIZ_SWAP(oldfmt, dp, needswap) \ (((oldfmt) && (needswap)) ? \ DIRECTSIZ((dp)->d_type) : DIRECTSIZ((dp)->d_namlen)) #endif #define cg_chkmagic_swap(cgp, ns) \ (ufs_rw32((cgp)->cg_magic, (ns)) == CG_MAGIC) #define cg_inosused_swap(cgp, ns) \ ((u_int8_t *)((u_int8_t *)(cgp) + ufs_rw32((cgp)->cg_iusedoff, (ns)))) #define cg_blksfree_swap(cgp, ns) \ ((u_int8_t *)((u_int8_t *)(cgp) + ufs_rw32((cgp)->cg_freeoff, (ns)))) #define cg_clustersfree_swap(cgp, ns) \ ((u_int8_t *)((u_int8_t *)(cgp) + ufs_rw32((cgp)->cg_clusteroff, (ns)))) #define cg_clustersum_swap(cgp, ns) \ ((int32_t *)((uintptr_t)(cgp) + ufs_rw32((cgp)->cg_clustersumoff, ns))) struct fs; void ffs_fragacct_swap(struct fs *, int, uint32_t [], int, int); fsinode *link_check(fsinode *); #endif /* _MAKEFS_H */ diff --git a/usr.sbin/makefs/tests/Makefile b/usr.sbin/makefs/tests/Makefile index 85e4b233aea7..c2c9f6bea5b6 100644 --- a/usr.sbin/makefs/tests/Makefile +++ b/usr.sbin/makefs/tests/Makefile @@ -1,18 +1,19 @@ # $FreeBSD$ ATF_TESTS_SH+= makefs_cd9660_tests ATF_TESTS_SH+= makefs_ffs_tests +ATF_TESTS_SH+= makefs_zfs_tests BINDIR= ${TESTSDIR} # XXX: PACKAGE support for SCRIPTS SCRIPTS+= makefs_tests_common.sh SCRIPTSNAME_makefs_tests_common.sh= makefs_tests_common.sh TEST_METADATA.makefs_cd9660_tests+= required_files="/sbin/mount_cd9660" .for t in ${ATF_TESTS_SH} TEST_METADATA.$t+= required_user="root" .endfor .include diff --git a/usr.sbin/makefs/tests/makefs_zfs_tests.sh b/usr.sbin/makefs/tests/makefs_zfs_tests.sh new file mode 100644 index 000000000000..8cd79966c49a --- /dev/null +++ b/usr.sbin/makefs/tests/makefs_zfs_tests.sh @@ -0,0 +1,634 @@ +#- +# SPDX-License-Identifier: BSD-2-Clause-FreeBSD +# +# Copyright (c) 2022 The FreeBSD Foundation +# +# This software was developed by Mark Johnston under sponsorship from +# the FreeBSD Foundation. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# + +MAKEFS="makefs -t zfs -o nowarn=true" +ZFS_POOL_NAME="makefstest$$" +TEST_ZFS_POOL_NAME="$TMPDIR/poolname" + +. "$(dirname "$0")/makefs_tests_common.sh" + +common_cleanup() +{ + local pool md + + # Try to force a TXG, this can help catch bugs by triggering a panic. + sync + + pool=$(cat $TEST_ZFS_POOL_NAME) + if zpool list "$pool" >/dev/null; then + zpool destroy "$pool" + fi + + md=$(cat $TEST_MD_DEVICE_FILE) + if [ -c /dev/"$md" ]; then + mdconfig -d -u "$md" + fi +} + +import_image() +{ + atf_check -e empty -o save:$TEST_MD_DEVICE_FILE -s exit:0 \ + mdconfig -a -f $TEST_IMAGE + atf_check zpool import -R $TEST_MOUNT_DIR $ZFS_POOL_NAME + echo "$ZFS_POOL_NAME" > $TEST_ZFS_POOL_NAME +} + +# +# Test autoexpansion of the vdev. +# +# The pool is initially 10GB, so we get 10GB minus one metaslab's worth of +# usable space for data. Then the pool is expanded to 50GB, and the amount of +# usable space is 50GB minus one metaslab. +# +atf_test_case autoexpand cleanup +autoexpand_body() +{ + local mssize poolsize poolsize1 newpoolsize + + create_test_inputs + + mssize=$((128 * 1024 * 1024)) + poolsize=$((10 * 1024 * 1024 * 1024)) + atf_check $MAKEFS -s $poolsize -o mssize=$mssize -o rootpath=/ \ + -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + newpoolsize=$((50 * 1024 * 1024 * 1024)) + truncate -s $newpoolsize $TEST_IMAGE + + import_image + + check_image_contents + + poolsize1=$(zpool list -Hp -o size $ZFS_POOL_NAME) + atf_check [ $((poolsize1 + $mssize)) -eq $poolsize ] + + atf_check zpool online -e $ZFS_POOL_NAME /dev/$(cat $TEST_MD_DEVICE_FILE) + + check_image_contents + + poolsize1=$(zpool list -Hp -o size $ZFS_POOL_NAME) + atf_check [ $((poolsize1 + $mssize)) -eq $newpoolsize ] +} +autoexpand_cleanup() +{ + common_cleanup +} + +# +# Test with some default layout defined by the common code. +# +atf_test_case basic cleanup +basic_body() +{ + create_test_inputs + + atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents +} +basic_cleanup() +{ + common_cleanup +} + +atf_test_case dataset_removal cleanup +dataset_removal_body() +{ + create_test_dirs + + cd $TEST_INPUTS_DIR + mkdir dir + cd - + + atf_check $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + -o fs=${ZFS_POOL_NAME}/dir \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents + + atf_check zfs destroy ${ZFS_POOL_NAME}/dir +} +dataset_removal_cleanup() +{ + common_cleanup +} + +# +# Make sure that we can create and remove an empty directory. +# +atf_test_case empty_dir cleanup +empty_dir_body() +{ + create_test_dirs + + cd $TEST_INPUTS_DIR + mkdir dir + cd - + + atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents + + atf_check rmdir ${TEST_MOUNT_DIR}/dir +} +empty_dir_cleanup() +{ + common_cleanup +} + +atf_test_case empty_fs cleanup +empty_fs_body() +{ + create_test_dirs + + atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents +} +empty_fs_cleanup() +{ + common_cleanup +} + +atf_test_case file_sizes cleanup +file_sizes_body() +{ + local i + + create_test_dirs + cd $TEST_INPUTS_DIR + + i=1 + while [ $i -lt $((1 << 20)) ]; do + truncate -s $i ${i}.1 + truncate -s $(($i - 1)) ${i}.2 + truncate -s $(($i + 1)) ${i}.3 + i=$(($i << 1)) + done + + cd - + + # XXXMJ this creates sparse files, make sure makefs doesn't + # preserve the sparseness. + # XXXMJ need to test with larger files (at least 128MB for L2 indirs) + atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents +} +file_sizes_cleanup() +{ + common_cleanup +} + +atf_test_case hard_links cleanup +hard_links_body() +{ + local f + + create_test_dirs + cd $TEST_INPUTS_DIR + + mkdir dir + echo "hello" > 1 + ln 1 2 + ln 1 dir/1 + + echo "goodbye" > dir/a + ln dir/a dir/b + ln dir/a a + + cd - + + atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents + + stat -f '%i' ${TEST_MOUNT_DIR}/1 > ./ino + stat -f '%l' ${TEST_MOUNT_DIR}/1 > ./nlink + for f in 1 2 dir/1; do + atf_check -o file:./nlink -e empty -s exit:0 \ + stat -f '%l' ${TEST_MOUNT_DIR}/${f} + atf_check -o file:./ino -e empty -s exit:0 \ + stat -f '%i' ${TEST_MOUNT_DIR}/${f} + atf_check cmp -s ${TEST_INPUTS_DIR}/1 ${TEST_MOUNT_DIR}/${f} + done + + stat -f '%i' ${TEST_MOUNT_DIR}/dir/a > ./ino + stat -f '%l' ${TEST_MOUNT_DIR}/dir/a > ./nlink + for f in dir/a dir/b a; do + atf_check -o file:./nlink -e empty -s exit:0 \ + stat -f '%l' ${TEST_MOUNT_DIR}/${f} + atf_check -o file:./ino -e empty -s exit:0 \ + stat -f '%i' ${TEST_MOUNT_DIR}/${f} + atf_check cmp -s ${TEST_INPUTS_DIR}/dir/a ${TEST_MOUNT_DIR}/${f} + done +} +hard_links_cleanup() +{ + common_cleanup +} + +# Allocate enough dnodes from an object set that the meta dnode needs to use +# indirect blocks. +atf_test_case indirect_dnode_array cleanup +indirect_dnode_array_body() +{ + local count i + + # How many dnodes do we need to allocate? Well, the data block size + # for meta dnodes is always 16KB, so with a dnode size of 512B we get + # 32 dnodes per direct block. The maximum indirect block size is 128KB + # and that can fit 1024 block pointers, so we need at least 32 * 1024 + # files to force the use of two levels of indirection. + # + # Unfortunately that number of files makes the test run quite slowly, + # so we settle for a single indirect block for now... + count=$(jot -r 1 32 1024) + + create_test_dirs + cd $TEST_INPUTS_DIR + for i in $(seq 1 $count); do + touch $i + done + cd - + + atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents +} +indirect_dnode_array_cleanup() +{ + common_cleanup +} + +# +# Create some files with long names, so as to test fat ZAP handling. +# +atf_test_case long_file_name cleanup +long_file_name_body() +{ + local dir i + + create_test_dirs + cd $TEST_INPUTS_DIR + + # micro ZAP keys can be at most 50 bytes. + for i in $(seq 1 60); do + touch $(jot -s '' $i 1 1) + done + dir=$(jot -s '' 61 1 1) + mkdir $dir + for i in $(seq 1 60); do + touch ${dir}/$(jot -s '' $i 1 1) + done + + cd - + + atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents + + # Add a directory entry in the hope that OpenZFS might catch a bug + # in makefs' fat ZAP encoding. + touch ${TEST_MOUNT_DIR}/foo +} +long_file_name_cleanup() +{ + common_cleanup +} + +# +# Exercise handling of multiple datasets. +# +atf_test_case multi_dataset_1 cleanup +multi_dataset_1_body() +{ + create_test_dirs + cd $TEST_INPUTS_DIR + + mkdir dir1 + echo a > dir1/a + mkdir dir2 + echo b > dir2/b + + cd - + + atf_check $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + -o fs=${ZFS_POOL_NAME}/dir1 -o fs=${ZFS_POOL_NAME}/dir2 \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents + + # Make sure that we have three datasets with the expected mount points. + atf_check -o inline:${ZFS_POOL_NAME}\\n -e empty -s exit:0 \ + zfs list -H -o name ${ZFS_POOL_NAME} + atf_check -o inline:${TEST_MOUNT_DIR}\\n -e empty -s exit:0 \ + zfs list -H -o mountpoint ${ZFS_POOL_NAME} + + atf_check -o inline:${ZFS_POOL_NAME}/dir1\\n -e empty -s exit:0 \ + zfs list -H -o name ${ZFS_POOL_NAME}/dir1 + atf_check -o inline:${TEST_MOUNT_DIR}/dir1\\n -e empty -s exit:0 \ + zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir1 + + atf_check -o inline:${ZFS_POOL_NAME}/dir2\\n -e empty -s exit:0 \ + zfs list -H -o name ${ZFS_POOL_NAME}/dir2 + atf_check -o inline:${TEST_MOUNT_DIR}/dir2\\n -e empty -s exit:0 \ + zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir2 +} +multi_dataset_1_cleanup() +{ + common_cleanup +} + +# +# Create a pool with two datasets, where the root dataset is mounted below +# the child dataset. +# +atf_test_case multi_dataset_2 cleanup +multi_dataset_2_body() +{ + create_test_dirs + cd $TEST_INPUTS_DIR + + mkdir dir1 + echo a > dir1/a + mkdir dir2 + echo b > dir2/b + + cd - + + atf_check $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + -o fs=${ZFS_POOL_NAME}/dir1\;mountpoint=/ \ + -o fs=${ZFS_POOL_NAME}\;mountpoint=/dir1 \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents +} +multi_dataset_2_cleanup() +{ + common_cleanup +} + +# +# Create a dataset with a non-existent mount point. +# +atf_test_case multi_dataset_3 cleanup +multi_dataset_3_body() +{ + create_test_dirs + cd $TEST_INPUTS_DIR + + mkdir dir1 + echo a > dir1/a + + cd - + + atf_check $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + -o fs=${ZFS_POOL_NAME}/dir1 \ + -o fs=${ZFS_POOL_NAME}/dir2 \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + atf_check -o inline:${TEST_MOUNT_DIR}/dir2\\n -e empty -s exit:0 \ + zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir2 + + # Mounting dir2 should have created a directory called dir2. Go + # back and create it in the staging tree before comparing. + atf_check mkdir ${TEST_INPUTS_DIR}/dir2 + + check_image_contents +} +multi_dataset_3_cleanup() +{ + common_cleanup +} + +# +# Create an unmounted dataset. +# +atf_test_case multi_dataset_4 cleanup +multi_dataset_4_body() +{ + create_test_dirs + cd $TEST_INPUTS_DIR + + mkdir dir1 + echo a > dir1/a + + cd - + + atf_check $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + -o fs=${ZFS_POOL_NAME}/dir1\;canmount=noauto\;mountpoint=none \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + atf_check -o inline:none\\n -e empty -s exit:0 \ + zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir1 + + check_image_contents + + atf_check zfs set mountpoint=/dir1 ${ZFS_POOL_NAME}/dir1 + atf_check zfs mount ${ZFS_POOL_NAME}/dir1 + atf_check -o inline:${TEST_MOUNT_DIR}/dir1\\n -e empty -s exit:0 \ + zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir1 + + # dir1/a should be part of the root dataset, not dir1. + atf_check -s not-exit:0 -e not-empty stat ${TEST_MOUNT_DIR}dir1/a +} +multi_dataset_4_cleanup() +{ + common_cleanup +} + +# +# Rudimentary test to verify that two ZFS images created using the same +# parameters and input hierarchy are byte-identical. In particular, makefs(1) +# does not preserve file access times. +# +atf_test_case reproducible cleanup +reproducible_body() +{ + create_test_inputs + + atf_check $MAKEFS -s 512m -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + ${TEST_IMAGE}.1 $TEST_INPUTS_DIR + + atf_check $MAKEFS -s 512m -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + ${TEST_IMAGE}.2 $TEST_INPUTS_DIR + + # XXX-MJ cmp(1) is really slow + atf_check cmp ${TEST_IMAGE}.1 ${TEST_IMAGE}.2 +} +reproducible_cleanup() +{ +} + +# +# Verify that we can take a snapshot of a generated dataset. +# +atf_test_case snapshot cleanup +snapshot_body() +{ + create_test_dirs + cd $TEST_INPUTS_DIR + + mkdir dir + echo "hello" > dir/hello + echo "goodbye" > goodbye + + cd - + + atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + atf_check zfs snapshot ${ZFS_POOL_NAME}@1 +} +snapshot_cleanup() +{ + common_cleanup +} + +# +# Check handling of symbolic links. +# +atf_test_case soft_links cleanup +soft_links_body() +{ + create_test_dirs + cd $TEST_INPUTS_DIR + + mkdir dir + ln -s a a + ln -s dir/../a a + ln -s dir/b b + echo 'c' > dir + ln -s dir/c c + # XXX-MJ overflows bonus buffer ln -s $(jot -s '' 320 1 1) 1 + + cd - + + atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents +} +soft_links_cleanup() +{ + common_cleanup +} + +# +# Verify that we can set properties on the root dataset. +# +atf_test_case root_props cleanup +root_props_body() +{ + create_test_inputs + + atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \ + -o fs=${ZFS_POOL_NAME}\;atime=off\;setuid=off \ + $TEST_IMAGE $TEST_INPUTS_DIR + + import_image + + check_image_contents + + atf_check -o inline:off\\n -e empty -s exit:0 \ + zfs get -H -o value atime $ZFS_POOL_NAME + atf_check -o inline:local\\n -e empty -s exit:0 \ + zfs get -H -o source atime $ZFS_POOL_NAME + atf_check -o inline:off\\n -e empty -s exit:0 \ + zfs get -H -o value setuid $ZFS_POOL_NAME + atf_check -o inline:local\\n -e empty -s exit:0 \ + zfs get -H -o source setuid $ZFS_POOL_NAME +} +root_props_cleanup() +{ + common_cleanup +} + +atf_init_test_cases() +{ + atf_add_test_case autoexpand + atf_add_test_case basic + atf_add_test_case dataset_removal + atf_add_test_case empty_dir + atf_add_test_case empty_fs + atf_add_test_case file_sizes + atf_add_test_case hard_links + atf_add_test_case indirect_dnode_array + atf_add_test_case long_file_name + atf_add_test_case multi_dataset_1 + atf_add_test_case multi_dataset_2 + atf_add_test_case multi_dataset_3 + atf_add_test_case multi_dataset_4 + atf_add_test_case reproducible + atf_add_test_case snapshot + atf_add_test_case soft_links + atf_add_test_case root_props + + # XXXMJ tests: + # - test with different ashifts (at least, 9 and 12), different image sizes + # - create datasets in imported pool +} diff --git a/usr.sbin/makefs/zfs.c b/usr.sbin/makefs/zfs.c new file mode 100644 index 000000000000..08689a558870 --- /dev/null +++ b/usr.sbin/makefs/zfs.c @@ -0,0 +1,758 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 The FreeBSD Foundation + * + * This software was developed by Mark Johnston under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "makefs.h" +#include "zfs.h" + +#define VDEV_LABEL_SPACE \ + ((off_t)(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) +_Static_assert(VDEV_LABEL_SPACE <= MINDEVSIZE, ""); + +#define MINMSSIZE ((off_t)1 << 24) /* 16MB */ +#define DFLTMSSIZE ((off_t)1 << 29) /* 512MB */ +#define MAXMSSIZE ((off_t)1 << 34) /* 16GB */ + +#define INDIR_LEVELS 6 +/* Indirect blocks are always 128KB. */ +#define BLKPTR_PER_INDIR (MAXBLOCKSIZE / sizeof(blkptr_t)) + +struct dnode_cursor { + char inddir[INDIR_LEVELS][MAXBLOCKSIZE]; + off_t indloc; + off_t indspace; + dnode_phys_t *dnode; + off_t dataoff; + off_t datablksz; +}; + +void +zfs_prep_opts(fsinfo_t *fsopts) +{ + zfs_opt_t *zfs = ecalloc(1, sizeof(*zfs)); + + const option_t zfs_options[] = { + { '\0', "bootfs", &zfs->bootfs, OPT_STRPTR, + 0, 0, "Bootable dataset" }, + { '\0', "mssize", &zfs->mssize, OPT_INT64, + MINMSSIZE, MAXMSSIZE, "Metaslab size" }, + { '\0', "poolname", &zfs->poolname, OPT_STRPTR, + 0, 0, "ZFS pool name" }, + { '\0', "rootpath", &zfs->rootpath, OPT_STRPTR, + 0, 0, "Prefix for all dataset mount points" }, + { '\0', "ashift", &zfs->ashift, OPT_INT32, + MINBLOCKSHIFT, MAXBLOCKSHIFT, "ZFS pool ashift" }, + { '\0', "nowarn", &zfs->nowarn, OPT_BOOL, + 0, 0, "Suppress warning about experimental ZFS support" }, + { .name = NULL } + }; + + STAILQ_INIT(&zfs->datasetdescs); + + fsopts->fs_specific = zfs; + fsopts->fs_options = copy_opts(zfs_options); +} + +int +zfs_parse_opts(const char *option, fsinfo_t *fsopts) +{ + zfs_opt_t *zfs; + struct dataset_desc *dsdesc; + char buf[BUFSIZ], *opt, *val; + int rv; + + zfs = fsopts->fs_specific; + + opt = val = estrdup(option); + opt = strsep(&val, "="); + if (strcmp(opt, "fs") == 0) { + if (val == NULL) + errx(1, "invalid filesystem parameters `%s'", option); + + /* + * Dataset descriptions will be parsed later, in dsl_init(). + * Just stash them away for now. + */ + dsdesc = ecalloc(1, sizeof(*dsdesc)); + dsdesc->params = estrdup(val); + free(opt); + STAILQ_INSERT_TAIL(&zfs->datasetdescs, dsdesc, next); + return (1); + } + free(opt); + + rv = set_option(fsopts->fs_options, option, buf, sizeof(buf)); + return (rv == -1 ? 0 : 1); +} + +static void +zfs_size_vdev(fsinfo_t *fsopts) +{ + zfs_opt_t *zfs; + off_t asize, mssize, vdevsize, vdevsize1; + + zfs = fsopts->fs_specific; + + assert(fsopts->maxsize != 0); + assert(zfs->ashift != 0); + + /* + * Figure out how big the vdev should be. + */ + vdevsize = rounddown2(fsopts->maxsize, 1 << zfs->ashift); + if (vdevsize < MINDEVSIZE) + errx(1, "maximum image size is too small"); + if (vdevsize < fsopts->minsize || vdevsize > fsopts->maxsize) { + errx(1, "image size bounds must be multiples of %d", + 1 << zfs->ashift); + } + asize = vdevsize - VDEV_LABEL_SPACE; + + /* + * Size metaslabs according to the following heuristic: + * - provide at least 8 metaslabs, + * - without using a metaslab size larger than 512MB. + * This approximates what OpenZFS does without being complicated. In + * practice we expect pools to be expanded upon first use, and OpenZFS + * does not resize metaslabs in that case, so there is no right answer + * here. In general we want to provide large metaslabs even if the + * image size is small, and 512MB is a reasonable size for pools up to + * several hundred gigabytes. + * + * The user may override this heuristic using the "-o mssize" option. + */ + mssize = zfs->mssize; + if (mssize == 0) { + mssize = MAX(MIN(asize / 8, DFLTMSSIZE), MINMSSIZE); + if (!powerof2(mssize)) + mssize = 1l << (flsll(mssize) - 1); + } + if (!powerof2(mssize)) + errx(1, "metaslab size must be a power of 2"); + + /* + * If we have some slop left over, try to cover it by resizing the vdev, + * subject to the maxsize and minsize parameters. + */ + if (asize % mssize != 0) { + vdevsize1 = rounddown2(asize, mssize) + VDEV_LABEL_SPACE; + if (vdevsize1 < fsopts->minsize) + vdevsize1 = roundup2(asize, mssize) + VDEV_LABEL_SPACE; + if (vdevsize1 <= fsopts->maxsize) + vdevsize = vdevsize1; + } + asize = vdevsize - VDEV_LABEL_SPACE; + + zfs->asize = asize; + zfs->vdevsize = vdevsize; + zfs->mssize = mssize; + zfs->msshift = flsll(mssize) - 1; + zfs->mscount = asize / mssize; +} + +/* + * Validate options and set some default values. + */ +static void +zfs_check_opts(fsinfo_t *fsopts) +{ + zfs_opt_t *zfs; + + zfs = fsopts->fs_specific; + + if (fsopts->offset != 0) + errx(1, "unhandled offset option"); + if (fsopts->maxsize == 0) + errx(1, "an image size must be specified"); + + if (zfs->poolname == NULL) + errx(1, "a pool name must be specified"); + + if (zfs->rootpath == NULL) + easprintf(&zfs->rootpath, "/%s", zfs->poolname); + if (zfs->rootpath[0] != '/') + errx(1, "mountpoint `%s' must be absolute", zfs->rootpath); + + if (zfs->ashift == 0) + zfs->ashift = 12; + + zfs_size_vdev(fsopts); +} + +void +zfs_cleanup_opts(fsinfo_t *fsopts) +{ + struct dataset_desc *d, *tmp; + zfs_opt_t *zfs; + + zfs = fsopts->fs_specific; + free(zfs->rootpath); + free(zfs->bootfs); + free(__DECONST(void *, zfs->poolname)); + STAILQ_FOREACH_SAFE(d, &zfs->datasetdescs, next, tmp) { + free(d->params); + free(d); + } + free(zfs); + free(fsopts->fs_options); +} + +static size_t +nvlist_size(const nvlist_t *nvl) +{ + return (sizeof(nvl->nv_header) + nvl->nv_size); +} + +static void +nvlist_copy(const nvlist_t *nvl, char *buf, size_t sz) +{ + assert(sz >= nvlist_size(nvl)); + + memcpy(buf, &nvl->nv_header, sizeof(nvl->nv_header)); + memcpy(buf + sizeof(nvl->nv_header), nvl->nv_data, nvl->nv_size); +} + +static nvlist_t * +pool_config_nvcreate(zfs_opt_t *zfs) +{ + nvlist_t *featuresnv, *poolnv; + + poolnv = nvlist_create(NV_UNIQUE_NAME); + nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_TXG, TXG); + nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VERSION, SPA_VERSION); + nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_STATE, POOL_STATE_EXPORTED); + nvlist_add_string(poolnv, ZPOOL_CONFIG_POOL_NAME, zfs->poolname); + nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_GUID, zfs->poolguid); + nvlist_add_uint64(poolnv, ZPOOL_CONFIG_TOP_GUID, zfs->vdevguid); + nvlist_add_uint64(poolnv, ZPOOL_CONFIG_GUID, zfs->vdevguid); + nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VDEV_CHILDREN, 1); + + featuresnv = nvlist_create(NV_UNIQUE_NAME); + nvlist_add_nvlist(poolnv, ZPOOL_CONFIG_FEATURES_FOR_READ, featuresnv); + nvlist_destroy(featuresnv); + + return (poolnv); +} + +static nvlist_t * +pool_disk_vdev_config_nvcreate(zfs_opt_t *zfs) +{ + nvlist_t *diskvdevnv; + + assert(zfs->objarrid != 0); + + diskvdevnv = nvlist_create(NV_UNIQUE_NAME); + nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK); + nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASHIFT, zfs->ashift); + nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASIZE, zfs->asize); + nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_GUID, zfs->vdevguid); + nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ID, 0); + nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_PATH, "/dev/null"); + nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_WHOLE_DISK, 1); + nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG); + nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_ARRAY, + zfs->objarrid); + nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_SHIFT, + zfs->msshift); + + return (diskvdevnv); +} + +static nvlist_t * +pool_root_vdev_config_nvcreate(zfs_opt_t *zfs) +{ + nvlist_t *diskvdevnv, *rootvdevnv; + + diskvdevnv = pool_disk_vdev_config_nvcreate(zfs); + rootvdevnv = nvlist_create(NV_UNIQUE_NAME); + + nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_ID, 0); + nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_GUID, zfs->poolguid); + nvlist_add_string(rootvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); + nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG); + nvlist_add_nvlist_array(rootvdevnv, ZPOOL_CONFIG_CHILDREN, &diskvdevnv, + 1); + nvlist_destroy(diskvdevnv); + + return (rootvdevnv); +} + +/* + * Create the pool's "config" object, which contains an nvlist describing pool + * parameters and the vdev topology. It is similar but not identical to the + * nvlist stored in vdev labels. The main difference is that vdev labels do not + * describe the full vdev tree and in particular do not contain the "root" + * meta-vdev. + */ +static void +pool_init_objdir_config(zfs_opt_t *zfs, zfs_zap_t *objdir) +{ + dnode_phys_t *dnode; + nvlist_t *poolconfig, *vdevconfig; + void *configbuf; + uint64_t dnid; + off_t configloc, configblksz; + int error; + + dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_PACKED_NVLIST, + DMU_OT_PACKED_NVLIST_SIZE, sizeof(uint64_t), &dnid); + + poolconfig = pool_config_nvcreate(zfs); + + vdevconfig = pool_root_vdev_config_nvcreate(zfs); + nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig); + nvlist_destroy(vdevconfig); + + error = nvlist_export(poolconfig); + if (error != 0) + errc(1, error, "nvlist_export"); + + configblksz = nvlist_size(poolconfig); + configloc = objset_space_alloc(zfs, zfs->mos, &configblksz); + configbuf = ecalloc(1, configblksz); + nvlist_copy(poolconfig, configbuf, configblksz); + + vdev_pwrite_dnode_data(zfs, dnode, configbuf, configblksz, configloc); + + dnode->dn_datablkszsec = configblksz >> MINBLOCKSHIFT; + dnode->dn_flags = DNODE_FLAG_USED_BYTES; + *(uint64_t *)DN_BONUS(dnode) = nvlist_size(poolconfig); + + zap_add_uint64(objdir, DMU_POOL_CONFIG, dnid); + + nvlist_destroy(poolconfig); + free(configbuf); +} + +/* + * Add objects block pointer list objects, used for deferred frees. We don't do + * anything with them, but they need to be present or OpenZFS will refuse to + * import the pool. + */ +static void +pool_init_objdir_bplists(zfs_opt_t *zfs __unused, zfs_zap_t *objdir) +{ + uint64_t dnid; + + (void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR, + BPOBJ_SIZE_V2, &dnid); + zap_add_uint64(objdir, DMU_POOL_FREE_BPOBJ, dnid); + + (void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR, + BPOBJ_SIZE_V2, &dnid); + zap_add_uint64(objdir, DMU_POOL_SYNC_BPLIST, dnid); +} + +/* + * Add required feature metadata objects. We don't know anything about ZFS + * features, so the objects are just empty ZAPs. + */ +static void +pool_init_objdir_feature_maps(zfs_opt_t *zfs, zfs_zap_t *objdir) +{ + dnode_phys_t *dnode; + uint64_t dnid; + + dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); + zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_READ, dnid); + zap_write(zfs, zap_alloc(zfs->mos, dnode)); + + dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); + zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_WRITE, dnid); + zap_write(zfs, zap_alloc(zfs->mos, dnode)); + + dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid); + zap_add_uint64(objdir, DMU_POOL_FEATURE_DESCRIPTIONS, dnid); + zap_write(zfs, zap_alloc(zfs->mos, dnode)); +} + +static void +pool_init_objdir_dsl(zfs_opt_t *zfs, zfs_zap_t *objdir) +{ + zap_add_uint64(objdir, DMU_POOL_ROOT_DATASET, + dsl_dir_id(zfs->rootdsldir)); +} + +static void +pool_init_objdir_poolprops(zfs_opt_t *zfs, zfs_zap_t *objdir) +{ + dnode_phys_t *dnode; + uint64_t id; + + dnode = objset_dnode_alloc(zfs->mos, DMU_OT_POOL_PROPS, &id); + zap_add_uint64(objdir, DMU_POOL_PROPS, id); + + zfs->poolprops = zap_alloc(zfs->mos, dnode); +} + +/* + * Initialize the MOS object directory, the root of virtually all of the pool's + * data and metadata. + */ +static void +pool_init_objdir(zfs_opt_t *zfs) +{ + zfs_zap_t *zap; + dnode_phys_t *objdir; + + objdir = objset_dnode_lookup(zfs->mos, DMU_POOL_DIRECTORY_OBJECT); + + zap = zap_alloc(zfs->mos, objdir); + pool_init_objdir_config(zfs, zap); + pool_init_objdir_bplists(zfs, zap); + pool_init_objdir_feature_maps(zfs, zap); + pool_init_objdir_dsl(zfs, zap); + pool_init_objdir_poolprops(zfs, zap); + zap_write(zfs, zap); +} + +/* + * Initialize the meta-object set (MOS) and immediately write out several + * special objects whose contents are already finalized, including the object + * directory. + * + * Once the MOS is finalized, it'll look roughly like this: + * + * object directory (ZAP) + * |-> vdev config object (nvlist) + * |-> features for read + * |-> features for write + * |-> feature descriptions + * |-> sync bplist + * |-> free bplist + * |-> pool properties + * L-> root DSL directory + * |-> DSL child directory (ZAP) + * | |-> $MOS (DSL dir) + * | | |-> child map + * | | L-> props (ZAP) + * | |-> $FREE (DSL dir) + * | | |-> child map + * | | L-> props (ZAP) + * | |-> $ORIGIN (DSL dir) + * | | |-> child map + * | | |-> dataset + * | | | L-> deadlist + * | | |-> snapshot + * | | | |-> deadlist + * | | | L-> snapshot names + * | | |-> props (ZAP) + * | | L-> clones (ZAP) + * | |-> dataset 1 (DSL dir) + * | | |-> DSL dataset + * | | | |-> snapshot names + * | | | L-> deadlist + * | | |-> child map + * | | | L-> ... + * | | L-> props + * | |-> dataset 2 + * | | L-> ... + * | |-> ... + * | L-> dataset n + * |-> DSL root dataset + * | |-> snapshot names + * | L-> deadlist + * L-> props (ZAP) + * space map object array + * |-> space map 1 + * |-> space map 2 + * |-> ... + * L-> space map n (zfs->mscount) + * + * The space map object array is pointed to by the "msarray" property in the + * pool configuration. + */ +static void +pool_init(zfs_opt_t *zfs) +{ + uint64_t dnid; + + zfs->poolguid = ((uint64_t)random() << 32) | random(); + zfs->vdevguid = ((uint64_t)random() << 32) | random(); + + zfs->mos = objset_alloc(zfs, DMU_OST_META); + + (void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_DIRECTORY, &dnid); + assert(dnid == DMU_POOL_DIRECTORY_OBJECT); + + (void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_ARRAY, &zfs->objarrid); + + dsl_init(zfs); + + pool_init_objdir(zfs); +} + +static void +pool_labels_write(zfs_opt_t *zfs) +{ + uberblock_t *ub; + vdev_label_t *label; + nvlist_t *poolconfig, *vdevconfig; + int error; + + label = ecalloc(1, sizeof(*label)); + + /* + * Assemble the vdev configuration and store it in the label. + */ + poolconfig = pool_config_nvcreate(zfs); + vdevconfig = pool_disk_vdev_config_nvcreate(zfs); + nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig); + nvlist_destroy(vdevconfig); + + error = nvlist_export(poolconfig); + if (error != 0) + errc(1, error, "nvlist_export"); + nvlist_copy(poolconfig, label->vl_vdev_phys.vp_nvlist, + sizeof(label->vl_vdev_phys.vp_nvlist)); + nvlist_destroy(poolconfig); + + /* + * Fill out the uberblock. Just make each one the same. The embedded + * checksum is calculated in vdev_label_write(). + */ + for (size_t uoff = 0; uoff < sizeof(label->vl_uberblock); + uoff += (1 << zfs->ashift)) { + ub = (uberblock_t *)(&label->vl_uberblock[0] + uoff); + ub->ub_magic = UBERBLOCK_MAGIC; + ub->ub_version = SPA_VERSION; + ub->ub_txg = TXG; + ub->ub_guid_sum = zfs->poolguid + zfs->vdevguid; + ub->ub_timestamp = 0; + + ub->ub_software_version = SPA_VERSION; + ub->ub_mmp_magic = MMP_MAGIC; + ub->ub_mmp_delay = 0; + ub->ub_mmp_config = 0; + ub->ub_checkpoint_txg = 0; + objset_root_blkptr_copy(zfs->mos, &ub->ub_rootbp); + } + + /* + * Write out four copies of the label: two at the beginning of the vdev + * and two at the end. + */ + for (int i = 0; i < VDEV_LABELS; i++) + vdev_label_write(zfs, i, label); + + free(label); +} + +static void +pool_fini(zfs_opt_t *zfs) +{ + zap_write(zfs, zfs->poolprops); + dsl_write(zfs); + objset_write(zfs, zfs->mos); + pool_labels_write(zfs); +} + +struct dnode_cursor * +dnode_cursor_init(zfs_opt_t *zfs, zfs_objset_t *os, dnode_phys_t *dnode, + off_t size, off_t blksz) +{ + struct dnode_cursor *c; + uint64_t nbppindir, indlevel, ndatablks, nindblks; + + assert(dnode->dn_nblkptr == 1); + assert(blksz <= MAXBLOCKSIZE); + + if (blksz == 0) { + /* Must be between 1<ashift, + powerof2(size) ? size : (1ul << flsll(size)))); + } + assert(powerof2(blksz)); + + /* + * Do we need indirect blocks? Figure out how many levels are needed + * (indlevel == 1 means no indirect blocks) and how much space is needed + * (it has to be allocated up-front to break the dependency cycle + * described in objset_write()). + */ + ndatablks = size == 0 ? 0 : howmany(size, blksz); + nindblks = 0; + for (indlevel = 1, nbppindir = 1; ndatablks > nbppindir; indlevel++) { + nbppindir *= BLKPTR_PER_INDIR; + nindblks += howmany(ndatablks, indlevel * nbppindir); + } + assert(indlevel < INDIR_LEVELS); + + dnode->dn_nlevels = (uint8_t)indlevel; + dnode->dn_maxblkid = ndatablks > 0 ? ndatablks - 1 : 0; + dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT; + + c = ecalloc(1, sizeof(*c)); + if (nindblks > 0) { + c->indspace = nindblks * MAXBLOCKSIZE; + c->indloc = objset_space_alloc(zfs, os, &c->indspace); + } + c->dnode = dnode; + c->dataoff = 0; + c->datablksz = blksz; + + return (c); +} + +static void +_dnode_cursor_flush(zfs_opt_t *zfs, struct dnode_cursor *c, int levels) +{ + blkptr_t *bp, *pbp; + void *buf; + uint64_t fill; + off_t blkid, blksz, loc; + + assert(levels > 0); + assert(levels <= c->dnode->dn_nlevels - 1); + + blksz = MAXBLOCKSIZE; + blkid = (c->dataoff / c->datablksz) / BLKPTR_PER_INDIR; + for (int level = 1; level <= levels; level++) { + buf = c->inddir[level - 1]; + + if (level == c->dnode->dn_nlevels - 1) { + pbp = &c->dnode->dn_blkptr[0]; + } else { + uint64_t iblkid; + + iblkid = blkid & (BLKPTR_PER_INDIR - 1); + pbp = (blkptr_t *) + &c->inddir[level][iblkid * sizeof(blkptr_t)]; + } + + /* + * Space for indirect blocks is allocated up-front; see the + * comment in objset_write(). + */ + loc = c->indloc; + c->indloc += blksz; + assert(c->indspace >= blksz); + c->indspace -= blksz; + + bp = buf; + fill = 0; + for (size_t i = 0; i < BLKPTR_PER_INDIR; i++) + fill += BP_GET_FILL(&bp[i]); + + vdev_pwrite_dnode_indir(zfs, c->dnode, level, fill, buf, blksz, + loc, pbp); + memset(buf, 0, MAXBLOCKSIZE); + + blkid /= BLKPTR_PER_INDIR; + } +} + +blkptr_t * +dnode_cursor_next(zfs_opt_t *zfs, struct dnode_cursor *c, off_t off) +{ + off_t blkid, l1id; + int levels; + + if (c->dnode->dn_nlevels == 1) { + assert(off < MAXBLOCKSIZE); + return (&c->dnode->dn_blkptr[0]); + } + + assert(off % c->datablksz == 0); + + /* Do we need to flush any full indirect blocks? */ + if (off > 0) { + blkid = off / c->datablksz; + for (levels = 0; levels < c->dnode->dn_nlevels - 1; levels++) { + if (blkid % BLKPTR_PER_INDIR != 0) + break; + blkid /= BLKPTR_PER_INDIR; + } + if (levels > 0) + _dnode_cursor_flush(zfs, c, levels); + } + + c->dataoff = off; + l1id = (off / c->datablksz) & (BLKPTR_PER_INDIR - 1); + return ((blkptr_t *)&c->inddir[0][l1id * sizeof(blkptr_t)]); +} + +void +dnode_cursor_finish(zfs_opt_t *zfs, struct dnode_cursor *c) +{ + int levels; + + levels = c->dnode->dn_nlevels - 1; + if (levels > 0) + _dnode_cursor_flush(zfs, c, levels); + assert(c->indspace == 0); + free(c); +} + +void +zfs_makefs(const char *image, const char *dir, fsnode *root, fsinfo_t *fsopts) +{ + zfs_opt_t *zfs; + int dirfd; + + zfs = fsopts->fs_specific; + + /* + * Use a fixed seed to provide reproducible pseudo-random numbers for + * on-disk structures when needed (e.g., GUIDs, ZAP hash salts). + */ + srandom(1729); + + zfs_check_opts(fsopts); + + if (!zfs->nowarn) { + fprintf(stderr, + "ZFS support is currently considered experimental. " + "Do not use it for anything critical.\n"); + } + + dirfd = open(dir, O_DIRECTORY | O_RDONLY); + if (dirfd < 0) + err(1, "open(%s)", dir); + + vdev_init(zfs, image); + pool_init(zfs); + fs_build(zfs, dirfd, root); + pool_fini(zfs); + vdev_fini(zfs); +} diff --git a/usr.sbin/makefs/zfs/Makefile.inc b/usr.sbin/makefs/zfs/Makefile.inc new file mode 100644 index 000000000000..bebe8c322035 --- /dev/null +++ b/usr.sbin/makefs/zfs/Makefile.inc @@ -0,0 +1,12 @@ +.PATH: ${SRCDIR}/zfs +.PATH: ${SRCTOP}/stand/libsa/zfs + +SRCS+= dsl.c \ + fs.c \ + objset.c \ + vdev.c \ + zap.c + +SRCS+= nvlist.c + +CFLAGS.nvlist.c+= -I${SRCTOP}/stand/libsa -Wno-cast-qual diff --git a/usr.sbin/makefs/zfs/dsl.c b/usr.sbin/makefs/zfs/dsl.c new file mode 100644 index 000000000000..5f473e557c02 --- /dev/null +++ b/usr.sbin/makefs/zfs/dsl.c @@ -0,0 +1,598 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 The FreeBSD Foundation + * + * This software was developed by Mark Johnston under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include + +#include + +#include "makefs.h" +#include "zfs.h" + +typedef struct zfs_dsl_dataset { + zfs_objset_t *os; /* referenced objset, may be null */ + dsl_dataset_phys_t *phys; /* on-disk representation */ + uint64_t dsid; /* DSL dataset dnode */ + + struct zfs_dsl_dir *dir; /* containing parent */ +} zfs_dsl_dataset_t; + +typedef STAILQ_HEAD(zfs_dsl_dir_list, zfs_dsl_dir) zfs_dsl_dir_list_t; + +typedef struct zfs_dsl_dir { + char *fullname; /* full dataset name */ + char *name; /* basename(fullname) */ + dsl_dir_phys_t *phys; /* on-disk representation */ + nvlist_t *propsnv; /* properties saved in propszap */ + + zfs_dsl_dataset_t *headds; /* principal dataset, may be null */ + + uint64_t dirid; /* DSL directory dnode */ + zfs_zap_t *propszap; /* dataset properties */ + zfs_zap_t *childzap; /* child directories */ + + /* DSL directory tree linkage. */ + struct zfs_dsl_dir *parent; + zfs_dsl_dir_list_t children; + STAILQ_ENTRY(zfs_dsl_dir) next; +} zfs_dsl_dir_t; + +static zfs_dsl_dir_t *dsl_dir_alloc(zfs_opt_t *zfs, const char *name); +static zfs_dsl_dataset_t *dsl_dataset_alloc(zfs_opt_t *zfs, zfs_dsl_dir_t *dir); + +static int +nvlist_find_string(nvlist_t *nvl, const char *key, char **retp) +{ + char *str; + int error, len; + + error = nvlist_find(nvl, key, DATA_TYPE_STRING, NULL, &str, &len); + if (error == 0) { + *retp = ecalloc(1, len + 1); + memcpy(*retp, str, len); + } + return (error); +} + +static int +nvlist_find_uint64(nvlist_t *nvl, const char *key, uint64_t *retp) +{ + return (nvlist_find(nvl, key, DATA_TYPE_UINT64, NULL, retp, NULL)); +} + +/* + * Return an allocated string containing the head dataset's mountpoint, + * including the root path prefix. + * + * If the dataset has a mountpoint property, it is returned. Otherwise we have + * to follow ZFS' inheritance rules. + */ +char * +dsl_dir_get_mountpoint(zfs_opt_t *zfs, zfs_dsl_dir_t *dir) +{ + zfs_dsl_dir_t *pdir; + char *mountpoint, *origmountpoint; + + if (nvlist_find_string(dir->propsnv, "mountpoint", &mountpoint) == 0) { + if (strcmp(mountpoint, "none") == 0) + return (NULL); + + /* + * nvlist_find_string() does not make a copy. + */ + mountpoint = estrdup(mountpoint); + } else { + /* + * If we don't have a mountpoint, it's inherited from one of our + * ancestors. Walk up the hierarchy until we find it, building + * up our mountpoint along the way. The mountpoint property is + * always set for the root dataset. + */ + for (pdir = dir->parent, mountpoint = estrdup(dir->name);;) { + origmountpoint = mountpoint; + + if (nvlist_find_string(pdir->propsnv, "mountpoint", + &mountpoint) == 0) { + easprintf(&mountpoint, "%s%s%s", mountpoint, + mountpoint[strlen(mountpoint) - 1] == '/' ? + "" : "/", origmountpoint); + free(origmountpoint); + break; + } + + easprintf(&mountpoint, "%s/%s", pdir->name, + origmountpoint); + free(origmountpoint); + pdir = pdir->parent; + } + } + assert(mountpoint[0] == '/'); + assert(strstr(mountpoint, zfs->rootpath) == mountpoint); + + return (mountpoint); +} + +int +dsl_dir_get_canmount(zfs_dsl_dir_t *dir, uint64_t *canmountp) +{ + return (nvlist_find_uint64(dir->propsnv, "canmount", canmountp)); +} + +/* + * Handle dataset properties that we know about; stash them into an nvlist to be + * written later to the properties ZAP object. + * + * If the set of properties we handle grows too much, we should probably explore + * using libzfs to manage them. + */ +static void +dsl_dir_set_prop(zfs_opt_t *zfs, zfs_dsl_dir_t *dir, const char *key, + const char *val) +{ + nvlist_t *nvl; + + nvl = dir->propsnv; + if (val == NULL || val[0] == '\0') + errx(1, "missing value for property `%s'", key); + if (nvpair_find(nvl, key) != NULL) + errx(1, "property `%s' already set", key); + + if (strcmp(key, "mountpoint") == 0) { + if (strcmp(val, "none") != 0) { + if (val[0] != '/') + errx(1, "mountpoint `%s' is not absolute", val); + if (strcmp(val, zfs->rootpath) != 0 && + strcmp(zfs->rootpath, "/") != 0 && + (strstr(val, zfs->rootpath) != val || + val[strlen(zfs->rootpath)] != '/')) { + errx(1, "mountpoint `%s' is not prefixed by " + "the root path `%s'", val, zfs->rootpath); + } + } + nvlist_add_string(nvl, key, val); + } else if (strcmp(key, "atime") == 0 || strcmp(key, "exec") == 0 || + strcmp(key, "setuid") == 0) { + if (strcmp(val, "on") == 0) + nvlist_add_uint64(nvl, key, 1); + else if (strcmp(val, "off") == 0) + nvlist_add_uint64(nvl, key, 0); + else + errx(1, "invalid value `%s' for %s", val, key); + } else if (strcmp(key, "canmount") == 0) { + if (strcmp(val, "noauto") == 0) + nvlist_add_uint64(nvl, key, 2); + else if (strcmp(val, "on") == 0) + nvlist_add_uint64(nvl, key, 1); + else if (strcmp(val, "off") == 0) + nvlist_add_uint64(nvl, key, 0); + else + errx(1, "invalid value `%s' for %s", val, key); + } else { + errx(1, "unknown property `%s'", key); + } +} + +static zfs_dsl_dir_t * +dsl_metadir_alloc(zfs_opt_t *zfs, const char *name) +{ + zfs_dsl_dir_t *dir; + char *path; + + easprintf(&path, "%s/%s", zfs->poolname, name); + dir = dsl_dir_alloc(zfs, path); + free(path); + return (dir); +} + +static void +dsl_origindir_init(zfs_opt_t *zfs) +{ + dnode_phys_t *clones; + uint64_t clonesid; + + zfs->origindsldir = dsl_metadir_alloc(zfs, "$ORIGIN"); + zfs->originds = dsl_dataset_alloc(zfs, zfs->origindsldir); + zfs->snapds = dsl_dataset_alloc(zfs, zfs->origindsldir); + + clones = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_CLONES, &clonesid); + zfs->cloneszap = zap_alloc(zfs->mos, clones); + zfs->origindsldir->phys->dd_clones = clonesid; +} + +void +dsl_init(zfs_opt_t *zfs) +{ + zfs_dsl_dir_t *dir; + struct dataset_desc *d; + const char *dspropdelim; + + dspropdelim = ";"; + + zfs->rootdsldir = dsl_dir_alloc(zfs, NULL); + + nvlist_add_uint64(zfs->rootdsldir->propsnv, "compression", + ZIO_COMPRESS_OFF); + + zfs->rootds = dsl_dataset_alloc(zfs, zfs->rootdsldir); + zfs->rootdsldir->headds = zfs->rootds; + + zfs->mosdsldir = dsl_metadir_alloc(zfs, "$MOS"); + zfs->freedsldir = dsl_metadir_alloc(zfs, "$FREE"); + dsl_origindir_init(zfs); + + /* + * Go through the list of user-specified datasets and create DSL objects + * for them. + */ + STAILQ_FOREACH(d, &zfs->datasetdescs, next) { + char *dsname, *next, *params, *param, *nextparam; + + params = d->params; + dsname = strsep(¶ms, dspropdelim); + + if (strcmp(dsname, zfs->poolname) == 0) { + /* + * This is the root dataset; it's already created, so + * we're just setting options. + */ + dir = zfs->rootdsldir; + } else { + /* + * This dataset must be a child of the root dataset. + */ + if (strstr(dsname, zfs->poolname) != dsname || + (next = strchr(dsname, '/')) == NULL || + (size_t)(next - dsname) != strlen(zfs->poolname)) { + errx(1, "dataset `%s' must be a child of `%s'", + dsname, zfs->poolname); + } + dir = dsl_dir_alloc(zfs, dsname); + dir->headds = dsl_dataset_alloc(zfs, dir); + } + + for (nextparam = param = params; nextparam != NULL;) { + char *key, *val; + + param = strsep(&nextparam, dspropdelim); + + key = val = param; + key = strsep(&val, "="); + dsl_dir_set_prop(zfs, dir, key, val); + } + } + + /* + * Set the root dataset's mount point if the user didn't override the + * default. + */ + if (nvpair_find(zfs->rootdsldir->propsnv, "mountpoint") == NULL) { + nvlist_add_string(zfs->rootdsldir->propsnv, "mountpoint", + zfs->rootpath); + } +} + +uint64_t +dsl_dir_id(zfs_dsl_dir_t *dir) +{ + return (dir->dirid); +} + +uint64_t +dsl_dir_dataset_id(zfs_dsl_dir_t *dir) +{ + return (dir->headds->dsid); +} + +static void +dsl_dir_foreach_post(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, + void (*cb)(zfs_opt_t *, zfs_dsl_dir_t *, void *), void *arg) +{ + zfs_dsl_dir_t *cdsldir; + + STAILQ_FOREACH(cdsldir, &dsldir->children, next) { + dsl_dir_foreach_post(zfs, cdsldir, cb, arg); + } + cb(zfs, dsldir, arg); +} + +/* + * Used when the caller doesn't care about the order one way or another. + */ +void +dsl_dir_foreach(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, + void (*cb)(zfs_opt_t *, zfs_dsl_dir_t *, void *), void *arg) +{ + dsl_dir_foreach_post(zfs, dsldir, cb, arg); +} + +const char * +dsl_dir_fullname(const zfs_dsl_dir_t *dir) +{ + return (dir->fullname); +} + +/* + * Create a DSL directory, which is effectively an entry in the ZFS namespace. + * We always create a root DSL directory, whose name is the pool's name, and + * several metadata directories. + * + * Each directory has two ZAP objects, one pointing to child directories, and + * one for properties (which are inherited by children unless overridden). + * Directories typically reference a DSL dataset, the "head dataset", which + * points to an object set. + */ +static zfs_dsl_dir_t * +dsl_dir_alloc(zfs_opt_t *zfs, const char *name) +{ + zfs_dsl_dir_list_t l, *lp; + zfs_dsl_dir_t *dir, *parent; + dnode_phys_t *dnode; + char *dirname, *nextdir, *origname; + uint64_t childid, propsid; + + dir = ecalloc(1, sizeof(*dir)); + + dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_DSL_DIR, + DMU_OT_DSL_DIR, sizeof(dsl_dir_phys_t), &dir->dirid); + dir->phys = (dsl_dir_phys_t *)DN_BONUS(dnode); + + dnode = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_PROPS, &propsid); + dir->propszap = zap_alloc(zfs->mos, dnode); + + dnode = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_DIR_CHILD_MAP, + &childid); + dir->childzap = zap_alloc(zfs->mos, dnode); + + dir->propsnv = nvlist_create(NV_UNIQUE_NAME); + STAILQ_INIT(&dir->children); + + dir->phys->dd_child_dir_zapobj = childid; + dir->phys->dd_props_zapobj = propsid; + + if (name == NULL) { + /* + * This is the root DSL directory. + */ + dir->name = estrdup(zfs->poolname); + dir->fullname = estrdup(zfs->poolname); + dir->parent = NULL; + dir->phys->dd_parent_obj = 0; + + assert(zfs->rootdsldir == NULL); + zfs->rootdsldir = dir; + return (dir); + } + + /* + * Insert the new directory into the hierarchy. Currently this must be + * done in order, e.g., when creating pool/a/b, pool/a must already + * exist. + */ + STAILQ_INIT(&l); + STAILQ_INSERT_HEAD(&l, zfs->rootdsldir, next); + origname = dirname = nextdir = estrdup(name); + for (lp = &l;; lp = &parent->children) { + dirname = strsep(&nextdir, "/"); + if (nextdir == NULL) + break; + + STAILQ_FOREACH(parent, lp, next) { + if (strcmp(parent->name, dirname) == 0) + break; + } + if (parent == NULL) { + errx(1, "no parent at `%s' for filesystem `%s'", + dirname, name); + } + } + + dir->fullname = estrdup(name); + dir->name = estrdup(dirname); + free(origname); + STAILQ_INSERT_TAIL(lp, dir, next); + zap_add_uint64(parent->childzap, dir->name, dir->dirid); + + dir->parent = parent; + dir->phys->dd_parent_obj = parent->dirid; + return (dir); +} + +void +dsl_dir_size_set(zfs_dsl_dir_t *dir, uint64_t bytes) +{ + dir->phys->dd_used_bytes = bytes; + dir->phys->dd_compressed_bytes = bytes; + dir->phys->dd_uncompressed_bytes = bytes; +} + +/* + * Convert dataset properties into entries in the DSL directory's properties + * ZAP. + */ +static void +dsl_dir_finalize_props(zfs_dsl_dir_t *dir) +{ + for (nvp_header_t *nvh = NULL; + (nvh = nvlist_next_nvpair(dir->propsnv, nvh)) != NULL;) { + nv_string_t *nvname; + nv_pair_data_t *nvdata; + const char *name; + + nvname = (nv_string_t *)(nvh + 1); + nvdata = (nv_pair_data_t *)(&nvname->nv_data[0] + + NV_ALIGN4(nvname->nv_size)); + + name = nvstring_get(nvname); + switch (nvdata->nv_type) { + case DATA_TYPE_UINT64: { + uint64_t val; + + memcpy(&val, &nvdata->nv_data[0], sizeof(uint64_t)); + zap_add_uint64(dir->propszap, name, val); + break; + } + case DATA_TYPE_STRING: { + nv_string_t *nvstr; + + nvstr = (nv_string_t *)&nvdata->nv_data[0]; + zap_add_string(dir->propszap, name, + nvstring_get(nvstr)); + break; + } + default: + assert(0); + } + } +} + +static void +dsl_dir_finalize(zfs_opt_t *zfs, zfs_dsl_dir_t *dir, void *arg __unused) +{ + char key[32]; + zfs_dsl_dir_t *cdir; + dnode_phys_t *snapnames; + zfs_dsl_dataset_t *headds; + zfs_objset_t *os; + uint64_t bytes, snapnamesid; + + dsl_dir_finalize_props(dir); + zap_write(zfs, dir->propszap); + zap_write(zfs, dir->childzap); + + headds = dir->headds; + if (headds == NULL) + return; + os = headds->os; + if (os == NULL) + return; + + snapnames = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_DS_SNAP_MAP, + &snapnamesid); + zap_write(zfs, zap_alloc(zfs->mos, snapnames)); + + dir->phys->dd_head_dataset_obj = headds->dsid; + dir->phys->dd_clone_parent_obj = zfs->snapds->dsid; + headds->phys->ds_prev_snap_obj = zfs->snapds->dsid; + headds->phys->ds_snapnames_zapobj = snapnamesid; + objset_root_blkptr_copy(os, &headds->phys->ds_bp); + + zfs->snapds->phys->ds_num_children++; + snprintf(key, sizeof(key), "%jx", (uintmax_t)headds->dsid); + zap_add_uint64(zfs->cloneszap, key, headds->dsid); + + bytes = objset_space(os); + headds->phys->ds_used_bytes = bytes; + headds->phys->ds_uncompressed_bytes = bytes; + headds->phys->ds_compressed_bytes = bytes; + + STAILQ_FOREACH(cdir, &dir->children, next) + bytes += cdir->phys->dd_used_bytes; + dsl_dir_size_set(dir, bytes); +} + +void +dsl_write(zfs_opt_t *zfs) +{ + zfs_zap_t *snapnameszap; + dnode_phys_t *snapnames; + uint64_t snapmapid; + + /* + * Perform accounting, starting from the leaves of the DSL directory + * tree. Accounting for $MOS is done later, once we've finished + * allocating space. + */ + dsl_dir_foreach_post(zfs, zfs->rootdsldir, dsl_dir_finalize, NULL); + + snapnames = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_DS_SNAP_MAP, + &snapmapid); + snapnameszap = zap_alloc(zfs->mos, snapnames); + zap_add_uint64(snapnameszap, "$ORIGIN", zfs->snapds->dsid); + zap_write(zfs, snapnameszap); + + zfs->origindsldir->phys->dd_head_dataset_obj = zfs->originds->dsid; + zfs->originds->phys->ds_prev_snap_obj = zfs->snapds->dsid; + zfs->originds->phys->ds_snapnames_zapobj = snapmapid; + + zfs->snapds->phys->ds_next_snap_obj = zfs->originds->dsid; + assert(zfs->snapds->phys->ds_num_children > 0); + zfs->snapds->phys->ds_num_children++; + + zap_write(zfs, zfs->cloneszap); + + /* XXX-MJ dirs and datasets are leaked */ +} + +void +dsl_dir_dataset_write(zfs_opt_t *zfs, zfs_objset_t *os, zfs_dsl_dir_t *dir) +{ + dir->headds->os = os; + objset_write(zfs, os); +} + +bool +dsl_dir_has_dataset(zfs_dsl_dir_t *dir) +{ + return (dir->headds != NULL); +} + +bool +dsl_dir_dataset_has_objset(zfs_dsl_dir_t *dir) +{ + return (dsl_dir_has_dataset(dir) && dir->headds->os != NULL); +} + +static zfs_dsl_dataset_t * +dsl_dataset_alloc(zfs_opt_t *zfs, zfs_dsl_dir_t *dir) +{ + zfs_dsl_dataset_t *ds; + dnode_phys_t *dnode; + uint64_t deadlistid; + + ds = ecalloc(1, sizeof(*ds)); + + dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_DSL_DATASET, + DMU_OT_DSL_DATASET, sizeof(dsl_dataset_phys_t), &ds->dsid); + ds->phys = (dsl_dataset_phys_t *)DN_BONUS(dnode); + + dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_DEADLIST, + DMU_OT_DEADLIST_HDR, sizeof(dsl_deadlist_phys_t), &deadlistid); + zap_write(zfs, zap_alloc(zfs->mos, dnode)); + + ds->phys->ds_dir_obj = dir->dirid; + ds->phys->ds_deadlist_obj = deadlistid; + ds->phys->ds_creation_txg = TXG - 1; + if (ds != zfs->snapds) + ds->phys->ds_prev_snap_txg = TXG - 1; + ds->phys->ds_guid = ((uint64_t)random() << 32) | random(); + ds->dir = dir; + + return (ds); +} diff --git a/usr.sbin/makefs/zfs/fs.c b/usr.sbin/makefs/zfs/fs.c new file mode 100644 index 000000000000..15025ec5447d --- /dev/null +++ b/usr.sbin/makefs/zfs/fs.c @@ -0,0 +1,981 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 The FreeBSD Foundation + * + * This software was developed by Mark Johnston under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include + +#include +#include +#include +#include + +#include + +#include "makefs.h" +#include "zfs.h" + +typedef struct { + const char *name; + unsigned int id; + uint16_t size; + sa_bswap_type_t bs; +} zfs_sattr_t; + +typedef struct zfs_fs { + zfs_objset_t *os; + + /* Offset table for system attributes, indexed by a zpl_attr_t. */ + uint16_t *saoffs; + size_t sacnt; + const zfs_sattr_t *satab; +} zfs_fs_t; + +/* + * The order of the attributes doesn't matter, this is simply the one hard-coded + * by OpenZFS, based on a zdb dump of the SA_REGISTRY table. + */ +typedef enum zpl_attr { + ZPL_ATIME, + ZPL_MTIME, + ZPL_CTIME, + ZPL_CRTIME, + ZPL_GEN, + ZPL_MODE, + ZPL_SIZE, + ZPL_PARENT, + ZPL_LINKS, + ZPL_XATTR, + ZPL_RDEV, + ZPL_FLAGS, + ZPL_UID, + ZPL_GID, + ZPL_PAD, + ZPL_ZNODE_ACL, + ZPL_DACL_COUNT, + ZPL_SYMLINK, + ZPL_SCANSTAMP, + ZPL_DACL_ACES, + ZPL_DXATTR, + ZPL_PROJID, +} zpl_attr_t; + +/* + * This table must be kept in sync with zpl_attr_layout[] and zpl_attr_t. + */ +static const zfs_sattr_t zpl_attrs[] = { +#define _ZPL_ATTR(n, s, b) { .name = #n, .id = n, .size = s, .bs = b } + _ZPL_ATTR(ZPL_ATIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_MTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_CTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_CRTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_GEN, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_MODE, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_SIZE, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_PARENT, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_LINKS, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_XATTR, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_RDEV, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_FLAGS, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_UID, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_GID, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_PAD, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_ZNODE_ACL, 88, SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_DACL_COUNT, sizeof(uint64_t), SA_UINT64_ARRAY), + _ZPL_ATTR(ZPL_SYMLINK, 0, SA_UINT8_ARRAY), + _ZPL_ATTR(ZPL_SCANSTAMP, sizeof(uint64_t) * 4, SA_UINT8_ARRAY), + _ZPL_ATTR(ZPL_DACL_ACES, 0, SA_ACL), + _ZPL_ATTR(ZPL_DXATTR, 0, SA_UINT8_ARRAY), + _ZPL_ATTR(ZPL_PROJID, sizeof(uint64_t), SA_UINT64_ARRAY), +#undef ZPL_ATTR +}; + +/* + * This layout matches that of a filesystem created using OpenZFS on FreeBSD. + * It need not match in general, but FreeBSD's loader doesn't bother parsing the + * layout and just hard-codes attribute offsets. + */ +static const sa_attr_type_t zpl_attr_layout[] = { + ZPL_MODE, + ZPL_SIZE, + ZPL_GEN, + ZPL_UID, + ZPL_GID, + ZPL_PARENT, + ZPL_FLAGS, + ZPL_ATIME, + ZPL_MTIME, + ZPL_CTIME, + ZPL_CRTIME, + ZPL_LINKS, + ZPL_DACL_COUNT, + ZPL_DACL_ACES, + ZPL_SYMLINK, +}; + +/* + * Keys for the ZPL attribute tables in the SA layout ZAP. The first two + * indices are reserved for legacy attribute encoding. + */ +#define SA_LAYOUT_INDEX_DEFAULT 2 +#define SA_LAYOUT_INDEX_SYMLINK 3 + +struct fs_populate_dir { + SLIST_ENTRY(fs_populate_dir) next; + int dirfd; + uint64_t objid; + zfs_zap_t *zap; +}; + +struct fs_populate_arg { + zfs_opt_t *zfs; + zfs_fs_t *fs; /* owning filesystem */ + int dirfd; /* current directory fd */ + uint64_t rootdirid; /* root directory dnode ID */ + SLIST_HEAD(, fs_populate_dir) dirs; /* stack of directories */ +}; + +static void fs_build_one(zfs_opt_t *, zfs_dsl_dir_t *, fsnode *, int); + +static bool +fsnode_isroot(const fsnode *cur) +{ + return (strcmp(cur->name, ".") == 0); +} + +/* + * Visit each node in a directory hierarchy, in pre-order depth-first order. + */ +static void +fsnode_foreach(fsnode *root, int (*cb)(fsnode *, void *), void *arg) +{ + assert(root->type == S_IFDIR); + + for (fsnode *cur = root; cur != NULL; cur = cur->next) { + assert(cur->type == S_IFREG || cur->type == S_IFDIR || + cur->type == S_IFLNK); + + if (cb(cur, arg) == 0) + continue; + if (cur->type == S_IFDIR && cur->child != NULL) + fsnode_foreach(cur->child, cb, arg); + } +} + +static void +fs_populate_dirent(struct fs_populate_arg *arg, fsnode *cur, uint64_t dnid) +{ + struct fs_populate_dir *dir; + uint64_t type; + + switch (cur->type) { + case S_IFREG: + type = DT_REG; + break; + case S_IFDIR: + type = DT_DIR; + break; + case S_IFLNK: + type = DT_LNK; + break; + default: + assert(0); + } + + dir = SLIST_FIRST(&arg->dirs); + zap_add_uint64(dir->zap, cur->name, ZFS_DIRENT_MAKE(type, dnid)); +} + +static void +fs_populate_attr(zfs_fs_t *fs, char *attrbuf, const void *val, uint16_t ind, + size_t *szp) +{ + assert(ind < fs->sacnt); + assert(fs->saoffs[ind] != 0xffff); + + memcpy(attrbuf + fs->saoffs[ind], val, fs->satab[ind].size); + *szp += fs->satab[ind].size; +} + +static void +fs_populate_varszattr(zfs_fs_t *fs, char *attrbuf, const void *val, + size_t valsz, size_t varoff, uint16_t ind, size_t *szp) +{ + assert(ind < fs->sacnt); + assert(fs->saoffs[ind] != 0xffff); + assert(fs->satab[ind].size == 0); + + memcpy(attrbuf + fs->saoffs[ind] + varoff, val, valsz); + *szp += valsz; +} + +static void +fs_populate_sattrs(struct fs_populate_arg *arg, const fsnode *cur, + dnode_phys_t *dnode) +{ + char target[PATH_MAX]; + zfs_fs_t *fs; + zfs_ace_hdr_t aces[3]; + struct stat *sb; + sa_hdr_phys_t *sahdr; + uint64_t daclcount, flags, gen, gid, links, mode, parent, objsize, uid; + char *attrbuf; + size_t bonussz, hdrsz; + int layout; + + assert(dnode->dn_bonustype == DMU_OT_SA); + assert(dnode->dn_nblkptr == 1); + + fs = arg->fs; + sb = &cur->inode->st; + + switch (cur->type) { + case S_IFREG: + layout = SA_LAYOUT_INDEX_DEFAULT; + links = cur->inode->nlink; + objsize = sb->st_size; + parent = SLIST_FIRST(&arg->dirs)->objid; + break; + case S_IFDIR: + layout = SA_LAYOUT_INDEX_DEFAULT; + links = 1; /* .. */ + objsize = 1; /* .. */ + + /* + * The size of a ZPL directory is the number of entries + * (including "." and ".."), and the link count is the number of + * entries which are directories (including "." and ".."). + */ + for (fsnode *c = fsnode_isroot(cur) ? cur->next : cur->child; + c != NULL; c = c->next) { + if (c->type == S_IFDIR) + links++; + objsize++; + } + + /* The root directory is its own parent. */ + parent = SLIST_EMPTY(&arg->dirs) ? + arg->rootdirid : SLIST_FIRST(&arg->dirs)->objid; + break; + case S_IFLNK: { + ssize_t n; + + if ((n = readlinkat(SLIST_FIRST(&arg->dirs)->dirfd, cur->name, + target, sizeof(target) - 1)) == -1) + err(1, "readlinkat(%s)", cur->name); + target[n] = '\0'; + + layout = SA_LAYOUT_INDEX_SYMLINK; + links = 1; + objsize = strlen(target); + parent = SLIST_FIRST(&arg->dirs)->objid; + break; + } + default: + assert(0); + } + + daclcount = nitems(aces); + flags = ZFS_ACL_TRIVIAL | ZFS_ACL_AUTO_INHERIT | ZFS_NO_EXECS_DENIED | + ZFS_ARCHIVE | ZFS_AV_MODIFIED; /* XXX-MJ */ + gen = 1; + gid = sb->st_gid; + mode = sb->st_mode; + uid = sb->st_uid; + + memset(aces, 0, sizeof(aces)); + aces[0].z_flags = ACE_OWNER; + aces[0].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE; + aces[0].z_access_mask = ACE_WRITE_ATTRIBUTES | ACE_WRITE_OWNER | + ACE_WRITE_ACL | ACE_WRITE_NAMED_ATTRS | ACE_READ_ACL | + ACE_READ_ATTRIBUTES | ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE; + if ((mode & S_IRUSR) != 0) + aces[0].z_access_mask |= ACE_READ_DATA; + if ((mode & S_IWUSR) != 0) + aces[0].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA; + if ((mode & S_IXUSR) != 0) + aces[0].z_access_mask |= ACE_EXECUTE; + + aces[1].z_flags = ACE_GROUP | ACE_IDENTIFIER_GROUP; + aces[1].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE; + aces[1].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES | + ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE; + if ((mode & S_IRGRP) != 0) + aces[1].z_access_mask |= ACE_READ_DATA; + if ((mode & S_IWGRP) != 0) + aces[1].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA; + if ((mode & S_IXGRP) != 0) + aces[1].z_access_mask |= ACE_EXECUTE; + + aces[2].z_flags = ACE_EVERYONE; + aces[2].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE; + aces[2].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES | + ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE; + if ((mode & S_IROTH) != 0) + aces[2].z_access_mask |= ACE_READ_DATA; + if ((mode & S_IWOTH) != 0) + aces[2].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA; + if ((mode & S_IXOTH) != 0) + aces[2].z_access_mask |= ACE_EXECUTE; + + switch (layout) { + case SA_LAYOUT_INDEX_DEFAULT: + /* At most one variable-length attribute. */ + hdrsz = sizeof(uint64_t); + break; + case SA_LAYOUT_INDEX_SYMLINK: + /* At most five variable-length attributes. */ + hdrsz = sizeof(uint64_t) * 2; + break; + default: + assert(0); + } + + sahdr = (sa_hdr_phys_t *)DN_BONUS(dnode); + sahdr->sa_magic = SA_MAGIC; + SA_HDR_LAYOUT_INFO_ENCODE(sahdr->sa_layout_info, layout, hdrsz); + + bonussz = SA_HDR_SIZE(sahdr); + attrbuf = (char *)sahdr + SA_HDR_SIZE(sahdr); + + fs_populate_attr(fs, attrbuf, &daclcount, ZPL_DACL_COUNT, &bonussz); + fs_populate_attr(fs, attrbuf, &flags, ZPL_FLAGS, &bonussz); + fs_populate_attr(fs, attrbuf, &gen, ZPL_GEN, &bonussz); + fs_populate_attr(fs, attrbuf, &gid, ZPL_GID, &bonussz); + fs_populate_attr(fs, attrbuf, &links, ZPL_LINKS, &bonussz); + fs_populate_attr(fs, attrbuf, &mode, ZPL_MODE, &bonussz); + fs_populate_attr(fs, attrbuf, &parent, ZPL_PARENT, &bonussz); + fs_populate_attr(fs, attrbuf, &objsize, ZPL_SIZE, &bonussz); + fs_populate_attr(fs, attrbuf, &uid, ZPL_UID, &bonussz); + + /* + * We deliberately set atime = mtime here to ensure that images are + * reproducible. + */ + assert(sizeof(sb->st_mtim) == fs->satab[ZPL_ATIME].size); + fs_populate_attr(fs, attrbuf, &sb->st_mtim, ZPL_ATIME, &bonussz); + assert(sizeof(sb->st_ctim) == fs->satab[ZPL_CTIME].size); + fs_populate_attr(fs, attrbuf, &sb->st_ctim, ZPL_CTIME, &bonussz); + assert(sizeof(sb->st_mtim) == fs->satab[ZPL_MTIME].size); + fs_populate_attr(fs, attrbuf, &sb->st_mtim, ZPL_MTIME, &bonussz); + assert(sizeof(sb->st_birthtim) == fs->satab[ZPL_CRTIME].size); + fs_populate_attr(fs, attrbuf, &sb->st_birthtim, ZPL_CRTIME, &bonussz); + + fs_populate_varszattr(fs, attrbuf, aces, sizeof(aces), 0, + ZPL_DACL_ACES, &bonussz); + sahdr->sa_lengths[0] = sizeof(aces); + + if (cur->type == S_IFLNK) { + assert(layout == SA_LAYOUT_INDEX_SYMLINK); + /* Need to use a spill block pointer if the target is long. */ + assert(bonussz + objsize <= DN_OLD_MAX_BONUSLEN); + fs_populate_varszattr(fs, attrbuf, target, objsize, + sahdr->sa_lengths[0], ZPL_SYMLINK, &bonussz); + sahdr->sa_lengths[1] = (uint16_t)objsize; + } + + dnode->dn_bonuslen = bonussz; +} + +static void +fs_populate_file(fsnode *cur, struct fs_populate_arg *arg) +{ + struct dnode_cursor *c; + dnode_phys_t *dnode; + zfs_opt_t *zfs; + char *buf; + uint64_t dnid; + ssize_t n; + size_t bufsz; + off_t size, target; + int fd; + + assert(cur->type == S_IFREG); + assert((cur->inode->flags & FI_ROOT) == 0); + + zfs = arg->zfs; + + assert(cur->inode->ino != 0); + if ((cur->inode->flags & FI_ALLOCATED) != 0) { + /* + * This is a hard link of an existing file. + * + * XXX-MJ need to check whether it crosses datasets, add a test + * case for that + */ + fs_populate_dirent(arg, cur, cur->inode->ino); + return; + } + + dnode = objset_dnode_bonus_alloc(arg->fs->os, + DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid); + cur->inode->ino = dnid; + cur->inode->flags |= FI_ALLOCATED; + + fd = openat(SLIST_FIRST(&arg->dirs)->dirfd, cur->name, O_RDONLY); + if (fd == -1) + err(1, "openat(%s)", cur->name); + + buf = zfs->filebuf; + bufsz = sizeof(zfs->filebuf); + size = cur->inode->st.st_size; + c = dnode_cursor_init(zfs, arg->fs->os, dnode, size, 0); + for (off_t foff = 0; foff < size; foff += target) { + off_t loc, sofar; + + /* + * Fill up our buffer, handling partial reads. + * + * It might be profitable to use copy_file_range(2) here. + */ + sofar = 0; + target = MIN(size - foff, (off_t)bufsz); + do { + n = read(fd, buf + sofar, target); + if (n < 0) + err(1, "reading from '%s'", cur->name); + if (n == 0) + errx(1, "unexpected EOF reading '%s'", + cur->name); + sofar += n; + } while (sofar < target); + + if (target < (off_t)bufsz) + memset(buf + target, 0, bufsz - target); + + loc = objset_space_alloc(zfs, arg->fs->os, &target); + vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, buf, target, loc, + dnode_cursor_next(zfs, c, foff)); + } + if (close(fd) != 0) + err(1, "close"); + dnode_cursor_finish(zfs, c); + + fs_populate_sattrs(arg, cur, dnode); + fs_populate_dirent(arg, cur, dnid); +} + +static void +fs_populate_dir(fsnode *cur, struct fs_populate_arg *arg) +{ + dnode_phys_t *dnode; + zfs_objset_t *os; + uint64_t dnid; + int dirfd; + + assert(cur->type == S_IFDIR); + assert((cur->inode->flags & FI_ALLOCATED) == 0); + + os = arg->fs->os; + + dnode = objset_dnode_bonus_alloc(os, DMU_OT_DIRECTORY_CONTENTS, + DMU_OT_SA, 0, &dnid); + + /* + * Add an entry to the parent directory and open this directory. + */ + if (!SLIST_EMPTY(&arg->dirs)) { + fs_populate_dirent(arg, cur, dnid); + dirfd = openat(SLIST_FIRST(&arg->dirs)->dirfd, cur->name, + O_DIRECTORY); + if (dirfd < 0) + err(1, "open(%s)", cur->name); + } else { + arg->rootdirid = dnid; + dirfd = arg->dirfd; + } + + /* + * Set ZPL attributes. + */ + fs_populate_sattrs(arg, cur, dnode); + + /* + * If this is a root directory, then its children belong to a different + * dataset and this directory remains empty in the current objset. + */ + if ((cur->inode->flags & FI_ROOT) == 0) { + struct fs_populate_dir *dir; + + dir = ecalloc(1, sizeof(*dir)); + dir->dirfd = dirfd; + dir->objid = dnid; + dir->zap = zap_alloc(os, dnode); + SLIST_INSERT_HEAD(&arg->dirs, dir, next); + } else { + zap_write(arg->zfs, zap_alloc(os, dnode)); + fs_build_one(arg->zfs, cur->inode->param, cur->child, dirfd); + } +} + +static void +fs_populate_symlink(fsnode *cur, struct fs_populate_arg *arg) +{ + dnode_phys_t *dnode; + uint64_t dnid; + + assert(cur->type == S_IFLNK); + assert((cur->inode->flags & (FI_ALLOCATED | FI_ROOT)) == 0); + + dnode = objset_dnode_bonus_alloc(arg->fs->os, + DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid); + + fs_populate_dirent(arg, cur, dnid); + + fs_populate_sattrs(arg, cur, dnode); +} + +static int +fs_foreach_populate(fsnode *cur, void *_arg) +{ + struct fs_populate_arg *arg; + struct fs_populate_dir *dir; + int ret; + + arg = _arg; + switch (cur->type) { + case S_IFREG: + fs_populate_file(cur, arg); + break; + case S_IFDIR: + if (fsnode_isroot(cur)) + break; + fs_populate_dir(cur, arg); + break; + case S_IFLNK: + fs_populate_symlink(cur, arg); + break; + default: + assert(0); + } + + ret = (cur->inode->flags & FI_ROOT) != 0 ? 0 : 1; + + if (cur->next == NULL && + (cur->child == NULL || (cur->inode->flags & FI_ROOT) != 0)) { + /* + * We reached a terminal node in a subtree. Walk back up and + * write out directories. We're done once we hit the root of a + * dataset or find a level where we're not on the edge of the + * tree. + */ + do { + dir = SLIST_FIRST(&arg->dirs); + SLIST_REMOVE_HEAD(&arg->dirs, next); + zap_write(arg->zfs, dir->zap); + if (dir->dirfd != -1 && close(dir->dirfd) != 0) + err(1, "close"); + free(dir); + cur = cur->parent; + } while (cur != NULL && cur->next == NULL && + (cur->inode->flags & FI_ROOT) == 0); + } + + return (ret); +} + +static void +fs_add_zpl_attr_layout(zfs_zap_t *zap, unsigned int index, + const sa_attr_type_t layout[], size_t sacnt) +{ + char ti[16]; + + assert(sizeof(layout[0]) == 2); + + snprintf(ti, sizeof(ti), "%u", index); + zap_add(zap, ti, sizeof(sa_attr_type_t), sacnt, + (const uint8_t *)layout); +} + +/* + * Initialize system attribute tables. + * + * There are two elements to this. First, we write the zpl_attrs[] and + * zpl_attr_layout[] tables to disk. Then we create a lookup table which + * allows us to set file attributes quickly. + */ +static uint64_t +fs_set_zpl_attrs(zfs_opt_t *zfs, zfs_fs_t *fs) +{ + zfs_zap_t *sazap, *salzap, *sarzap; + zfs_objset_t *os; + dnode_phys_t *saobj, *salobj, *sarobj; + uint64_t saobjid, salobjid, sarobjid; + uint16_t offset; + + os = fs->os; + + /* + * The on-disk tables are stored in two ZAP objects, the registry object + * and the layout object. Individual attributes are described by + * entries in the registry object; for example, the value for the + * "ZPL_SIZE" key gives the size and encoding of the ZPL_SIZE attribute. + * The attributes of a file are ordered according to one of the layouts + * defined in the layout object. The master node object is simply used + * to locate the registry and layout objects. + */ + saobj = objset_dnode_alloc(os, DMU_OT_SA_MASTER_NODE, &saobjid); + salobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_LAYOUTS, &salobjid); + sarobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_REGISTRATION, &sarobjid); + + sarzap = zap_alloc(os, sarobj); + for (size_t i = 0; i < nitems(zpl_attrs); i++) { + const zfs_sattr_t *sa; + uint64_t attr; + + attr = 0; + sa = &zpl_attrs[i]; + SA_ATTR_ENCODE(attr, (uint64_t)i, sa->size, sa->bs); + zap_add_uint64(sarzap, sa->name, attr); + } + zap_write(zfs, sarzap); + + /* + * Layouts are arrays of indices into the registry. We define two + * layouts for use by the ZPL, one for non-symlinks and one for + * symlinks. They are identical except that the symlink layout includes + * ZPL_SYMLINK as its final attribute. + */ + salzap = zap_alloc(os, salobj); + assert(zpl_attr_layout[nitems(zpl_attr_layout) - 1] == ZPL_SYMLINK); + fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_DEFAULT, + zpl_attr_layout, nitems(zpl_attr_layout) - 1); + fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_SYMLINK, + zpl_attr_layout, nitems(zpl_attr_layout)); + zap_write(zfs, salzap); + + sazap = zap_alloc(os, saobj); + zap_add_uint64(sazap, SA_LAYOUTS, salobjid); + zap_add_uint64(sazap, SA_REGISTRY, sarobjid); + zap_write(zfs, sazap); + + /* Sanity check. */ + for (size_t i = 0; i < nitems(zpl_attrs); i++) + assert(i == zpl_attrs[i].id); + + /* + * Build the offset table used when setting file attributes. File + * attributes are stored in the object's bonus buffer; this table + * provides the buffer offset of attributes referenced by the layout + * table. + */ + fs->sacnt = nitems(zpl_attrs); + fs->saoffs = ecalloc(fs->sacnt, sizeof(*fs->saoffs)); + for (size_t i = 0; i < fs->sacnt; i++) + fs->saoffs[i] = 0xffff; + offset = 0; + for (size_t i = 0; i < nitems(zpl_attr_layout); i++) { + uint16_t size; + + assert(zpl_attr_layout[i] < fs->sacnt); + + fs->saoffs[zpl_attr_layout[i]] = offset; + size = zpl_attrs[zpl_attr_layout[i]].size; + offset += size; + } + fs->satab = zpl_attrs; + + return (saobjid); +} + +static void +fs_layout_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg) +{ + char *mountpoint, *origmountpoint, *name, *next; + fsnode *cur, *root; + uint64_t canmount; + + if (!dsl_dir_has_dataset(dsldir)) + return; + + mountpoint = dsl_dir_get_mountpoint(zfs, dsldir); + if (mountpoint == NULL) + return; + if (dsl_dir_get_canmount(dsldir, &canmount) == 0 && canmount == 0) + return; + + /* + * If we were asked to specify a bootfs, set it here. + */ + if (zfs->bootfs != NULL && strcmp(zfs->bootfs, + dsl_dir_fullname(dsldir)) == 0) { + zap_add_uint64(zfs->poolprops, "bootfs", + dsl_dir_dataset_id(dsldir)); + } + + origmountpoint = mountpoint; + + /* + * Figure out which fsnode corresponds to our mountpoint. + */ + root = arg; + cur = root; + if (strcmp(mountpoint, zfs->rootpath) != 0) { + mountpoint += strlen(zfs->rootpath); + + /* + * Look up the directory in the staged tree. For example, if + * the dataset's mount point is /foo/bar/baz, we'll search the + * root directory for "foo", search "foo" for "baz", and so on. + * Each intermediate name must refer to a directory; the final + * component need not exist. + */ + cur = root; + for (next = name = mountpoint; next != NULL;) { + for (; *next == '/'; next++) + ; + name = strsep(&next, "/"); + + for (; cur != NULL && strcmp(cur->name, name) != 0; + cur = cur->next) + ; + if (cur == NULL) { + if (next == NULL) + break; + errx(1, "missing mountpoint directory for `%s'", + dsl_dir_fullname(dsldir)); + } + if (cur->type != S_IFDIR) { + errx(1, + "mountpoint for `%s' is not a directory", + dsl_dir_fullname(dsldir)); + } + if (next != NULL) + cur = cur->child; + } + } + + if (cur != NULL) { + assert(cur->type == S_IFDIR); + + /* + * Multiple datasets shouldn't share a mountpoint. It's + * technically allowed, but it's not clear what makefs should do + * in that case. + */ + assert((cur->inode->flags & FI_ROOT) == 0); + if (cur != root) + cur->inode->flags |= FI_ROOT; + assert(cur->inode->param == NULL); + cur->inode->param = dsldir; + } + + free(origmountpoint); +} + +static int +fs_foreach_mark(fsnode *cur, void *arg) +{ + uint64_t *countp; + + countp = arg; + if (cur->type == S_IFDIR && fsnode_isroot(cur)) + return (1); + + if (cur->inode->ino == 0) { + cur->inode->ino = ++(*countp); + cur->inode->nlink = 1; + } else { + cur->inode->nlink++; + } + + return ((cur->inode->flags & FI_ROOT) != 0 ? 0 : 1); +} + +/* + * Create a filesystem dataset. More specifically: + * - create an object set for the dataset, + * - add required metadata (SA tables, property definitions, etc.) to that + * object set, + * - optionally populate the object set with file objects, using "root" as the + * root directory. + * + * "dirfd" is a directory descriptor for the directory referenced by "root". It + * is closed before returning. + */ +static void +fs_build_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, fsnode *root, int dirfd) +{ + struct fs_populate_arg arg; + zfs_fs_t fs; + zfs_zap_t *masterzap; + zfs_objset_t *os; + dnode_phys_t *deleteq, *masterobj; + uint64_t deleteqid, dnodecount, moid, rootdirid, saobjid; + bool fakedroot; + + /* + * This dataset's mountpoint doesn't exist in the staging tree, or the + * dataset doesn't have a mountpoint at all. In either case we still + * need a root directory. Fake up a root fsnode to handle this case. + */ + fakedroot = root == NULL; + if (fakedroot) { + struct stat *stp; + + assert(dirfd == -1); + + root = ecalloc(1, sizeof(*root)); + root->inode = ecalloc(1, sizeof(*root->inode)); + root->name = estrdup("."); + root->type = S_IFDIR; + + stp = &root->inode->st; + stp->st_uid = 0; + stp->st_gid = 0; + stp->st_mode = S_IFDIR | 0755; + } + assert(root->type == S_IFDIR); + assert(fsnode_isroot(root)); + + /* + * Initialize the object set for this dataset. + */ + os = objset_alloc(zfs, DMU_OST_ZFS); + masterobj = objset_dnode_alloc(os, DMU_OT_MASTER_NODE, &moid); + assert(moid == MASTER_NODE_OBJ); + + memset(&fs, 0, sizeof(fs)); + fs.os = os; + + /* + * Create the ZAP SA layout now since filesystem object dnodes will + * refer to those attributes. + */ + saobjid = fs_set_zpl_attrs(zfs, &fs); + + /* + * Make a pass over the staged directory to detect hard links and assign + * virtual dnode numbers. + */ + dnodecount = 1; /* root directory */ + fsnode_foreach(root, fs_foreach_mark, &dnodecount); + + /* + * Make a second pass to populate the dataset with files from the + * staged directory. Most of our runtime is spent here. + */ + arg.dirfd = dirfd; + arg.zfs = zfs; + arg.fs = &fs; + SLIST_INIT(&arg.dirs); + fs_populate_dir(root, &arg); + assert(!SLIST_EMPTY(&arg.dirs)); + fsnode_foreach(root, fs_foreach_populate, &arg); + assert(SLIST_EMPTY(&arg.dirs)); + rootdirid = arg.rootdirid; + + /* + * Create an empty delete queue. We don't do anything with it, but + * OpenZFS will refuse to mount filesystems that don't have one. + */ + deleteq = objset_dnode_alloc(os, DMU_OT_UNLINKED_SET, &deleteqid); + zap_write(zfs, zap_alloc(os, deleteq)); + + /* + * Populate and write the master node object. This is a ZAP object + * containing various dataset properties and the object IDs of the root + * directory and delete queue. + */ + masterzap = zap_alloc(os, masterobj); + zap_add_uint64(masterzap, ZFS_ROOT_OBJ, rootdirid); + zap_add_uint64(masterzap, ZFS_UNLINKED_SET, deleteqid); + zap_add_uint64(masterzap, ZFS_SA_ATTRS, saobjid); + zap_add_uint64(masterzap, ZPL_VERSION_OBJ, 5 /* ZPL_VERSION_SA */); + zap_add_uint64(masterzap, "normalization", 0 /* off */); + zap_add_uint64(masterzap, "utf8only", 0 /* off */); + zap_add_uint64(masterzap, "casesensitivity", 0 /* case sensitive */); + zap_add_uint64(masterzap, "acltype", 2 /* NFSv4 */); + zap_write(zfs, masterzap); + + /* + * All finished with this object set, we may as well write it now. + * The DSL layer will sum up the bytes consumed by each dataset using + * information stored in the object set, so it can't be freed just yet. + */ + dsl_dir_dataset_write(zfs, os, dsldir); + + if (fakedroot) { + free(root->inode); + free(root->name); + free(root); + } + free(fs.saoffs); +} + +/* + * Create an object set for each DSL directory which has a dataset and doesn't + * already have an object set. + */ +static void +fs_build_unmounted(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg __unused) +{ + if (dsl_dir_has_dataset(dsldir) && !dsl_dir_dataset_has_objset(dsldir)) + fs_build_one(zfs, dsldir, NULL, -1); +} + +/* + * Create our datasets and populate them with files. + */ +void +fs_build(zfs_opt_t *zfs, int dirfd, fsnode *root) +{ + /* + * Run through our datasets and find the root fsnode for each one. Each + * root fsnode is flagged so that we can figure out which dataset it + * belongs to. + */ + dsl_dir_foreach(zfs, zfs->rootdsldir, fs_layout_one, root); + + /* + * Did we find our boot filesystem? + */ + if (zfs->bootfs != NULL && !zap_entry_exists(zfs->poolprops, "bootfs")) + errx(1, "no mounted dataset matches bootfs property `%s'", + zfs->bootfs); + + /* + * Traverse the file hierarchy starting from the root fsnode. One + * dataset, not necessarily the root dataset, must "own" the root + * directory by having its mountpoint be equal to the root path. + * + * As roots of other datasets are encountered during the traversal, + * fs_build_one() recursively creates the corresponding object sets and + * populates them. Once this function has returned, all datasets will + * have been fully populated. + */ + fs_build_one(zfs, root->inode->param, root, dirfd); + + /* + * Now create object sets for datasets whose mountpoints weren't found + * in the staging directory, either because there is no mountpoint, or + * because the mountpoint doesn't correspond to an existing directory. + */ + dsl_dir_foreach(zfs, zfs->rootdsldir, fs_build_unmounted, NULL); +} diff --git a/usr.sbin/makefs/zfs/objset.c b/usr.sbin/makefs/zfs/objset.c new file mode 100644 index 000000000000..fdb17167a607 --- /dev/null +++ b/usr.sbin/makefs/zfs/objset.c @@ -0,0 +1,259 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 The FreeBSD Foundation + * + * This software was developed by Mark Johnston under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include + +#include + +#include "zfs.h" + +#define DNODES_PER_CHUNK (MAXBLOCKSIZE / sizeof(dnode_phys_t)) + +struct objset_dnode_chunk { + dnode_phys_t buf[DNODES_PER_CHUNK]; + unsigned int nextfree; + STAILQ_ENTRY(objset_dnode_chunk) next; +}; + +typedef struct zfs_objset { + /* Physical object set. */ + objset_phys_t *phys; + off_t osloc; + off_t osblksz; + blkptr_t osbp; /* set in objset_write() */ + + /* Accounting. */ + off_t space; /* bytes allocated to this objset */ + + /* dnode allocator. */ + uint64_t dnodecount; + STAILQ_HEAD(, objset_dnode_chunk) dnodechunks; +} zfs_objset_t; + +static void +dnode_init(dnode_phys_t *dnode, uint8_t type, uint8_t bonustype, + uint16_t bonuslen) +{ + dnode->dn_indblkshift = MAXBLOCKSHIFT; + dnode->dn_type = type; + dnode->dn_bonustype = bonustype; + dnode->dn_bonuslen = bonuslen; + dnode->dn_checksum = ZIO_CHECKSUM_FLETCHER_4; + dnode->dn_nlevels = 1; + dnode->dn_nblkptr = 1; + dnode->dn_flags = DNODE_FLAG_USED_BYTES; +} + +zfs_objset_t * +objset_alloc(zfs_opt_t *zfs, uint64_t type) +{ + struct objset_dnode_chunk *chunk; + zfs_objset_t *os; + + os = ecalloc(1, sizeof(*os)); + os->osblksz = sizeof(objset_phys_t); + os->osloc = objset_space_alloc(zfs, os, &os->osblksz); + + /* + * Object ID zero is always reserved for the meta dnode, which is + * embedded in the objset itself. + */ + STAILQ_INIT(&os->dnodechunks); + chunk = ecalloc(1, sizeof(*chunk)); + chunk->nextfree = 1; + STAILQ_INSERT_HEAD(&os->dnodechunks, chunk, next); + os->dnodecount = 1; + + os->phys = ecalloc(1, os->osblksz); + os->phys->os_type = type; + + dnode_init(&os->phys->os_meta_dnode, DMU_OT_DNODE, DMU_OT_NONE, 0); + os->phys->os_meta_dnode.dn_datablkszsec = + DNODE_BLOCK_SIZE >> MINBLOCKSHIFT; + + return (os); +} + +/* + * Write the dnode array and physical object set to disk. + */ +static void +_objset_write(zfs_opt_t *zfs, zfs_objset_t *os, struct dnode_cursor *c, + off_t loc) +{ + struct objset_dnode_chunk *chunk, *tmp; + unsigned int total; + + /* + * Write out the dnode array, i.e., the meta-dnode. For some reason its + * data blocks must be 16KB in size no matter how large the array is. + */ + total = 0; + STAILQ_FOREACH_SAFE(chunk, &os->dnodechunks, next, tmp) { + unsigned int i; + + assert(chunk->nextfree <= os->dnodecount); + assert(chunk->nextfree <= DNODES_PER_CHUNK); + + for (i = 0; i < chunk->nextfree; i += DNODES_PER_BLOCK) { + blkptr_t *bp; + uint64_t fill; + + if (chunk->nextfree - i < DNODES_PER_BLOCK) + fill = DNODES_PER_BLOCK - (chunk->nextfree - i); + else + fill = 0; + bp = dnode_cursor_next(zfs, c, + (total + i) * sizeof(dnode_phys_t)); + vdev_pwrite_dnode_indir(zfs, &os->phys->os_meta_dnode, + 0, fill, chunk->buf + i, DNODE_BLOCK_SIZE, loc, bp); + loc += DNODE_BLOCK_SIZE; + } + total += i; + + free(chunk); + } + dnode_cursor_finish(zfs, c); + STAILQ_INIT(&os->dnodechunks); + + /* + * Write the object set itself. The saved block pointer will be copied + * into the referencing DSL dataset or the uberblocks. + */ + vdev_pwrite_data(zfs, DMU_OT_OBJSET, ZIO_CHECKSUM_FLETCHER_4, 0, 1, + os->phys, os->osblksz, os->osloc, &os->osbp); +} + +void +objset_write(zfs_opt_t *zfs, zfs_objset_t *os) +{ + struct dnode_cursor *c; + off_t dnodeloc, dnodesz; + uint64_t dnodecount; + + /* + * There is a chicken-and-egg problem here when writing the MOS: we + * cannot write space maps before we're finished allocating space from + * the vdev, and we can't write the MOS without having allocated space + * for indirect dnode blocks. Thus, rather than lazily allocating + * indirect blocks for the meta-dnode (which would be simpler), they are + * allocated up-front and before writing space maps. + */ + dnodecount = os->dnodecount; + if (os == zfs->mos) + dnodecount += zfs->mscount; + dnodesz = dnodecount * sizeof(dnode_phys_t); + c = dnode_cursor_init(zfs, os, &os->phys->os_meta_dnode, dnodesz, + DNODE_BLOCK_SIZE); + dnodesz = roundup2(dnodesz, DNODE_BLOCK_SIZE); + dnodeloc = objset_space_alloc(zfs, os, &dnodesz); + + if (os == zfs->mos) { + vdev_spacemap_write(zfs); + + /* + * We've finished allocating space, account for it in $MOS. + */ + dsl_dir_size_set(zfs->mosdsldir, os->space); + } + _objset_write(zfs, os, c, dnodeloc); +} + +dnode_phys_t * +objset_dnode_bonus_alloc(zfs_objset_t *os, uint8_t type, uint8_t bonustype, + uint16_t bonuslen, uint64_t *idp) +{ + struct objset_dnode_chunk *chunk; + dnode_phys_t *dnode; + + assert(bonuslen <= DN_OLD_MAX_BONUSLEN); + assert(!STAILQ_EMPTY(&os->dnodechunks)); + + chunk = STAILQ_LAST(&os->dnodechunks, objset_dnode_chunk, next); + if (chunk->nextfree == DNODES_PER_CHUNK) { + chunk = ecalloc(1, sizeof(*chunk)); + STAILQ_INSERT_TAIL(&os->dnodechunks, chunk, next); + } + *idp = os->dnodecount++; + dnode = &chunk->buf[chunk->nextfree++]; + dnode_init(dnode, type, bonustype, bonuslen); + dnode->dn_datablkszsec = os->osblksz >> MINBLOCKSHIFT; + return (dnode); +} + +dnode_phys_t * +objset_dnode_alloc(zfs_objset_t *os, uint8_t type, uint64_t *idp) +{ + return (objset_dnode_bonus_alloc(os, type, DMU_OT_NONE, 0, idp)); +} + +/* + * Look up a physical dnode by ID. This is not used often so a linear search is + * fine. + */ +dnode_phys_t * +objset_dnode_lookup(zfs_objset_t *os, uint64_t id) +{ + struct objset_dnode_chunk *chunk; + + assert(id > 0); + assert(id < os->dnodecount); + + STAILQ_FOREACH(chunk, &os->dnodechunks, next) { + if (id < DNODES_PER_CHUNK) + return (&chunk->buf[id]); + id -= DNODES_PER_CHUNK; + } + assert(0); + return (NULL); +} + +off_t +objset_space_alloc(zfs_opt_t *zfs, zfs_objset_t *os, off_t *lenp) +{ + off_t loc; + + loc = vdev_space_alloc(zfs, lenp); + os->space += *lenp; + return (loc); +} + +uint64_t +objset_space(const zfs_objset_t *os) +{ + return (os->space); +} + +void +objset_root_blkptr_copy(const zfs_objset_t *os, blkptr_t *bp) +{ + memcpy(bp, &os->osbp, sizeof(blkptr_t)); +} diff --git a/usr.sbin/makefs/zfs/vdev.c b/usr.sbin/makefs/zfs/vdev.c new file mode 100644 index 000000000000..1709a828b7c5 --- /dev/null +++ b/usr.sbin/makefs/zfs/vdev.c @@ -0,0 +1,435 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 The FreeBSD Foundation + * + * This software was developed by Mark Johnston under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include + +#include + +#include "zfs.h" + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-function" +#include "zfs/fletcher.c" +#include "zfs/sha256.c" +#pragma clang diagnostic pop + +static void +blkptr_set(blkptr_t *bp, off_t off, off_t size, uint8_t dntype, uint8_t level, + uint64_t fill, enum zio_checksum cksumt, zio_cksum_t *cksum) +{ + dva_t *dva; + + assert(powerof2(size)); + + BP_ZERO(bp); + BP_SET_LSIZE(bp, size); + BP_SET_PSIZE(bp, size); + BP_SET_CHECKSUM(bp, cksumt); + BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + BP_SET_BIRTH(bp, TXG, TXG); + BP_SET_LEVEL(bp, level); + BP_SET_FILL(bp, fill); + BP_SET_TYPE(bp, dntype); + + dva = BP_IDENTITY(bp); + DVA_SET_VDEV(dva, 0); + DVA_SET_OFFSET(dva, off); + DVA_SET_ASIZE(dva, size); + memcpy(&bp->blk_cksum, cksum, sizeof(*cksum)); +} + +/* + * Write a block of data to the vdev. The offset is always relative to the end + * of the second leading vdev label. + * + * Consumers should generally use the helpers below, which provide block + * pointers and update dnode accounting, rather than calling this function + * directly. + */ +static void +vdev_pwrite(const zfs_opt_t *zfs, const void *buf, size_t len, off_t off) +{ + ssize_t n; + + assert(off >= 0 && off < zfs->asize); + assert(powerof2(len)); + assert((off_t)len > 0 && off + (off_t)len > off && + off + (off_t)len < zfs->asize); + if (zfs->spacemap != NULL) { + /* + * Verify that the blocks being written were in fact allocated. + * + * The space map isn't available once the on-disk space map is + * finalized, so this check doesn't quite catch everything. + */ + assert(bit_ntest(zfs->spacemap, off >> zfs->ashift, + (off + len - 1) >> zfs->ashift, 1)); + } + + off += VDEV_LABEL_START_SIZE; + for (size_t sofar = 0; sofar < len; sofar += n) { + n = pwrite(zfs->fd, (const char *)buf + sofar, len - sofar, + off + sofar); + if (n < 0) + err(1, "pwrite"); + assert(n > 0); + } +} + +void +vdev_pwrite_data(zfs_opt_t *zfs, uint8_t datatype, uint8_t cksumtype, + uint8_t level, uint64_t fill, const void *data, off_t sz, off_t loc, + blkptr_t *bp) +{ + zio_cksum_t cksum; + + assert(cksumtype == ZIO_CHECKSUM_FLETCHER_4); + + fletcher_4_native(data, sz, NULL, &cksum); + blkptr_set(bp, loc, sz, datatype, level, fill, cksumtype, &cksum); + vdev_pwrite(zfs, data, sz, loc); +} + +void +vdev_pwrite_dnode_indir(zfs_opt_t *zfs, dnode_phys_t *dnode, uint8_t level, + uint64_t fill, const void *data, off_t sz, off_t loc, blkptr_t *bp) +{ + vdev_pwrite_data(zfs, dnode->dn_type, dnode->dn_checksum, level, fill, + data, sz, loc, bp); + + assert((dnode->dn_flags & DNODE_FLAG_USED_BYTES) != 0); + dnode->dn_used += sz; +} + +void +vdev_pwrite_dnode_data(zfs_opt_t *zfs, dnode_phys_t *dnode, const void *data, + off_t sz, off_t loc) +{ + vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, data, sz, loc, + &dnode->dn_blkptr[0]); +} + +static void +vdev_label_set_checksum(void *buf, off_t off, off_t size) +{ + zio_cksum_t cksum; + zio_eck_t *eck; + + assert(size > 0 && (size_t)size >= sizeof(zio_eck_t)); + + eck = (zio_eck_t *)((char *)buf + size) - 1; + eck->zec_magic = ZEC_MAGIC; + ZIO_SET_CHECKSUM(&eck->zec_cksum, off, 0, 0, 0); + zio_checksum_SHA256(buf, size, NULL, &cksum); + eck->zec_cksum = cksum; +} + +/* + * Set embedded checksums and write the label at the specified index. + */ +void +vdev_label_write(zfs_opt_t *zfs, int ind, const vdev_label_t *labelp) +{ + vdev_label_t *label; + ssize_t n; + off_t blksz, loff; + + assert(ind >= 0 && ind < VDEV_LABELS); + + /* + * Make a copy since we have to modify the label to set checksums. + */ + label = ecalloc(1, sizeof(*label)); + memcpy(label, labelp, sizeof(*label)); + + if (ind < 2) + loff = ind * sizeof(*label); + else + loff = zfs->vdevsize - (VDEV_LABELS - ind) * sizeof(*label); + + /* + * Set the verifier checksum for the boot block. We don't use it, but + * the FreeBSD loader reads it and will complain if the checksum isn't + * valid. + */ + vdev_label_set_checksum(&label->vl_be, + loff + __offsetof(vdev_label_t, vl_be), sizeof(label->vl_be)); + + /* + * Set the verifier checksum for the label. + */ + vdev_label_set_checksum(&label->vl_vdev_phys, + loff + __offsetof(vdev_label_t, vl_vdev_phys), + sizeof(label->vl_vdev_phys)); + + /* + * Set the verifier checksum for the uberblocks. There is one uberblock + * per sector; for example, with an ashift of 12 we end up with + * 128KB/4KB=32 copies of the uberblock in the ring. + */ + blksz = 1 << zfs->ashift; + assert(sizeof(label->vl_uberblock) % blksz == 0); + for (size_t roff = 0; roff < sizeof(label->vl_uberblock); + roff += blksz) { + vdev_label_set_checksum(&label->vl_uberblock[0] + roff, + loff + __offsetof(vdev_label_t, vl_uberblock) + roff, + blksz); + } + + n = pwrite(zfs->fd, label, sizeof(*label), loff); + if (n < 0) + err(1, "writing vdev label"); + assert(n == sizeof(*label)); + + free(label); +} + +/* + * Find a chunk of contiguous free space of length *lenp, according to the + * following rules: + * 1. If the length is less than or equal to 128KB, the returned run's length + * will be the smallest power of 2 equal to or larger than the length. + * 2. If the length is larger than 128KB, the returned run's length will be + * the smallest multiple of 128KB that is larger than the length. + * 3. The returned run's length will be size-aligned up to 128KB. + * + * XXX-MJ the third rule isn't actually required, so this can just be a dumb + * bump allocator. Maybe there's some benefit to keeping large blocks aligned, + * so let's keep it for now and hope we don't get too much fragmentation. + * Alternately we could try to allocate all blocks of a certain size from the + * same metaslab. + */ +off_t +vdev_space_alloc(zfs_opt_t *zfs, off_t *lenp) +{ + off_t len; + int align, loc, minblksz, nbits; + + minblksz = 1 << zfs->ashift; + len = roundup2(*lenp, minblksz); + + assert(len != 0); + assert(len / minblksz <= INT_MAX); + + if (len < MAXBLOCKSIZE) { + if ((len & (len - 1)) != 0) + len = (off_t)1 << flsll(len); + align = len / minblksz; + } else { + len = roundup2(len, MAXBLOCKSIZE); + align = MAXBLOCKSIZE / minblksz; + } + + for (loc = 0, nbits = len / minblksz;; loc = roundup2(loc, align)) { + bit_ffc_area_at(zfs->spacemap, loc, zfs->spacemapbits, nbits, + &loc); + if (loc == -1) { + errx(1, "failed to find %ju bytes of space", + (uintmax_t)len); + } + if ((loc & (align - 1)) == 0) + break; + } + assert(loc + nbits > loc); + bit_nset(zfs->spacemap, loc, loc + nbits - 1); + *lenp = len; + + return ((off_t)loc << zfs->ashift); +} + +static void +vdev_spacemap_init(zfs_opt_t *zfs) +{ + uint64_t nbits; + + assert(powerof2(zfs->mssize)); + + nbits = rounddown2(zfs->asize, zfs->mssize) >> zfs->ashift; + if (nbits > INT_MAX) { + /* + * With the smallest block size of 512B, the limit on the image + * size is 2TB. That should be enough for anyone. + */ + errx(1, "image size is too large"); + } + zfs->spacemapbits = (int)nbits; + zfs->spacemap = bit_alloc(zfs->spacemapbits); + if (zfs->spacemap == NULL) + err(1, "bitstring allocation failed"); +} + +void +vdev_spacemap_write(zfs_opt_t *zfs) +{ + dnode_phys_t *objarr; + bitstr_t *spacemap; + uint64_t *objarrblk; + off_t smblksz, objarrblksz, objarrloc; + + struct { + dnode_phys_t *dnode; + uint64_t dnid; + off_t loc; + } *sma; + + objarrblksz = sizeof(uint64_t) * zfs->mscount; + assert(objarrblksz <= MAXBLOCKSIZE); + objarrloc = objset_space_alloc(zfs, zfs->mos, &objarrblksz); + objarrblk = ecalloc(1, objarrblksz); + + objarr = objset_dnode_lookup(zfs->mos, zfs->objarrid); + objarr->dn_datablkszsec = objarrblksz >> MINBLOCKSHIFT; + + /* + * Use the smallest block size for space maps. The space allocation + * algorithm should aim to minimize the number of holes. + */ + smblksz = 1 << zfs->ashift; + + /* + * First allocate dnodes and space for all of our space maps. No more + * space can be allocated from the vdev after this point. + */ + sma = ecalloc(zfs->mscount, sizeof(*sma)); + for (uint64_t i = 0; i < zfs->mscount; i++) { + sma[i].dnode = objset_dnode_bonus_alloc(zfs->mos, + DMU_OT_SPACE_MAP, DMU_OT_SPACE_MAP_HEADER, + sizeof(space_map_phys_t), &sma[i].dnid); + sma[i].loc = objset_space_alloc(zfs, zfs->mos, &smblksz); + } + spacemap = zfs->spacemap; + zfs->spacemap = NULL; + + /* + * Now that the set of allocated space is finalized, populate each space + * map and write it to the vdev. + */ + for (uint64_t i = 0; i < zfs->mscount; i++) { + space_map_phys_t *sm; + uint64_t alloc, length, *smblk; + int shift, startb, endb, srunb, erunb; + + /* + * We only allocate a single block for this space map, but + * OpenZFS assumes that a space map object with sufficient bonus + * space supports histograms. + */ + sma[i].dnode->dn_nblkptr = 3; + sma[i].dnode->dn_datablkszsec = smblksz >> MINBLOCKSHIFT; + + smblk = ecalloc(1, smblksz); + + alloc = length = 0; + shift = zfs->msshift - zfs->ashift; + for (srunb = startb = i * (1 << shift), + endb = (i + 1) * (1 << shift); + srunb < endb; srunb = erunb) { + uint64_t runlen, runoff; + + /* Find a run of allocated space. */ + bit_ffs_at(spacemap, srunb, zfs->spacemapbits, &srunb); + if (srunb == -1 || srunb >= endb) + break; + + bit_ffc_at(spacemap, srunb, zfs->spacemapbits, &erunb); + if (erunb == -1 || erunb > endb) + erunb = endb; + + /* + * The space represented by [srunb, erunb) has been + * allocated. Add a record to the space map to indicate + * this. Run offsets are relative to the beginning of + * the metaslab. + */ + runlen = erunb - srunb; + runoff = srunb - startb; + + assert(length * sizeof(uint64_t) < (uint64_t)smblksz); + smblk[length] = SM_PREFIX_ENCODE(SM2_PREFIX) | + SM2_RUN_ENCODE(runlen) | SM2_VDEV_ENCODE(0); + smblk[length + 1] = SM2_TYPE_ENCODE(SM_ALLOC) | + SM2_OFFSET_ENCODE(runoff); + + alloc += runlen << zfs->ashift; + length += 2; + } + + sm = DN_BONUS(sma[i].dnode); + sm->smp_length = length * sizeof(uint64_t); + sm->smp_alloc = alloc; + + vdev_pwrite_dnode_data(zfs, sma[i].dnode, smblk, smblksz, + sma[i].loc); + free(smblk); + + /* Record this space map in the space map object array. */ + objarrblk[i] = sma[i].dnid; + } + + /* + * All of the space maps are written, now write the object array. + */ + vdev_pwrite_dnode_data(zfs, objarr, objarrblk, objarrblksz, objarrloc); + free(objarrblk); + + assert(zfs->spacemap == NULL); + free(spacemap); + free(sma); +} + +void +vdev_init(zfs_opt_t *zfs, const char *image) +{ + assert(zfs->ashift >= MINBLOCKSHIFT); + + zfs->fd = open(image, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (zfs->fd == -1) + err(1, "Can't open `%s' for writing", image); + if (ftruncate(zfs->fd, zfs->vdevsize) != 0) + err(1, "Failed to extend image file `%s'", image); + + vdev_spacemap_init(zfs); +} + +void +vdev_fini(zfs_opt_t *zfs) +{ + assert(zfs->spacemap == NULL); + + if (zfs->fd != -1) { + if (close(zfs->fd) != 0) + err(1, "close"); + zfs->fd = -1; + } +} diff --git a/usr.sbin/makefs/zfs/zap.c b/usr.sbin/makefs/zfs/zap.c new file mode 100644 index 000000000000..398c0fbf029c --- /dev/null +++ b/usr.sbin/makefs/zfs/zap.c @@ -0,0 +1,551 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 The FreeBSD Foundation + * + * This software was developed by Mark Johnston under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include + +#include +#include +#include + +#include + +#include "makefs.h" +#include "zfs.h" + +typedef struct zfs_zap_entry { + char *name; /* entry key, private copy */ + uint64_t hash; /* key hash */ + union { + uint8_t *valp; + uint16_t *val16p; + uint32_t *val32p; + uint64_t *val64p; + }; /* entry value, an integer array */ + uint64_t val64; /* embedded value for a common case */ + size_t intsz; /* array element size; 1, 2, 4 or 8 */ + size_t intcnt; /* array size */ + STAILQ_ENTRY(zfs_zap_entry) next; +} zfs_zap_entry_t; + +struct zfs_zap { + STAILQ_HEAD(, zfs_zap_entry) kvps; + uint64_t hashsalt; /* key hash input */ + unsigned long kvpcnt; /* number of key-value pairs */ + unsigned long chunks; /* count of chunks needed for fat ZAP */ + bool micro; /* can this be a micro ZAP? */ + + dnode_phys_t *dnode; /* backpointer */ + zfs_objset_t *os; /* backpointer */ +}; + +static uint16_t +zap_entry_chunks(zfs_zap_entry_t *ent) +{ + return (1 + howmany(strlen(ent->name) + 1, ZAP_LEAF_ARRAY_BYTES) + + howmany(ent->intsz * ent->intcnt, ZAP_LEAF_ARRAY_BYTES)); +} + +static uint64_t +zap_hash(uint64_t salt, const char *name) +{ + static uint64_t crc64_table[256]; + const uint64_t crc64_poly = 0xC96C5795D7870F42UL; + const uint8_t *cp; + uint64_t crc; + uint8_t c; + + assert(salt != 0); + if (crc64_table[128] == 0) { + for (int i = 0; i < 256; i++) { + uint64_t *t; + + t = crc64_table + i; + *t = i; + for (int j = 8; j > 0; j--) + *t = (*t >> 1) ^ (-(*t & 1) & crc64_poly); + } + } + assert(crc64_table[128] == crc64_poly); + + for (cp = (const uint8_t *)name, crc = salt; (c = *cp) != '\0'; cp++) + crc = (crc >> 8) ^ crc64_table[(crc ^ c) & 0xFF]; + + /* + * Only use 28 bits, since we need 4 bits in the cookie for the + * collision differentiator. We MUST use the high bits, since + * those are the ones that we first pay attention to when + * choosing the bucket. + */ + crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); + + return (crc); +} + +zfs_zap_t * +zap_alloc(zfs_objset_t *os, dnode_phys_t *dnode) +{ + zfs_zap_t *zap; + + zap = ecalloc(1, sizeof(*zap)); + STAILQ_INIT(&zap->kvps); + zap->hashsalt = ((uint64_t)random() << 32) | random(); + zap->micro = true; + zap->kvpcnt = 0; + zap->chunks = 0; + zap->dnode = dnode; + zap->os = os; + return (zap); +} + +void +zap_add(zfs_zap_t *zap, const char *name, size_t intsz, size_t intcnt, + const uint8_t *val) +{ + zfs_zap_entry_t *ent; + + assert(intsz == 1 || intsz == 2 || intsz == 4 || intsz == 8); + assert(strlen(name) + 1 <= ZAP_MAXNAMELEN); + assert(intcnt <= ZAP_MAXVALUELEN && intcnt * intsz <= ZAP_MAXVALUELEN); + + ent = ecalloc(1, sizeof(*ent)); + ent->name = estrdup(name); + ent->hash = zap_hash(zap->hashsalt, ent->name); + ent->intsz = intsz; + ent->intcnt = intcnt; + if (intsz == sizeof(uint64_t) && intcnt == 1) { + /* + * Micro-optimization to elide a memory allocation in that most + * common case where this is a directory entry. + */ + ent->val64p = &ent->val64; + } else { + ent->valp = ecalloc(intcnt, intsz); + } + memcpy(ent->valp, val, intcnt * intsz); + zap->kvpcnt++; + zap->chunks += zap_entry_chunks(ent); + STAILQ_INSERT_TAIL(&zap->kvps, ent, next); + + if (zap->micro && (intcnt != 1 || intsz != sizeof(uint64_t) || + strlen(name) + 1 > MZAP_NAME_LEN || zap->kvpcnt > MZAP_ENT_MAX)) + zap->micro = false; +} + +void +zap_add_uint64(zfs_zap_t *zap, const char *name, uint64_t val) +{ + zap_add(zap, name, sizeof(uint64_t), 1, (uint8_t *)&val); +} + +void +zap_add_string(zfs_zap_t *zap, const char *name, const char *val) +{ + zap_add(zap, name, 1, strlen(val) + 1, val); +} + +bool +zap_entry_exists(zfs_zap_t *zap, const char *name) +{ + zfs_zap_entry_t *ent; + + STAILQ_FOREACH(ent, &zap->kvps, next) { + if (strcmp(ent->name, name) == 0) + return (true); + } + return (false); +} + +static void +zap_micro_write(zfs_opt_t *zfs, zfs_zap_t *zap) +{ + dnode_phys_t *dnode; + zfs_zap_entry_t *ent; + mzap_phys_t *mzap; + mzap_ent_phys_t *ment; + off_t bytes, loc; + + memset(zfs->filebuf, 0, sizeof(zfs->filebuf)); + mzap = (mzap_phys_t *)&zfs->filebuf[0]; + mzap->mz_block_type = ZBT_MICRO; + mzap->mz_salt = zap->hashsalt; + mzap->mz_normflags = 0; + + bytes = sizeof(*mzap) + (zap->kvpcnt - 1) * sizeof(*ment); + assert(bytes <= (off_t)MZAP_MAX_BLKSZ); + + ment = &mzap->mz_chunk[0]; + STAILQ_FOREACH(ent, &zap->kvps, next) { + memcpy(&ment->mze_value, ent->valp, ent->intsz * ent->intcnt); + ment->mze_cd = 0; /* XXX-MJ */ + strlcpy(ment->mze_name, ent->name, sizeof(ment->mze_name)); + ment++; + } + + loc = objset_space_alloc(zfs, zap->os, &bytes); + + dnode = zap->dnode; + dnode->dn_maxblkid = 0; + dnode->dn_datablkszsec = bytes >> MINBLOCKSHIFT; + dnode->dn_flags = DNODE_FLAG_USED_BYTES; + + vdev_pwrite_dnode_data(zfs, dnode, zfs->filebuf, bytes, loc); +} + +/* + * Write some data to the fat ZAP leaf chunk starting at index "li". + * + * Note that individual integers in the value may be split among consecutive + * leaves. + */ +static void +zap_fat_write_array_chunk(zap_leaf_t *l, uint16_t li, size_t sz, + const uint8_t *val) +{ + struct zap_leaf_array *la; + + assert(sz <= ZAP_MAXVALUELEN); + + for (uint16_t n, resid = sz; resid > 0; resid -= n, val += n, li++) { + n = MIN(resid, ZAP_LEAF_ARRAY_BYTES); + + la = &ZAP_LEAF_CHUNK(l, li).l_array; + assert(la->la_type == ZAP_CHUNK_FREE); + la->la_type = ZAP_CHUNK_ARRAY; + memcpy(la->la_array, val, n); + la->la_next = li + 1; + } + la->la_next = 0xffff; +} + +/* + * Find the shortest hash prefix length which lets us distribute keys without + * overflowing a leaf block. This is not (space) optimal, but is simple, and + * directories large enough to overflow a single 128KB leaf block are uncommon. + */ +static unsigned int +zap_fat_write_prefixlen(zfs_zap_t *zap, zap_leaf_t *l) +{ + zfs_zap_entry_t *ent; + unsigned int prefixlen; + + if (zap->chunks <= ZAP_LEAF_NUMCHUNKS(l)) { + /* + * All chunks will fit in a single leaf block. + */ + return (0); + } + + for (prefixlen = 1; prefixlen < (unsigned int)l->l_bs; prefixlen++) { + uint32_t *leafchunks; + + leafchunks = ecalloc(1u << prefixlen, sizeof(*leafchunks)); + STAILQ_FOREACH(ent, &zap->kvps, next) { + uint64_t li; + uint16_t chunks; + + li = ZAP_HASH_IDX(ent->hash, prefixlen); + + chunks = zap_entry_chunks(ent); + if (ZAP_LEAF_NUMCHUNKS(l) - leafchunks[li] < chunks) { + /* + * Not enough space, grow the prefix and retry. + */ + break; + } + leafchunks[li] += chunks; + } + free(leafchunks); + + if (ent == NULL) { + /* + * Everything fits, we're done. + */ + break; + } + } + + /* + * If this fails, then we need to expand the pointer table. For now + * this situation is unhandled since it is hard to trigger. + */ + assert(prefixlen < (unsigned int)l->l_bs); + + return (prefixlen); +} + +/* + * Initialize a fat ZAP leaf block. + */ +static void +zap_fat_write_leaf_init(zap_leaf_t *l, uint64_t prefix, int prefixlen) +{ + zap_leaf_phys_t *leaf; + + leaf = l->l_phys; + + leaf->l_hdr.lh_block_type = ZBT_LEAF; + leaf->l_hdr.lh_magic = ZAP_LEAF_MAGIC; + leaf->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l); + leaf->l_hdr.lh_prefix = prefix; + leaf->l_hdr.lh_prefix_len = prefixlen; + + /* Initialize the leaf hash table. */ + assert(leaf->l_hdr.lh_nfree < 0xffff); + memset(leaf->l_hash, 0xff, + ZAP_LEAF_HASH_NUMENTRIES(l) * sizeof(*leaf->l_hash)); + + /* Initialize the leaf chunks. */ + for (uint16_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { + struct zap_leaf_free *lf; + + lf = &ZAP_LEAF_CHUNK(l, i).l_free; + lf->lf_type = ZAP_CHUNK_FREE; + if (i + 1 == ZAP_LEAF_NUMCHUNKS(l)) + lf->lf_next = 0xffff; + else + lf->lf_next = i + 1; + } +} + +static void +zap_fat_write(zfs_opt_t *zfs, zfs_zap_t *zap) +{ + struct dnode_cursor *c; + zap_leaf_t l; + zap_phys_t *zaphdr; + struct zap_table_phys *zt; + zfs_zap_entry_t *ent; + dnode_phys_t *dnode; + uint8_t *leafblks; + uint64_t lblkcnt, *ptrhasht; + off_t loc, blksz; + size_t blkshift; + unsigned int prefixlen; + int ptrcnt; + + /* + * For simplicity, always use the largest block size. This should be ok + * since most directories will be micro ZAPs, but it's space inefficient + * for small ZAPs and might need to be revisited. + */ + blkshift = MAXBLOCKSHIFT; + blksz = (off_t)1 << blkshift; + + /* + * Embedded pointer tables give up to 8192 entries. This ought to be + * enough for anything except massive directories. + */ + ptrcnt = (blksz / 2) / sizeof(uint64_t); + + memset(zfs->filebuf, 0, sizeof(zfs->filebuf)); + zaphdr = (zap_phys_t *)&zfs->filebuf[0]; + zaphdr->zap_block_type = ZBT_HEADER; + zaphdr->zap_magic = ZAP_MAGIC; + zaphdr->zap_num_entries = zap->kvpcnt; + zaphdr->zap_salt = zap->hashsalt; + + l.l_bs = blkshift; + l.l_phys = NULL; + + zt = &zaphdr->zap_ptrtbl; + zt->zt_blk = 0; + zt->zt_numblks = 0; + zt->zt_shift = flsll(ptrcnt) - 1; + zt->zt_nextblk = 0; + zt->zt_blks_copied = 0; + + /* + * How many leaf blocks do we need? Initialize them and update the + * header. + */ + prefixlen = zap_fat_write_prefixlen(zap, &l); + lblkcnt = 1 << prefixlen; + leafblks = ecalloc(lblkcnt, blksz); + for (unsigned int li = 0; li < lblkcnt; li++) { + l.l_phys = (zap_leaf_phys_t *)(leafblks + li * blksz); + zap_fat_write_leaf_init(&l, li, prefixlen); + } + zaphdr->zap_num_leafs = lblkcnt; + zaphdr->zap_freeblk = lblkcnt + 1; + + /* + * For each entry, figure out which leaf block it belongs to based on + * the upper bits of its hash, allocate chunks from that leaf, and fill + * them out. + */ + ptrhasht = (uint64_t *)(&zfs->filebuf[0] + blksz / 2); + STAILQ_FOREACH(ent, &zap->kvps, next) { + struct zap_leaf_entry *le; + uint16_t *lptr; + uint64_t hi, li; + uint16_t namelen, nchunks, nnamechunks, nvalchunks; + + hi = ZAP_HASH_IDX(ent->hash, zt->zt_shift); + li = ZAP_HASH_IDX(ent->hash, prefixlen); + assert(ptrhasht[hi] == 0 || ptrhasht[hi] == li + 1); + ptrhasht[hi] = li + 1; + l.l_phys = (zap_leaf_phys_t *)(leafblks + li * blksz); + + namelen = strlen(ent->name) + 1; + + /* + * How many leaf chunks do we need for this entry? + */ + nnamechunks = howmany(namelen, ZAP_LEAF_ARRAY_BYTES); + nvalchunks = howmany(ent->intcnt, + ZAP_LEAF_ARRAY_BYTES / ent->intsz); + nchunks = 1 + nnamechunks + nvalchunks; + + /* + * Allocate a run of free leaf chunks for this entry, + * potentially extending a hash chain. + */ + assert(l.l_phys->l_hdr.lh_nfree >= nchunks); + l.l_phys->l_hdr.lh_nfree -= nchunks; + l.l_phys->l_hdr.lh_nentries++; + lptr = ZAP_LEAF_HASH_ENTPTR(&l, ent->hash); + while (*lptr != 0xffff) { + assert(*lptr < ZAP_LEAF_NUMCHUNKS(&l)); + le = ZAP_LEAF_ENTRY(&l, *lptr); + assert(le->le_type == ZAP_CHUNK_ENTRY); + le->le_cd++; + lptr = &le->le_next; + } + *lptr = l.l_phys->l_hdr.lh_freelist; + l.l_phys->l_hdr.lh_freelist += nchunks; + assert(l.l_phys->l_hdr.lh_freelist <= + ZAP_LEAF_NUMCHUNKS(&l)); + if (l.l_phys->l_hdr.lh_freelist == + ZAP_LEAF_NUMCHUNKS(&l)) + l.l_phys->l_hdr.lh_freelist = 0xffff; + + /* + * Integer values must be stored in big-endian format. + */ + switch (ent->intsz) { + case 1: + break; + case 2: + for (uint16_t *v = ent->val16p; + v - ent->val16p < (ptrdiff_t)ent->intcnt; + v++) + *v = htobe16(*v); + break; + case 4: + for (uint32_t *v = ent->val32p; + v - ent->val32p < (ptrdiff_t)ent->intcnt; + v++) + *v = htobe32(*v); + break; + case 8: + for (uint64_t *v = ent->val64p; + v - ent->val64p < (ptrdiff_t)ent->intcnt; + v++) + *v = htobe64(*v); + break; + default: + assert(0); + } + + /* + * Finally, write out the leaf chunks for this entry. + */ + le = ZAP_LEAF_ENTRY(&l, *lptr); + assert(le->le_type == ZAP_CHUNK_FREE); + le->le_type = ZAP_CHUNK_ENTRY; + le->le_next = 0xffff; + le->le_name_chunk = *lptr + 1; + le->le_name_numints = namelen; + le->le_value_chunk = *lptr + 1 + nnamechunks; + le->le_value_intlen = ent->intsz; + le->le_value_numints = ent->intcnt; + le->le_hash = ent->hash; + zap_fat_write_array_chunk(&l, *lptr + 1, namelen, ent->name); + zap_fat_write_array_chunk(&l, *lptr + 1 + nnamechunks, + ent->intcnt * ent->intsz, ent->valp); + } + + /* + * Initialize unused slots of the pointer table. + */ + for (int i = 0; i < ptrcnt; i++) + if (ptrhasht[i] == 0) + ptrhasht[i] = (i >> (zt->zt_shift - prefixlen)) + 1; + + /* + * Write the whole thing to disk. + */ + dnode = zap->dnode; + dnode->dn_nblkptr = 1; + dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT; + dnode->dn_maxblkid = lblkcnt + 1; + dnode->dn_flags = DNODE_FLAG_USED_BYTES; + + c = dnode_cursor_init(zfs, zap->os, zap->dnode, + (lblkcnt + 1) * blksz, blksz); + + loc = objset_space_alloc(zfs, zap->os, &blksz); + vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, zfs->filebuf, blksz, loc, + dnode_cursor_next(zfs, c, 0)); + + for (uint64_t i = 0; i < lblkcnt; i++) { + loc = objset_space_alloc(zfs, zap->os, &blksz); + vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, leafblks + i * blksz, + blksz, loc, dnode_cursor_next(zfs, c, (i + 1) * blksz)); + } + + dnode_cursor_finish(zfs, c); + + free(leafblks); +} + +void +zap_write(zfs_opt_t *zfs, zfs_zap_t *zap) +{ + zfs_zap_entry_t *ent; + + if (zap->micro) { + zap_micro_write(zfs, zap); + } else { + assert(!STAILQ_EMPTY(&zap->kvps)); + assert(zap->kvpcnt > 0); + zap_fat_write(zfs, zap); + } + + while ((ent = STAILQ_FIRST(&zap->kvps)) != NULL) { + STAILQ_REMOVE_HEAD(&zap->kvps, next); + if (ent->val64p != &ent->val64) + free(ent->valp); + free(ent->name); + free(ent); + } + free(zap); +} diff --git a/usr.sbin/makefs/zfs/zfs.h b/usr.sbin/makefs/zfs/zfs.h new file mode 100644 index 000000000000..b92e2c035669 --- /dev/null +++ b/usr.sbin/makefs/zfs/zfs.h @@ -0,0 +1,167 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 The FreeBSD Foundation + * + * This software was developed by Mark Johnston under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _MAKEFS_ZFS_H_ +#define _MAKEFS_ZFS_H_ + +#include +#include + +#include +#include + +#include "makefs.h" + +#include "zfs/nvlist.h" +#define ASSERT assert +#include "zfs/zfsimpl.h" + +#define MAXBLOCKSHIFT 17 /* 128KB */ +#define MAXBLOCKSIZE ((off_t)(1 << MAXBLOCKSHIFT)) +_Static_assert(MAXBLOCKSIZE == SPA_OLDMAXBLOCKSIZE, ""); +#define MINBLOCKSHIFT 9 /* 512B */ +#define MINBLOCKSIZE ((off_t)(1 << MINBLOCKSHIFT)) +_Static_assert(MINBLOCKSIZE == SPA_MINBLOCKSIZE, ""); +#define MINDEVSIZE ((off_t)SPA_MINDEVSIZE) + +/* All data was written in this transaction group. */ +#define TXG 4 + +typedef struct zfs_dsl_dataset zfs_dsl_dataset_t; +typedef struct zfs_dsl_dir zfs_dsl_dir_t; +typedef struct zfs_objset zfs_objset_t; +typedef struct zfs_zap zfs_zap_t; + +struct dataset_desc { + char *params; + STAILQ_ENTRY(dataset_desc) next; +}; + +typedef struct { + bool nowarn; + + /* I/O buffer, just for convenience. */ + char filebuf[MAXBLOCKSIZE]; + + /* Pool parameters. */ + const char *poolname; + char *rootpath; /* implicit mount point prefix */ + char *bootfs; /* bootable dataset, pool property */ + int ashift; /* vdev block size */ + uint64_t mssize; /* metaslab size */ + STAILQ_HEAD(, dataset_desc) datasetdescs; /* non-root dataset descrs */ + + /* Pool state. */ + uint64_t poolguid; /* pool and root vdev GUID */ + zfs_zap_t *poolprops; + + /* MOS state. */ + zfs_objset_t *mos; /* meta object set */ + uint64_t objarrid; /* space map object array */ + + /* DSL state. */ + zfs_dsl_dir_t *rootdsldir; /* root DSL directory */ + zfs_dsl_dataset_t *rootds; + zfs_dsl_dir_t *origindsldir; /* $ORIGIN */ + zfs_dsl_dataset_t *originds; + zfs_dsl_dataset_t *snapds; + zfs_zap_t *cloneszap; + zfs_dsl_dir_t *freedsldir; /* $FREE */ + zfs_dsl_dir_t *mosdsldir; /* $MOS */ + + /* vdev state. */ + int fd; /* vdev disk fd */ + uint64_t vdevguid; /* disk vdev GUID */ + off_t vdevsize; /* vdev size, including labels */ + off_t asize; /* vdev size, excluding labels */ + bitstr_t *spacemap; /* space allocation tracking */ + int spacemapbits; /* one bit per ashift-sized block */ + uint64_t msshift; /* log2(metaslab size) */ + uint64_t mscount; /* number of metaslabs for this vdev */ +} zfs_opt_t; + +/* dsl.c */ +void dsl_init(zfs_opt_t *); +const char *dsl_dir_fullname(const zfs_dsl_dir_t *); +uint64_t dsl_dir_id(zfs_dsl_dir_t *); +uint64_t dsl_dir_dataset_id(zfs_dsl_dir_t *); +void dsl_dir_foreach(zfs_opt_t *, zfs_dsl_dir_t *, + void (*)(zfs_opt_t *, zfs_dsl_dir_t *, void *), void *); +int dsl_dir_get_canmount(zfs_dsl_dir_t *, uint64_t *); +char *dsl_dir_get_mountpoint(zfs_opt_t *, zfs_dsl_dir_t *); +bool dsl_dir_has_dataset(zfs_dsl_dir_t *); +bool dsl_dir_dataset_has_objset(zfs_dsl_dir_t *); +void dsl_dir_dataset_write(zfs_opt_t *, zfs_objset_t *, zfs_dsl_dir_t *); +void dsl_dir_size_set(zfs_dsl_dir_t *, uint64_t); +void dsl_write(zfs_opt_t *); + +/* fs.c */ +void fs_build(zfs_opt_t *, int, fsnode *); + +/* objset.c */ +zfs_objset_t *objset_alloc(zfs_opt_t *zfs, uint64_t type); +off_t objset_space_alloc(zfs_opt_t *, zfs_objset_t *, off_t *); +dnode_phys_t *objset_dnode_alloc(zfs_objset_t *, uint8_t, uint64_t *); +dnode_phys_t *objset_dnode_bonus_alloc(zfs_objset_t *, uint8_t, uint8_t, + uint16_t, uint64_t *); +dnode_phys_t *objset_dnode_lookup(zfs_objset_t *, uint64_t); +void objset_root_blkptr_copy(const zfs_objset_t *, blkptr_t *); +uint64_t objset_space(const zfs_objset_t *); +void objset_write(zfs_opt_t *zfs, zfs_objset_t *os); + +/* vdev.c */ +void vdev_init(zfs_opt_t *, const char *); +off_t vdev_space_alloc(zfs_opt_t *zfs, off_t *lenp); +void vdev_pwrite_data(zfs_opt_t *zfs, uint8_t datatype, uint8_t cksumtype, + uint8_t level, uint64_t fill, const void *data, off_t sz, off_t loc, + blkptr_t *bp); +void vdev_pwrite_dnode_indir(zfs_opt_t *zfs, dnode_phys_t *dnode, uint8_t level, + uint64_t fill, const void *data, off_t sz, off_t loc, blkptr_t *bp); +void vdev_pwrite_dnode_data(zfs_opt_t *zfs, dnode_phys_t *dnode, const void *data, + off_t sz, off_t loc); +void vdev_label_write(zfs_opt_t *zfs, int ind, const vdev_label_t *labelp); +void vdev_spacemap_write(zfs_opt_t *); +void vdev_fini(zfs_opt_t *zfs); + +/* zap.c */ +zfs_zap_t *zap_alloc(zfs_objset_t *, dnode_phys_t *); +void zap_add(zfs_zap_t *, const char *, size_t, size_t, const uint8_t *); +void zap_add_uint64(zfs_zap_t *, const char *, uint64_t); +void zap_add_string(zfs_zap_t *, const char *, const char *); +bool zap_entry_exists(zfs_zap_t *, const char *); +void zap_write(zfs_opt_t *, zfs_zap_t *); + +/* zfs.c */ +struct dnode_cursor *dnode_cursor_init(zfs_opt_t *, zfs_objset_t *, + dnode_phys_t *, off_t, off_t); +blkptr_t *dnode_cursor_next(zfs_opt_t *, struct dnode_cursor *, off_t); +void dnode_cursor_finish(zfs_opt_t *, struct dnode_cursor *); + +#endif /* !_MAKEFS_ZFS_H_ */