diff --git a/usr.sbin/makefs/Makefile b/usr.sbin/makefs/Makefile
index 3fea648f9383..fe472d7e7309 100644
--- a/usr.sbin/makefs/Makefile
+++ b/usr.sbin/makefs/Makefile
@@ -1,42 +1,53 @@
 #	$FreeBSD$
 
 SRCDIR:=${.PARSEDIR:tA}
 
 .include <src.opts.mk>
 
 PROG=	makefs
 
 CFLAGS+=-I${SRCDIR}
 
 SRCS=	cd9660.c \
 	ffs.c \
 	makefs.c \
 	msdos.c \
 	mtree.c \
 	walk.c
 MAN=	makefs.8
 
 NO_WCAST_ALIGN=
 CSTD=	c11
 
+.if ${MK_ZFS} != "no"
+SRCS+=	zfs.c
+CFLAGS+=-I${SRCDIR}/zfs \
+	-I${SRCTOP}/stand/libsa \
+	-I${SRCTOP}/sys/cddl/boot
+
+CFLAGS+=	-DHAVE_ZFS
+
+.include "${SRCDIR}/zfs/Makefile.inc"
+.endif
+
 .include "${SRCDIR}/cd9660/Makefile.inc"
 .include "${SRCDIR}/ffs/Makefile.inc"
 .include "${SRCDIR}/msdos/Makefile.inc"
 
 CFLAGS+=-DHAVE_STRUCT_STAT_ST_FLAGS=1
 
 .PATH: ${SRCTOP}/contrib/mtree
 CFLAGS+=-I${SRCTOP}/contrib/mtree
 SRCS+=	getid.c misc.c spec.c
 
 .PATH: ${SRCTOP}/contrib/mknod
 CFLAGS+=-I${SRCTOP}/contrib/mknod
 SRCS+=	pack_dev.c
 
 CFLAGS+=	-I${SRCTOP}/lib/libnetbsd
 LIBADD=		netbsd util sbuf
 
 HAS_TESTS=
 SUBDIR.${MK_TESTS}+= tests
 
 .include <bsd.prog.mk>
diff --git a/usr.sbin/makefs/makefs.8 b/usr.sbin/makefs/makefs.8
index fdf8d532b69f..464583eab3a1 100644
--- a/usr.sbin/makefs/makefs.8
+++ b/usr.sbin/makefs/makefs.8
@@ -1,521 +1,612 @@
 .\"	$NetBSD: makefs.8,v 1.33 2011/05/22 21:51:39 christos Exp $
 .\"
 .\" Copyright (c) 2001-2003 Wasabi Systems, Inc.
 .\" All rights reserved.
 .\"
 .\" Written by Luke Mewburn for Wasabi Systems, Inc.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\" 3. All advertising materials mentioning features or use of this software
 .\"    must display the following acknowledgement:
 .\"      This product includes software developed for the NetBSD Project by
 .\"      Wasabi Systems, Inc.
 .\" 4. The name of Wasabi Systems, Inc. may not be used to endorse
 .\"    or promote products derived from this software without specific prior
 .\"    written permission.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 .\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 .\" PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
 .\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 .\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 .\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 .\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 .\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 .\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 .\" POSSIBILITY OF SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd September 17, 2020
+.Dd August 5, 2022
 .Dt MAKEFS 8
 .Os
 .Sh NAME
 .Nm makefs
 .Nd create a file system image from a directory tree or a mtree manifest
 .Sh SYNOPSIS
 .Nm
 .Op Fl DxZ
 .Op Fl B Ar endian
 .Op Fl b Ar free-blocks
 .Op Fl d Ar debug-mask
 .Op Fl F Ar mtree-specfile
 .Op Fl f Ar free-files
 .Op Fl M Ar minimum-size
 .Op Fl m Ar maximum-size
 .Op Fl N Ar userdb-dir
 .Op Fl O Ar offset
 .Op Fl o Ar fs-options
 .Op Fl R Ar roundup-size
 .Op Fl S Ar sector-size
 .Op Fl s Ar image-size
 .Op Fl T Ar timestamp
 .Op Fl t Ar fs-type
 .Ar image-file
 .Ar directory | manifest
 .Op Ar extra-directory ...
 .Sh DESCRIPTION
 The utility
 .Nm
 creates a file system image into
 .Ar image-file
 from the directory tree
 .Ar directory
 or from the mtree manifest
 .Ar manifest .
 If any optional directory trees are passed in the
 .Ar extra-directory
 arguments, then the directory tree of each argument will be merged
 into the
 .Ar directory
 or
 .Ar manifest
 first before creating
 .Ar image-file .
 No special devices or privileges are required to perform this task.
 .Pp
 The options are as follows:
 .Bl -tag -width flag
 .It Fl B Ar endian
 Set the byte order of the image to
 .Ar endian .
 Valid byte orders are
 .Ql 4321 ,
 .Ql big ,
 or
 .Ql be
 for big endian, and
 .Ql 1234 ,
 .Ql little ,
 or
 .Ql le
 for little endian.
 Some file systems may have a fixed byte order; in those cases this
 argument will be ignored.
 .It Fl b Ar free-blocks
 Ensure that a minimum of
 .Ar free-blocks
 free blocks exist in the image.
 An optional
 .Ql %
 suffix may be provided to indicate that
 .Ar free-blocks
 indicates a percentage of the calculated image size.
 .It Fl D
 Treat duplicate paths in an mtree manifest as warnings not error.
 .It Fl d Ar debug-mask
 Enable various levels of debugging, depending upon which bits are
 set in
 .Ar debug-mask .
 XXX: document these
 .It Fl F Ar mtree-specfile
 .Em This is almost certainly not the option you are looking for.
 To create an image from a list of files in an mtree format manifest,
 specify it as the last argument on the command line, not as a the
 argument to
 .Fl F .
 .Pp
 Use
 .Ar mtree-specfile
 as an
 .Xr mtree 8
 .Sq specfile
 specification.
 This option has no effect when the image is created from a mtree manifest
 rather than a directory.
 .Pp
 If a specfile entry exists in the underlying file system, its
 permissions and modification time will be used unless specifically
 overridden by the specfile.
 An error will be raised if the type of entry in the specfile
 conflicts with that of an existing entry.
 .Pp
 In the opposite case (where a specfile entry does not have an entry
 in the underlying file system) the following occurs:
 If the specfile entry is marked
 .Sy optional ,
 the specfile entry is ignored.
 Otherwise, the entry will be created in the image, and it is
 necessary to specify at least the following parameters in the
 specfile:
 .Sy type ,
 .Sy mode ,
 .Sy gname ,
 or
 .Sy gid ,
 and
 .Sy uname
 or
 .Sy uid ,
 and
 .Sy link
 (in the case of symbolic links).
 If
 .Sy time
 is not provided, the current time will be used.
 If
 .Sy flags
 is not provided, the current file flags will be used.
 Missing regular file entries will be created as zero-length files.
 .It Fl f Ar free-files
 Ensure that a minimum of
 .Ar free-files
 free files (inodes) exist in the image.
 An optional
 .Ql %
 suffix may be provided to indicate that
 .Ar free-files
 indicates a percentage of the calculated image size.
 .It Fl M Ar minimum-size
 Set the minimum size of the file system image to
 .Ar minimum-size .
 .It Fl m Ar maximum-size
 Set the maximum size of the file system image to
 .Ar maximum-size .
 An error will be raised if the target file system needs to be larger
 than this to accommodate the provided directory tree.
 .It Fl N Ar userdb-dir
 Use the user database text file
 .Pa master.passwd
 and group database text file
 .Pa group
 from
 .Ar userdb-dir ,
 rather than using the results from the system's
 .Xr getpwnam 3
 and
 .Xr getgrnam 3
 (and related) library calls.
 .It Fl O Ar offset
 Instead of creating the filesystem at the beginning of the file, start
 at offset.
 Valid only for
 .Sy ffs
 and
 .Sy msdos .
 .It Fl o Ar fs-options
 Set file system specific options.
 .Ar fs-options
 is a comma separated list of options.
 Valid file system specific options are detailed below.
 .It Fl p
 Deprecated.
 See the
 .Fl Z
 flag.
 .It Fl R Ar roundup-size
 Round the image up to
 .Ar roundup-size .
 .Ar roundup-size
 should be a multiple of the file system block size.
 This option only applies to the
 .Sy ffs
 file system type.
 .It Fl S Ar sector-size
 Set the file system sector size to
 .Ar sector-size .
 .\" XXX: next line also true for cd9660?
 Defaults to 512.
 .It Fl s Ar image-size
 Set the size of the file system image to
 .Ar image-size .
 This is equivalent to setting both the minimum
 .Fl ( M )
 and the maximum
 .Fl ( m )
 sizes to the same value.
 For
 .Sy ffs
 and
 .Sy msdos
 the
 .Ar image-size
 does not include the
 .Ar offset .
 .Ar offset
 is not included in that size.
 .It Fl T Ar timestamp
 Specify a timestamp to be set for all filesystem files and directories
 created so that repeatable builds are possible.
 The
 .Ar timestamp
 can be a
 .Pa pathname ,
 where the timestamps are derived from that file, or an integer
 value interpreted as the number of seconds from the Epoch.
 Note that timestamps specified in an
 .Xr mtree 5
 spec file, override the default timestamp.
 .It Fl t Ar fs-type
 Create an
 .Ar fs-type
 file system image.
 The following file system types are supported:
 .Bl -tag -width cd9660 -offset indent
 .It Sy ffs
 BSD fast file system (default).
 .It Sy cd9660
 ISO 9660 file system.
 .It Sy msdos
 FAT12, FAT16, or FAT32 file system.
+.It Sy zfs
+ZFS pool containing one or more file systems.
 .El
 .It Fl x
 Exclude file system nodes not explicitly listed in the specfile.
 .It Fl Z
 Create a sparse file for
 .Sy ffs .
 This is useful for virtual machine images.
 .El
 .Pp
 Where sizes are specified, a decimal number of bytes is expected.
 Two or more numbers may be separated by an
 .Dq x
 to indicate a product.
 Each number may have one of the following optional suffixes:
 .Bl -tag -width 3n -offset indent -compact
 .It b
 Block; multiply by 512
 .It k
 Kibi; multiply by 1024 (1 KiB)
 .It m
 Mebi; multiply by 1048576 (1 MiB)
 .It g
 Gibi; multiply by 1073741824 (1 GiB)
 .It t
 Tebi; multiply by 1099511627776 (1 TiB)
 .It w
 Word; multiply by the number of bytes in an integer
 .El
 .\"
 .\"
 .Ss FFS-specific options
 .Sy ffs
 images have ffs-specific optional parameters that may be provided.
 Each of the options consists of a keyword, an equal sign
 .Pq Ql = ,
 and a value.
 The following keywords are supported:
 .Pp
 .Bl -tag -width optimization -offset indent -compact
 .It Sy avgfilesize
 Expected average file size.
 .It Sy avgfpdir
 Expected number of files per directory.
 .It Sy bsize
 Block size.
 .It Sy density
 Bytes per inode. If unset, will allocate the minimum number of inodes to
 represent the filesystem if no free space has been requested (free blocks
 or minimum size set); otherwise the larger of the newfs defaults or what
 is required by the free inode parameters if set.
 .It Sy fsize
 Fragment size.
 .It Sy label
 Label name of the image.
 .It Sy maxbpg
 Maximum blocks per file in a cylinder group.
 .It Sy minfree
 Minimum % free.
 .It Sy optimization
 Optimization preference; one of
 .Ql space
 or
 .Ql time .
 .It Sy extent
 Maximum extent size.
 .It Sy maxbpcg
 Maximum total number of blocks in a cylinder group.
 .It Sy version
 UFS version.
 1 for FFS (default), 2 for UFS2.
 .It Sy softupdates
 0 for disable (default), 1 for enable
 .El
 .Ss CD9660-specific options
 .Sy cd9660
 images have ISO9660-specific optional parameters that may be
 provided.
 The arguments consist of a keyword and, optionally, an equal sign
 .Pq Ql = ,
 and a value.
 The following keywords are supported:
 .Pp
 .Bl -tag -width omit-trailing-period -offset indent -compact
 .It Sy allow-deep-trees
 Allow the directory structure to exceed the maximum specified in
 the spec.
 .It Sy allow-illegal-chars
 Allow illegal characters in filenames.
 This option is not implemented.
 .It Sy allow-lowercase
 Allow lowercase characters in filenames.
 This option is not implemented.
 .It Sy allow-max-name
 Allow 37 instead of 33 characters for filenames by omitting the
 version id.
 .It Sy allow-multidot
 Allow multiple dots in a filename.
 .It Sy applicationid
 Application ID of the image.
 .It Sy archimedes
 Use the
 .Ql ARCHIMEDES
 extension to encode
 .Tn RISC OS
 metadata.
 .It Sy bootimagedir
 Boot image directory.
 This option is not implemented.
 .It Sy chrp-boot
 Write an MBR partition table to the image to allow older CHRP hardware to
 boot.
 .It Sy boot-load-segment
 Set load segment for the boot image.
 .It Sy bootimage
 Filename of a boot image in the format
 .Dq sysid;filename ,
 where
 .Dq sysid
 is one of
 .Ql efi ,
 .Ql i386 ,
 .Ql mac68k ,
 .Ql macppc ,
 or
 .Ql powerpc .
 .It Sy generic-bootimage
 Load a generic boot image into the first 32K of the cd9660 image.
 .It Sy hard-disk-boot
 Boot image is a hard disk image.
 .It Sy isolevel
 An integer representing the ISO 9660 interchange level where
 .Dq level
 is either
 .Ql 1
 or
 .Ql 2 .
 .Dq level
 .Ql 3
 is not implemented.
 .It Sy keep-bad-images
 Do not discard images whose write was aborted due to an error.
 For debugging purposes.
 .It Sy label
 Label name of the image.
 .It Sy no-boot
 Boot image is not bootable.
 .It Sy no-emul-boot
 Boot image is a
 .Dq no emulation
 ElTorito image.
 .It Sy no-trailing-padding
 Do not pad the image (apparently Linux needs the padding).
 .It Sy omit-trailing-period
 Omit trailing periods in filenames.
 .It Sy platformid
 Set platform ID of section header entry of the boot image.
 .It Sy preparer
 Preparer ID of the image.
 .It Sy publisher
 Publisher ID of the image.
 .It Sy rockridge
 Use RockRidge extensions (for longer filenames, etc.).
 .It Sy verbose
 Turns on verbose output.
 .It Sy volumeid
 Volume set identifier of the image.
 .El
 .Ss msdos-specific options
 .Sy msdos
 images have MS-DOS-specific optional parameters that may be
 provided.
 The arguments consist of a keyword, an equal sign
 .Pq Ql = ,
 and a value.
 The following keywords are supported (see
 .Xr newfs_msdos 8
 for more details):
 .Pp
 .Bl -tag -width omit-trailing-period -offset indent -compact
 .It Cm backup_sector
 Location of the backup boot sector.
 .It Cm block_size
 Block size.
 .It Cm bootstrap
 Bootstrap file.
 .It Cm bytes_per_sector
 Bytes per sector.
 .It Cm create_size
 Create file size.
 .It Cm directory_entries
 Directory entries.
 .It Cm drive_heads
 Drive heads.
 .It Cm fat_type
 FAT type (12, 16, or 32).
 .It Cm floppy
 Preset drive parameters for standard format floppy disks
 (160, 180, 320, 360, 640, 720, 1200, 1232, 1440, or 2880).
 .It Cm hidden_sectors
 Hidden sectors.
 .It Cm info_sector
 Location of the info sector.
 .It Cm media_descriptor
 Media descriptor.
 .It Cm num_FAT
 Number of FATs.
 .It Cm OEM_string
 OEM string.
 .It Cm offset
 Offset in device.
 This option will be ignored if
 .Fl O
 is set to a positive number.
 .It Cm reserved_sectors
 Reserved sectors.
 .It Cm sectors_per_cluster
 Sectors per cluster.
 .It Cm sectors_per_fat
 Sectors per FAT.
 .It Cm sectors_per_track
 Sectors per track.
 .It Cm size
 File System size.
 .It Cm volume_id
 Volume ID.
 .It Cm volume_label
 Volume Label.
 .El
+.Ss zfs-specific options
+Note: ZFS support is currently considered experimental.
+Do not use it for anything critical.
+.Pp
+The image created by
+.Nm
+contains a ZFS pool with a single vdev of type
+.Ql disk .
+The root dataset is always created implicitly and contains the entire input
+directory tree unless additional datasets are specified using the options
+described below.
+.Pp
+The arguments consist of a keyword, an equal sign
+.Pq Ql = ,
+and a value.
+The following keywords are supported:
+.Pp
+.Bl -tag -width omit-trailing-period -offset indent -compact
+.It ashift
+The base-2 logarithm of the minimum block size.
+Typical values are 9 (512B blocks) and 12 (4KB blocks).
+The default value is 12.
+.It bootfs
+The name of the bootable dataset for the pool.
+Specifying this option causes the
+.Ql bootfs
+property to be set in the created pool.
+.It mssize
+The size of metaslabs in the created pool.
+By default,
+.Nm
+allocates large (up to 512MB) metaslabs with the expectation that
+the image will be auto-expanded upon first use.
+This option allows the default heuristic to be overridden.
+.It poolname
+The name of the ZFS pool.
+This option must be specified.
+.It rootpath
+An implicit path prefix added to dataset mountpoints.
+By default it is
+.Pa /<poolname> .
+For creating bootable pools, the
+.Va rootpath
+should be set to
+.Pa / .
+At least one dataset must have a mountpoint equal to
+.Va rootpath .
+.It fs
+Create an additional dataset.
+This option may be specified multiple times.
+The argument value must be of the form
+.Ar <dataset>[;<prop1=v1>[;<prop2=v2>[;...]]] ,
+where
+.Ar dataset
+is the name of the dataset and must belong to the pool's namespace.
+For example, with a pool name of
+.Ql test
+all dataset names must be prefixed by
+.Ql test/ .
+A dataset must exist at each level of the pool's namespace.
+For example, to create
+.Ql test/foo/bar ,
+.Ql test/foo
+must be created as well.
+.Pp
+The dataset mountpoints determine how the datasets are populated with
+files from the staged directory tree.
+Conceptually, all datasets are mounted before any are populated with files.
+The root of the staged directory tree is mapped to
+.Va rootpath .
+.Pp
+Dataset properties, as described in
+.Xr zfsprops 8 ,
+may be specified following the dataset name.
+The following properties may be set for a dataset:
+.Pp
+.Bl -tag -compact -offset indent
+.It atime
+.It canmount
+.It exec
+.It mountpoint
+.It setuid
+.El
+.El
 .Sh SEE ALSO
 .Xr mtree 5 ,
 .Xr mtree 8 ,
-.Xr newfs 8
+.Xr newfs 8 ,
+.Xr zfsconcepts 8 ,
+.Xr zfsprops 8 ,
+.Xr zpoolprops 8
 .Sh HISTORY
 The
 .Nm
 utility appeared in
 .Nx 1.6 .
 It was ported to
 .Fx
 and first appeared in
 .Fx 8.0 .
 .Sh AUTHORS
 .An Luke Mewburn
 .Aq Mt lukem@NetBSD.org
 (original program),
 .An Daniel Watt ,
 .An Walter Deignan ,
 .An Ryan Gabrys ,
 .An Alan Perez-Rathke ,
 .An Ram Vedam
 (cd9660 support),
 .An Christos Zoulas
-(msdos support).
+(msdos support),
+.An Mark Johnston
+(zfs support).
diff --git a/usr.sbin/makefs/makefs.c b/usr.sbin/makefs/makefs.c
index 888a2b3edea7..2a50768d3152 100644
--- a/usr.sbin/makefs/makefs.c
+++ b/usr.sbin/makefs/makefs.c
@@ -1,507 +1,510 @@
 /*	$NetBSD: makefs.c,v 1.26 2006/10/22 21:11:56 christos Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 2001-2003 Wasabi Systems, Inc.
  * All rights reserved.
  *
  * Written by Luke Mewburn for Wasabi Systems, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed for the NetBSD Project by
  *      Wasabi Systems, Inc.
  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  *    or promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <assert.h>
 #include <ctype.h>
 #include <errno.h>
 #include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
 #include <unistd.h>
 #include <stdbool.h>
 #include <util.h>
 
 #include "makefs.h"
 #include "mtree.h"
 
 /*
  * list of supported file systems and dispatch functions
  */
 typedef struct {
 	const char	*type;
 	void		(*prepare_options)(fsinfo_t *);
 	int		(*parse_options)(const char *, fsinfo_t *);
 	void		(*cleanup_options)(fsinfo_t *);
 	void		(*make_fs)(const char *, const char *, fsnode *,
 				fsinfo_t *);
 } fstype_t;
 
 static fstype_t fstypes[] = {
 #define ENTRY(name) { \
 	# name, name ## _prep_opts, name ## _parse_opts, \
 	name ## _cleanup_opts, name ## _makefs  \
 }
 	ENTRY(cd9660),
 	ENTRY(ffs),
 	ENTRY(msdos),
+#ifdef HAVE_ZFS
+	ENTRY(zfs),
+#endif
 	{ .type = NULL	},
 };
 
 u_int		debug;
 int		dupsok;
 struct timespec	start_time;
 struct stat stampst;
 
 static	fstype_t *get_fstype(const char *);
 static int get_tstamp(const char *, struct stat *);
 static	void	usage(fstype_t *, fsinfo_t *);
 
 int
 main(int argc, char *argv[])
 {
 	struct stat	 sb;
 	struct timeval	 start;
 	fstype_t	*fstype;
 	fsinfo_t	 fsoptions;
 	fsnode		*root;
 	int	 	 ch, i, len;
 	const char	*subtree;
 	const char	*specfile;
 
 	setprogname(argv[0]);
 
 	debug = 0;
 	if ((fstype = get_fstype(DEFAULT_FSTYPE)) == NULL)
 		errx(1, "Unknown default fs type `%s'.", DEFAULT_FSTYPE);
 
 		/* set default fsoptions */
 	(void)memset(&fsoptions, 0, sizeof(fsoptions));
 	fsoptions.fd = -1;
 	fsoptions.sectorsize = -1;
 
 	if (fstype->prepare_options)
 		fstype->prepare_options(&fsoptions);
 
 	specfile = NULL;
 #ifdef CLOCK_REALTIME
 	ch = clock_gettime(CLOCK_REALTIME, &start_time);
 #else
 	ch = gettimeofday(&start, NULL);
 	start_time.tv_sec = start.tv_sec;
 	start_time.tv_nsec = start.tv_usec * 1000;
 #endif
 	if (ch == -1)
 		err(1, "Unable to get system time");
 
 
 	while ((ch = getopt(argc, argv, "B:b:Dd:f:F:M:m:N:O:o:pR:s:S:t:T:xZ")) != -1) {
 		switch (ch) {
 
 		case 'B':
 			if (strcmp(optarg, "be") == 0 ||
 			    strcmp(optarg, "4321") == 0 ||
 			    strcmp(optarg, "big") == 0) {
 #if BYTE_ORDER == LITTLE_ENDIAN
 				fsoptions.needswap = 1;
 #endif
 			} else if (strcmp(optarg, "le") == 0 ||
 			    strcmp(optarg, "1234") == 0 ||
 			    strcmp(optarg, "little") == 0) {
 #if BYTE_ORDER == BIG_ENDIAN
 				fsoptions.needswap = 1;
 #endif
 			} else {
 				warnx("Invalid endian `%s'.", optarg);
 				usage(fstype, &fsoptions);
 			}
 			break;
 
 		case 'b':
 			len = strlen(optarg) - 1;
 			if (optarg[len] == '%') {
 				optarg[len] = '\0';
 				fsoptions.freeblockpc =
 				    strsuftoll("free block percentage",
 					optarg, 0, 99);
 			} else {
 				fsoptions.freeblocks =
 				    strsuftoll("free blocks",
 					optarg, 0, LLONG_MAX);
 			}
 			break;
 
 		case 'D':
 			dupsok = 1;
 			break;
 
 		case 'd':
 			debug = strtoll(optarg, NULL, 0);
 			break;
 
 		case 'f':
 			len = strlen(optarg) - 1;
 			if (optarg[len] == '%') {
 				optarg[len] = '\0';
 				fsoptions.freefilepc =
 				    strsuftoll("free file percentage",
 					optarg, 0, 99);
 			} else {
 				fsoptions.freefiles =
 				    strsuftoll("free files",
 					optarg, 0, LLONG_MAX);
 			}
 			break;
 
 		case 'F':
 			specfile = optarg;
 			break;
 
 		case 'M':
 			fsoptions.minsize =
 			    strsuftoll("minimum size", optarg, 1LL, LLONG_MAX);
 			break;
 
 		case 'N':
 			if (! setup_getid(optarg))
 				errx(1,
 			    "Unable to use user and group databases in `%s'",
 				    optarg);
 			break;
 
 		case 'm':
 			fsoptions.maxsize =
 			    strsuftoll("maximum size", optarg, 1LL, LLONG_MAX);
 			break;
 
 		case 'O':
 			fsoptions.offset =
 			    strsuftoll("offset", optarg, 0LL, LLONG_MAX);
 			break;
 
 		case 'o':
 		{
 			char *p;
 
 			while ((p = strsep(&optarg, ",")) != NULL) {
 				if (*p == '\0')
 					errx(1, "Empty option");
 				if (! fstype->parse_options(p, &fsoptions))
 					usage(fstype, &fsoptions);
 			}
 			break;
 		}
 		case 'p':
 			/* Deprecated in favor of 'Z' */
 			fsoptions.sparse = 1;
 			break;
 
 		case 'R':
 			/* Round image size up to specified block size */
 			fsoptions.roundup =
 			    strsuftoll("roundup-size", optarg, 0, LLONG_MAX);
 			break;
 
 		case 's':
 			fsoptions.minsize = fsoptions.maxsize =
 			    strsuftoll("size", optarg, 1LL, LLONG_MAX);
 			break;
 
 		case 'S':
 			fsoptions.sectorsize =
 			    (int)strsuftoll("sector size", optarg,
 				1LL, INT_MAX);
 			break;
 
 		case 't':
 			/* Check current one and cleanup if necessary. */
 			if (fstype->cleanup_options)
 				fstype->cleanup_options(&fsoptions);
 			fsoptions.fs_specific = NULL;
 			if ((fstype = get_fstype(optarg)) == NULL)
 				errx(1, "Unknown fs type `%s'.", optarg);
 			fstype->prepare_options(&fsoptions);
 			break;
 
 		case 'T':
 			if (get_tstamp(optarg, &stampst) == -1)
 				errx(1, "Cannot get timestamp from `%s'",
 				    optarg);
 			break;
 
 		case 'x':
 			fsoptions.onlyspec = 1;
 			break;
 
 		case 'Z':
 			/* Superscedes 'p' for compatibility with NetBSD makefs(8) */
 			fsoptions.sparse = 1;
 			break;
 
 		case '?':
 		default:
 			usage(fstype, &fsoptions);
 			/* NOTREACHED */
 
 		}
 	}
 	if (debug) {
 		printf("debug mask: 0x%08x\n", debug);
 		printf("start time: %ld.%ld, %s",
 		    (long)start_time.tv_sec, (long)start_time.tv_nsec,
 		    ctime(&start_time.tv_sec));
 	}
 	argc -= optind;
 	argv += optind;
 
 	if (argc < 2)
 		usage(fstype, &fsoptions);
 
 	/* -x must be accompanied by -F */
 	if (fsoptions.onlyspec != 0 && specfile == NULL)
 		errx(1, "-x requires -F mtree-specfile.");
 
 	/* Accept '-' as meaning "read from standard input". */
 	if (strcmp(argv[1], "-") == 0)
 		sb.st_mode = S_IFREG;
 	else {
 		if (stat(argv[1], &sb) == -1)
 			err(1, "Can't stat `%s'", argv[1]);
 	}
 
 	switch (sb.st_mode & S_IFMT) {
 	case S_IFDIR:		/* walk the tree */
 		subtree = argv[1];
 		TIMER_START(start);
 		root = walk_dir(subtree, ".", NULL, NULL);
 		TIMER_RESULTS(start, "walk_dir");
 		break;
 	case S_IFREG:		/* read the manifest file */
 		subtree = ".";
 		TIMER_START(start);
 		root = read_mtree(argv[1], NULL);
 		TIMER_RESULTS(start, "manifest");
 		break;
 	default:
 		errx(1, "%s: not a file or directory", argv[1]);
 		/* NOTREACHED */
 	}
 
 	/* append extra directory */
 	for (i = 2; i < argc; i++) {
 		if (stat(argv[i], &sb) == -1)
 			err(1, "Can't stat `%s'", argv[i]);
 		if (!S_ISDIR(sb.st_mode))
 			errx(1, "%s: not a directory", argv[i]);
 		TIMER_START(start);
 		root = walk_dir(argv[i], ".", NULL, root);
 		TIMER_RESULTS(start, "walk_dir2");
 	}
 
 	if (specfile) {		/* apply a specfile */
 		TIMER_START(start);
 		apply_specfile(specfile, subtree, root, fsoptions.onlyspec);
 		TIMER_RESULTS(start, "apply_specfile");
 	}
 
 	if (debug & DEBUG_DUMP_FSNODES) {
 		printf("\nparent: %s\n", subtree);
 		dump_fsnodes(root);
 		putchar('\n');
 	}
 
 				/* build the file system */
 	TIMER_START(start);
 	fstype->make_fs(argv[0], subtree, root, &fsoptions);
 	TIMER_RESULTS(start, "make_fs");
 
 	free_fsnodes(root);
 
 	exit(0);
 	/* NOTREACHED */
 }
 
 int
 set_option(const option_t *options, const char *option, char *buf, size_t len)
 {
 	char *var, *val;
 	int retval;
 
 	assert(option != NULL);
 
 	var = estrdup(option);
 	for (val = var; *val; val++)
 		if (*val == '=') {
 			*val++ = '\0';
 			break;
 		}
 	retval = set_option_var(options, var, val, buf, len);
 	free(var);
 	return retval;
 }
 
 int
 set_option_var(const option_t *options, const char *var, const char *val,
     char *buf, size_t len)
 {
 	char *s;
 	size_t i;
 
 #define NUM(type) \
 	if (!*val) { \
 		*(type *)options[i].value = 1; \
 		break; \
 	} \
 	*(type *)options[i].value = (type)strsuftoll(options[i].desc, val, \
 	    options[i].minimum, options[i].maximum); break
 
 	for (i = 0; options[i].name != NULL; i++) {
 		if (var[1] == '\0') {
 			if (options[i].letter != var[0])
 				continue;
 		} else if (strcmp(options[i].name, var) != 0)
 			continue;
 		switch (options[i].type) {
 		case OPT_BOOL:
 			*(bool *)options[i].value = 1;
 			break;
 		case OPT_STRARRAY:
 			strlcpy((void *)options[i].value, val, (size_t)
 			    options[i].maximum);
 			break;
 		case OPT_STRPTR:
 			s = estrdup(val);
 			*(char **)options[i].value = s;
 			break;
 		case OPT_STRBUF:
 			if (buf == NULL)
 				abort();
 			strlcpy(buf, val, len);
 			break;
 		case OPT_INT64:
 			NUM(uint64_t);
 		case OPT_INT32:
 			NUM(uint32_t);
 		case OPT_INT16:
 			NUM(uint16_t);
 		case OPT_INT8:
 			NUM(uint8_t);
 		default:
 			warnx("Unknown type %d in option %s", options[i].type,
 			    val);
 			return 0;
 		}
 		return i;
 	}
 	warnx("Unknown option `%s'", var);
 	return -1;
 }
 
 
 static fstype_t *
 get_fstype(const char *type)
 {
 	int i;
 	
 	for (i = 0; fstypes[i].type != NULL; i++)
 		if (strcmp(fstypes[i].type, type) == 0)
 			return (&fstypes[i]);
 	return (NULL);
 }
 
 option_t *
 copy_opts(const option_t *o)
 {
 	size_t i;
 
 	for (i = 0; o[i].name; i++)
 		continue;
 	i++;
 	return memcpy(ecalloc(i, sizeof(*o)), o, i * sizeof(*o));
 }
 
 static int
 get_tstamp(const char *b, struct stat *st)
 {
 	time_t when;
 	char *eb;
 	long long l;
 
 	if (stat(b, st) != -1)
 		return 0;
 
 	{
 		errno = 0;
 		l = strtoll(b, &eb, 0);
 		if (b == eb || *eb || errno)
 			return -1;
 		when = (time_t)l;
 	}
 
 	st->st_ino = 1;
 #ifdef HAVE_STRUCT_STAT_BIRTHTIME
 	st->st_birthtime =
 #endif
 	st->st_mtime = st->st_ctime = st->st_atime = when;
 	return 0;
 }
 
 static void
 usage(fstype_t *fstype, fsinfo_t *fsoptions)
 {
 	const char *prog;
 
 	prog = getprogname();
 	fprintf(stderr,
 "Usage: %s [-xZ] [-B endian] [-b free-blocks] [-d debug-mask]\n"
 "\t[-F mtree-specfile] [-f free-files] [-M minimum-size] [-m maximum-size]\n"
 "\t[-N userdb-dir] [-O offset] [-o fs-options] [-R roundup-size]\n"
 "\t[-S sector-size] [-s image-size] [-T <timestamp/file>] [-t fs-type]\n"
 "\timage-file directory | manifest [extra-directory ...]\n",
 	    prog);
 
 	if (fstype) {
 		size_t i;
 		option_t *o = fsoptions->fs_options;
 
 		fprintf(stderr, "\n%s specific options:\n", fstype->type);
 		for (i = 0; o[i].name != NULL; i++)
 			fprintf(stderr, "\t%c%c%20.20s\t%s\n",
 			    o[i].letter ? o[i].letter : ' ',
 			    o[i].letter ? ',' : ' ',
 			    o[i].name, o[i].desc);
 	}
 	exit(1);
 }
diff --git a/usr.sbin/makefs/makefs.h b/usr.sbin/makefs/makefs.h
index 68dc0362dd21..e88313e8366d 100644
--- a/usr.sbin/makefs/makefs.h
+++ b/usr.sbin/makefs/makefs.h
@@ -1,306 +1,311 @@
 /*	$NetBSD: makefs.h,v 1.20 2008/12/28 21:51:46 christos Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 2001 Wasabi Systems, Inc.
  * All rights reserved.
  *
  * Written by Luke Mewburn for Wasabi Systems, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed for the NetBSD Project by
  *      Wasabi Systems, Inc.
  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  *    or promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef	_MAKEFS_H
 #define	_MAKEFS_H
 
 #include <sys/stat.h>
 #include <err.h>
 
 /*
  * fsnode -
  *	a component of the tree; contains a filename, a pointer to
  *	fsinode, optional symlink name, and tree pointers
  *
  * fsinode - 
  *	equivalent to an inode, containing target file system inode number,
  *	refcount (nlink), and stat buffer
  *
  * A tree of fsnodes looks like this:
  *
  *	name	"."		"bin"		"netbsd"
  *	type	S_IFDIR		S_IFDIR		S_IFREG
  *	next 	  >		  >		NULL
  *	parent	NULL		NULL		NULL
  *	child	NULL		  v
  *
  *	name			"."		"ls"
  *	type			S_IFDIR		S_IFREG
  *	next			  >		NULL
  *	parent			  ^		^ (to "bin")
  *	child			NULL		NULL
  *
  * Notes:
  *	-   first always points to first entry, at current level, which
  *	    must be "." when the tree has been built; during build it may
  *	    not be if "." hasn't yet been found by readdir(2).
  */
 
 enum fi_flags {
 	FI_SIZED =	1<<0,		/* inode sized */
 	FI_ALLOCATED =	1<<1,		/* fsinode->ino allocated */
 	FI_WRITTEN =	1<<2,		/* inode written */
+	FI_ROOT =	1<<3,		/* root of a ZFS dataset */
 };
 
 typedef struct {
 	uint32_t	 ino;		/* inode number used on target fs */
 	uint32_t	 nlink;		/* number of links to this entry */
 	enum fi_flags	 flags;		/* flags used by fs specific code */
+	void		*param;		/* for use by individual fs impls */
 	struct stat	 st;		/* stat entry */
 } fsinode;
 
 typedef struct _fsnode {
 	struct _fsnode	*parent;	/* parent (NULL if root) */
 	struct _fsnode	*child;		/* child (if type == S_IFDIR) */
 	struct _fsnode	*next;		/* next */
 	struct _fsnode	*first;		/* first node of current level (".") */
 	uint32_t	 type;		/* type of entry */
 	fsinode		*inode;		/* actual inode data */
 	char		*symlink;	/* symlink target */
 	char		*contents;	/* file to provide contents */
 	const char	*root;		/* root path */
 	char		*path;		/* directory name */
 	char		*name;		/* file name */
 	int		flags;		/* misc flags */
 } fsnode;
 
 #define	FSNODE_F_HASSPEC	0x01	/* fsnode has a spec entry */
 #define	FSNODE_F_OPTIONAL	0x02	/* fsnode is optional */
 
 /*
  * option_t - contains option name, description, pointer to location to store
  * result, and range checks for the result. Used to simplify fs specific
  * option setting
  */
 typedef enum {
 	OPT_STRARRAY,
 	OPT_STRPTR,
 	OPT_STRBUF,
 	OPT_BOOL,
 	OPT_INT8,
 	OPT_INT16,
 	OPT_INT32,
 	OPT_INT64
 } opttype_t;
 
 typedef struct {
 	char		letter;		/* option letter NUL for none */
 	const char	*name;		/* option name */
 	void		*value;		/* where to stuff the value */
 	opttype_t	type;		/* type of entry */
 	long long	minimum;	/* minimum for value */
 	long long	maximum;	/* maximum for value */
 	const char	*desc;		/* option description */
 } option_t;
 
 /*
  * fsinfo_t - contains various settings and parameters pertaining to
  * the image, including current settings, global options, and fs
  * specific options
  */
 typedef struct makefs_fsinfo {
 		/* current settings */
 	off_t	size;		/* total size */
 	off_t	inodes;		/* number of inodes */
 	uint32_t curinode;	/* current inode */
 
 		/* image settings */
 	int	fd;		/* file descriptor of image */
 	void	*superblock;	/* superblock */
 	int	onlyspec;	/* only add entries in specfile */
 
 
 		/* global options */
 	off_t	minsize;	/* minimum size image should be */
 	off_t	maxsize;	/* maximum size image can be */
 	off_t	freefiles;	/* free file entries to leave */
 	off_t	freeblocks;	/* free blocks to leave */
 	off_t	offset;		/* offset from start of file */
 	off_t	roundup;	/* round image size up to this value */
 	int	freefilepc;	/* free file % */
 	int	freeblockpc;	/* free block % */
 	int	needswap;	/* non-zero if byte swapping needed */
 	int	sectorsize;	/* sector size */
 	int	sparse;		/* sparse image, don't fill it with zeros */
 
 	void	*fs_specific;	/* File system specific additions. */
 	option_t *fs_options;	/* File system specific options */
 } fsinfo_t;
 
 
 void		apply_specfile(const char *, const char *, fsnode *, int);
 void		dump_fsnodes(fsnode *);
 const char *	inode_type(mode_t);
 fsnode *	read_mtree(const char *, fsnode *);
 int		set_option(const option_t *, const char *, char *, size_t);
 int		set_option_var(const option_t *, const char *, const char *,
     char *, size_t);
 fsnode *	walk_dir(const char *, const char *, fsnode *, fsnode *);
 void		free_fsnodes(fsnode *);
 option_t *	copy_opts(const option_t *);
 
 #define DECLARE_FUN(fs)							\
 void		fs ## _prep_opts(fsinfo_t *);				\
 int		fs ## _parse_opts(const char *, fsinfo_t *);		\
 void		fs ## _cleanup_opts(fsinfo_t *);			\
 void		fs ## _makefs(const char *, const char *, fsnode *, fsinfo_t *)
 
 DECLARE_FUN(cd9660);
 DECLARE_FUN(ffs);
 DECLARE_FUN(msdos);
+#ifdef HAVE_ZFS
+DECLARE_FUN(zfs);
+#endif
 
 extern	u_int		debug;
 extern	int		dupsok;
 extern	struct timespec	start_time;
 extern	struct stat stampst;
 
 /*
  * If -x is specified, we want to exclude nodes which do not appear
  * in the spec file.
  */
 #define	FSNODE_EXCLUDE_P(opts, fsnode)	\
 	((opts)->onlyspec != 0 && ((fsnode)->flags & FSNODE_F_HASSPEC) == 0)
 
 #define	DEBUG_TIME			0x00000001
 		/* debug bits 1..3 unused at this time */
 #define	DEBUG_WALK_DIR			0x00000010
 #define	DEBUG_WALK_DIR_NODE		0x00000020
 #define	DEBUG_WALK_DIR_LINKCHECK	0x00000040
 #define	DEBUG_DUMP_FSNODES		0x00000080
 #define	DEBUG_DUMP_FSNODES_VERBOSE	0x00000100
 #define	DEBUG_FS_PARSE_OPTS		0x00000200
 #define	DEBUG_FS_MAKEFS			0x00000400
 #define	DEBUG_FS_VALIDATE		0x00000800
 #define	DEBUG_FS_CREATE_IMAGE		0x00001000
 #define	DEBUG_FS_SIZE_DIR		0x00002000
 #define	DEBUG_FS_SIZE_DIR_NODE		0x00004000
 #define	DEBUG_FS_SIZE_DIR_ADD_DIRENT	0x00008000
 #define	DEBUG_FS_POPULATE		0x00010000
 #define	DEBUG_FS_POPULATE_DIRBUF	0x00020000
 #define	DEBUG_FS_POPULATE_NODE		0x00040000
 #define	DEBUG_FS_WRITE_FILE		0x00080000
 #define	DEBUG_FS_WRITE_FILE_BLOCK	0x00100000
 #define	DEBUG_FS_MAKE_DIRBUF		0x00200000
 #define	DEBUG_FS_WRITE_INODE		0x00400000
 #define	DEBUG_BUF_BREAD			0x00800000
 #define	DEBUG_BUF_BWRITE		0x01000000
 #define	DEBUG_BUF_GETBLK		0x02000000
 #define	DEBUG_APPLY_SPECFILE		0x04000000
 #define	DEBUG_APPLY_SPECENTRY		0x08000000
 #define	DEBUG_APPLY_SPECONLY		0x10000000
 #define	DEBUG_MSDOSFS			0x20000000
 
 
 #define	TIMER_START(x)				\
 	if (debug & DEBUG_TIME)			\
 		gettimeofday(&(x), NULL)
 
 #define	TIMER_RESULTS(x,d)				\
 	if (debug & DEBUG_TIME) {			\
 		struct timeval end, td;			\
 		gettimeofday(&end, NULL);		\
 		timersub(&end, &(x), &td);		\
 		printf("%s took %lld.%06ld seconds\n",	\
 		    (d), (long long)td.tv_sec,		\
 		    (long)td.tv_usec);			\
 	}
 
 
 #ifndef	DEFAULT_FSTYPE
 #define	DEFAULT_FSTYPE	"ffs"
 #endif
 
 
 /*
  *	ffs specific settings
  *	---------------------
  */
 
 #define	FFS_EI		/* for opposite endian support in ffs headers */
 
 /*
  * Write-arounds/compat shims for endian-agnostic support.
  * These belong in the kernel if/when it's possible to mount
  * filesystems w/ either byte order.
  */
 
 /*
  * File system internal flags, also in fs_flags.
  * (Pick highest number to avoid conflicts with others)
  */
 #define        FS_SWAPPED      0x80000000      /* file system is endian swapped */
 #define        FS_INTERNAL     0x80000000      /* mask for internal flags */
 
 #define        FS_ISCLEAN      1
 
 #define        DINODE1_SIZE    (sizeof(struct ufs1_dinode))
 #define        DINODE2_SIZE    (sizeof(struct ufs2_dinode))
 
 #define UFS1_MAXSYMLINKLEN   ((UFS_NDADDR + UFS_NIADDR) * sizeof(ufs1_daddr_t))
 #define UFS2_MAXSYMLINKLEN   ((UFS_NDADDR + UFS_NIADDR) * sizeof(ufs2_daddr_t))
 
 #if (BYTE_ORDER == LITTLE_ENDIAN)
 #define DIRSIZ_SWAP(oldfmt, dp, needswap)      \
     (((oldfmt) && !(needswap)) ?       \
     DIRECTSIZ((dp)->d_type) : DIRECTSIZ((dp)->d_namlen))
 #else
 #define DIRSIZ_SWAP(oldfmt, dp, needswap)      \
     (((oldfmt) && (needswap)) ?                \
     DIRECTSIZ((dp)->d_type) : DIRECTSIZ((dp)->d_namlen))
 #endif
 
 #define        cg_chkmagic_swap(cgp, ns) \
     (ufs_rw32((cgp)->cg_magic, (ns)) == CG_MAGIC)
 #define        cg_inosused_swap(cgp, ns) \
     ((u_int8_t *)((u_int8_t *)(cgp) + ufs_rw32((cgp)->cg_iusedoff, (ns))))
 #define        cg_blksfree_swap(cgp, ns) \
     ((u_int8_t *)((u_int8_t *)(cgp) + ufs_rw32((cgp)->cg_freeoff, (ns))))
 #define        cg_clustersfree_swap(cgp, ns) \
     ((u_int8_t *)((u_int8_t *)(cgp) + ufs_rw32((cgp)->cg_clusteroff, (ns))))
 #define        cg_clustersum_swap(cgp, ns) \
     ((int32_t *)((uintptr_t)(cgp) + ufs_rw32((cgp)->cg_clustersumoff, ns)))
 
 struct fs;
 void   ffs_fragacct_swap(struct fs *, int, uint32_t [], int, int);
 
 fsinode *link_check(fsinode *);
 
 #endif	/* _MAKEFS_H */
diff --git a/usr.sbin/makefs/tests/Makefile b/usr.sbin/makefs/tests/Makefile
index 85e4b233aea7..c2c9f6bea5b6 100644
--- a/usr.sbin/makefs/tests/Makefile
+++ b/usr.sbin/makefs/tests/Makefile
@@ -1,18 +1,19 @@
 # $FreeBSD$
 
 ATF_TESTS_SH+=	makefs_cd9660_tests
 ATF_TESTS_SH+=	makefs_ffs_tests
+ATF_TESTS_SH+=	makefs_zfs_tests
 
 BINDIR=		${TESTSDIR}
 
 # XXX: PACKAGE support for SCRIPTS
 SCRIPTS+=	makefs_tests_common.sh
 SCRIPTSNAME_makefs_tests_common.sh=	makefs_tests_common.sh
 
 TEST_METADATA.makefs_cd9660_tests+=	required_files="/sbin/mount_cd9660"
 
 .for t in ${ATF_TESTS_SH}
 TEST_METADATA.$t+=	required_user="root"
 .endfor
 
 .include <bsd.test.mk>
diff --git a/usr.sbin/makefs/tests/makefs_zfs_tests.sh b/usr.sbin/makefs/tests/makefs_zfs_tests.sh
new file mode 100644
index 000000000000..8cd79966c49a
--- /dev/null
+++ b/usr.sbin/makefs/tests/makefs_zfs_tests.sh
@@ -0,0 +1,634 @@
+#-
+# SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+#
+# Copyright (c) 2022 The FreeBSD Foundation
+#
+# This software was developed by Mark Johnston under sponsorship from
+# the FreeBSD Foundation.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in
+#    the documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+
+MAKEFS="makefs -t zfs -o nowarn=true"
+ZFS_POOL_NAME="makefstest$$"
+TEST_ZFS_POOL_NAME="$TMPDIR/poolname"
+
+. "$(dirname "$0")/makefs_tests_common.sh"
+
+common_cleanup()
+{
+	local pool md
+
+        # Try to force a TXG, this can help catch bugs by triggering a panic.
+	sync
+
+	pool=$(cat $TEST_ZFS_POOL_NAME)
+	if zpool list "$pool" >/dev/null; then
+		zpool destroy "$pool"
+	fi
+
+	md=$(cat $TEST_MD_DEVICE_FILE)
+	if [ -c /dev/"$md" ]; then
+		mdconfig -d -u "$md"
+	fi
+}
+
+import_image()
+{
+	atf_check -e empty -o save:$TEST_MD_DEVICE_FILE -s exit:0 \
+	    mdconfig -a -f $TEST_IMAGE
+	atf_check zpool import -R $TEST_MOUNT_DIR $ZFS_POOL_NAME
+	echo "$ZFS_POOL_NAME" > $TEST_ZFS_POOL_NAME
+}
+
+#
+# Test autoexpansion of the vdev.
+#
+# The pool is initially 10GB, so we get 10GB minus one metaslab's worth of
+# usable space for data.  Then the pool is expanded to 50GB, and the amount of
+# usable space is 50GB minus one metaslab.
+#
+atf_test_case autoexpand cleanup
+autoexpand_body()
+{
+	local mssize poolsize poolsize1 newpoolsize
+
+	create_test_inputs
+
+	mssize=$((128 * 1024 * 1024))
+	poolsize=$((10 * 1024 * 1024 * 1024))
+	atf_check $MAKEFS -s $poolsize -o mssize=$mssize -o rootpath=/ \
+	    -o poolname=$ZFS_POOL_NAME \
+	    $TEST_IMAGE $TEST_INPUTS_DIR
+
+	newpoolsize=$((50 * 1024 * 1024 * 1024))
+	truncate -s $newpoolsize $TEST_IMAGE
+
+	import_image
+
+	check_image_contents
+
+	poolsize1=$(zpool list -Hp -o size $ZFS_POOL_NAME)
+	atf_check [ $((poolsize1 + $mssize)) -eq $poolsize ]
+
+        atf_check zpool online -e $ZFS_POOL_NAME /dev/$(cat $TEST_MD_DEVICE_FILE)
+
+	check_image_contents
+
+	poolsize1=$(zpool list -Hp -o size $ZFS_POOL_NAME)
+	atf_check [ $((poolsize1 + $mssize)) -eq $newpoolsize ]
+}
+autoexpand_cleanup()
+{
+	common_cleanup
+}
+
+#
+# Test with some default layout defined by the common code.
+#
+atf_test_case basic cleanup
+basic_body()
+{
+	create_test_inputs
+
+	atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+	    $TEST_IMAGE $TEST_INPUTS_DIR
+
+	import_image
+
+	check_image_contents
+}
+basic_cleanup()
+{
+	common_cleanup
+}
+
+atf_test_case dataset_removal cleanup
+dataset_removal_body()
+{
+	create_test_dirs
+
+	cd $TEST_INPUTS_DIR
+	mkdir dir
+	cd -
+
+	atf_check $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+	    -o fs=${ZFS_POOL_NAME}/dir \
+	    $TEST_IMAGE $TEST_INPUTS_DIR
+
+	import_image
+
+	check_image_contents
+
+	atf_check zfs destroy ${ZFS_POOL_NAME}/dir
+}
+dataset_removal_cleanup()
+{
+	common_cleanup
+}
+
+#
+# Make sure that we can create and remove an empty directory.
+#
+atf_test_case empty_dir cleanup
+empty_dir_body()
+{
+	create_test_dirs
+
+	cd $TEST_INPUTS_DIR
+	mkdir dir
+	cd -
+
+	atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+	    $TEST_IMAGE $TEST_INPUTS_DIR
+
+	import_image
+
+	check_image_contents
+
+	atf_check rmdir ${TEST_MOUNT_DIR}/dir
+}
+empty_dir_cleanup()
+{
+	common_cleanup
+}
+
+atf_test_case empty_fs cleanup
+empty_fs_body()
+{
+	create_test_dirs
+
+	atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+	    $TEST_IMAGE $TEST_INPUTS_DIR
+
+	import_image
+
+	check_image_contents
+}
+empty_fs_cleanup()
+{
+	common_cleanup
+}
+
+atf_test_case file_sizes cleanup
+file_sizes_body()
+{
+	local i
+
+	create_test_dirs
+	cd $TEST_INPUTS_DIR
+
+	i=1
+	while [ $i -lt $((1 << 20)) ]; do
+		truncate -s $i ${i}.1
+		truncate -s $(($i - 1)) ${i}.2
+		truncate -s $(($i + 1)) ${i}.3
+		i=$(($i << 1))
+	done
+
+	cd -
+
+	# XXXMJ this creates sparse files, make sure makefs doesn't
+	#       preserve the sparseness.
+	# XXXMJ need to test with larger files (at least 128MB for L2 indirs)
+	atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+	    $TEST_IMAGE $TEST_INPUTS_DIR
+
+	import_image
+
+	check_image_contents
+}
+file_sizes_cleanup()
+{
+	common_cleanup
+}
+
+atf_test_case hard_links cleanup
+hard_links_body()
+{
+	local f
+
+	create_test_dirs
+	cd $TEST_INPUTS_DIR
+
+	mkdir dir
+	echo "hello" > 1
+	ln 1 2
+	ln 1 dir/1
+
+	echo "goodbye" > dir/a
+	ln dir/a dir/b
+	ln dir/a a
+
+	cd -
+
+	atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+	    $TEST_IMAGE $TEST_INPUTS_DIR
+
+	import_image
+
+	check_image_contents
+
+	stat -f '%i' ${TEST_MOUNT_DIR}/1 > ./ino
+	stat -f '%l' ${TEST_MOUNT_DIR}/1 > ./nlink
+	for f in 1 2 dir/1; do
+		atf_check -o file:./nlink -e empty -s exit:0 \
+		    stat -f '%l' ${TEST_MOUNT_DIR}/${f}
+		atf_check -o file:./ino -e empty -s exit:0 \
+		    stat -f '%i' ${TEST_MOUNT_DIR}/${f}
+		atf_check cmp -s ${TEST_INPUTS_DIR}/1 ${TEST_MOUNT_DIR}/${f}
+	done
+
+	stat -f '%i' ${TEST_MOUNT_DIR}/dir/a > ./ino
+	stat -f '%l' ${TEST_MOUNT_DIR}/dir/a > ./nlink
+	for f in dir/a dir/b a; do
+		atf_check -o file:./nlink -e empty -s exit:0 \
+		    stat -f '%l' ${TEST_MOUNT_DIR}/${f}
+		atf_check -o file:./ino -e empty -s exit:0 \
+		    stat -f '%i' ${TEST_MOUNT_DIR}/${f}
+		atf_check cmp -s ${TEST_INPUTS_DIR}/dir/a ${TEST_MOUNT_DIR}/${f}
+	done
+}
+hard_links_cleanup()
+{
+	common_cleanup
+}
+
+# Allocate enough dnodes from an object set that the meta dnode needs to use
+# indirect blocks.
+atf_test_case indirect_dnode_array cleanup
+indirect_dnode_array_body()
+{
+	local count i
+
+	# How many dnodes do we need to allocate?  Well, the data block size
+	# for meta dnodes is always 16KB, so with a dnode size of 512B we get
+	# 32 dnodes per direct block.  The maximum indirect block size is 128KB
+	# and that can fit 1024 block pointers, so we need at least 32 * 1024
+	# files to force the use of two levels of indirection.
+	#
+	# Unfortunately that number of files makes the test run quite slowly,
+	# so we settle for a single indirect block for now...
+	count=$(jot -r 1 32 1024)
+
+	create_test_dirs
+	cd $TEST_INPUTS_DIR
+	for i in $(seq 1 $count); do
+		touch $i
+	done
+	cd -
+
+	atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+	    $TEST_IMAGE $TEST_INPUTS_DIR
+
+	import_image
+
+	check_image_contents
+}
+indirect_dnode_array_cleanup()
+{
+	common_cleanup
+}
+
+#
+# Create some files with long names, so as to test fat ZAP handling.
+#
+atf_test_case long_file_name cleanup
+long_file_name_body()
+{
+	local dir i
+
+	create_test_dirs
+	cd $TEST_INPUTS_DIR
+
+	# micro ZAP keys can be at most 50 bytes.
+	for i in $(seq 1 60); do
+		touch $(jot -s '' $i 1 1)
+	done
+	dir=$(jot -s '' 61 1 1)
+	mkdir $dir
+	for i in $(seq 1 60); do
+		touch ${dir}/$(jot -s '' $i 1 1)
+	done
+
+	cd -
+
+	atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+	    $TEST_IMAGE $TEST_INPUTS_DIR
+
+	import_image
+
+	check_image_contents
+
+	# Add a directory entry in the hope that OpenZFS might catch a bug
+	# in makefs' fat ZAP encoding.
+	touch ${TEST_MOUNT_DIR}/foo
+}
+long_file_name_cleanup()
+{
+	common_cleanup
+}
+
+#
+# Exercise handling of multiple datasets.
+#
+atf_test_case multi_dataset_1 cleanup
+multi_dataset_1_body()
+{
+	create_test_dirs
+	cd $TEST_INPUTS_DIR
+
+	mkdir dir1
+	echo a > dir1/a
+	mkdir dir2
+	echo b > dir2/b
+
+	cd -
+
+	atf_check $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+	    -o fs=${ZFS_POOL_NAME}/dir1 -o fs=${ZFS_POOL_NAME}/dir2 \
+	    $TEST_IMAGE $TEST_INPUTS_DIR
+
+	import_image
+
+	check_image_contents
+
+	# Make sure that we have three datasets with the expected mount points.
+	atf_check -o inline:${ZFS_POOL_NAME}\\n -e empty -s exit:0 \
+	    zfs list -H -o name ${ZFS_POOL_NAME}
+	atf_check -o inline:${TEST_MOUNT_DIR}\\n -e empty -s exit:0 \
+	    zfs list -H -o mountpoint ${ZFS_POOL_NAME}
+
+	atf_check -o inline:${ZFS_POOL_NAME}/dir1\\n -e empty -s exit:0 \
+	    zfs list -H -o name ${ZFS_POOL_NAME}/dir1
+	atf_check -o inline:${TEST_MOUNT_DIR}/dir1\\n -e empty -s exit:0 \
+	    zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir1
+
+	atf_check -o inline:${ZFS_POOL_NAME}/dir2\\n -e empty -s exit:0 \
+	    zfs list -H -o name ${ZFS_POOL_NAME}/dir2
+	atf_check -o inline:${TEST_MOUNT_DIR}/dir2\\n -e empty -s exit:0 \
+	    zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir2
+}
+multi_dataset_1_cleanup()
+{
+	common_cleanup
+}
+
+#
+# Create a pool with two datasets, where the root dataset is mounted below
+# the child dataset.
+#
+atf_test_case multi_dataset_2 cleanup
+multi_dataset_2_body()
+{
+	create_test_dirs
+	cd $TEST_INPUTS_DIR
+
+	mkdir dir1
+	echo a > dir1/a
+	mkdir dir2
+	echo b > dir2/b
+
+	cd -
+
+	atf_check $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+	    -o fs=${ZFS_POOL_NAME}/dir1\;mountpoint=/ \
+	    -o fs=${ZFS_POOL_NAME}\;mountpoint=/dir1 \
+	    $TEST_IMAGE $TEST_INPUTS_DIR
+
+	import_image
+
+	check_image_contents
+}
+multi_dataset_2_cleanup()
+{
+	common_cleanup
+}
+
+#
+# Create a dataset with a non-existent mount point.
+#
+atf_test_case multi_dataset_3 cleanup
+multi_dataset_3_body()
+{
+	create_test_dirs
+	cd $TEST_INPUTS_DIR
+
+	mkdir dir1
+	echo a > dir1/a
+
+	cd -
+
+	atf_check $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+	    -o fs=${ZFS_POOL_NAME}/dir1 \
+	    -o fs=${ZFS_POOL_NAME}/dir2 \
+	    $TEST_IMAGE $TEST_INPUTS_DIR
+
+	import_image
+
+	atf_check -o inline:${TEST_MOUNT_DIR}/dir2\\n -e empty -s exit:0 \
+	    zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir2
+
+	# Mounting dir2 should have created a directory called dir2.  Go
+	# back and create it in the staging tree before comparing.
+	atf_check mkdir ${TEST_INPUTS_DIR}/dir2
+
+	check_image_contents
+}
+multi_dataset_3_cleanup()
+{
+	common_cleanup
+}
+
+#
+# Create an unmounted dataset.
+#
+atf_test_case multi_dataset_4 cleanup
+multi_dataset_4_body()
+{
+	create_test_dirs
+	cd $TEST_INPUTS_DIR
+
+	mkdir dir1
+	echo a > dir1/a
+
+	cd -
+
+	atf_check $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+	    -o fs=${ZFS_POOL_NAME}/dir1\;canmount=noauto\;mountpoint=none \
+	    $TEST_IMAGE $TEST_INPUTS_DIR
+
+	import_image
+
+	atf_check -o inline:none\\n -e empty -s exit:0 \
+	    zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir1
+
+	check_image_contents
+
+	atf_check zfs set mountpoint=/dir1 ${ZFS_POOL_NAME}/dir1
+	atf_check zfs mount ${ZFS_POOL_NAME}/dir1
+	atf_check -o inline:${TEST_MOUNT_DIR}/dir1\\n -e empty -s exit:0 \
+	    zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir1
+
+	# dir1/a should be part of the root dataset, not dir1.
+	atf_check -s not-exit:0 -e not-empty stat ${TEST_MOUNT_DIR}dir1/a
+}
+multi_dataset_4_cleanup()
+{
+	common_cleanup
+}
+
+#
+# Rudimentary test to verify that two ZFS images created using the same
+# parameters and input hierarchy are byte-identical.  In particular, makefs(1)
+# does not preserve file access times.
+#
+atf_test_case reproducible cleanup
+reproducible_body()
+{
+	create_test_inputs
+
+	atf_check $MAKEFS -s 512m -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+	    ${TEST_IMAGE}.1 $TEST_INPUTS_DIR
+
+	atf_check $MAKEFS -s 512m -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+	    ${TEST_IMAGE}.2 $TEST_INPUTS_DIR
+
+	# XXX-MJ cmp(1) is really slow
+	atf_check cmp ${TEST_IMAGE}.1 ${TEST_IMAGE}.2
+}
+reproducible_cleanup()
+{
+}
+
+#
+# Verify that we can take a snapshot of a generated dataset.
+#
+atf_test_case snapshot cleanup
+snapshot_body()
+{
+	create_test_dirs
+	cd $TEST_INPUTS_DIR
+
+	mkdir dir
+	echo "hello" > dir/hello
+	echo "goodbye" > goodbye
+
+	cd -
+
+	atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+	    $TEST_IMAGE $TEST_INPUTS_DIR
+
+	import_image
+
+	atf_check zfs snapshot ${ZFS_POOL_NAME}@1
+}
+snapshot_cleanup()
+{
+	common_cleanup
+}
+
+#
+# Check handling of symbolic links.
+#
+atf_test_case soft_links cleanup
+soft_links_body()
+{
+	create_test_dirs
+	cd $TEST_INPUTS_DIR
+
+	mkdir dir
+	ln -s a a
+	ln -s dir/../a a
+	ln -s dir/b b
+	echo 'c' > dir
+	ln -s dir/c c
+	# XXX-MJ overflows bonus buffer ln -s $(jot -s '' 320 1 1) 1
+
+	cd -
+
+	atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+	    $TEST_IMAGE $TEST_INPUTS_DIR
+
+	import_image
+
+	check_image_contents
+}
+soft_links_cleanup()
+{
+	common_cleanup
+}
+
+#
+# Verify that we can set properties on the root dataset.
+#
+atf_test_case root_props cleanup
+root_props_body()
+{
+	create_test_inputs
+
+	atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
+	    -o fs=${ZFS_POOL_NAME}\;atime=off\;setuid=off \
+	    $TEST_IMAGE $TEST_INPUTS_DIR
+
+	import_image
+
+	check_image_contents
+
+	atf_check -o inline:off\\n -e empty -s exit:0 \
+	    zfs get -H -o value atime $ZFS_POOL_NAME
+	atf_check -o inline:local\\n -e empty -s exit:0 \
+	    zfs get -H -o source atime $ZFS_POOL_NAME
+	atf_check -o inline:off\\n -e empty -s exit:0 \
+	    zfs get -H -o value setuid $ZFS_POOL_NAME
+	atf_check -o inline:local\\n -e empty -s exit:0 \
+	    zfs get -H -o source setuid $ZFS_POOL_NAME
+}
+root_props_cleanup()
+{
+	common_cleanup
+}
+
+atf_init_test_cases()
+{
+	atf_add_test_case autoexpand
+	atf_add_test_case basic
+	atf_add_test_case dataset_removal
+	atf_add_test_case empty_dir
+	atf_add_test_case empty_fs
+	atf_add_test_case file_sizes
+	atf_add_test_case hard_links
+	atf_add_test_case indirect_dnode_array
+	atf_add_test_case long_file_name
+	atf_add_test_case multi_dataset_1
+	atf_add_test_case multi_dataset_2
+	atf_add_test_case multi_dataset_3
+	atf_add_test_case multi_dataset_4
+	atf_add_test_case reproducible
+	atf_add_test_case snapshot
+	atf_add_test_case soft_links
+	atf_add_test_case root_props
+
+	# XXXMJ tests:
+	# - test with different ashifts (at least, 9 and 12), different image sizes
+	# - create datasets in imported pool
+}
diff --git a/usr.sbin/makefs/zfs.c b/usr.sbin/makefs/zfs.c
new file mode 100644
index 000000000000..08689a558870
--- /dev/null
+++ b/usr.sbin/makefs/zfs.c
@@ -0,0 +1,758 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 The FreeBSD Foundation
+ *
+ * This software was developed by Mark Johnston under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/queue.h>
+
+#include <assert.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <util.h>
+
+#include "makefs.h"
+#include "zfs.h"
+
+#define	VDEV_LABEL_SPACE	\
+	((off_t)(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE))
+_Static_assert(VDEV_LABEL_SPACE <= MINDEVSIZE, "");
+
+#define	MINMSSIZE		((off_t)1 << 24) /* 16MB */
+#define	DFLTMSSIZE		((off_t)1 << 29) /* 512MB */
+#define	MAXMSSIZE		((off_t)1 << 34) /* 16GB */
+
+#define	INDIR_LEVELS		6
+/* Indirect blocks are always 128KB. */
+#define	BLKPTR_PER_INDIR	(MAXBLOCKSIZE / sizeof(blkptr_t))
+
+struct dnode_cursor {
+	char		inddir[INDIR_LEVELS][MAXBLOCKSIZE];
+	off_t		indloc;
+	off_t		indspace;
+	dnode_phys_t	*dnode;
+	off_t		dataoff;
+	off_t		datablksz;
+};
+
+void
+zfs_prep_opts(fsinfo_t *fsopts)
+{
+	zfs_opt_t *zfs = ecalloc(1, sizeof(*zfs));
+
+	const option_t zfs_options[] = {
+		{ '\0', "bootfs", &zfs->bootfs, OPT_STRPTR,
+		  0, 0, "Bootable dataset" },
+		{ '\0', "mssize", &zfs->mssize, OPT_INT64,
+		  MINMSSIZE, MAXMSSIZE, "Metaslab size" },
+		{ '\0', "poolname", &zfs->poolname, OPT_STRPTR,
+		  0, 0, "ZFS pool name" },
+		{ '\0', "rootpath", &zfs->rootpath, OPT_STRPTR,
+		  0, 0, "Prefix for all dataset mount points" },
+		{ '\0', "ashift", &zfs->ashift, OPT_INT32,
+		  MINBLOCKSHIFT, MAXBLOCKSHIFT, "ZFS pool ashift" },
+		{ '\0', "nowarn", &zfs->nowarn, OPT_BOOL,
+		  0, 0, "Suppress warning about experimental ZFS support" },
+		{ .name = NULL }
+	};
+
+	STAILQ_INIT(&zfs->datasetdescs);
+
+	fsopts->fs_specific = zfs;
+	fsopts->fs_options = copy_opts(zfs_options);
+}
+
+int
+zfs_parse_opts(const char *option, fsinfo_t *fsopts)
+{
+	zfs_opt_t *zfs;
+	struct dataset_desc *dsdesc;
+	char buf[BUFSIZ], *opt, *val;
+	int rv;
+
+	zfs = fsopts->fs_specific;
+
+	opt = val = estrdup(option);
+	opt = strsep(&val, "=");
+	if (strcmp(opt, "fs") == 0) {
+		if (val == NULL)
+			errx(1, "invalid filesystem parameters `%s'", option);
+
+		/*
+		 * Dataset descriptions will be parsed later, in dsl_init().
+		 * Just stash them away for now.
+		 */
+		dsdesc = ecalloc(1, sizeof(*dsdesc));
+		dsdesc->params = estrdup(val);
+		free(opt);
+		STAILQ_INSERT_TAIL(&zfs->datasetdescs, dsdesc, next);
+		return (1);
+	}
+	free(opt);
+
+	rv = set_option(fsopts->fs_options, option, buf, sizeof(buf));
+	return (rv == -1 ? 0 : 1);
+}
+
+static void
+zfs_size_vdev(fsinfo_t *fsopts)
+{
+	zfs_opt_t *zfs;
+	off_t asize, mssize, vdevsize, vdevsize1;
+
+	zfs = fsopts->fs_specific;
+
+	assert(fsopts->maxsize != 0);
+	assert(zfs->ashift != 0);
+
+	/*
+	 * Figure out how big the vdev should be.
+	 */
+	vdevsize = rounddown2(fsopts->maxsize, 1 << zfs->ashift);
+	if (vdevsize < MINDEVSIZE)
+		errx(1, "maximum image size is too small");
+	if (vdevsize < fsopts->minsize || vdevsize > fsopts->maxsize) {
+		errx(1, "image size bounds must be multiples of %d",
+		    1 << zfs->ashift);
+	}
+	asize = vdevsize - VDEV_LABEL_SPACE;
+
+	/*
+	 * Size metaslabs according to the following heuristic:
+	 * - provide at least 8 metaslabs,
+	 * - without using a metaslab size larger than 512MB.
+	 * This approximates what OpenZFS does without being complicated.  In
+	 * practice we expect pools to be expanded upon first use, and OpenZFS
+	 * does not resize metaslabs in that case, so there is no right answer
+	 * here.  In general we want to provide large metaslabs even if the
+	 * image size is small, and 512MB is a reasonable size for pools up to
+	 * several hundred gigabytes.
+	 *
+	 * The user may override this heuristic using the "-o mssize" option.
+	 */
+	mssize = zfs->mssize;
+	if (mssize == 0) {
+		mssize = MAX(MIN(asize / 8, DFLTMSSIZE), MINMSSIZE);
+		if (!powerof2(mssize))
+			mssize = 1l << (flsll(mssize) - 1);
+	}
+	if (!powerof2(mssize))
+		errx(1, "metaslab size must be a power of 2");
+
+	/*
+	 * If we have some slop left over, try to cover it by resizing the vdev,
+	 * subject to the maxsize and minsize parameters.
+	 */
+	if (asize % mssize != 0) {
+		vdevsize1 = rounddown2(asize, mssize) + VDEV_LABEL_SPACE;
+		if (vdevsize1 < fsopts->minsize)
+			vdevsize1 = roundup2(asize, mssize) + VDEV_LABEL_SPACE;
+		if (vdevsize1 <= fsopts->maxsize)
+			vdevsize = vdevsize1;
+	}
+	asize = vdevsize - VDEV_LABEL_SPACE;
+
+	zfs->asize = asize;
+	zfs->vdevsize = vdevsize;
+	zfs->mssize = mssize;
+	zfs->msshift = flsll(mssize) - 1;
+	zfs->mscount = asize / mssize;
+}
+
+/*
+ * Validate options and set some default values.
+ */
+static void
+zfs_check_opts(fsinfo_t *fsopts)
+{
+	zfs_opt_t *zfs;
+
+	zfs = fsopts->fs_specific;
+
+	if (fsopts->offset != 0)
+		errx(1, "unhandled offset option");
+	if (fsopts->maxsize == 0)
+		errx(1, "an image size must be specified");
+
+	if (zfs->poolname == NULL)
+		errx(1, "a pool name must be specified");
+
+	if (zfs->rootpath == NULL)
+		easprintf(&zfs->rootpath, "/%s", zfs->poolname);
+	if (zfs->rootpath[0] != '/')
+		errx(1, "mountpoint `%s' must be absolute", zfs->rootpath);
+
+	if (zfs->ashift == 0)
+		zfs->ashift = 12;
+
+	zfs_size_vdev(fsopts);
+}
+
+void
+zfs_cleanup_opts(fsinfo_t *fsopts)
+{
+	struct dataset_desc *d, *tmp;
+	zfs_opt_t *zfs;
+
+	zfs = fsopts->fs_specific;
+	free(zfs->rootpath);
+	free(zfs->bootfs);
+	free(__DECONST(void *, zfs->poolname));
+	STAILQ_FOREACH_SAFE(d, &zfs->datasetdescs, next, tmp) {
+		free(d->params);
+		free(d);
+	}
+	free(zfs);
+	free(fsopts->fs_options);
+}
+
+static size_t
+nvlist_size(const nvlist_t *nvl)
+{
+	return (sizeof(nvl->nv_header) + nvl->nv_size);
+}
+
+static void
+nvlist_copy(const nvlist_t *nvl, char *buf, size_t sz)
+{
+	assert(sz >= nvlist_size(nvl));
+
+	memcpy(buf, &nvl->nv_header, sizeof(nvl->nv_header));
+	memcpy(buf + sizeof(nvl->nv_header), nvl->nv_data, nvl->nv_size);
+}
+
+static nvlist_t *
+pool_config_nvcreate(zfs_opt_t *zfs)
+{
+	nvlist_t *featuresnv, *poolnv;
+
+	poolnv = nvlist_create(NV_UNIQUE_NAME);
+	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_TXG, TXG);
+	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VERSION, SPA_VERSION);
+	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_STATE, POOL_STATE_EXPORTED);
+	nvlist_add_string(poolnv, ZPOOL_CONFIG_POOL_NAME, zfs->poolname);
+	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_GUID, zfs->poolguid);
+	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_TOP_GUID, zfs->vdevguid);
+	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_GUID, zfs->vdevguid);
+	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VDEV_CHILDREN, 1);
+
+	featuresnv = nvlist_create(NV_UNIQUE_NAME);
+	nvlist_add_nvlist(poolnv, ZPOOL_CONFIG_FEATURES_FOR_READ, featuresnv);
+	nvlist_destroy(featuresnv);
+
+	return (poolnv);
+}
+
+static nvlist_t *
+pool_disk_vdev_config_nvcreate(zfs_opt_t *zfs)
+{
+	nvlist_t *diskvdevnv;
+
+	assert(zfs->objarrid != 0);
+
+	diskvdevnv = nvlist_create(NV_UNIQUE_NAME);
+	nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK);
+	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASHIFT, zfs->ashift);
+	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASIZE, zfs->asize);
+	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_GUID, zfs->vdevguid);
+	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ID, 0);
+	nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_PATH, "/dev/null");
+	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_WHOLE_DISK, 1);
+	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG);
+	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_ARRAY,
+	    zfs->objarrid);
+	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_SHIFT,
+	    zfs->msshift);
+
+	return (diskvdevnv);
+}
+
+static nvlist_t *
+pool_root_vdev_config_nvcreate(zfs_opt_t *zfs)
+{
+	nvlist_t *diskvdevnv, *rootvdevnv;
+
+	diskvdevnv = pool_disk_vdev_config_nvcreate(zfs);
+	rootvdevnv = nvlist_create(NV_UNIQUE_NAME);
+
+	nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_ID, 0);
+	nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_GUID, zfs->poolguid);
+	nvlist_add_string(rootvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT);
+	nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG);
+	nvlist_add_nvlist_array(rootvdevnv, ZPOOL_CONFIG_CHILDREN, &diskvdevnv,
+	    1);
+	nvlist_destroy(diskvdevnv);
+
+	return (rootvdevnv);
+}
+
+/*
+ * Create the pool's "config" object, which contains an nvlist describing pool
+ * parameters and the vdev topology.  It is similar but not identical to the
+ * nvlist stored in vdev labels.  The main difference is that vdev labels do not
+ * describe the full vdev tree and in particular do not contain the "root"
+ * meta-vdev.
+ */
+static void
+pool_init_objdir_config(zfs_opt_t *zfs, zfs_zap_t *objdir)
+{
+	dnode_phys_t *dnode;
+	nvlist_t *poolconfig, *vdevconfig;
+	void *configbuf;
+	uint64_t dnid;
+	off_t configloc, configblksz;
+	int error;
+
+	dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_PACKED_NVLIST,
+	    DMU_OT_PACKED_NVLIST_SIZE, sizeof(uint64_t), &dnid);
+
+	poolconfig = pool_config_nvcreate(zfs);
+
+	vdevconfig = pool_root_vdev_config_nvcreate(zfs);
+	nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig);
+	nvlist_destroy(vdevconfig);
+
+	error = nvlist_export(poolconfig);
+	if (error != 0)
+		errc(1, error, "nvlist_export");
+
+	configblksz = nvlist_size(poolconfig);
+	configloc = objset_space_alloc(zfs, zfs->mos, &configblksz);
+	configbuf = ecalloc(1, configblksz);
+	nvlist_copy(poolconfig, configbuf, configblksz);
+
+	vdev_pwrite_dnode_data(zfs, dnode, configbuf, configblksz, configloc);
+
+	dnode->dn_datablkszsec = configblksz >> MINBLOCKSHIFT;
+	dnode->dn_flags = DNODE_FLAG_USED_BYTES;
+	*(uint64_t *)DN_BONUS(dnode) = nvlist_size(poolconfig);
+
+	zap_add_uint64(objdir, DMU_POOL_CONFIG, dnid);
+
+	nvlist_destroy(poolconfig);
+	free(configbuf);
+}
+
+/*
+ * Add objects block pointer list objects, used for deferred frees.  We don't do
+ * anything with them, but they need to be present or OpenZFS will refuse to
+ * import the pool.
+ */
+static void
+pool_init_objdir_bplists(zfs_opt_t *zfs __unused, zfs_zap_t *objdir)
+{
+	uint64_t dnid;
+
+	(void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR,
+	    BPOBJ_SIZE_V2, &dnid);
+	zap_add_uint64(objdir, DMU_POOL_FREE_BPOBJ, dnid);
+
+	(void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR,
+	    BPOBJ_SIZE_V2, &dnid);
+	zap_add_uint64(objdir, DMU_POOL_SYNC_BPLIST, dnid);
+}
+
+/*
+ * Add required feature metadata objects.  We don't know anything about ZFS
+ * features, so the objects are just empty ZAPs.
+ */
+static void
+pool_init_objdir_feature_maps(zfs_opt_t *zfs, zfs_zap_t *objdir)
+{
+	dnode_phys_t *dnode;
+	uint64_t dnid;
+
+	dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
+	zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_READ, dnid);
+	zap_write(zfs, zap_alloc(zfs->mos, dnode));
+
+	dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
+	zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_WRITE, dnid);
+	zap_write(zfs, zap_alloc(zfs->mos, dnode));
+
+	dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
+	zap_add_uint64(objdir, DMU_POOL_FEATURE_DESCRIPTIONS, dnid);
+	zap_write(zfs, zap_alloc(zfs->mos, dnode));
+}
+
+static void
+pool_init_objdir_dsl(zfs_opt_t *zfs, zfs_zap_t *objdir)
+{
+	zap_add_uint64(objdir, DMU_POOL_ROOT_DATASET,
+	    dsl_dir_id(zfs->rootdsldir));
+}
+
+static void
+pool_init_objdir_poolprops(zfs_opt_t *zfs, zfs_zap_t *objdir)
+{
+	dnode_phys_t *dnode;
+	uint64_t id;
+
+	dnode = objset_dnode_alloc(zfs->mos, DMU_OT_POOL_PROPS, &id);
+	zap_add_uint64(objdir, DMU_POOL_PROPS, id);
+
+	zfs->poolprops = zap_alloc(zfs->mos, dnode);
+}
+
+/*
+ * Initialize the MOS object directory, the root of virtually all of the pool's
+ * data and metadata.
+ */
+static void
+pool_init_objdir(zfs_opt_t *zfs)
+{
+	zfs_zap_t *zap;
+	dnode_phys_t *objdir;
+
+	objdir = objset_dnode_lookup(zfs->mos, DMU_POOL_DIRECTORY_OBJECT);
+
+	zap = zap_alloc(zfs->mos, objdir);
+	pool_init_objdir_config(zfs, zap);
+	pool_init_objdir_bplists(zfs, zap);
+	pool_init_objdir_feature_maps(zfs, zap);
+	pool_init_objdir_dsl(zfs, zap);
+	pool_init_objdir_poolprops(zfs, zap);
+	zap_write(zfs, zap);
+}
+
+/*
+ * Initialize the meta-object set (MOS) and immediately write out several
+ * special objects whose contents are already finalized, including the object
+ * directory.
+ *
+ * Once the MOS is finalized, it'll look roughly like this:
+ *
+ *	object directory (ZAP)
+ *	|-> vdev config object (nvlist)
+ *	|-> features for read
+ *	|-> features for write
+ *	|-> feature descriptions
+ *	|-> sync bplist
+ *	|-> free bplist
+ *	|-> pool properties
+ *	L-> root DSL directory
+ *	    |-> DSL child directory (ZAP)
+ *	    |   |-> $MOS (DSL dir)
+ *	    |   |   |-> child map
+ *	    |   |   L-> props (ZAP)
+ *	    |   |-> $FREE (DSL dir)
+ *	    |   |   |-> child map
+ *	    |   |   L-> props (ZAP)
+ *	    |   |-> $ORIGIN (DSL dir)
+ *	    |   |   |-> child map
+ *	    |   |   |-> dataset
+ *	    |   |   |   L-> deadlist
+ *	    |   |   |-> snapshot
+ *	    |   |   |   |-> deadlist
+ *	    |   |   |   L-> snapshot names
+ *	    |   |   |-> props (ZAP)
+ *	    |   |   L-> clones (ZAP)
+ *	    |   |-> dataset 1 (DSL dir)
+ *	    |   |   |-> DSL dataset
+ *	    |   |   |   |-> snapshot names
+ *	    |   |   |   L-> deadlist
+ *	    |   |   |-> child map
+ *	    |   |   |   L-> ...
+ *	    |   |   L-> props
+ *	    |   |-> dataset 2
+ *	    |   |   L-> ...
+ *	    |   |-> ...
+ *	    |   L-> dataset n
+ *	    |-> DSL root dataset
+ *	    |   |-> snapshot names
+ *	    |   L-> deadlist
+ *	    L-> props (ZAP)
+ *	space map object array
+ *	|-> space map 1
+ *	|-> space map 2
+ *	|-> ...
+ *	L-> space map n (zfs->mscount)
+ *
+ * The space map object array is pointed to by the "msarray" property in the
+ * pool configuration.
+ */
+static void
+pool_init(zfs_opt_t *zfs)
+{
+	uint64_t dnid;
+
+	zfs->poolguid = ((uint64_t)random() << 32) | random();
+	zfs->vdevguid = ((uint64_t)random() << 32) | random();
+
+	zfs->mos = objset_alloc(zfs, DMU_OST_META);
+
+	(void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_DIRECTORY, &dnid);
+	assert(dnid == DMU_POOL_DIRECTORY_OBJECT);
+
+	(void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_ARRAY, &zfs->objarrid);
+
+	dsl_init(zfs);
+
+	pool_init_objdir(zfs);
+}
+
+static void
+pool_labels_write(zfs_opt_t *zfs)
+{
+	uberblock_t *ub;
+	vdev_label_t *label;
+	nvlist_t *poolconfig, *vdevconfig;
+	int error;
+
+	label = ecalloc(1, sizeof(*label));
+
+	/*
+	 * Assemble the vdev configuration and store it in the label.
+	 */
+	poolconfig = pool_config_nvcreate(zfs);
+	vdevconfig = pool_disk_vdev_config_nvcreate(zfs);
+	nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig);
+	nvlist_destroy(vdevconfig);
+
+	error = nvlist_export(poolconfig);
+	if (error != 0)
+		errc(1, error, "nvlist_export");
+	nvlist_copy(poolconfig, label->vl_vdev_phys.vp_nvlist,
+	    sizeof(label->vl_vdev_phys.vp_nvlist));
+	nvlist_destroy(poolconfig);
+
+	/*
+	 * Fill out the uberblock.  Just make each one the same.  The embedded
+	 * checksum is calculated in vdev_label_write().
+	 */
+	for (size_t uoff = 0; uoff < sizeof(label->vl_uberblock);
+	    uoff += (1 << zfs->ashift)) {
+		ub = (uberblock_t *)(&label->vl_uberblock[0] + uoff);
+		ub->ub_magic = UBERBLOCK_MAGIC;
+		ub->ub_version = SPA_VERSION;
+		ub->ub_txg = TXG;
+		ub->ub_guid_sum = zfs->poolguid + zfs->vdevguid;
+		ub->ub_timestamp = 0;
+
+		ub->ub_software_version = SPA_VERSION;
+		ub->ub_mmp_magic = MMP_MAGIC;
+		ub->ub_mmp_delay = 0;
+		ub->ub_mmp_config = 0;
+		ub->ub_checkpoint_txg = 0;
+		objset_root_blkptr_copy(zfs->mos, &ub->ub_rootbp);
+	}
+
+	/*
+	 * Write out four copies of the label: two at the beginning of the vdev
+	 * and two at the end.
+	 */
+	for (int i = 0; i < VDEV_LABELS; i++)
+		vdev_label_write(zfs, i, label);
+
+	free(label);
+}
+
+static void
+pool_fini(zfs_opt_t *zfs)
+{
+	zap_write(zfs, zfs->poolprops);
+	dsl_write(zfs);
+	objset_write(zfs, zfs->mos);
+	pool_labels_write(zfs);
+}
+
+struct dnode_cursor *
+dnode_cursor_init(zfs_opt_t *zfs, zfs_objset_t *os, dnode_phys_t *dnode,
+    off_t size, off_t blksz)
+{
+	struct dnode_cursor *c;
+	uint64_t nbppindir, indlevel, ndatablks, nindblks;
+
+	assert(dnode->dn_nblkptr == 1);
+	assert(blksz <= MAXBLOCKSIZE);
+
+	if (blksz == 0) {
+		/* Must be between 1<<ashift and 128KB. */
+		blksz = MIN(MAXBLOCKSIZE, MAX(1 << zfs->ashift,
+		    powerof2(size) ? size : (1ul << flsll(size))));
+	}
+	assert(powerof2(blksz));
+
+	/*
+	 * Do we need indirect blocks?  Figure out how many levels are needed
+	 * (indlevel == 1 means no indirect blocks) and how much space is needed
+	 * (it has to be allocated up-front to break the dependency cycle
+	 * described in objset_write()).
+	 */
+	ndatablks = size == 0 ? 0 : howmany(size, blksz);
+	nindblks = 0;
+	for (indlevel = 1, nbppindir = 1; ndatablks > nbppindir; indlevel++) {
+		nbppindir *= BLKPTR_PER_INDIR;
+		nindblks += howmany(ndatablks, indlevel * nbppindir);
+	}
+	assert(indlevel < INDIR_LEVELS);
+
+	dnode->dn_nlevels = (uint8_t)indlevel;
+	dnode->dn_maxblkid = ndatablks > 0 ? ndatablks - 1 : 0;
+	dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT;
+
+	c = ecalloc(1, sizeof(*c));
+	if (nindblks > 0) {
+		c->indspace = nindblks * MAXBLOCKSIZE;
+		c->indloc = objset_space_alloc(zfs, os, &c->indspace);
+	}
+	c->dnode = dnode;
+	c->dataoff = 0;
+	c->datablksz = blksz;
+
+	return (c);
+}
+
+static void
+_dnode_cursor_flush(zfs_opt_t *zfs, struct dnode_cursor *c, int levels)
+{
+	blkptr_t *bp, *pbp;
+	void *buf;
+	uint64_t fill;
+	off_t blkid, blksz, loc;
+
+	assert(levels > 0);
+	assert(levels <= c->dnode->dn_nlevels - 1);
+
+	blksz = MAXBLOCKSIZE;
+	blkid = (c->dataoff / c->datablksz) / BLKPTR_PER_INDIR;
+	for (int level = 1; level <= levels; level++) {
+		buf = c->inddir[level - 1];
+
+		if (level == c->dnode->dn_nlevels - 1) {
+			pbp = &c->dnode->dn_blkptr[0];
+		} else {
+			uint64_t iblkid;
+
+			iblkid = blkid & (BLKPTR_PER_INDIR - 1);
+			pbp = (blkptr_t *)
+			    &c->inddir[level][iblkid * sizeof(blkptr_t)];
+		}
+
+		/*
+		 * Space for indirect blocks is allocated up-front; see the
+		 * comment in objset_write().
+		 */
+		loc = c->indloc;
+		c->indloc += blksz;
+		assert(c->indspace >= blksz);
+		c->indspace -= blksz;
+
+		bp = buf;
+		fill = 0;
+		for (size_t i = 0; i < BLKPTR_PER_INDIR; i++)
+			fill += BP_GET_FILL(&bp[i]);
+
+		vdev_pwrite_dnode_indir(zfs, c->dnode, level, fill, buf, blksz,
+		    loc, pbp);
+		memset(buf, 0, MAXBLOCKSIZE);
+
+		blkid /= BLKPTR_PER_INDIR;
+	}
+}
+
+blkptr_t *
+dnode_cursor_next(zfs_opt_t *zfs, struct dnode_cursor *c, off_t off)
+{
+	off_t blkid, l1id;
+	int levels;
+
+	if (c->dnode->dn_nlevels == 1) {
+		assert(off < MAXBLOCKSIZE);
+		return (&c->dnode->dn_blkptr[0]);
+	}
+
+	assert(off % c->datablksz == 0);
+
+	/* Do we need to flush any full indirect blocks? */
+	if (off > 0) {
+		blkid = off / c->datablksz;
+		for (levels = 0; levels < c->dnode->dn_nlevels - 1; levels++) {
+			if (blkid % BLKPTR_PER_INDIR != 0)
+				break;
+			blkid /= BLKPTR_PER_INDIR;
+		}
+		if (levels > 0)
+			_dnode_cursor_flush(zfs, c, levels);
+	}
+
+	c->dataoff = off;
+	l1id = (off / c->datablksz) & (BLKPTR_PER_INDIR - 1);
+	return ((blkptr_t *)&c->inddir[0][l1id * sizeof(blkptr_t)]);
+}
+
+void
+dnode_cursor_finish(zfs_opt_t *zfs, struct dnode_cursor *c)
+{
+	int levels;
+
+	levels = c->dnode->dn_nlevels - 1;
+	if (levels > 0)
+		_dnode_cursor_flush(zfs, c, levels);
+	assert(c->indspace == 0);
+	free(c);
+}
+
+void
+zfs_makefs(const char *image, const char *dir, fsnode *root, fsinfo_t *fsopts)
+{
+	zfs_opt_t *zfs;
+	int dirfd;
+
+	zfs = fsopts->fs_specific;
+
+	/*
+	 * Use a fixed seed to provide reproducible pseudo-random numbers for
+	 * on-disk structures when needed (e.g., GUIDs, ZAP hash salts).
+	 */
+	srandom(1729);
+
+	zfs_check_opts(fsopts);
+
+	if (!zfs->nowarn) {
+		fprintf(stderr,
+		    "ZFS support is currently considered experimental. "
+		    "Do not use it for anything critical.\n");
+	}
+
+	dirfd = open(dir, O_DIRECTORY | O_RDONLY);
+	if (dirfd < 0)
+		err(1, "open(%s)", dir);
+
+	vdev_init(zfs, image);
+	pool_init(zfs);
+	fs_build(zfs, dirfd, root);
+	pool_fini(zfs);
+	vdev_fini(zfs);
+}
diff --git a/usr.sbin/makefs/zfs/Makefile.inc b/usr.sbin/makefs/zfs/Makefile.inc
new file mode 100644
index 000000000000..bebe8c322035
--- /dev/null
+++ b/usr.sbin/makefs/zfs/Makefile.inc
@@ -0,0 +1,12 @@
+.PATH:	${SRCDIR}/zfs
+.PATH:	${SRCTOP}/stand/libsa/zfs
+
+SRCS+=	dsl.c \
+	fs.c \
+	objset.c \
+	vdev.c \
+	zap.c
+
+SRCS+=	nvlist.c
+
+CFLAGS.nvlist.c+= -I${SRCTOP}/stand/libsa -Wno-cast-qual
diff --git a/usr.sbin/makefs/zfs/dsl.c b/usr.sbin/makefs/zfs/dsl.c
new file mode 100644
index 000000000000..5f473e557c02
--- /dev/null
+++ b/usr.sbin/makefs/zfs/dsl.c
@@ -0,0 +1,598 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 The FreeBSD Foundation
+ *
+ * This software was developed by Mark Johnston under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include <util.h>
+
+#include "makefs.h"
+#include "zfs.h"
+
+typedef struct zfs_dsl_dataset {
+	zfs_objset_t	*os;		/* referenced objset, may be null */
+	dsl_dataset_phys_t *phys;	/* on-disk representation */
+	uint64_t	dsid;		/* DSL dataset dnode */
+
+	struct zfs_dsl_dir *dir;	/* containing parent */
+} zfs_dsl_dataset_t;
+
+typedef STAILQ_HEAD(zfs_dsl_dir_list, zfs_dsl_dir) zfs_dsl_dir_list_t;
+
+typedef struct zfs_dsl_dir {
+	char		*fullname;	/* full dataset name */
+	char		*name;		/* basename(fullname) */
+	dsl_dir_phys_t	*phys;		/* on-disk representation */
+	nvlist_t	*propsnv;	/* properties saved in propszap */
+
+	zfs_dsl_dataset_t *headds;	/* principal dataset, may be null */
+
+	uint64_t	dirid;		/* DSL directory dnode */
+	zfs_zap_t	*propszap;	/* dataset properties */
+	zfs_zap_t	*childzap;	/* child directories */
+
+	/* DSL directory tree linkage. */
+	struct zfs_dsl_dir *parent;
+	zfs_dsl_dir_list_t children;
+	STAILQ_ENTRY(zfs_dsl_dir) next;
+} zfs_dsl_dir_t;
+
+static zfs_dsl_dir_t *dsl_dir_alloc(zfs_opt_t *zfs, const char *name);
+static zfs_dsl_dataset_t *dsl_dataset_alloc(zfs_opt_t *zfs, zfs_dsl_dir_t *dir);
+
+static int
+nvlist_find_string(nvlist_t *nvl, const char *key, char **retp)
+{
+	char *str;
+	int error, len;
+
+	error = nvlist_find(nvl, key, DATA_TYPE_STRING, NULL, &str, &len);
+	if (error == 0) {
+		*retp = ecalloc(1, len + 1);
+		memcpy(*retp, str, len);
+	}
+	return (error);
+}
+
+static int
+nvlist_find_uint64(nvlist_t *nvl, const char *key, uint64_t *retp)
+{
+	return (nvlist_find(nvl, key, DATA_TYPE_UINT64, NULL, retp, NULL));
+}
+
+/*
+ * Return an allocated string containing the head dataset's mountpoint,
+ * including the root path prefix.
+ *
+ * If the dataset has a mountpoint property, it is returned.  Otherwise we have
+ * to follow ZFS' inheritance rules.
+ */
+char *
+dsl_dir_get_mountpoint(zfs_opt_t *zfs, zfs_dsl_dir_t *dir)
+{
+	zfs_dsl_dir_t *pdir;
+	char *mountpoint, *origmountpoint;
+
+	if (nvlist_find_string(dir->propsnv, "mountpoint", &mountpoint) == 0) {
+		if (strcmp(mountpoint, "none") == 0)
+			return (NULL);
+
+		/*
+		 * nvlist_find_string() does not make a copy.
+		 */
+		mountpoint = estrdup(mountpoint);
+	} else {
+		/*
+		 * If we don't have a mountpoint, it's inherited from one of our
+		 * ancestors.  Walk up the hierarchy until we find it, building
+		 * up our mountpoint along the way.  The mountpoint property is
+		 * always set for the root dataset.
+		 */
+		for (pdir = dir->parent, mountpoint = estrdup(dir->name);;) {
+			origmountpoint = mountpoint;
+
+			if (nvlist_find_string(pdir->propsnv, "mountpoint",
+			    &mountpoint) == 0) {
+				easprintf(&mountpoint, "%s%s%s", mountpoint,
+				    mountpoint[strlen(mountpoint) - 1] == '/' ?
+				    "" : "/", origmountpoint);
+				free(origmountpoint);
+				break;
+			}
+
+			easprintf(&mountpoint, "%s/%s", pdir->name,
+			    origmountpoint);
+			free(origmountpoint);
+			pdir = pdir->parent;
+		}
+	}
+	assert(mountpoint[0] == '/');
+	assert(strstr(mountpoint, zfs->rootpath) == mountpoint);
+
+	return (mountpoint);
+}
+
+int
+dsl_dir_get_canmount(zfs_dsl_dir_t *dir, uint64_t *canmountp)
+{
+	return (nvlist_find_uint64(dir->propsnv, "canmount", canmountp));
+}
+
+/*
+ * Handle dataset properties that we know about; stash them into an nvlist to be
+ * written later to the properties ZAP object.
+ *
+ * If the set of properties we handle grows too much, we should probably explore
+ * using libzfs to manage them.
+ */
+static void
+dsl_dir_set_prop(zfs_opt_t *zfs, zfs_dsl_dir_t *dir, const char *key,
+    const char *val)
+{
+	nvlist_t *nvl;
+
+	nvl = dir->propsnv;
+	if (val == NULL || val[0] == '\0')
+		errx(1, "missing value for property `%s'", key);
+	if (nvpair_find(nvl, key) != NULL)
+		errx(1, "property `%s' already set", key);
+
+	if (strcmp(key, "mountpoint") == 0) {
+		if (strcmp(val, "none") != 0) {
+			if (val[0] != '/')
+				errx(1, "mountpoint `%s' is not absolute", val);
+			if (strcmp(val, zfs->rootpath) != 0 &&
+			    strcmp(zfs->rootpath, "/") != 0 &&
+			    (strstr(val, zfs->rootpath) != val ||
+			     val[strlen(zfs->rootpath)] != '/')) {
+				errx(1, "mountpoint `%s' is not prefixed by "
+				    "the root path `%s'", val, zfs->rootpath);
+			}
+		}
+		nvlist_add_string(nvl, key, val);
+	} else if (strcmp(key, "atime") == 0 || strcmp(key, "exec") == 0 ||
+	    strcmp(key, "setuid") == 0) {
+		if (strcmp(val, "on") == 0)
+			nvlist_add_uint64(nvl, key, 1);
+		else if (strcmp(val, "off") == 0)
+			nvlist_add_uint64(nvl, key, 0);
+		else
+			errx(1, "invalid value `%s' for %s", val, key);
+	} else if (strcmp(key, "canmount") == 0) {
+		if (strcmp(val, "noauto") == 0)
+			nvlist_add_uint64(nvl, key, 2);
+		else if (strcmp(val, "on") == 0)
+			nvlist_add_uint64(nvl, key, 1);
+		else if (strcmp(val, "off") == 0)
+			nvlist_add_uint64(nvl, key, 0);
+		else
+			errx(1, "invalid value `%s' for %s", val, key);
+	} else {
+		errx(1, "unknown property `%s'", key);
+	}
+}
+
+static zfs_dsl_dir_t *
+dsl_metadir_alloc(zfs_opt_t *zfs, const char *name)
+{
+	zfs_dsl_dir_t *dir;
+	char *path;
+
+	easprintf(&path, "%s/%s", zfs->poolname, name);
+	dir = dsl_dir_alloc(zfs, path);
+	free(path);
+	return (dir);
+}
+
+static void
+dsl_origindir_init(zfs_opt_t *zfs)
+{
+	dnode_phys_t *clones;
+	uint64_t clonesid;
+
+	zfs->origindsldir = dsl_metadir_alloc(zfs, "$ORIGIN");
+	zfs->originds = dsl_dataset_alloc(zfs, zfs->origindsldir);
+	zfs->snapds = dsl_dataset_alloc(zfs, zfs->origindsldir);
+
+	clones = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_CLONES, &clonesid);
+	zfs->cloneszap = zap_alloc(zfs->mos, clones);
+	zfs->origindsldir->phys->dd_clones = clonesid;
+}
+
+void
+dsl_init(zfs_opt_t *zfs)
+{
+	zfs_dsl_dir_t *dir;
+	struct dataset_desc *d;
+	const char *dspropdelim;
+
+	dspropdelim = ";";
+
+	zfs->rootdsldir = dsl_dir_alloc(zfs, NULL);
+
+	nvlist_add_uint64(zfs->rootdsldir->propsnv, "compression",
+	    ZIO_COMPRESS_OFF);
+
+	zfs->rootds = dsl_dataset_alloc(zfs, zfs->rootdsldir);
+	zfs->rootdsldir->headds = zfs->rootds;
+
+	zfs->mosdsldir = dsl_metadir_alloc(zfs, "$MOS");
+	zfs->freedsldir = dsl_metadir_alloc(zfs, "$FREE");
+	dsl_origindir_init(zfs);
+
+	/*
+	 * Go through the list of user-specified datasets and create DSL objects
+	 * for them.
+	 */
+	STAILQ_FOREACH(d, &zfs->datasetdescs, next) {
+		char *dsname, *next, *params, *param, *nextparam;
+
+		params = d->params;
+		dsname = strsep(&params, dspropdelim);
+
+		if (strcmp(dsname, zfs->poolname) == 0) {
+			/*
+			 * This is the root dataset; it's already created, so
+			 * we're just setting options.
+			 */
+			dir = zfs->rootdsldir;
+		} else {
+			/*
+			 * This dataset must be a child of the root dataset.
+			 */
+			if (strstr(dsname, zfs->poolname) != dsname ||
+			    (next = strchr(dsname, '/')) == NULL ||
+			    (size_t)(next - dsname) != strlen(zfs->poolname)) {
+				errx(1, "dataset `%s' must be a child of `%s'",
+				    dsname, zfs->poolname);
+			}
+			dir = dsl_dir_alloc(zfs, dsname);
+			dir->headds = dsl_dataset_alloc(zfs, dir);
+		}
+
+		for (nextparam = param = params; nextparam != NULL;) {
+			char *key, *val;
+
+			param = strsep(&nextparam, dspropdelim);
+
+			key = val = param;
+			key = strsep(&val, "=");
+			dsl_dir_set_prop(zfs, dir, key, val);
+		}
+	}
+
+	/*
+	 * Set the root dataset's mount point if the user didn't override the
+	 * default.
+	 */
+	if (nvpair_find(zfs->rootdsldir->propsnv, "mountpoint") == NULL) {
+		nvlist_add_string(zfs->rootdsldir->propsnv, "mountpoint",
+		    zfs->rootpath);
+	}
+}
+
+uint64_t
+dsl_dir_id(zfs_dsl_dir_t *dir)
+{
+	return (dir->dirid);
+}
+
+uint64_t
+dsl_dir_dataset_id(zfs_dsl_dir_t *dir)
+{
+	return (dir->headds->dsid);
+}
+
+static void
+dsl_dir_foreach_post(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir,
+    void (*cb)(zfs_opt_t *, zfs_dsl_dir_t *, void *), void *arg)
+{
+	zfs_dsl_dir_t *cdsldir;
+
+	STAILQ_FOREACH(cdsldir, &dsldir->children, next) {
+		dsl_dir_foreach_post(zfs, cdsldir, cb, arg);
+	}
+	cb(zfs, dsldir, arg);
+}
+
+/*
+ * Used when the caller doesn't care about the order one way or another.
+ */
+void
+dsl_dir_foreach(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir,
+    void (*cb)(zfs_opt_t *, zfs_dsl_dir_t *, void *), void *arg)
+{
+	dsl_dir_foreach_post(zfs, dsldir, cb, arg);
+}
+
+const char *
+dsl_dir_fullname(const zfs_dsl_dir_t *dir)
+{
+	return (dir->fullname);
+}
+
+/*
+ * Create a DSL directory, which is effectively an entry in the ZFS namespace.
+ * We always create a root DSL directory, whose name is the pool's name, and
+ * several metadata directories.
+ *
+ * Each directory has two ZAP objects, one pointing to child directories, and
+ * one for properties (which are inherited by children unless overridden).
+ * Directories typically reference a DSL dataset, the "head dataset", which
+ * points to an object set.
+ */
+static zfs_dsl_dir_t *
+dsl_dir_alloc(zfs_opt_t *zfs, const char *name)
+{
+	zfs_dsl_dir_list_t l, *lp;
+	zfs_dsl_dir_t *dir, *parent;
+	dnode_phys_t *dnode;
+	char *dirname, *nextdir, *origname;
+	uint64_t childid, propsid;
+
+	dir = ecalloc(1, sizeof(*dir));
+
+	dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_DSL_DIR,
+	    DMU_OT_DSL_DIR, sizeof(dsl_dir_phys_t), &dir->dirid);
+	dir->phys = (dsl_dir_phys_t *)DN_BONUS(dnode);
+
+	dnode = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_PROPS, &propsid);
+	dir->propszap = zap_alloc(zfs->mos, dnode);
+
+	dnode = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_DIR_CHILD_MAP,
+	    &childid);
+	dir->childzap = zap_alloc(zfs->mos, dnode);
+
+	dir->propsnv = nvlist_create(NV_UNIQUE_NAME);
+	STAILQ_INIT(&dir->children);
+
+	dir->phys->dd_child_dir_zapobj = childid;
+	dir->phys->dd_props_zapobj = propsid;
+
+	if (name == NULL) {
+		/*
+		 * This is the root DSL directory.
+		 */
+		dir->name = estrdup(zfs->poolname);
+		dir->fullname = estrdup(zfs->poolname);
+		dir->parent = NULL;
+		dir->phys->dd_parent_obj = 0;
+
+		assert(zfs->rootdsldir == NULL);
+		zfs->rootdsldir = dir;
+		return (dir);
+	}
+
+	/*
+	 * Insert the new directory into the hierarchy.  Currently this must be
+	 * done in order, e.g., when creating pool/a/b, pool/a must already
+	 * exist.
+	 */
+	STAILQ_INIT(&l);
+	STAILQ_INSERT_HEAD(&l, zfs->rootdsldir, next);
+	origname = dirname = nextdir = estrdup(name);
+	for (lp = &l;; lp = &parent->children) {
+		dirname = strsep(&nextdir, "/");
+		if (nextdir == NULL)
+			break;
+
+		STAILQ_FOREACH(parent, lp, next) {
+			if (strcmp(parent->name, dirname) == 0)
+				break;
+		}
+		if (parent == NULL) {
+			errx(1, "no parent at `%s' for filesystem `%s'",
+			    dirname, name);
+		}
+	}
+
+	dir->fullname = estrdup(name);
+	dir->name = estrdup(dirname);
+	free(origname);
+	STAILQ_INSERT_TAIL(lp, dir, next);
+	zap_add_uint64(parent->childzap, dir->name, dir->dirid);
+
+	dir->parent = parent;
+	dir->phys->dd_parent_obj = parent->dirid;
+	return (dir);
+}
+
+void
+dsl_dir_size_set(zfs_dsl_dir_t *dir, uint64_t bytes)
+{
+	dir->phys->dd_used_bytes = bytes;
+	dir->phys->dd_compressed_bytes = bytes;
+	dir->phys->dd_uncompressed_bytes = bytes;
+}
+
+/*
+ * Convert dataset properties into entries in the DSL directory's properties
+ * ZAP.
+ */
+static void
+dsl_dir_finalize_props(zfs_dsl_dir_t *dir)
+{
+	for (nvp_header_t *nvh = NULL;
+	    (nvh = nvlist_next_nvpair(dir->propsnv, nvh)) != NULL;) {
+		nv_string_t *nvname;
+		nv_pair_data_t *nvdata;
+		const char *name;
+
+		nvname = (nv_string_t *)(nvh + 1);
+		nvdata = (nv_pair_data_t *)(&nvname->nv_data[0] +
+		    NV_ALIGN4(nvname->nv_size));
+
+		name = nvstring_get(nvname);
+		switch (nvdata->nv_type) {
+		case DATA_TYPE_UINT64: {
+			uint64_t val;
+
+			memcpy(&val, &nvdata->nv_data[0], sizeof(uint64_t));
+			zap_add_uint64(dir->propszap, name, val);
+			break;
+		}
+		case DATA_TYPE_STRING: {
+			nv_string_t *nvstr;
+
+			nvstr = (nv_string_t *)&nvdata->nv_data[0];
+			zap_add_string(dir->propszap, name,
+			    nvstring_get(nvstr));
+			break;
+		}
+		default:
+			assert(0);
+		}
+	}
+}
+
+static void
+dsl_dir_finalize(zfs_opt_t *zfs, zfs_dsl_dir_t *dir, void *arg __unused)
+{
+	char key[32];
+	zfs_dsl_dir_t *cdir;
+	dnode_phys_t *snapnames;
+	zfs_dsl_dataset_t *headds;
+	zfs_objset_t *os;
+	uint64_t bytes, snapnamesid;
+
+	dsl_dir_finalize_props(dir);
+	zap_write(zfs, dir->propszap);
+	zap_write(zfs, dir->childzap);
+
+	headds = dir->headds;
+	if (headds == NULL)
+		return;
+	os = headds->os;
+	if (os == NULL)
+		return;
+
+	snapnames = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_DS_SNAP_MAP,
+	    &snapnamesid);
+	zap_write(zfs, zap_alloc(zfs->mos, snapnames));
+
+	dir->phys->dd_head_dataset_obj = headds->dsid;
+	dir->phys->dd_clone_parent_obj = zfs->snapds->dsid;
+	headds->phys->ds_prev_snap_obj = zfs->snapds->dsid;
+	headds->phys->ds_snapnames_zapobj = snapnamesid;
+	objset_root_blkptr_copy(os, &headds->phys->ds_bp);
+
+	zfs->snapds->phys->ds_num_children++;
+	snprintf(key, sizeof(key), "%jx", (uintmax_t)headds->dsid);
+	zap_add_uint64(zfs->cloneszap, key, headds->dsid);
+
+	bytes = objset_space(os);
+	headds->phys->ds_used_bytes = bytes;
+	headds->phys->ds_uncompressed_bytes = bytes;
+	headds->phys->ds_compressed_bytes = bytes;
+
+	STAILQ_FOREACH(cdir, &dir->children, next)
+		bytes += cdir->phys->dd_used_bytes;
+	dsl_dir_size_set(dir, bytes);
+}
+
+void
+dsl_write(zfs_opt_t *zfs)
+{
+	zfs_zap_t *snapnameszap;
+	dnode_phys_t *snapnames;
+	uint64_t snapmapid;
+
+	/*
+	 * Perform accounting, starting from the leaves of the DSL directory
+	 * tree.  Accounting for $MOS is done later, once we've finished
+	 * allocating space.
+	 */
+	dsl_dir_foreach_post(zfs, zfs->rootdsldir, dsl_dir_finalize, NULL);
+
+	snapnames = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_DS_SNAP_MAP,
+	    &snapmapid);
+	snapnameszap = zap_alloc(zfs->mos, snapnames);
+	zap_add_uint64(snapnameszap, "$ORIGIN", zfs->snapds->dsid);
+	zap_write(zfs, snapnameszap);
+
+	zfs->origindsldir->phys->dd_head_dataset_obj = zfs->originds->dsid;
+	zfs->originds->phys->ds_prev_snap_obj = zfs->snapds->dsid;
+	zfs->originds->phys->ds_snapnames_zapobj = snapmapid;
+
+	zfs->snapds->phys->ds_next_snap_obj = zfs->originds->dsid;
+	assert(zfs->snapds->phys->ds_num_children > 0);
+	zfs->snapds->phys->ds_num_children++;
+
+	zap_write(zfs, zfs->cloneszap);
+
+	/* XXX-MJ dirs and datasets are leaked */
+}
+
+void
+dsl_dir_dataset_write(zfs_opt_t *zfs, zfs_objset_t *os, zfs_dsl_dir_t *dir)
+{
+	dir->headds->os = os;
+	objset_write(zfs, os);
+}
+
+bool
+dsl_dir_has_dataset(zfs_dsl_dir_t *dir)
+{
+	return (dir->headds != NULL);
+}
+
+bool
+dsl_dir_dataset_has_objset(zfs_dsl_dir_t *dir)
+{
+	return (dsl_dir_has_dataset(dir) && dir->headds->os != NULL);
+}
+
+static zfs_dsl_dataset_t *
+dsl_dataset_alloc(zfs_opt_t *zfs, zfs_dsl_dir_t *dir)
+{
+	zfs_dsl_dataset_t *ds;
+	dnode_phys_t *dnode;
+	uint64_t deadlistid;
+
+	ds = ecalloc(1, sizeof(*ds));
+
+	dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_DSL_DATASET,
+	    DMU_OT_DSL_DATASET, sizeof(dsl_dataset_phys_t), &ds->dsid);
+	ds->phys = (dsl_dataset_phys_t *)DN_BONUS(dnode);
+
+	dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_DEADLIST,
+	    DMU_OT_DEADLIST_HDR, sizeof(dsl_deadlist_phys_t), &deadlistid);
+	zap_write(zfs, zap_alloc(zfs->mos, dnode));
+
+	ds->phys->ds_dir_obj = dir->dirid;
+	ds->phys->ds_deadlist_obj = deadlistid;
+	ds->phys->ds_creation_txg = TXG - 1;
+	if (ds != zfs->snapds)
+		ds->phys->ds_prev_snap_txg = TXG - 1;
+	ds->phys->ds_guid = ((uint64_t)random() << 32) | random();
+	ds->dir = dir;
+
+	return (ds);
+}
diff --git a/usr.sbin/makefs/zfs/fs.c b/usr.sbin/makefs/zfs/fs.c
new file mode 100644
index 000000000000..15025ec5447d
--- /dev/null
+++ b/usr.sbin/makefs/zfs/fs.c
@@ -0,0 +1,981 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 The FreeBSD Foundation
+ *
+ * This software was developed by Mark Johnston under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/dirent.h>
+#include <sys/stat.h>
+
+#include <assert.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <util.h>
+
+#include "makefs.h"
+#include "zfs.h"
+
+typedef struct {
+	const char	*name;
+	unsigned int	id;
+	uint16_t	size;
+	sa_bswap_type_t	bs;
+} zfs_sattr_t;
+
+typedef struct zfs_fs {
+	zfs_objset_t	*os;
+
+	/* Offset table for system attributes, indexed by a zpl_attr_t. */
+	uint16_t	*saoffs;
+	size_t		sacnt;
+	const zfs_sattr_t *satab;
+} zfs_fs_t;
+
+/*
+ * The order of the attributes doesn't matter, this is simply the one hard-coded
+ * by OpenZFS, based on a zdb dump of the SA_REGISTRY table.
+ */
+typedef enum zpl_attr {
+	ZPL_ATIME,
+	ZPL_MTIME,
+	ZPL_CTIME,
+	ZPL_CRTIME,
+	ZPL_GEN,
+	ZPL_MODE,
+	ZPL_SIZE,
+	ZPL_PARENT,
+	ZPL_LINKS,
+	ZPL_XATTR,
+	ZPL_RDEV,
+	ZPL_FLAGS,
+	ZPL_UID,
+	ZPL_GID,
+	ZPL_PAD,
+	ZPL_ZNODE_ACL,
+	ZPL_DACL_COUNT,
+	ZPL_SYMLINK,
+	ZPL_SCANSTAMP,
+	ZPL_DACL_ACES,
+	ZPL_DXATTR,
+	ZPL_PROJID,
+} zpl_attr_t;
+
+/*
+ * This table must be kept in sync with zpl_attr_layout[] and zpl_attr_t.
+ */
+static const zfs_sattr_t zpl_attrs[] = {
+#define	_ZPL_ATTR(n, s, b)	{ .name = #n, .id = n, .size = s, .bs = b }
+	_ZPL_ATTR(ZPL_ATIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_MTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_CTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_CRTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_GEN, sizeof(uint64_t), SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_MODE, sizeof(uint64_t), SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_SIZE, sizeof(uint64_t), SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_PARENT, sizeof(uint64_t), SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_LINKS, sizeof(uint64_t), SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_XATTR, sizeof(uint64_t), SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_RDEV, sizeof(uint64_t), SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_FLAGS, sizeof(uint64_t), SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_UID, sizeof(uint64_t), SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_GID, sizeof(uint64_t), SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_PAD, sizeof(uint64_t), SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_ZNODE_ACL, 88, SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_DACL_COUNT, sizeof(uint64_t), SA_UINT64_ARRAY),
+	_ZPL_ATTR(ZPL_SYMLINK, 0, SA_UINT8_ARRAY),
+	_ZPL_ATTR(ZPL_SCANSTAMP, sizeof(uint64_t) * 4, SA_UINT8_ARRAY),
+	_ZPL_ATTR(ZPL_DACL_ACES, 0, SA_ACL),
+	_ZPL_ATTR(ZPL_DXATTR, 0, SA_UINT8_ARRAY),
+	_ZPL_ATTR(ZPL_PROJID, sizeof(uint64_t), SA_UINT64_ARRAY),
+#undef ZPL_ATTR
+};
+
+/*
+ * This layout matches that of a filesystem created using OpenZFS on FreeBSD.
+ * It need not match in general, but FreeBSD's loader doesn't bother parsing the
+ * layout and just hard-codes attribute offsets.
+ */
+static const sa_attr_type_t zpl_attr_layout[] = {
+	ZPL_MODE,
+	ZPL_SIZE,
+	ZPL_GEN,
+	ZPL_UID,
+	ZPL_GID,
+	ZPL_PARENT,
+	ZPL_FLAGS,
+	ZPL_ATIME,
+	ZPL_MTIME,
+	ZPL_CTIME,
+	ZPL_CRTIME,
+	ZPL_LINKS,
+	ZPL_DACL_COUNT,
+	ZPL_DACL_ACES,
+	ZPL_SYMLINK,
+};
+
+/*
+ * Keys for the ZPL attribute tables in the SA layout ZAP.  The first two
+ * indices are reserved for legacy attribute encoding.
+ */
+#define	SA_LAYOUT_INDEX_DEFAULT	2
+#define	SA_LAYOUT_INDEX_SYMLINK	3
+
+struct fs_populate_dir {
+	SLIST_ENTRY(fs_populate_dir) next;
+	int			dirfd;
+	uint64_t		objid;
+	zfs_zap_t		*zap;
+};
+
+struct fs_populate_arg {
+	zfs_opt_t	*zfs;
+	zfs_fs_t	*fs;			/* owning filesystem */
+	int		dirfd;			/* current directory fd */
+	uint64_t	rootdirid;		/* root directory dnode ID */
+	SLIST_HEAD(, fs_populate_dir) dirs;	/* stack of directories */
+};
+
+static void fs_build_one(zfs_opt_t *, zfs_dsl_dir_t *, fsnode *, int);
+
+static bool
+fsnode_isroot(const fsnode *cur)
+{
+	return (strcmp(cur->name, ".") == 0);
+}
+
+/*
+ * Visit each node in a directory hierarchy, in pre-order depth-first order.
+ */
+static void
+fsnode_foreach(fsnode *root, int (*cb)(fsnode *, void *), void *arg)
+{
+	assert(root->type == S_IFDIR);
+
+	for (fsnode *cur = root; cur != NULL; cur = cur->next) {
+		assert(cur->type == S_IFREG || cur->type == S_IFDIR ||
+		    cur->type == S_IFLNK);
+
+		if (cb(cur, arg) == 0)
+			continue;
+		if (cur->type == S_IFDIR && cur->child != NULL)
+			fsnode_foreach(cur->child, cb, arg);
+	}
+}
+
+static void
+fs_populate_dirent(struct fs_populate_arg *arg, fsnode *cur, uint64_t dnid)
+{
+	struct fs_populate_dir *dir;
+	uint64_t type;
+
+	switch (cur->type) {
+	case S_IFREG:
+		type = DT_REG;
+		break;
+	case S_IFDIR:
+		type = DT_DIR;
+		break;
+	case S_IFLNK:
+		type = DT_LNK;
+		break;
+	default:
+		assert(0);
+	}
+
+	dir = SLIST_FIRST(&arg->dirs);
+	zap_add_uint64(dir->zap, cur->name, ZFS_DIRENT_MAKE(type, dnid));
+}
+
+static void
+fs_populate_attr(zfs_fs_t *fs, char *attrbuf, const void *val, uint16_t ind,
+    size_t *szp)
+{
+	assert(ind < fs->sacnt);
+	assert(fs->saoffs[ind] != 0xffff);
+
+	memcpy(attrbuf + fs->saoffs[ind], val, fs->satab[ind].size);
+	*szp += fs->satab[ind].size;
+}
+
+static void
+fs_populate_varszattr(zfs_fs_t *fs, char *attrbuf, const void *val,
+    size_t valsz, size_t varoff, uint16_t ind, size_t *szp)
+{
+	assert(ind < fs->sacnt);
+	assert(fs->saoffs[ind] != 0xffff);
+	assert(fs->satab[ind].size == 0);
+
+	memcpy(attrbuf + fs->saoffs[ind] + varoff, val, valsz);
+	*szp += valsz;
+}
+
+static void
+fs_populate_sattrs(struct fs_populate_arg *arg, const fsnode *cur,
+    dnode_phys_t *dnode)
+{
+	char target[PATH_MAX];
+	zfs_fs_t *fs;
+	zfs_ace_hdr_t aces[3];
+	struct stat *sb;
+	sa_hdr_phys_t *sahdr;
+	uint64_t daclcount, flags, gen, gid, links, mode, parent, objsize, uid;
+	char *attrbuf;
+	size_t bonussz, hdrsz;
+	int layout;
+
+	assert(dnode->dn_bonustype == DMU_OT_SA);
+	assert(dnode->dn_nblkptr == 1);
+
+	fs = arg->fs;
+	sb = &cur->inode->st;
+
+	switch (cur->type) {
+	case S_IFREG:
+		layout = SA_LAYOUT_INDEX_DEFAULT;
+		links = cur->inode->nlink;
+		objsize = sb->st_size;
+		parent = SLIST_FIRST(&arg->dirs)->objid;
+		break;
+	case S_IFDIR:
+		layout = SA_LAYOUT_INDEX_DEFAULT;
+		links = 1; /* .. */
+		objsize = 1; /* .. */
+
+		/*
+		 * The size of a ZPL directory is the number of entries
+		 * (including "." and ".."), and the link count is the number of
+		 * entries which are directories (including "." and "..").
+		 */
+		for (fsnode *c = fsnode_isroot(cur) ? cur->next : cur->child;
+		    c != NULL; c = c->next) {
+			if (c->type == S_IFDIR)
+				links++;
+			objsize++;
+		}
+
+		/* The root directory is its own parent. */
+		parent = SLIST_EMPTY(&arg->dirs) ?
+		    arg->rootdirid : SLIST_FIRST(&arg->dirs)->objid;
+		break;
+	case S_IFLNK: {
+		ssize_t n;
+
+		if ((n = readlinkat(SLIST_FIRST(&arg->dirs)->dirfd, cur->name,
+		    target, sizeof(target) - 1)) == -1)
+			err(1, "readlinkat(%s)", cur->name);
+		target[n] = '\0';
+
+		layout = SA_LAYOUT_INDEX_SYMLINK;
+		links = 1;
+		objsize = strlen(target);
+		parent = SLIST_FIRST(&arg->dirs)->objid;
+		break;
+		}
+	default:
+		assert(0);
+	}
+
+	daclcount = nitems(aces);
+	flags = ZFS_ACL_TRIVIAL | ZFS_ACL_AUTO_INHERIT | ZFS_NO_EXECS_DENIED |
+	    ZFS_ARCHIVE | ZFS_AV_MODIFIED; /* XXX-MJ */
+	gen = 1;
+	gid = sb->st_gid;
+	mode = sb->st_mode;
+	uid = sb->st_uid;
+
+	memset(aces, 0, sizeof(aces));
+	aces[0].z_flags = ACE_OWNER;
+	aces[0].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
+	aces[0].z_access_mask = ACE_WRITE_ATTRIBUTES | ACE_WRITE_OWNER |
+	    ACE_WRITE_ACL | ACE_WRITE_NAMED_ATTRS | ACE_READ_ACL |
+	    ACE_READ_ATTRIBUTES | ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
+	if ((mode & S_IRUSR) != 0)
+		aces[0].z_access_mask |= ACE_READ_DATA;
+	if ((mode & S_IWUSR) != 0)
+		aces[0].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
+	if ((mode & S_IXUSR) != 0)
+		aces[0].z_access_mask |= ACE_EXECUTE;
+
+	aces[1].z_flags = ACE_GROUP | ACE_IDENTIFIER_GROUP;
+	aces[1].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
+	aces[1].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES |
+	    ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
+	if ((mode & S_IRGRP) != 0)
+		aces[1].z_access_mask |= ACE_READ_DATA;
+	if ((mode & S_IWGRP) != 0)
+		aces[1].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
+	if ((mode & S_IXGRP) != 0)
+		aces[1].z_access_mask |= ACE_EXECUTE;
+
+	aces[2].z_flags = ACE_EVERYONE;
+	aces[2].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
+	aces[2].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES |
+	    ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
+	if ((mode & S_IROTH) != 0)
+		aces[2].z_access_mask |= ACE_READ_DATA;
+	if ((mode & S_IWOTH) != 0)
+		aces[2].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
+	if ((mode & S_IXOTH) != 0)
+		aces[2].z_access_mask |= ACE_EXECUTE;
+
+	switch (layout) {
+	case SA_LAYOUT_INDEX_DEFAULT:
+		/* At most one variable-length attribute. */
+		hdrsz = sizeof(uint64_t);
+		break;
+	case SA_LAYOUT_INDEX_SYMLINK:
+		/* At most five variable-length attributes. */
+		hdrsz = sizeof(uint64_t) * 2;
+		break;
+	default:
+		assert(0);
+	}
+
+	sahdr = (sa_hdr_phys_t *)DN_BONUS(dnode);
+	sahdr->sa_magic = SA_MAGIC;
+	SA_HDR_LAYOUT_INFO_ENCODE(sahdr->sa_layout_info, layout, hdrsz);
+
+	bonussz = SA_HDR_SIZE(sahdr);
+	attrbuf = (char *)sahdr + SA_HDR_SIZE(sahdr);
+
+	fs_populate_attr(fs, attrbuf, &daclcount, ZPL_DACL_COUNT, &bonussz);
+	fs_populate_attr(fs, attrbuf, &flags, ZPL_FLAGS, &bonussz);
+	fs_populate_attr(fs, attrbuf, &gen, ZPL_GEN, &bonussz);
+	fs_populate_attr(fs, attrbuf, &gid, ZPL_GID, &bonussz);
+	fs_populate_attr(fs, attrbuf, &links, ZPL_LINKS, &bonussz);
+	fs_populate_attr(fs, attrbuf, &mode, ZPL_MODE, &bonussz);
+	fs_populate_attr(fs, attrbuf, &parent, ZPL_PARENT, &bonussz);
+	fs_populate_attr(fs, attrbuf, &objsize, ZPL_SIZE, &bonussz);
+	fs_populate_attr(fs, attrbuf, &uid, ZPL_UID, &bonussz);
+
+	/*
+	 * We deliberately set atime = mtime here to ensure that images are
+	 * reproducible.
+	 */
+	assert(sizeof(sb->st_mtim) == fs->satab[ZPL_ATIME].size);
+	fs_populate_attr(fs, attrbuf, &sb->st_mtim, ZPL_ATIME, &bonussz);
+	assert(sizeof(sb->st_ctim) == fs->satab[ZPL_CTIME].size);
+	fs_populate_attr(fs, attrbuf, &sb->st_ctim, ZPL_CTIME, &bonussz);
+	assert(sizeof(sb->st_mtim) == fs->satab[ZPL_MTIME].size);
+	fs_populate_attr(fs, attrbuf, &sb->st_mtim, ZPL_MTIME, &bonussz);
+	assert(sizeof(sb->st_birthtim) == fs->satab[ZPL_CRTIME].size);
+	fs_populate_attr(fs, attrbuf, &sb->st_birthtim, ZPL_CRTIME, &bonussz);
+
+	fs_populate_varszattr(fs, attrbuf, aces, sizeof(aces), 0,
+	    ZPL_DACL_ACES, &bonussz);
+	sahdr->sa_lengths[0] = sizeof(aces);
+
+	if (cur->type == S_IFLNK) {
+		assert(layout == SA_LAYOUT_INDEX_SYMLINK);
+		/* Need to use a spill block pointer if the target is long. */
+		assert(bonussz + objsize <= DN_OLD_MAX_BONUSLEN);
+		fs_populate_varszattr(fs, attrbuf, target, objsize,
+		    sahdr->sa_lengths[0], ZPL_SYMLINK, &bonussz);
+		sahdr->sa_lengths[1] = (uint16_t)objsize;
+	}
+
+	dnode->dn_bonuslen = bonussz;
+}
+
+static void
+fs_populate_file(fsnode *cur, struct fs_populate_arg *arg)
+{
+	struct dnode_cursor *c;
+	dnode_phys_t *dnode;
+	zfs_opt_t *zfs;
+	char *buf;
+	uint64_t dnid;
+	ssize_t n;
+	size_t bufsz;
+	off_t size, target;
+	int fd;
+
+	assert(cur->type == S_IFREG);
+	assert((cur->inode->flags & FI_ROOT) == 0);
+
+	zfs = arg->zfs;
+
+	assert(cur->inode->ino != 0);
+	if ((cur->inode->flags & FI_ALLOCATED) != 0) {
+		/*
+		 * This is a hard link of an existing file.
+		 *
+		 * XXX-MJ need to check whether it crosses datasets, add a test
+		 * case for that
+		 */
+		fs_populate_dirent(arg, cur, cur->inode->ino);
+		return;
+	}
+
+	dnode = objset_dnode_bonus_alloc(arg->fs->os,
+	    DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid);
+	cur->inode->ino = dnid;
+	cur->inode->flags |= FI_ALLOCATED;
+
+	fd = openat(SLIST_FIRST(&arg->dirs)->dirfd, cur->name, O_RDONLY);
+	if (fd == -1)
+		err(1, "openat(%s)", cur->name);
+
+	buf = zfs->filebuf;
+	bufsz = sizeof(zfs->filebuf);
+	size = cur->inode->st.st_size;
+	c = dnode_cursor_init(zfs, arg->fs->os, dnode, size, 0);
+	for (off_t foff = 0; foff < size; foff += target) {
+		off_t loc, sofar;
+
+		/*
+		 * Fill up our buffer, handling partial reads.
+		 *
+		 * It might be profitable to use copy_file_range(2) here.
+		 */
+		sofar = 0;
+		target = MIN(size - foff, (off_t)bufsz);
+		do {
+			n = read(fd, buf + sofar, target);
+			if (n < 0)
+				err(1, "reading from '%s'", cur->name);
+			if (n == 0)
+				errx(1, "unexpected EOF reading '%s'",
+				    cur->name);
+			sofar += n;
+		} while (sofar < target);
+
+		if (target < (off_t)bufsz)
+			memset(buf + target, 0, bufsz - target);
+
+		loc = objset_space_alloc(zfs, arg->fs->os, &target);
+		vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, buf, target, loc,
+		    dnode_cursor_next(zfs, c, foff));
+	}
+	if (close(fd) != 0)
+		err(1, "close");
+	dnode_cursor_finish(zfs, c);
+
+	fs_populate_sattrs(arg, cur, dnode);
+	fs_populate_dirent(arg, cur, dnid);
+}
+
+static void
+fs_populate_dir(fsnode *cur, struct fs_populate_arg *arg)
+{
+	dnode_phys_t *dnode;
+	zfs_objset_t *os;
+	uint64_t dnid;
+	int dirfd;
+
+	assert(cur->type == S_IFDIR);
+	assert((cur->inode->flags & FI_ALLOCATED) == 0);
+
+	os = arg->fs->os;
+
+	dnode = objset_dnode_bonus_alloc(os, DMU_OT_DIRECTORY_CONTENTS,
+	    DMU_OT_SA, 0, &dnid);
+
+	/*
+	 * Add an entry to the parent directory and open this directory.
+	 */
+	if (!SLIST_EMPTY(&arg->dirs)) {
+		fs_populate_dirent(arg, cur, dnid);
+		dirfd = openat(SLIST_FIRST(&arg->dirs)->dirfd, cur->name,
+		    O_DIRECTORY);
+		if (dirfd < 0)
+			err(1, "open(%s)", cur->name);
+	} else {
+		arg->rootdirid = dnid;
+		dirfd = arg->dirfd;
+	}
+
+	/*
+	 * Set ZPL attributes.
+	 */
+	fs_populate_sattrs(arg, cur, dnode);
+
+	/*
+	 * If this is a root directory, then its children belong to a different
+	 * dataset and this directory remains empty in the current objset.
+	 */
+	if ((cur->inode->flags & FI_ROOT) == 0) {
+		struct fs_populate_dir *dir;
+
+		dir = ecalloc(1, sizeof(*dir));
+		dir->dirfd = dirfd;
+		dir->objid = dnid;
+		dir->zap = zap_alloc(os, dnode);
+		SLIST_INSERT_HEAD(&arg->dirs, dir, next);
+	} else {
+		zap_write(arg->zfs, zap_alloc(os, dnode));
+		fs_build_one(arg->zfs, cur->inode->param, cur->child, dirfd);
+	}
+}
+
+static void
+fs_populate_symlink(fsnode *cur, struct fs_populate_arg *arg)
+{
+	dnode_phys_t *dnode;
+	uint64_t dnid;
+
+	assert(cur->type == S_IFLNK);
+	assert((cur->inode->flags & (FI_ALLOCATED | FI_ROOT)) == 0);
+
+	dnode = objset_dnode_bonus_alloc(arg->fs->os,
+	    DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid);
+
+	fs_populate_dirent(arg, cur, dnid);
+
+	fs_populate_sattrs(arg, cur, dnode);
+}
+
+static int
+fs_foreach_populate(fsnode *cur, void *_arg)
+{
+	struct fs_populate_arg *arg;
+	struct fs_populate_dir *dir;
+	int ret;
+
+	arg = _arg;
+	switch (cur->type) {
+	case S_IFREG:
+		fs_populate_file(cur, arg);
+		break;
+	case S_IFDIR:
+		if (fsnode_isroot(cur))
+			break;
+		fs_populate_dir(cur, arg);
+		break;
+	case S_IFLNK:
+		fs_populate_symlink(cur, arg);
+		break;
+	default:
+		assert(0);
+	}
+
+	ret = (cur->inode->flags & FI_ROOT) != 0 ? 0 : 1;
+
+	if (cur->next == NULL &&
+	    (cur->child == NULL || (cur->inode->flags & FI_ROOT) != 0)) {
+		/*
+		 * We reached a terminal node in a subtree.  Walk back up and
+		 * write out directories.  We're done once we hit the root of a
+		 * dataset or find a level where we're not on the edge of the
+		 * tree.
+		 */
+		do {
+			dir = SLIST_FIRST(&arg->dirs);
+			SLIST_REMOVE_HEAD(&arg->dirs, next);
+			zap_write(arg->zfs, dir->zap);
+			if (dir->dirfd != -1 && close(dir->dirfd) != 0)
+				err(1, "close");
+			free(dir);
+			cur = cur->parent;
+		} while (cur != NULL && cur->next == NULL &&
+		    (cur->inode->flags & FI_ROOT) == 0);
+	}
+
+	return (ret);
+}
+
+static void
+fs_add_zpl_attr_layout(zfs_zap_t *zap, unsigned int index,
+    const sa_attr_type_t layout[], size_t sacnt)
+{
+	char ti[16];
+
+	assert(sizeof(layout[0]) == 2);
+
+	snprintf(ti, sizeof(ti), "%u", index);
+	zap_add(zap, ti, sizeof(sa_attr_type_t), sacnt,
+	    (const uint8_t *)layout);
+}
+
+/*
+ * Initialize system attribute tables.
+ *
+ * There are two elements to this.  First, we write the zpl_attrs[] and
+ * zpl_attr_layout[] tables to disk.  Then we create a lookup table which
+ * allows us to set file attributes quickly.
+ */
+static uint64_t
+fs_set_zpl_attrs(zfs_opt_t *zfs, zfs_fs_t *fs)
+{
+	zfs_zap_t *sazap, *salzap, *sarzap;
+	zfs_objset_t *os;
+	dnode_phys_t *saobj, *salobj, *sarobj;
+	uint64_t saobjid, salobjid, sarobjid;
+	uint16_t offset;
+
+	os = fs->os;
+
+	/*
+	 * The on-disk tables are stored in two ZAP objects, the registry object
+	 * and the layout object.  Individual attributes are described by
+	 * entries in the registry object; for example, the value for the
+	 * "ZPL_SIZE" key gives the size and encoding of the ZPL_SIZE attribute.
+	 * The attributes of a file are ordered according to one of the layouts
+	 * defined in the layout object.  The master node object is simply used
+	 * to locate the registry and layout objects.
+	 */
+	saobj = objset_dnode_alloc(os, DMU_OT_SA_MASTER_NODE, &saobjid);
+	salobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_LAYOUTS, &salobjid);
+	sarobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_REGISTRATION, &sarobjid);
+
+	sarzap = zap_alloc(os, sarobj);
+	for (size_t i = 0; i < nitems(zpl_attrs); i++) {
+		const zfs_sattr_t *sa;
+		uint64_t attr;
+
+		attr = 0;
+		sa = &zpl_attrs[i];
+		SA_ATTR_ENCODE(attr, (uint64_t)i, sa->size, sa->bs);
+		zap_add_uint64(sarzap, sa->name, attr);
+	}
+	zap_write(zfs, sarzap);
+
+	/*
+	 * Layouts are arrays of indices into the registry.  We define two
+	 * layouts for use by the ZPL, one for non-symlinks and one for
+	 * symlinks.  They are identical except that the symlink layout includes
+	 * ZPL_SYMLINK as its final attribute.
+	 */
+	salzap = zap_alloc(os, salobj);
+	assert(zpl_attr_layout[nitems(zpl_attr_layout) - 1] == ZPL_SYMLINK);
+	fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_DEFAULT,
+	    zpl_attr_layout, nitems(zpl_attr_layout) - 1);
+	fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_SYMLINK,
+	    zpl_attr_layout, nitems(zpl_attr_layout));
+	zap_write(zfs, salzap);
+
+	sazap = zap_alloc(os, saobj);
+	zap_add_uint64(sazap, SA_LAYOUTS, salobjid);
+	zap_add_uint64(sazap, SA_REGISTRY, sarobjid);
+	zap_write(zfs, sazap);
+
+	/* Sanity check. */
+	for (size_t i = 0; i < nitems(zpl_attrs); i++)
+		assert(i == zpl_attrs[i].id);
+
+	/*
+	 * Build the offset table used when setting file attributes.  File
+	 * attributes are stored in the object's bonus buffer; this table
+	 * provides the buffer offset of attributes referenced by the layout
+	 * table.
+	 */
+	fs->sacnt = nitems(zpl_attrs);
+	fs->saoffs = ecalloc(fs->sacnt, sizeof(*fs->saoffs));
+	for (size_t i = 0; i < fs->sacnt; i++)
+		fs->saoffs[i] = 0xffff;
+	offset = 0;
+	for (size_t i = 0; i < nitems(zpl_attr_layout); i++) {
+		uint16_t size;
+
+		assert(zpl_attr_layout[i] < fs->sacnt);
+
+		fs->saoffs[zpl_attr_layout[i]] = offset;
+		size = zpl_attrs[zpl_attr_layout[i]].size;
+		offset += size;
+	}
+	fs->satab = zpl_attrs;
+
+	return (saobjid);
+}
+
+static void
+fs_layout_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg)
+{
+	char *mountpoint, *origmountpoint, *name, *next;
+	fsnode *cur, *root;
+	uint64_t canmount;
+
+	if (!dsl_dir_has_dataset(dsldir))
+		return;
+
+	mountpoint = dsl_dir_get_mountpoint(zfs, dsldir);
+	if (mountpoint == NULL)
+		return;
+	if (dsl_dir_get_canmount(dsldir, &canmount) == 0 && canmount == 0)
+		return;
+
+	/*
+	 * If we were asked to specify a bootfs, set it here.
+	 */
+	if (zfs->bootfs != NULL && strcmp(zfs->bootfs,
+	    dsl_dir_fullname(dsldir)) == 0) {
+		zap_add_uint64(zfs->poolprops, "bootfs",
+		    dsl_dir_dataset_id(dsldir));
+	}
+
+	origmountpoint = mountpoint;
+
+	/*
+	 * Figure out which fsnode corresponds to our mountpoint.
+	 */
+	root = arg;
+	cur = root;
+	if (strcmp(mountpoint, zfs->rootpath) != 0) {
+		mountpoint += strlen(zfs->rootpath);
+
+		/*
+		 * Look up the directory in the staged tree.  For example, if
+		 * the dataset's mount point is /foo/bar/baz, we'll search the
+		 * root directory for "foo", search "foo" for "baz", and so on.
+		 * Each intermediate name must refer to a directory; the final
+		 * component need not exist.
+		 */
+		cur = root;
+		for (next = name = mountpoint; next != NULL;) {
+			for (; *next == '/'; next++)
+				;
+			name = strsep(&next, "/");
+
+			for (; cur != NULL && strcmp(cur->name, name) != 0;
+			    cur = cur->next)
+				;
+			if (cur == NULL) {
+				if (next == NULL)
+					break;
+				errx(1, "missing mountpoint directory for `%s'",
+				    dsl_dir_fullname(dsldir));
+			}
+			if (cur->type != S_IFDIR) {
+				errx(1,
+				    "mountpoint for `%s' is not a directory",
+				    dsl_dir_fullname(dsldir));
+			}
+			if (next != NULL)
+				cur = cur->child;
+		}
+	}
+
+	if (cur != NULL) {
+		assert(cur->type == S_IFDIR);
+
+		/*
+		 * Multiple datasets shouldn't share a mountpoint.  It's
+		 * technically allowed, but it's not clear what makefs should do
+		 * in that case.
+		 */
+		assert((cur->inode->flags & FI_ROOT) == 0);
+		if (cur != root)
+			cur->inode->flags |= FI_ROOT;
+		assert(cur->inode->param == NULL);
+		cur->inode->param = dsldir;
+	}
+
+	free(origmountpoint);
+}
+
+static int
+fs_foreach_mark(fsnode *cur, void *arg)
+{
+	uint64_t *countp;
+
+	countp = arg;
+	if (cur->type == S_IFDIR && fsnode_isroot(cur))
+		return (1);
+
+	if (cur->inode->ino == 0) {
+		cur->inode->ino = ++(*countp);
+		cur->inode->nlink = 1;
+	} else {
+		cur->inode->nlink++;
+	}
+
+	return ((cur->inode->flags & FI_ROOT) != 0 ? 0 : 1);
+}
+
+/*
+ * Create a filesystem dataset.  More specifically:
+ * - create an object set for the dataset,
+ * - add required metadata (SA tables, property definitions, etc.) to that
+ *   object set,
+ * - optionally populate the object set with file objects, using "root" as the
+ *   root directory.
+ *
+ * "dirfd" is a directory descriptor for the directory referenced by "root".  It
+ * is closed before returning.
+ */
+static void
+fs_build_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, fsnode *root, int dirfd)
+{
+	struct fs_populate_arg arg;
+	zfs_fs_t fs;
+	zfs_zap_t *masterzap;
+	zfs_objset_t *os;
+	dnode_phys_t *deleteq, *masterobj;
+	uint64_t deleteqid, dnodecount, moid, rootdirid, saobjid;
+	bool fakedroot;
+
+	/*
+	 * This dataset's mountpoint doesn't exist in the staging tree, or the
+	 * dataset doesn't have a mountpoint at all.  In either case we still
+	 * need a root directory.  Fake up a root fsnode to handle this case.
+	 */
+	fakedroot = root == NULL;
+	if (fakedroot) {
+		struct stat *stp;
+
+		assert(dirfd == -1);
+
+		root = ecalloc(1, sizeof(*root));
+		root->inode = ecalloc(1, sizeof(*root->inode));
+		root->name = estrdup(".");
+		root->type = S_IFDIR;
+
+		stp = &root->inode->st;
+		stp->st_uid = 0;
+		stp->st_gid = 0;
+		stp->st_mode = S_IFDIR | 0755;
+	}
+	assert(root->type == S_IFDIR);
+	assert(fsnode_isroot(root));
+
+	/*
+	 * Initialize the object set for this dataset.
+	 */
+	os = objset_alloc(zfs, DMU_OST_ZFS);
+	masterobj = objset_dnode_alloc(os, DMU_OT_MASTER_NODE, &moid);
+	assert(moid == MASTER_NODE_OBJ);
+
+	memset(&fs, 0, sizeof(fs));
+	fs.os = os;
+
+	/*
+	 * Create the ZAP SA layout now since filesystem object dnodes will
+	 * refer to those attributes.
+	 */
+	saobjid = fs_set_zpl_attrs(zfs, &fs);
+
+	/*
+	 * Make a pass over the staged directory to detect hard links and assign
+	 * virtual dnode numbers.
+	 */
+	dnodecount = 1; /* root directory */
+	fsnode_foreach(root, fs_foreach_mark, &dnodecount);
+
+	/*
+	 * Make a second pass to populate the dataset with files from the
+	 * staged directory.  Most of our runtime is spent here.
+	 */
+	arg.dirfd = dirfd;
+	arg.zfs = zfs;
+	arg.fs = &fs;
+	SLIST_INIT(&arg.dirs);
+	fs_populate_dir(root, &arg);
+	assert(!SLIST_EMPTY(&arg.dirs));
+	fsnode_foreach(root, fs_foreach_populate, &arg);
+	assert(SLIST_EMPTY(&arg.dirs));
+	rootdirid = arg.rootdirid;
+
+	/*
+	 * Create an empty delete queue.  We don't do anything with it, but
+	 * OpenZFS will refuse to mount filesystems that don't have one.
+	 */
+	deleteq = objset_dnode_alloc(os, DMU_OT_UNLINKED_SET, &deleteqid);
+	zap_write(zfs, zap_alloc(os, deleteq));
+
+	/*
+	 * Populate and write the master node object.  This is a ZAP object
+	 * containing various dataset properties and the object IDs of the root
+	 * directory and delete queue.
+	 */
+	masterzap = zap_alloc(os, masterobj);
+	zap_add_uint64(masterzap, ZFS_ROOT_OBJ, rootdirid);
+	zap_add_uint64(masterzap, ZFS_UNLINKED_SET, deleteqid);
+	zap_add_uint64(masterzap, ZFS_SA_ATTRS, saobjid);
+	zap_add_uint64(masterzap, ZPL_VERSION_OBJ, 5 /* ZPL_VERSION_SA */);
+	zap_add_uint64(masterzap, "normalization", 0 /* off */);
+	zap_add_uint64(masterzap, "utf8only", 0 /* off */);
+	zap_add_uint64(masterzap, "casesensitivity", 0 /* case sensitive */);
+	zap_add_uint64(masterzap, "acltype", 2 /* NFSv4 */);
+	zap_write(zfs, masterzap);
+
+	/*
+	 * All finished with this object set, we may as well write it now.
+	 * The DSL layer will sum up the bytes consumed by each dataset using
+	 * information stored in the object set, so it can't be freed just yet.
+	 */
+	dsl_dir_dataset_write(zfs, os, dsldir);
+
+	if (fakedroot) {
+		free(root->inode);
+		free(root->name);
+		free(root);
+	}
+	free(fs.saoffs);
+}
+
+/*
+ * Create an object set for each DSL directory which has a dataset and doesn't
+ * already have an object set.
+ */
+static void
+fs_build_unmounted(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg __unused)
+{
+	if (dsl_dir_has_dataset(dsldir) && !dsl_dir_dataset_has_objset(dsldir))
+		fs_build_one(zfs, dsldir, NULL, -1);
+}
+
+/*
+ * Create our datasets and populate them with files.
+ */
+void
+fs_build(zfs_opt_t *zfs, int dirfd, fsnode *root)
+{
+	/*
+	 * Run through our datasets and find the root fsnode for each one.  Each
+	 * root fsnode is flagged so that we can figure out which dataset it
+	 * belongs to.
+	 */
+	dsl_dir_foreach(zfs, zfs->rootdsldir, fs_layout_one, root);
+
+	/*
+	 * Did we find our boot filesystem?
+	 */
+	if (zfs->bootfs != NULL && !zap_entry_exists(zfs->poolprops, "bootfs"))
+		errx(1, "no mounted dataset matches bootfs property `%s'",
+		    zfs->bootfs);
+
+	/*
+	 * Traverse the file hierarchy starting from the root fsnode.  One
+	 * dataset, not necessarily the root dataset, must "own" the root
+	 * directory by having its mountpoint be equal to the root path.
+	 *
+	 * As roots of other datasets are encountered during the traversal,
+	 * fs_build_one() recursively creates the corresponding object sets and
+	 * populates them.  Once this function has returned, all datasets will
+	 * have been fully populated.
+	 */
+	fs_build_one(zfs, root->inode->param, root, dirfd);
+
+	/*
+	 * Now create object sets for datasets whose mountpoints weren't found
+	 * in the staging directory, either because there is no mountpoint, or
+	 * because the mountpoint doesn't correspond to an existing directory.
+	 */
+	dsl_dir_foreach(zfs, zfs->rootdsldir, fs_build_unmounted, NULL);
+}
diff --git a/usr.sbin/makefs/zfs/objset.c b/usr.sbin/makefs/zfs/objset.c
new file mode 100644
index 000000000000..fdb17167a607
--- /dev/null
+++ b/usr.sbin/makefs/zfs/objset.c
@@ -0,0 +1,259 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 The FreeBSD Foundation
+ *
+ * This software was developed by Mark Johnston under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include <util.h>
+
+#include "zfs.h"
+
+#define	DNODES_PER_CHUNK	(MAXBLOCKSIZE / sizeof(dnode_phys_t))
+
+struct objset_dnode_chunk {
+	dnode_phys_t	buf[DNODES_PER_CHUNK];
+	unsigned int	nextfree;
+	STAILQ_ENTRY(objset_dnode_chunk) next;
+};
+
+typedef struct zfs_objset {
+	/* Physical object set. */
+	objset_phys_t	*phys;
+	off_t		osloc;
+	off_t		osblksz;
+	blkptr_t	osbp;		/* set in objset_write() */
+
+	/* Accounting. */
+	off_t		space;		/* bytes allocated to this objset */
+
+	/* dnode allocator. */
+	uint64_t	dnodecount;
+	STAILQ_HEAD(, objset_dnode_chunk) dnodechunks;
+} zfs_objset_t;
+
+static void
+dnode_init(dnode_phys_t *dnode, uint8_t type, uint8_t bonustype,
+    uint16_t bonuslen)
+{
+	dnode->dn_indblkshift = MAXBLOCKSHIFT;
+	dnode->dn_type = type;
+	dnode->dn_bonustype = bonustype;
+	dnode->dn_bonuslen = bonuslen;
+	dnode->dn_checksum = ZIO_CHECKSUM_FLETCHER_4;
+	dnode->dn_nlevels = 1;
+	dnode->dn_nblkptr = 1;
+	dnode->dn_flags = DNODE_FLAG_USED_BYTES;
+}
+
+zfs_objset_t *
+objset_alloc(zfs_opt_t *zfs, uint64_t type)
+{
+	struct objset_dnode_chunk *chunk;
+	zfs_objset_t *os;
+
+	os = ecalloc(1, sizeof(*os));
+	os->osblksz = sizeof(objset_phys_t);
+	os->osloc = objset_space_alloc(zfs, os, &os->osblksz);
+
+	/*
+	 * Object ID zero is always reserved for the meta dnode, which is
+	 * embedded in the objset itself.
+	 */
+	STAILQ_INIT(&os->dnodechunks);
+	chunk = ecalloc(1, sizeof(*chunk));
+	chunk->nextfree = 1;
+	STAILQ_INSERT_HEAD(&os->dnodechunks, chunk, next);
+	os->dnodecount = 1;
+
+	os->phys = ecalloc(1, os->osblksz);
+	os->phys->os_type = type;
+
+	dnode_init(&os->phys->os_meta_dnode, DMU_OT_DNODE, DMU_OT_NONE, 0);
+	os->phys->os_meta_dnode.dn_datablkszsec =
+	    DNODE_BLOCK_SIZE >> MINBLOCKSHIFT;
+
+	return (os);
+}
+
+/*
+ * Write the dnode array and physical object set to disk.
+ */
+static void
+_objset_write(zfs_opt_t *zfs, zfs_objset_t *os, struct dnode_cursor *c,
+    off_t loc)
+{
+	struct objset_dnode_chunk *chunk, *tmp;
+	unsigned int total;
+
+	/*
+	 * Write out the dnode array, i.e., the meta-dnode.  For some reason its
+	 * data blocks must be 16KB in size no matter how large the array is.
+	 */
+	total = 0;
+	STAILQ_FOREACH_SAFE(chunk, &os->dnodechunks, next, tmp) {
+		unsigned int i;
+
+		assert(chunk->nextfree <= os->dnodecount);
+		assert(chunk->nextfree <= DNODES_PER_CHUNK);
+
+		for (i = 0; i < chunk->nextfree; i += DNODES_PER_BLOCK) {
+			blkptr_t *bp;
+			uint64_t fill;
+
+			if (chunk->nextfree - i < DNODES_PER_BLOCK)
+				fill = DNODES_PER_BLOCK - (chunk->nextfree - i);
+			else
+				fill = 0;
+			bp = dnode_cursor_next(zfs, c,
+			    (total + i) * sizeof(dnode_phys_t));
+			vdev_pwrite_dnode_indir(zfs, &os->phys->os_meta_dnode,
+			    0, fill, chunk->buf + i, DNODE_BLOCK_SIZE, loc, bp);
+			loc += DNODE_BLOCK_SIZE;
+		}
+		total += i;
+
+		free(chunk);
+	}
+	dnode_cursor_finish(zfs, c);
+	STAILQ_INIT(&os->dnodechunks);
+
+	/*
+	 * Write the object set itself.  The saved block pointer will be copied
+	 * into the referencing DSL dataset or the uberblocks.
+	 */
+	vdev_pwrite_data(zfs, DMU_OT_OBJSET, ZIO_CHECKSUM_FLETCHER_4, 0, 1,
+	    os->phys, os->osblksz, os->osloc, &os->osbp);
+}
+
+void
+objset_write(zfs_opt_t *zfs, zfs_objset_t *os)
+{
+	struct dnode_cursor *c;
+	off_t dnodeloc, dnodesz;
+	uint64_t dnodecount;
+
+	/*
+	 * There is a chicken-and-egg problem here when writing the MOS: we
+	 * cannot write space maps before we're finished allocating space from
+	 * the vdev, and we can't write the MOS without having allocated space
+	 * for indirect dnode blocks.  Thus, rather than lazily allocating
+	 * indirect blocks for the meta-dnode (which would be simpler), they are
+	 * allocated up-front and before writing space maps.
+	 */
+	dnodecount = os->dnodecount;
+	if (os == zfs->mos)
+		dnodecount += zfs->mscount;
+	dnodesz = dnodecount * sizeof(dnode_phys_t);
+	c = dnode_cursor_init(zfs, os, &os->phys->os_meta_dnode, dnodesz,
+	    DNODE_BLOCK_SIZE);
+	dnodesz = roundup2(dnodesz, DNODE_BLOCK_SIZE);
+	dnodeloc = objset_space_alloc(zfs, os, &dnodesz);
+
+	if (os == zfs->mos) {
+		vdev_spacemap_write(zfs);
+
+		/*
+		 * We've finished allocating space, account for it in $MOS.
+		 */
+		dsl_dir_size_set(zfs->mosdsldir, os->space);
+	}
+	_objset_write(zfs, os, c, dnodeloc);
+}
+
+dnode_phys_t *
+objset_dnode_bonus_alloc(zfs_objset_t *os, uint8_t type, uint8_t bonustype,
+    uint16_t bonuslen, uint64_t *idp)
+{
+	struct objset_dnode_chunk *chunk;
+	dnode_phys_t *dnode;
+
+	assert(bonuslen <= DN_OLD_MAX_BONUSLEN);
+	assert(!STAILQ_EMPTY(&os->dnodechunks));
+
+	chunk = STAILQ_LAST(&os->dnodechunks, objset_dnode_chunk, next);
+	if (chunk->nextfree == DNODES_PER_CHUNK) {
+		chunk = ecalloc(1, sizeof(*chunk));
+		STAILQ_INSERT_TAIL(&os->dnodechunks, chunk, next);
+	}
+	*idp = os->dnodecount++;
+	dnode = &chunk->buf[chunk->nextfree++];
+	dnode_init(dnode, type, bonustype, bonuslen);
+	dnode->dn_datablkszsec = os->osblksz >> MINBLOCKSHIFT;
+	return (dnode);
+}
+
+dnode_phys_t *
+objset_dnode_alloc(zfs_objset_t *os, uint8_t type, uint64_t *idp)
+{
+	return (objset_dnode_bonus_alloc(os, type, DMU_OT_NONE, 0, idp));
+}
+
+/*
+ * Look up a physical dnode by ID.  This is not used often so a linear search is
+ * fine.
+ */
+dnode_phys_t *
+objset_dnode_lookup(zfs_objset_t *os, uint64_t id)
+{
+	struct objset_dnode_chunk *chunk;
+
+	assert(id > 0);
+	assert(id < os->dnodecount);
+
+	STAILQ_FOREACH(chunk, &os->dnodechunks, next) {
+		if (id < DNODES_PER_CHUNK)
+			return (&chunk->buf[id]);
+		id -= DNODES_PER_CHUNK;
+	}
+	assert(0);
+	return (NULL);
+}
+
+off_t
+objset_space_alloc(zfs_opt_t *zfs, zfs_objset_t *os, off_t *lenp)
+{
+	off_t loc;
+
+	loc = vdev_space_alloc(zfs, lenp);
+	os->space += *lenp;
+	return (loc);
+}
+
+uint64_t
+objset_space(const zfs_objset_t *os)
+{
+	return (os->space);
+}
+
+void
+objset_root_blkptr_copy(const zfs_objset_t *os, blkptr_t *bp)
+{
+	memcpy(bp, &os->osbp, sizeof(blkptr_t));
+}
diff --git a/usr.sbin/makefs/zfs/vdev.c b/usr.sbin/makefs/zfs/vdev.c
new file mode 100644
index 000000000000..1709a828b7c5
--- /dev/null
+++ b/usr.sbin/makefs/zfs/vdev.c
@@ -0,0 +1,435 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 The FreeBSD Foundation
+ *
+ * This software was developed by Mark Johnston under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <assert.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <util.h>
+
+#include "zfs.h"
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-function"
+#include "zfs/fletcher.c"
+#include "zfs/sha256.c"
+#pragma clang diagnostic pop
+
+static void
+blkptr_set(blkptr_t *bp, off_t off, off_t size, uint8_t dntype, uint8_t level,
+    uint64_t fill, enum zio_checksum cksumt, zio_cksum_t *cksum)
+{
+	dva_t *dva;
+
+	assert(powerof2(size));
+
+	BP_ZERO(bp);
+	BP_SET_LSIZE(bp, size);
+	BP_SET_PSIZE(bp, size);
+	BP_SET_CHECKSUM(bp, cksumt);
+	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+	BP_SET_BIRTH(bp, TXG, TXG);
+	BP_SET_LEVEL(bp, level);
+	BP_SET_FILL(bp, fill);
+	BP_SET_TYPE(bp, dntype);
+
+	dva = BP_IDENTITY(bp);
+	DVA_SET_VDEV(dva, 0);
+	DVA_SET_OFFSET(dva, off);
+	DVA_SET_ASIZE(dva, size);
+	memcpy(&bp->blk_cksum, cksum, sizeof(*cksum));
+}
+
+/*
+ * Write a block of data to the vdev.  The offset is always relative to the end
+ * of the second leading vdev label.
+ *
+ * Consumers should generally use the helpers below, which provide block
+ * pointers and update dnode accounting, rather than calling this function
+ * directly.
+ */
+static void
+vdev_pwrite(const zfs_opt_t *zfs, const void *buf, size_t len, off_t off)
+{
+	ssize_t n;
+
+	assert(off >= 0 && off < zfs->asize);
+	assert(powerof2(len));
+	assert((off_t)len > 0 && off + (off_t)len > off &&
+	    off + (off_t)len < zfs->asize);
+	if (zfs->spacemap != NULL) {
+		/*
+		 * Verify that the blocks being written were in fact allocated.
+		 *
+		 * The space map isn't available once the on-disk space map is
+		 * finalized, so this check doesn't quite catch everything.
+		 */
+		assert(bit_ntest(zfs->spacemap, off >> zfs->ashift,
+		    (off + len - 1) >> zfs->ashift, 1));
+	}
+
+	off += VDEV_LABEL_START_SIZE;
+	for (size_t sofar = 0; sofar < len; sofar += n) {
+		n = pwrite(zfs->fd, (const char *)buf + sofar, len - sofar,
+		    off + sofar);
+		if (n < 0)
+			err(1, "pwrite");
+		assert(n > 0);
+	}
+}
+
+void
+vdev_pwrite_data(zfs_opt_t *zfs, uint8_t datatype, uint8_t cksumtype,
+    uint8_t level, uint64_t fill, const void *data, off_t sz, off_t loc,
+    blkptr_t *bp)
+{
+	zio_cksum_t cksum;
+
+	assert(cksumtype == ZIO_CHECKSUM_FLETCHER_4);
+
+	fletcher_4_native(data, sz, NULL, &cksum);
+	blkptr_set(bp, loc, sz, datatype, level, fill, cksumtype, &cksum);
+	vdev_pwrite(zfs, data, sz, loc);
+}
+
+void
+vdev_pwrite_dnode_indir(zfs_opt_t *zfs, dnode_phys_t *dnode, uint8_t level,
+    uint64_t fill, const void *data, off_t sz, off_t loc, blkptr_t *bp)
+{
+	vdev_pwrite_data(zfs, dnode->dn_type, dnode->dn_checksum, level, fill,
+	    data, sz, loc, bp);
+
+	assert((dnode->dn_flags & DNODE_FLAG_USED_BYTES) != 0);
+	dnode->dn_used += sz;
+}
+
+void
+vdev_pwrite_dnode_data(zfs_opt_t *zfs, dnode_phys_t *dnode, const void *data,
+    off_t sz, off_t loc)
+{
+	vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, data, sz, loc,
+	    &dnode->dn_blkptr[0]);
+}
+
+static void
+vdev_label_set_checksum(void *buf, off_t off, off_t size)
+{
+	zio_cksum_t cksum;
+	zio_eck_t *eck;
+
+	assert(size > 0 && (size_t)size >= sizeof(zio_eck_t));
+
+	eck = (zio_eck_t *)((char *)buf + size) - 1;
+	eck->zec_magic = ZEC_MAGIC;
+	ZIO_SET_CHECKSUM(&eck->zec_cksum, off, 0, 0, 0);
+	zio_checksum_SHA256(buf, size, NULL, &cksum);
+	eck->zec_cksum = cksum;
+}
+
+/*
+ * Set embedded checksums and write the label at the specified index.
+ */
+void
+vdev_label_write(zfs_opt_t *zfs, int ind, const vdev_label_t *labelp)
+{
+	vdev_label_t *label;
+	ssize_t n;
+	off_t blksz, loff;
+
+	assert(ind >= 0 && ind < VDEV_LABELS);
+
+	/*
+	 * Make a copy since we have to modify the label to set checksums.
+	 */
+	label = ecalloc(1, sizeof(*label));
+	memcpy(label, labelp, sizeof(*label));
+
+	if (ind < 2)
+		loff = ind * sizeof(*label);
+	else
+		loff = zfs->vdevsize - (VDEV_LABELS - ind) * sizeof(*label);
+
+	/*
+	 * Set the verifier checksum for the boot block.  We don't use it, but
+	 * the FreeBSD loader reads it and will complain if the checksum isn't
+	 * valid.
+	 */
+	vdev_label_set_checksum(&label->vl_be,
+	    loff + __offsetof(vdev_label_t, vl_be), sizeof(label->vl_be));
+
+	/*
+	 * Set the verifier checksum for the label.
+	 */
+	vdev_label_set_checksum(&label->vl_vdev_phys,
+	    loff + __offsetof(vdev_label_t, vl_vdev_phys),
+	    sizeof(label->vl_vdev_phys));
+
+	/*
+	 * Set the verifier checksum for the uberblocks.  There is one uberblock
+	 * per sector; for example, with an ashift of 12 we end up with
+	 * 128KB/4KB=32 copies of the uberblock in the ring.
+	 */
+	blksz = 1 << zfs->ashift;
+	assert(sizeof(label->vl_uberblock) % blksz == 0);
+	for (size_t roff = 0; roff < sizeof(label->vl_uberblock);
+	    roff += blksz) {
+		vdev_label_set_checksum(&label->vl_uberblock[0] + roff,
+		    loff + __offsetof(vdev_label_t, vl_uberblock) + roff,
+		    blksz);
+	}
+
+	n = pwrite(zfs->fd, label, sizeof(*label), loff);
+	if (n < 0)
+		err(1, "writing vdev label");
+	assert(n == sizeof(*label));
+
+	free(label);
+}
+
+/*
+ * Find a chunk of contiguous free space of length *lenp, according to the
+ * following rules:
+ * 1. If the length is less than or equal to 128KB, the returned run's length
+ *    will be the smallest power of 2 equal to or larger than the length.
+ * 2. If the length is larger than 128KB, the returned run's length will be
+ *    the smallest multiple of 128KB that is larger than the length.
+ * 3. The returned run's length will be size-aligned up to 128KB.
+ *
+ * XXX-MJ the third rule isn't actually required, so this can just be a dumb
+ * bump allocator.  Maybe there's some benefit to keeping large blocks aligned,
+ * so let's keep it for now and hope we don't get too much fragmentation.
+ * Alternately we could try to allocate all blocks of a certain size from the
+ * same metaslab.
+ */
+off_t
+vdev_space_alloc(zfs_opt_t *zfs, off_t *lenp)
+{
+	off_t len;
+	int align, loc, minblksz, nbits;
+
+	minblksz = 1 << zfs->ashift;
+	len = roundup2(*lenp, minblksz);
+
+	assert(len != 0);
+	assert(len / minblksz <= INT_MAX);
+
+	if (len < MAXBLOCKSIZE) {
+		if ((len & (len - 1)) != 0)
+			len = (off_t)1 << flsll(len);
+		align = len / minblksz;
+	} else {
+		len = roundup2(len, MAXBLOCKSIZE);
+		align = MAXBLOCKSIZE / minblksz;
+	}
+
+	for (loc = 0, nbits = len / minblksz;; loc = roundup2(loc, align)) {
+		bit_ffc_area_at(zfs->spacemap, loc, zfs->spacemapbits, nbits,
+		    &loc);
+		if (loc == -1) {
+			errx(1, "failed to find %ju bytes of space",
+			    (uintmax_t)len);
+		}
+		if ((loc & (align - 1)) == 0)
+			break;
+	}
+	assert(loc + nbits > loc);
+	bit_nset(zfs->spacemap, loc, loc + nbits - 1);
+	*lenp = len;
+
+	return ((off_t)loc << zfs->ashift);
+}
+
+static void
+vdev_spacemap_init(zfs_opt_t *zfs)
+{
+	uint64_t nbits;
+
+	assert(powerof2(zfs->mssize));
+
+	nbits = rounddown2(zfs->asize, zfs->mssize) >> zfs->ashift;
+	if (nbits > INT_MAX) {
+		/*
+		 * With the smallest block size of 512B, the limit on the image
+		 * size is 2TB.  That should be enough for anyone.
+		 */
+		errx(1, "image size is too large");
+	}
+	zfs->spacemapbits = (int)nbits;
+	zfs->spacemap = bit_alloc(zfs->spacemapbits);
+	if (zfs->spacemap == NULL)
+		err(1, "bitstring allocation failed");
+}
+
+void
+vdev_spacemap_write(zfs_opt_t *zfs)
+{
+	dnode_phys_t *objarr;
+	bitstr_t *spacemap;
+	uint64_t *objarrblk;
+	off_t smblksz, objarrblksz, objarrloc;
+
+	struct {
+		dnode_phys_t	*dnode;
+		uint64_t	dnid;
+		off_t		loc;
+	} *sma;
+
+	objarrblksz = sizeof(uint64_t) * zfs->mscount;
+	assert(objarrblksz <= MAXBLOCKSIZE);
+	objarrloc = objset_space_alloc(zfs, zfs->mos, &objarrblksz);
+	objarrblk = ecalloc(1, objarrblksz);
+
+	objarr = objset_dnode_lookup(zfs->mos, zfs->objarrid);
+	objarr->dn_datablkszsec = objarrblksz >> MINBLOCKSHIFT;
+
+	/*
+	 * Use the smallest block size for space maps.  The space allocation
+	 * algorithm should aim to minimize the number of holes.
+	 */
+	smblksz = 1 << zfs->ashift;
+
+	/*
+	 * First allocate dnodes and space for all of our space maps.  No more
+	 * space can be allocated from the vdev after this point.
+	 */
+	sma = ecalloc(zfs->mscount, sizeof(*sma));
+	for (uint64_t i = 0; i < zfs->mscount; i++) {
+		sma[i].dnode = objset_dnode_bonus_alloc(zfs->mos,
+		    DMU_OT_SPACE_MAP, DMU_OT_SPACE_MAP_HEADER,
+		    sizeof(space_map_phys_t), &sma[i].dnid);
+		sma[i].loc = objset_space_alloc(zfs, zfs->mos, &smblksz);
+	}
+	spacemap = zfs->spacemap;
+	zfs->spacemap = NULL;
+
+	/*
+	 * Now that the set of allocated space is finalized, populate each space
+	 * map and write it to the vdev.
+	 */
+	for (uint64_t i = 0; i < zfs->mscount; i++) {
+		space_map_phys_t *sm;
+		uint64_t alloc, length, *smblk;
+		int shift, startb, endb, srunb, erunb;
+
+		/*
+		 * We only allocate a single block for this space map, but
+		 * OpenZFS assumes that a space map object with sufficient bonus
+		 * space supports histograms.
+		 */
+		sma[i].dnode->dn_nblkptr = 3;
+		sma[i].dnode->dn_datablkszsec = smblksz >> MINBLOCKSHIFT;
+
+		smblk = ecalloc(1, smblksz);
+
+		alloc = length = 0;
+		shift = zfs->msshift - zfs->ashift;
+		for (srunb = startb = i * (1 << shift),
+		    endb = (i + 1) * (1 << shift);
+		    srunb < endb; srunb = erunb) {
+			uint64_t runlen, runoff;
+
+			/* Find a run of allocated space. */
+			bit_ffs_at(spacemap, srunb, zfs->spacemapbits, &srunb);
+			if (srunb == -1 || srunb >= endb)
+				break;
+
+			bit_ffc_at(spacemap, srunb, zfs->spacemapbits, &erunb);
+			if (erunb == -1 || erunb > endb)
+				erunb = endb;
+
+			/*
+			 * The space represented by [srunb, erunb) has been
+			 * allocated.  Add a record to the space map to indicate
+			 * this.  Run offsets are relative to the beginning of
+			 * the metaslab.
+			 */
+			runlen = erunb - srunb;
+			runoff = srunb - startb;
+
+			assert(length * sizeof(uint64_t) < (uint64_t)smblksz);
+			smblk[length] = SM_PREFIX_ENCODE(SM2_PREFIX) |
+			    SM2_RUN_ENCODE(runlen) | SM2_VDEV_ENCODE(0);
+			smblk[length + 1] = SM2_TYPE_ENCODE(SM_ALLOC) |
+			    SM2_OFFSET_ENCODE(runoff);
+
+			alloc += runlen << zfs->ashift;
+			length += 2;
+		}
+
+		sm = DN_BONUS(sma[i].dnode);
+		sm->smp_length = length * sizeof(uint64_t);
+		sm->smp_alloc = alloc;
+
+		vdev_pwrite_dnode_data(zfs, sma[i].dnode, smblk, smblksz,
+		    sma[i].loc);
+		free(smblk);
+
+		/* Record this space map in the space map object array. */
+		objarrblk[i] = sma[i].dnid;
+	}
+
+	/*
+	 * All of the space maps are written, now write the object array.
+	 */
+	vdev_pwrite_dnode_data(zfs, objarr, objarrblk, objarrblksz, objarrloc);
+	free(objarrblk);
+
+	assert(zfs->spacemap == NULL);
+	free(spacemap);
+	free(sma);
+}
+
+void
+vdev_init(zfs_opt_t *zfs, const char *image)
+{
+	assert(zfs->ashift >= MINBLOCKSHIFT);
+
+	zfs->fd = open(image, O_RDWR | O_CREAT | O_TRUNC, 0644);
+	if (zfs->fd == -1)
+		err(1, "Can't open `%s' for writing", image);
+	if (ftruncate(zfs->fd, zfs->vdevsize) != 0)
+		err(1, "Failed to extend image file `%s'", image);
+
+	vdev_spacemap_init(zfs);
+}
+
+void
+vdev_fini(zfs_opt_t *zfs)
+{
+	assert(zfs->spacemap == NULL);
+
+	if (zfs->fd != -1) {
+		if (close(zfs->fd) != 0)
+			err(1, "close");
+		zfs->fd = -1;
+	}
+}
diff --git a/usr.sbin/makefs/zfs/zap.c b/usr.sbin/makefs/zfs/zap.c
new file mode 100644
index 000000000000..398c0fbf029c
--- /dev/null
+++ b/usr.sbin/makefs/zfs/zap.c
@@ -0,0 +1,551 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 The FreeBSD Foundation
+ *
+ * This software was developed by Mark Johnston under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+#include <sys/endian.h>
+
+#include <assert.h>
+#include <stddef.h>
+#include <string.h>
+
+#include <util.h>
+
+#include "makefs.h"
+#include "zfs.h"
+
+typedef struct zfs_zap_entry {
+	char		*name;		/* entry key, private copy */
+	uint64_t	hash;		/* key hash */
+	union {
+		uint8_t	 *valp;
+		uint16_t *val16p;
+		uint32_t *val32p;
+		uint64_t *val64p;
+	};				/* entry value, an integer array */
+	uint64_t	val64;		/* embedded value for a common case */
+	size_t		intsz;		/* array element size; 1, 2, 4 or 8 */
+	size_t		intcnt;		/* array size */
+	STAILQ_ENTRY(zfs_zap_entry) next;
+} zfs_zap_entry_t;
+
+struct zfs_zap {
+	STAILQ_HEAD(, zfs_zap_entry) kvps;
+	uint64_t	hashsalt;	/* key hash input */
+	unsigned long	kvpcnt;		/* number of key-value pairs */
+	unsigned long	chunks;		/* count of chunks needed for fat ZAP */
+	bool		micro;		/* can this be a micro ZAP? */
+
+	dnode_phys_t	*dnode;		/* backpointer */
+	zfs_objset_t	*os;		/* backpointer */
+};
+
+static uint16_t
+zap_entry_chunks(zfs_zap_entry_t *ent)
+{
+	return (1 + howmany(strlen(ent->name) + 1, ZAP_LEAF_ARRAY_BYTES) +
+	    howmany(ent->intsz * ent->intcnt, ZAP_LEAF_ARRAY_BYTES));
+}
+
+static uint64_t
+zap_hash(uint64_t salt, const char *name)
+{
+	static uint64_t crc64_table[256];
+	const uint64_t crc64_poly = 0xC96C5795D7870F42UL;
+	const uint8_t *cp;
+	uint64_t crc;
+	uint8_t c;
+
+	assert(salt != 0);
+	if (crc64_table[128] == 0) {
+		for (int i = 0; i < 256; i++) {
+			uint64_t *t;
+
+			t = crc64_table + i;
+			*t = i;
+			for (int j = 8; j > 0; j--)
+				*t = (*t >> 1) ^ (-(*t & 1) & crc64_poly);
+		}
+	}
+	assert(crc64_table[128] == crc64_poly);
+
+	for (cp = (const uint8_t *)name, crc = salt; (c = *cp) != '\0'; cp++)
+		crc = (crc >> 8) ^ crc64_table[(crc ^ c) & 0xFF];
+
+	/*
+	 * Only use 28 bits, since we need 4 bits in the cookie for the
+	 * collision differentiator.  We MUST use the high bits, since
+	 * those are the ones that we first pay attention to when
+	 * choosing the bucket.
+	 */
+	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
+
+	return (crc);
+}
+
+zfs_zap_t *
+zap_alloc(zfs_objset_t *os, dnode_phys_t *dnode)
+{
+	zfs_zap_t *zap;
+
+	zap = ecalloc(1, sizeof(*zap));
+	STAILQ_INIT(&zap->kvps);
+	zap->hashsalt = ((uint64_t)random() << 32) | random();
+	zap->micro = true;
+	zap->kvpcnt = 0;
+	zap->chunks = 0;
+	zap->dnode = dnode;
+	zap->os = os;
+	return (zap);
+}
+
+void
+zap_add(zfs_zap_t *zap, const char *name, size_t intsz, size_t intcnt,
+    const uint8_t *val)
+{
+	zfs_zap_entry_t *ent;
+
+	assert(intsz == 1 || intsz == 2 || intsz == 4 || intsz == 8);
+	assert(strlen(name) + 1 <= ZAP_MAXNAMELEN);
+	assert(intcnt <= ZAP_MAXVALUELEN && intcnt * intsz <= ZAP_MAXVALUELEN);
+
+	ent = ecalloc(1, sizeof(*ent));
+	ent->name = estrdup(name);
+	ent->hash = zap_hash(zap->hashsalt, ent->name);
+	ent->intsz = intsz;
+	ent->intcnt = intcnt;
+	if (intsz == sizeof(uint64_t) && intcnt == 1) {
+		/*
+		 * Micro-optimization to elide a memory allocation in that most
+		 * common case where this is a directory entry.
+		 */
+		ent->val64p = &ent->val64;
+	} else {
+		ent->valp = ecalloc(intcnt, intsz);
+	}
+	memcpy(ent->valp, val, intcnt * intsz);
+	zap->kvpcnt++;
+	zap->chunks += zap_entry_chunks(ent);
+	STAILQ_INSERT_TAIL(&zap->kvps, ent, next);
+
+	if (zap->micro && (intcnt != 1 || intsz != sizeof(uint64_t) ||
+	    strlen(name) + 1 > MZAP_NAME_LEN || zap->kvpcnt > MZAP_ENT_MAX))
+		zap->micro = false;
+}
+
+void
+zap_add_uint64(zfs_zap_t *zap, const char *name, uint64_t val)
+{
+	zap_add(zap, name, sizeof(uint64_t), 1, (uint8_t *)&val);
+}
+
+void
+zap_add_string(zfs_zap_t *zap, const char *name, const char *val)
+{
+	zap_add(zap, name, 1, strlen(val) + 1, val);
+}
+
+bool
+zap_entry_exists(zfs_zap_t *zap, const char *name)
+{
+	zfs_zap_entry_t *ent;
+
+	STAILQ_FOREACH(ent, &zap->kvps, next) {
+		if (strcmp(ent->name, name) == 0)
+			return (true);
+	}
+	return (false);
+}
+
+static void
+zap_micro_write(zfs_opt_t *zfs, zfs_zap_t *zap)
+{
+	dnode_phys_t *dnode;
+	zfs_zap_entry_t *ent;
+	mzap_phys_t *mzap;
+	mzap_ent_phys_t *ment;
+	off_t bytes, loc;
+
+	memset(zfs->filebuf, 0, sizeof(zfs->filebuf));
+	mzap = (mzap_phys_t *)&zfs->filebuf[0];
+	mzap->mz_block_type = ZBT_MICRO;
+	mzap->mz_salt = zap->hashsalt;
+	mzap->mz_normflags = 0;
+
+	bytes = sizeof(*mzap) + (zap->kvpcnt - 1) * sizeof(*ment);
+	assert(bytes <= (off_t)MZAP_MAX_BLKSZ);
+
+	ment = &mzap->mz_chunk[0];
+	STAILQ_FOREACH(ent, &zap->kvps, next) {
+		memcpy(&ment->mze_value, ent->valp, ent->intsz * ent->intcnt);
+		ment->mze_cd = 0; /* XXX-MJ */
+		strlcpy(ment->mze_name, ent->name, sizeof(ment->mze_name));
+		ment++;
+	}
+
+	loc = objset_space_alloc(zfs, zap->os, &bytes);
+
+	dnode = zap->dnode;
+	dnode->dn_maxblkid = 0;
+	dnode->dn_datablkszsec = bytes >> MINBLOCKSHIFT;
+	dnode->dn_flags = DNODE_FLAG_USED_BYTES;
+
+	vdev_pwrite_dnode_data(zfs, dnode, zfs->filebuf, bytes, loc);
+}
+
+/*
+ * Write some data to the fat ZAP leaf chunk starting at index "li".
+ *
+ * Note that individual integers in the value may be split among consecutive
+ * leaves.
+ */
+static void
+zap_fat_write_array_chunk(zap_leaf_t *l, uint16_t li, size_t sz,
+    const uint8_t *val)
+{
+	struct zap_leaf_array *la;
+
+	assert(sz <= ZAP_MAXVALUELEN);
+
+	for (uint16_t n, resid = sz; resid > 0; resid -= n, val += n, li++) {
+		n = MIN(resid, ZAP_LEAF_ARRAY_BYTES);
+
+		la = &ZAP_LEAF_CHUNK(l, li).l_array;
+		assert(la->la_type == ZAP_CHUNK_FREE);
+		la->la_type = ZAP_CHUNK_ARRAY;
+		memcpy(la->la_array, val, n);
+		la->la_next = li + 1;
+	}
+	la->la_next = 0xffff;
+}
+
+/*
+ * Find the shortest hash prefix length which lets us distribute keys without
+ * overflowing a leaf block.  This is not (space) optimal, but is simple, and
+ * directories large enough to overflow a single 128KB leaf block are uncommon.
+ */
+static unsigned int
+zap_fat_write_prefixlen(zfs_zap_t *zap, zap_leaf_t *l)
+{
+	zfs_zap_entry_t *ent;
+	unsigned int prefixlen;
+
+	if (zap->chunks <= ZAP_LEAF_NUMCHUNKS(l)) {
+		/*
+		 * All chunks will fit in a single leaf block.
+		 */
+		return (0);
+	}
+
+	for (prefixlen = 1; prefixlen < (unsigned int)l->l_bs; prefixlen++) {
+		uint32_t *leafchunks;
+
+		leafchunks = ecalloc(1u << prefixlen, sizeof(*leafchunks));
+		STAILQ_FOREACH(ent, &zap->kvps, next) {
+			uint64_t li;
+			uint16_t chunks;
+
+			li = ZAP_HASH_IDX(ent->hash, prefixlen);
+
+			chunks = zap_entry_chunks(ent);
+			if (ZAP_LEAF_NUMCHUNKS(l) - leafchunks[li] < chunks) {
+				/*
+				 * Not enough space, grow the prefix and retry.
+				 */
+				break;
+			}
+			leafchunks[li] += chunks;
+		}
+		free(leafchunks);
+
+		if (ent == NULL) {
+			/*
+			 * Everything fits, we're done.
+			 */
+			break;
+		}
+	}
+
+	/*
+	 * If this fails, then we need to expand the pointer table.  For now
+	 * this situation is unhandled since it is hard to trigger.
+	 */
+	assert(prefixlen < (unsigned int)l->l_bs);
+
+	return (prefixlen);
+}
+
+/*
+ * Initialize a fat ZAP leaf block.
+ */
+static void
+zap_fat_write_leaf_init(zap_leaf_t *l, uint64_t prefix, int prefixlen)
+{
+	zap_leaf_phys_t *leaf;
+
+	leaf = l->l_phys;
+
+	leaf->l_hdr.lh_block_type = ZBT_LEAF;
+	leaf->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
+	leaf->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
+	leaf->l_hdr.lh_prefix = prefix;
+	leaf->l_hdr.lh_prefix_len = prefixlen;
+
+	/* Initialize the leaf hash table. */
+	assert(leaf->l_hdr.lh_nfree < 0xffff);
+	memset(leaf->l_hash, 0xff,
+	    ZAP_LEAF_HASH_NUMENTRIES(l) * sizeof(*leaf->l_hash));
+
+	/* Initialize the leaf chunks. */
+	for (uint16_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+		struct zap_leaf_free *lf;
+
+		lf = &ZAP_LEAF_CHUNK(l, i).l_free;
+		lf->lf_type = ZAP_CHUNK_FREE;
+		if (i + 1 == ZAP_LEAF_NUMCHUNKS(l))
+			lf->lf_next = 0xffff;
+		else
+			lf->lf_next = i + 1;
+	}
+}
+
+static void
+zap_fat_write(zfs_opt_t *zfs, zfs_zap_t *zap)
+{
+	struct dnode_cursor *c;
+	zap_leaf_t l;
+	zap_phys_t *zaphdr;
+	struct zap_table_phys *zt;
+	zfs_zap_entry_t *ent;
+	dnode_phys_t *dnode;
+	uint8_t *leafblks;
+	uint64_t lblkcnt, *ptrhasht;
+	off_t loc, blksz;
+	size_t blkshift;
+	unsigned int prefixlen;
+	int ptrcnt;
+
+	/*
+	 * For simplicity, always use the largest block size.  This should be ok
+	 * since most directories will be micro ZAPs, but it's space inefficient
+	 * for small ZAPs and might need to be revisited.
+	 */
+	blkshift = MAXBLOCKSHIFT;
+	blksz = (off_t)1 << blkshift;
+
+	/*
+	 * Embedded pointer tables give up to 8192 entries.  This ought to be
+	 * enough for anything except massive directories.
+	 */
+	ptrcnt = (blksz / 2) / sizeof(uint64_t);
+
+	memset(zfs->filebuf, 0, sizeof(zfs->filebuf));
+	zaphdr = (zap_phys_t *)&zfs->filebuf[0];
+	zaphdr->zap_block_type = ZBT_HEADER;
+	zaphdr->zap_magic = ZAP_MAGIC;
+	zaphdr->zap_num_entries = zap->kvpcnt;
+	zaphdr->zap_salt = zap->hashsalt;
+
+	l.l_bs = blkshift;
+	l.l_phys = NULL;
+
+	zt = &zaphdr->zap_ptrtbl;
+	zt->zt_blk = 0;
+	zt->zt_numblks = 0;
+	zt->zt_shift = flsll(ptrcnt) - 1;
+	zt->zt_nextblk = 0;
+	zt->zt_blks_copied = 0;
+
+	/*
+	 * How many leaf blocks do we need?  Initialize them and update the
+	 * header.
+	 */
+	prefixlen = zap_fat_write_prefixlen(zap, &l);
+	lblkcnt = 1 << prefixlen;
+	leafblks = ecalloc(lblkcnt, blksz);
+	for (unsigned int li = 0; li < lblkcnt; li++) {
+		l.l_phys = (zap_leaf_phys_t *)(leafblks + li * blksz);
+		zap_fat_write_leaf_init(&l, li, prefixlen);
+	}
+	zaphdr->zap_num_leafs = lblkcnt;
+	zaphdr->zap_freeblk = lblkcnt + 1;
+
+	/*
+	 * For each entry, figure out which leaf block it belongs to based on
+	 * the upper bits of its hash, allocate chunks from that leaf, and fill
+	 * them out.
+	 */
+	ptrhasht = (uint64_t *)(&zfs->filebuf[0] + blksz / 2);
+	STAILQ_FOREACH(ent, &zap->kvps, next) {
+		struct zap_leaf_entry *le;
+		uint16_t *lptr;
+		uint64_t hi, li;
+		uint16_t namelen, nchunks, nnamechunks, nvalchunks;
+
+		hi = ZAP_HASH_IDX(ent->hash, zt->zt_shift);
+		li = ZAP_HASH_IDX(ent->hash, prefixlen);
+		assert(ptrhasht[hi] == 0 || ptrhasht[hi] == li + 1);
+		ptrhasht[hi] = li + 1;
+		l.l_phys = (zap_leaf_phys_t *)(leafblks + li * blksz);
+
+		namelen = strlen(ent->name) + 1;
+
+		/*
+		 * How many leaf chunks do we need for this entry?
+		 */
+		nnamechunks = howmany(namelen, ZAP_LEAF_ARRAY_BYTES);
+		nvalchunks = howmany(ent->intcnt,
+		    ZAP_LEAF_ARRAY_BYTES / ent->intsz);
+		nchunks = 1 + nnamechunks + nvalchunks;
+
+		/*
+		 * Allocate a run of free leaf chunks for this entry,
+		 * potentially extending a hash chain.
+		 */
+		assert(l.l_phys->l_hdr.lh_nfree >= nchunks);
+		l.l_phys->l_hdr.lh_nfree -= nchunks;
+		l.l_phys->l_hdr.lh_nentries++;
+		lptr = ZAP_LEAF_HASH_ENTPTR(&l, ent->hash);
+		while (*lptr != 0xffff) {
+			assert(*lptr < ZAP_LEAF_NUMCHUNKS(&l));
+			le = ZAP_LEAF_ENTRY(&l, *lptr);
+			assert(le->le_type == ZAP_CHUNK_ENTRY);
+			le->le_cd++;
+			lptr = &le->le_next;
+		}
+		*lptr = l.l_phys->l_hdr.lh_freelist;
+		l.l_phys->l_hdr.lh_freelist += nchunks;
+		assert(l.l_phys->l_hdr.lh_freelist <=
+		    ZAP_LEAF_NUMCHUNKS(&l));
+		if (l.l_phys->l_hdr.lh_freelist ==
+		    ZAP_LEAF_NUMCHUNKS(&l))
+			l.l_phys->l_hdr.lh_freelist = 0xffff;
+
+		/*
+		 * Integer values must be stored in big-endian format.
+		 */
+		switch (ent->intsz) {
+		case 1:
+			break;
+		case 2:
+			for (uint16_t *v = ent->val16p;
+			    v - ent->val16p < (ptrdiff_t)ent->intcnt;
+			    v++)
+				*v = htobe16(*v);
+			break;
+		case 4:
+			for (uint32_t *v = ent->val32p;
+			    v - ent->val32p < (ptrdiff_t)ent->intcnt;
+			    v++)
+				*v = htobe32(*v);
+			break;
+		case 8:
+			for (uint64_t *v = ent->val64p;
+			    v - ent->val64p < (ptrdiff_t)ent->intcnt;
+			    v++)
+				*v = htobe64(*v);
+			break;
+		default:
+			assert(0);
+		}
+
+		/*
+		 * Finally, write out the leaf chunks for this entry.
+		 */
+		le = ZAP_LEAF_ENTRY(&l, *lptr);
+		assert(le->le_type == ZAP_CHUNK_FREE);
+		le->le_type = ZAP_CHUNK_ENTRY;
+		le->le_next = 0xffff;
+		le->le_name_chunk = *lptr + 1;
+		le->le_name_numints = namelen;
+		le->le_value_chunk = *lptr + 1 + nnamechunks;
+		le->le_value_intlen = ent->intsz;
+		le->le_value_numints = ent->intcnt;
+		le->le_hash = ent->hash;
+		zap_fat_write_array_chunk(&l, *lptr + 1, namelen, ent->name);
+		zap_fat_write_array_chunk(&l, *lptr + 1 + nnamechunks,
+		    ent->intcnt * ent->intsz, ent->valp);
+	}
+
+	/*
+	 * Initialize unused slots of the pointer table.
+	 */
+	for (int i = 0; i < ptrcnt; i++)
+		if (ptrhasht[i] == 0)
+			ptrhasht[i] = (i >> (zt->zt_shift - prefixlen)) + 1;
+
+	/*
+	 * Write the whole thing to disk.
+	 */
+	dnode = zap->dnode;
+	dnode->dn_nblkptr = 1;
+	dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT;
+	dnode->dn_maxblkid = lblkcnt + 1;
+	dnode->dn_flags = DNODE_FLAG_USED_BYTES;
+
+	c = dnode_cursor_init(zfs, zap->os, zap->dnode,
+	    (lblkcnt + 1) * blksz, blksz);
+
+	loc = objset_space_alloc(zfs, zap->os, &blksz);
+	vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, zfs->filebuf, blksz, loc,
+	    dnode_cursor_next(zfs, c, 0));
+
+	for (uint64_t i = 0; i < lblkcnt; i++) {
+		loc = objset_space_alloc(zfs, zap->os, &blksz);
+		vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, leafblks + i * blksz,
+		    blksz, loc, dnode_cursor_next(zfs, c, (i + 1) * blksz));
+	}
+
+	dnode_cursor_finish(zfs, c);
+
+	free(leafblks);
+}
+
+void
+zap_write(zfs_opt_t *zfs, zfs_zap_t *zap)
+{
+	zfs_zap_entry_t *ent;
+
+	if (zap->micro) {
+		zap_micro_write(zfs, zap);
+	} else {
+		assert(!STAILQ_EMPTY(&zap->kvps));
+		assert(zap->kvpcnt > 0);
+		zap_fat_write(zfs, zap);
+	}
+
+	while ((ent = STAILQ_FIRST(&zap->kvps)) != NULL) {
+		STAILQ_REMOVE_HEAD(&zap->kvps, next);
+		if (ent->val64p != &ent->val64)
+			free(ent->valp);
+		free(ent->name);
+		free(ent);
+	}
+	free(zap);
+}
diff --git a/usr.sbin/makefs/zfs/zfs.h b/usr.sbin/makefs/zfs/zfs.h
new file mode 100644
index 000000000000..b92e2c035669
--- /dev/null
+++ b/usr.sbin/makefs/zfs/zfs.h
@@ -0,0 +1,167 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 The FreeBSD Foundation
+ *
+ * This software was developed by Mark Johnston under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _MAKEFS_ZFS_H_
+#define	_MAKEFS_ZFS_H_
+
+#include <sys/types.h>
+#include <sys/queue.h>
+
+#include <bitstring.h>
+#include <stdbool.h>
+
+#include "makefs.h"
+
+#include "zfs/nvlist.h"
+#define	ASSERT		assert
+#include "zfs/zfsimpl.h"
+
+#define	MAXBLOCKSHIFT		17	/* 128KB */
+#define	MAXBLOCKSIZE		((off_t)(1 << MAXBLOCKSHIFT))
+_Static_assert(MAXBLOCKSIZE == SPA_OLDMAXBLOCKSIZE, "");
+#define	MINBLOCKSHIFT		9	/* 512B */
+#define	MINBLOCKSIZE		((off_t)(1 << MINBLOCKSHIFT))
+_Static_assert(MINBLOCKSIZE == SPA_MINBLOCKSIZE, "");
+#define	MINDEVSIZE		((off_t)SPA_MINDEVSIZE)
+
+/* All data was written in this transaction group. */
+#define	TXG			4
+
+typedef struct zfs_dsl_dataset zfs_dsl_dataset_t;
+typedef struct zfs_dsl_dir zfs_dsl_dir_t;
+typedef struct zfs_objset zfs_objset_t;
+typedef struct zfs_zap zfs_zap_t;
+
+struct dataset_desc {
+	char		*params;
+	STAILQ_ENTRY(dataset_desc) next;
+};
+
+typedef struct {
+	bool		nowarn;
+
+	/* I/O buffer, just for convenience. */
+	char		filebuf[MAXBLOCKSIZE];
+
+	/* Pool parameters. */
+	const char	*poolname;
+	char		*rootpath;	/* implicit mount point prefix */
+	char		*bootfs;	/* bootable dataset, pool property */
+	int		ashift;		/* vdev block size */
+	uint64_t	mssize;		/* metaslab size */
+	STAILQ_HEAD(, dataset_desc) datasetdescs; /* non-root dataset descrs  */
+
+	/* Pool state. */
+	uint64_t	poolguid;	/* pool and root vdev GUID */
+	zfs_zap_t	*poolprops;
+
+	/* MOS state. */
+	zfs_objset_t	*mos;		/* meta object set */
+	uint64_t	objarrid;	/* space map object array */
+
+	/* DSL state. */
+	zfs_dsl_dir_t	*rootdsldir;	/* root DSL directory */
+	zfs_dsl_dataset_t *rootds;
+	zfs_dsl_dir_t	*origindsldir;	/* $ORIGIN */
+	zfs_dsl_dataset_t *originds;
+	zfs_dsl_dataset_t *snapds;
+	zfs_zap_t	*cloneszap;
+	zfs_dsl_dir_t	*freedsldir;	/* $FREE */
+	zfs_dsl_dir_t	*mosdsldir;	/* $MOS */
+
+	/* vdev state. */
+	int		fd;		/* vdev disk fd */
+	uint64_t	vdevguid;	/* disk vdev GUID */
+	off_t		vdevsize;	/* vdev size, including labels */
+	off_t		asize;		/* vdev size, excluding labels */
+	bitstr_t	*spacemap;	/* space allocation tracking */
+	int		spacemapbits;	/* one bit per ashift-sized block */
+	uint64_t	msshift;	/* log2(metaslab size) */
+	uint64_t	mscount;	/* number of metaslabs for this vdev */
+} zfs_opt_t;
+
+/* dsl.c */
+void dsl_init(zfs_opt_t *);
+const char *dsl_dir_fullname(const zfs_dsl_dir_t *);
+uint64_t dsl_dir_id(zfs_dsl_dir_t *);
+uint64_t dsl_dir_dataset_id(zfs_dsl_dir_t *);
+void dsl_dir_foreach(zfs_opt_t *, zfs_dsl_dir_t *,
+    void (*)(zfs_opt_t *, zfs_dsl_dir_t *, void *), void *);
+int dsl_dir_get_canmount(zfs_dsl_dir_t *, uint64_t *);
+char *dsl_dir_get_mountpoint(zfs_opt_t *, zfs_dsl_dir_t *);
+bool dsl_dir_has_dataset(zfs_dsl_dir_t *);
+bool dsl_dir_dataset_has_objset(zfs_dsl_dir_t *);
+void dsl_dir_dataset_write(zfs_opt_t *, zfs_objset_t *, zfs_dsl_dir_t *);
+void dsl_dir_size_set(zfs_dsl_dir_t *, uint64_t);
+void dsl_write(zfs_opt_t *);
+
+/* fs.c */
+void fs_build(zfs_opt_t *, int, fsnode *);
+
+/* objset.c */
+zfs_objset_t *objset_alloc(zfs_opt_t *zfs, uint64_t type);
+off_t objset_space_alloc(zfs_opt_t *, zfs_objset_t *, off_t *);
+dnode_phys_t *objset_dnode_alloc(zfs_objset_t *, uint8_t, uint64_t *);
+dnode_phys_t *objset_dnode_bonus_alloc(zfs_objset_t *, uint8_t, uint8_t,
+    uint16_t, uint64_t *);
+dnode_phys_t *objset_dnode_lookup(zfs_objset_t *, uint64_t);
+void objset_root_blkptr_copy(const zfs_objset_t *, blkptr_t *);
+uint64_t objset_space(const zfs_objset_t *);
+void objset_write(zfs_opt_t *zfs, zfs_objset_t *os);
+
+/* vdev.c */
+void vdev_init(zfs_opt_t *, const char *);
+off_t vdev_space_alloc(zfs_opt_t *zfs, off_t *lenp);
+void vdev_pwrite_data(zfs_opt_t *zfs, uint8_t datatype, uint8_t cksumtype,
+    uint8_t level, uint64_t fill, const void *data, off_t sz, off_t loc,
+    blkptr_t *bp);
+void vdev_pwrite_dnode_indir(zfs_opt_t *zfs, dnode_phys_t *dnode, uint8_t level,
+    uint64_t fill, const void *data, off_t sz, off_t loc, blkptr_t *bp);
+void vdev_pwrite_dnode_data(zfs_opt_t *zfs, dnode_phys_t *dnode, const void *data,
+    off_t sz, off_t loc);
+void vdev_label_write(zfs_opt_t *zfs, int ind, const vdev_label_t *labelp);
+void vdev_spacemap_write(zfs_opt_t *);
+void vdev_fini(zfs_opt_t *zfs);
+
+/* zap.c */
+zfs_zap_t *zap_alloc(zfs_objset_t *, dnode_phys_t *);
+void zap_add(zfs_zap_t *, const char *, size_t, size_t, const uint8_t *);
+void zap_add_uint64(zfs_zap_t *, const char *, uint64_t);
+void zap_add_string(zfs_zap_t *, const char *, const char *);
+bool zap_entry_exists(zfs_zap_t *, const char *);
+void zap_write(zfs_opt_t *, zfs_zap_t *);
+
+/* zfs.c */
+struct dnode_cursor *dnode_cursor_init(zfs_opt_t *, zfs_objset_t *,
+    dnode_phys_t *, off_t, off_t);
+blkptr_t *dnode_cursor_next(zfs_opt_t *, struct dnode_cursor *, off_t);
+void dnode_cursor_finish(zfs_opt_t *, struct dnode_cursor *);
+
+#endif /* !_MAKEFS_ZFS_H_ */