Index: stable/12/lib/libc/sys/getdirentries.2
===================================================================
--- stable/12/lib/libc/sys/getdirentries.2	(revision 340720)
+++ stable/12/lib/libc/sys/getdirentries.2	(revision 340721)
@@ -1,190 +1,204 @@
 .\" Copyright (c) 1989, 1991, 1993
 .\"	The Regents of the University of California.  All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\" 3. Neither the name of the University nor the names of its contributors
 .\"    may be used to endorse or promote products derived from this software
 .\"    without specific prior written permission.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\"	@(#)getdirentries.2	8.2 (Berkeley) 5/3/95
 .\" $FreeBSD$
 .\"
-.Dd May 28, 2017
+.Dd Nov 14, 2018
 .Dt GETDIRENTRIES 2
 .Os
 .Sh NAME
 .Nm getdirentries ,
 .Nm getdents
 .Nd "get directory entries in a file system independent format"
 .Sh LIBRARY
 .Lb libc
 .Sh SYNOPSIS
 .In sys/types.h
 .In dirent.h
 .Ft ssize_t
 .Fn getdirentries "int fd" "char *buf" "size_t nbytes" "off_t *basep"
 .Ft ssize_t
 .Fn getdents "int fd" "char *buf" "size_t nbytes"
 .Sh DESCRIPTION
 The
 .Fn getdirentries
 and
 .Fn getdents
 system calls read directory entries from the directory
 referenced by the file descriptor
 .Fa fd
 into the buffer pointed to by
 .Fa buf ,
 in a file system independent format.
 Up to
 .Fa nbytes
 of data will be transferred.
 The
 .Fa nbytes
 argument must be greater than or equal to the
 block size associated with the file,
 see
 .Xr stat 2 .
 Some file systems may not support these system calls
 with buffers smaller than this size.
 .Pp
 The data in the buffer is a series of
 .Vt dirent
 structures each containing the following entries:
 .Bd -literal -offset indent
 ino_t	d_fileno;
 off_t	d_off;
 uint16_t	d_reclen;
 uint8_t	d_type;
 uint16_t	d_namlen;
 char	d_name[MAXNAMLEN + 1];	/* see below */
 .Ed
 .Pp
 The
 .Fa d_fileno
 entry is a number which is unique for each
 distinct file in the file system.
 Files that are linked by hard links (see
 .Xr link 2 )
 have the same
 .Fa d_fileno .
 The
+.Fa d_off
+field returns a cookie which can be used with
+.Xr lseek 2
+to position the directory descriptor to the next entry.
+The
 .Fa d_reclen
 entry is the length, in bytes, of the directory record.
 The
 .Fa d_type
 entry is the type of the file pointed to by the directory record.
 The file type values are defined in
 .Fa <sys/dirent.h> .
 The
 .Fa d_name
 entry contains a null terminated file name.
 The
 .Fa d_namlen
 entry specifies the length of the file name excluding the null byte.
 Thus the actual size of
 .Fa d_name
 may vary from 1 to
 .Dv MAXNAMLEN
 \&+ 1.
 .Pp
 Entries may be separated by extra space.
 The
 .Fa d_reclen
 entry may be used as an offset from the start of a
 .Fa dirent
 structure to the next structure, if any.
 .Pp
 The actual number of bytes transferred is returned.
 The current position pointer associated with
 .Fa fd
 is set to point to the next block of entries.
 The pointer may not advance by the number of bytes returned by
 .Fn getdirentries
 or
 .Fn getdents .
 A value of zero is returned when
 the end of the directory has been reached.
 .Pp
 If the
 .Fa basep
 pointer value is non-NULL ,
 the
 .Fn getdirentries
 system call writes the position of the block read into the location pointed to by
 .Fa basep .
 Alternatively, the current position pointer may be set and retrieved by
 .Xr lseek 2 .
 The current position pointer should only be set to a value returned by
 .Xr lseek 2 ,
 a value returned in the location pointed to by
 .Fa basep
 .Po Fn getdirentries
 only
-.Pc
+.Pc ,
+a value returned in the
+.Fa d_off
+field,
 or zero.
+.Sh IMPLEMENTATION NOTES
+The
+.Fa d_off
+field is being used as a cookie to readdir for nfs servers.
+These cookies can be cached and allow to read directory entries at a specific
+offset on demand.
 .Sh RETURN VALUES
 If successful, the number of bytes actually transferred is returned.
 Otherwise, -1 is returned and the global variable
 .Va errno
 is set to indicate the error.
 .Sh ERRORS
 The
 .Fn getdirentries
 system call
 will fail if:
 .Bl -tag -width Er
 .It Bq Er EBADF
 The
 .Fa fd
 argument
 is not a valid file descriptor open for reading.
 .It Bq Er EFAULT
 Either
 .Fa buf
 or non-NULL
 .Fa basep
 point outside the allocated address space.
 .It Bq Er EINVAL
 The file referenced by
 .Fa fd
 is not a directory, or
 .Fa nbytes
 is too small for returning a directory entry or block of entries,
 or the current position pointer is invalid.
 .It Bq Er EIO
 An
 .Tn I/O
 error occurred while reading from or writing to the file system.
 .El
 .Sh SEE ALSO
 .Xr lseek 2 ,
 .Xr open 2
 .Sh HISTORY
 The
 .Fn getdirentries
 system call first appeared in
 .Bx 4.4 .
 The
 .Fn getdents
 system call first appeared in
 .Fx 3.0 .
Index: stable/12/share/man/man5/dir.5
===================================================================
--- stable/12/share/man/man5/dir.5	(revision 340720)
+++ stable/12/share/man/man5/dir.5	(revision 340721)
@@ -1,167 +1,167 @@
 .\" Copyright (c) 1983, 1991, 1993
 .\"	The Regents of the University of California.  All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\" 3. Neither the name of the University nor the names of its contributors
 .\"    may be used to endorse or promote products derived from this software
 .\"    without specific prior written permission.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\"     @(#)dir.5	8.3 (Berkeley) 4/19/94
 .\" $FreeBSD$
 .\"
-.Dd June 20, 2018
+.Dd November 14, 2018
 .Dt DIR 5
 .Os
 .Sh NAME
 .Nm dir ,
 .Nm dirent
 .Nd directory file format
 .Sh SYNOPSIS
 .In dirent.h
 .Sh DESCRIPTION
 Directories provide a convenient hierarchical method of grouping
 files while obscuring the underlying details of the storage medium.
 A directory file is differentiated from a plain file
 by a flag in its
 .Xr inode 5
 entry.
 It consists of records (directory entries) each of which contains
 information about a file and a pointer to the file itself.
 Directory entries may contain other directories
 as well as plain files; such nested directories are referred to as
 subdirectories.
 A hierarchy of directories and files is formed in this manner
 and is called a file system (or referred to as a file system tree).
 .\" An entry in this tree,
 .\" nested or not nested,
 .\" is a pathname.
 .Pp
 Each directory file contains two special directory entries; one is a pointer
 to the directory itself
 called dot
 .Ql .\&
 and the other a pointer to its parent directory called dot-dot
 .Ql \&.. .
 Dot and dot-dot
 are valid pathnames, however,
 the system root directory
 .Ql / ,
 has no parent and dot-dot points to itself like dot.
 .Pp
 File system nodes are ordinary directory files on which has
 been grafted a file system object, such as a physical disk or a
 partitioned area of such a disk.
 (See
 .Xr mount 2
 and
 .Xr mount 8 . )
 .Pp
 The directory entry format is defined in the file
 .In sys/dirent.h
 (which should not be included directly by applications):
 .Bd -literal
 #ifndef	_SYS_DIRENT_H_
 #define	_SYS_DIRENT_H_
 
 #include <machine/ansi.h>
 
 /*
  * The dirent structure defines the format of directory entries returned by
  * the getdirentries(2) system call.
  *
  * A directory entry has a struct dirent at the front of it, containing its
  * inode number, the length of the entry, and the length of the name
  * contained in the entry.  These are followed by the name padded to a 8
  * byte boundary with null bytes.  All names are guaranteed null terminated.
  * The maximum length of a name in a directory is MAXNAMLEN.
  * Explicit pad is added between the last member of the header and
  * d_name, to avoid having the ABI padding in the end of dirent on
  * LP64 arches.  There is code depending on d_name being last.  Also,
  * keeping this pad for ILP32 architectures simplifies compat32 layer.
  */
 
 struct dirent {
 	ino_t      d_fileno;		/* file number of entry */
-	off_t      d_off;		/* directory offset of entry */
+	off_t      d_off;		/* directory offset of the next entry */
 	__uint16_t d_reclen;		/* length of this record */
 	__uint8_t  d_type;		/* file type, see below */
 	__uint8_t  d_namlen;		/* length of string in d_name */
 	__uint32_t d_pad0;
 #if __BSD_VISIBLE
 #define	MAXNAMLEN	255
 	char	d_name[MAXNAMLEN + 1];	/* name must be no longer than this */
 #else
 	char	d_name[255 + 1];	/* name must be no longer than this */
 #endif
 };
 
 /*
  * File types
  */
 #define	DT_UNKNOWN	 0
 #define	DT_FIFO		 1
 #define	DT_CHR		 2
 #define	DT_DIR		 4
 #define	DT_BLK		 6
 #define	DT_REG		 8
 #define	DT_LNK		10
 #define	DT_SOCK		12
 #define	DT_WHT		14
 
 /*
  * Convert between stat structure types and directory types.
  */
 #define	IFTODT(mode)	(((mode) & 0170000) >> 12)
 #define	DTTOIF(dirtype)	((dirtype) << 12)
 
 /*
  * The _GENERIC_DIRSIZ macro gives the minimum record length which will hold
  * the directory entry.  This returns the amount of space in struct direct
  * without the d_name field, plus enough space for the name with a terminating
  * null byte (dp->d_namlen+1), rounded up to a 8 byte boundary.
  *
  * XXX although this macro is in the implementation namespace, it requires
  * a manifest constant that is not.
  */
 #define	_GENERIC_DIRLEN(namlen)					\
 	((__offsetof(struct dirent, d_name) + (namlen) + 1 + 7) & ~7)
 #define	_GENERIC_DIRSIZ(dp)	_GENERIC_DIRLEN((dp)->d_namlen)
 #endif /* __BSD_VISIBLE */
 
 #ifdef _KERNEL
 #define	GENERIC_DIRSIZ(dp)	_GENERIC_DIRSIZ(dp)
 #endif
 
 #endif /* !_SYS_DIRENT_H_ */
 .Ed
 .Sh SEE ALSO
 .Xr fs 5 ,
 .Xr inode 5
 .Sh HISTORY
 A
 .Nm
 file format appeared in
 .At v7 .
 .Sh BUGS
 The usage of the member d_type of struct dirent is unportable as it is
 .Fx Ns -specific .
 It also may fail on certain file systems, for example the cd9660 file system.
Index: stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
===================================================================
--- stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c	(revision 340720)
+++ stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c	(revision 340721)
@@ -1,1358 +1,1360 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
  */
 
 /*
  * ZFS control directory (a.k.a. ".zfs")
  *
  * This directory provides a common location for all ZFS meta-objects.
  * Currently, this is only the 'snapshot' directory, but this may expand in the
  * future.  The elements are built using the GFS primitives, as the hierarchy
  * does not actually exist on disk.
  *
  * For 'snapshot', we don't want to have all snapshots always mounted, because
  * this would take up a huge amount of space in /etc/mnttab.  We have three
  * types of objects:
  *
  * 	ctldir ------> snapshotdir -------> snapshot
  *                                             |
  *                                             |
  *                                             V
  *                                         mounted fs
  *
  * The 'snapshot' node contains just enough information to lookup '..' and act
  * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
  * perform an automount of the underlying filesystem and return the
  * corresponding vnode.
  *
  * All mounts are handled automatically by the kernel, but unmounts are
  * (currently) handled from user land.  The main reason is that there is no
  * reliable way to auto-unmount the filesystem when it's "no longer in use".
  * When the user unmounts a filesystem, we call zfsctl_unmount(), which
  * unmounts any snapshots within the snapshot directory.
  *
  * The '.zfs', '.zfs/snapshot', and all directories created under
  * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and
  * share the same vfs_t as the head filesystem (what '.zfs' lives under).
  *
  * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>'
  * (ie: snapshots) are ZFS nodes and have their own unique vfs_t.
  * However, vnodes within these mounted on file systems have their v_vfsp
  * fields set to the head filesystem to make NFS happy (see
  * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t
  * so that it cannot be freed until all snapshots have been unmounted.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/namei.h>
 #include <sys/stat.h>
 #include <sys/dmu.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_destroy.h>
 #include <sys/dsl_deleg.h>
 #include <sys/mount.h>
 #include <sys/zap.h>
 
 #include "zfs_namecheck.h"
 
 /* Common access mode for all virtual directories under the ctldir */
 const u_short zfsctl_ctldir_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
     S_IROTH | S_IXOTH;
 
 /*
  * "Synthetic" filesystem implementation.
  */
 
 /*
  * Assert that A implies B.
  */
 #define KASSERT_IMPLY(A, B, msg)	KASSERT(!(A) || (B), (msg));
 
 static MALLOC_DEFINE(M_SFSNODES, "sfs_nodes", "synthetic-fs nodes");
 
 typedef struct sfs_node {
 	char		sn_name[ZFS_MAX_DATASET_NAME_LEN];
 	uint64_t	sn_parent_id;
 	uint64_t	sn_id;
 } sfs_node_t;
 
 /*
  * Check the parent's ID as well as the node's to account for a chance
  * that IDs originating from different domains (snapshot IDs, artifical
  * IDs, znode IDs) may clash.
  */
 static int
 sfs_compare_ids(struct vnode *vp, void *arg)
 {
 	sfs_node_t *n1 = vp->v_data;
 	sfs_node_t *n2 = arg;
 	bool equal;
 
 	equal = n1->sn_id == n2->sn_id &&
 	    n1->sn_parent_id == n2->sn_parent_id;
 
 	/* Zero means equality. */
 	return (!equal);
 }
 
 static int
 sfs_vnode_get(const struct mount *mp, int flags, uint64_t parent_id,
    uint64_t id, struct vnode **vpp)
 {
 	sfs_node_t search;
 	int err;
 
 	search.sn_id = id;
 	search.sn_parent_id = parent_id;
 	err = vfs_hash_get(mp, (u_int)id, flags, curthread, vpp,
 	    sfs_compare_ids, &search);
 	return (err);
 }
 
 static int
 sfs_vnode_insert(struct vnode *vp, int flags, uint64_t parent_id,
    uint64_t id, struct vnode **vpp)
 {
 	int err;
 
 	KASSERT(vp->v_data != NULL, ("sfs_vnode_insert with NULL v_data"));
 	err = vfs_hash_insert(vp, (u_int)id, flags, curthread, vpp,
 	    sfs_compare_ids, vp->v_data);
 	return (err);
 }
 
 static void
 sfs_vnode_remove(struct vnode *vp)
 {
 	vfs_hash_remove(vp);
 }
 
 typedef void sfs_vnode_setup_fn(vnode_t *vp, void *arg);
 
 static int
 sfs_vgetx(struct mount *mp, int flags, uint64_t parent_id, uint64_t id,
     const char *tag, struct vop_vector *vops,
     sfs_vnode_setup_fn setup, void *arg,
     struct vnode **vpp)
 {
 	struct vnode *vp;
 	int error;
 
 	error = sfs_vnode_get(mp, flags, parent_id, id, vpp);
 	if (error != 0 || *vpp != NULL) {
 		KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
 		    "sfs vnode with no data");
 		return (error);
 	}
 
 	/* Allocate a new vnode/inode. */
 	error = getnewvnode(tag, mp, vops, &vp);
 	if (error != 0) {
 		*vpp = NULL;
 		return (error);
 	}
 
 	/*
 	 * Exclusively lock the vnode vnode while it's being constructed.
 	 */
 	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
 	error = insmntque(vp, mp);
 	if (error != 0) {
 		*vpp = NULL;
 		return (error);
 	}
 
 	setup(vp, arg);
 
 	error = sfs_vnode_insert(vp, flags, parent_id, id, vpp);
 	if (error != 0 || *vpp != NULL) {
 		KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
 		    "sfs vnode with no data");
 		return (error);
 	}
 
 	*vpp = vp;
 	return (0);
 }
 
 static void
 sfs_print_node(sfs_node_t *node)
 {
 	printf("\tname = %s\n", node->sn_name);
 	printf("\tparent_id = %ju\n", (uintmax_t)node->sn_parent_id);
 	printf("\tid = %ju\n", (uintmax_t)node->sn_id);
 }
 
 static sfs_node_t *
 sfs_alloc_node(size_t size, const char *name, uint64_t parent_id, uint64_t id)
 {
 	struct sfs_node *node;
 
 	KASSERT(strlen(name) < sizeof(node->sn_name),
 	    ("sfs node name is too long"));
 	KASSERT(size >= sizeof(*node), ("sfs node size is too small"));
 	node = malloc(size, M_SFSNODES, M_WAITOK | M_ZERO);
 	strlcpy(node->sn_name, name, sizeof(node->sn_name));
 	node->sn_parent_id = parent_id;
 	node->sn_id = id;
 
 	return (node);
 }
 
 static void
 sfs_destroy_node(sfs_node_t *node)
 {
 	free(node, M_SFSNODES);
 }
 
 static void *
 sfs_reclaim_vnode(vnode_t *vp)
 {
 	sfs_node_t *node;
 	void *data;
 
 	sfs_vnode_remove(vp);
 	data = vp->v_data;
 	vp->v_data = NULL;
 	return (data);
 }
 
 static int
 sfs_readdir_common(uint64_t parent_id, uint64_t id, struct vop_readdir_args *ap,
     uio_t *uio, off_t *offp)
 {
 	struct dirent entry;
 	int error;
 
 	/* Reset ncookies for subsequent use of vfs_read_dirent. */
 	if (ap->a_ncookies != NULL)
 		*ap->a_ncookies = 0;
 
 	if (uio->uio_resid < sizeof(entry))
 		return (SET_ERROR(EINVAL));
 
 	if (uio->uio_offset < 0)
 		return (SET_ERROR(EINVAL));
 	if (uio->uio_offset == 0) {
 		entry.d_fileno = id;
 		entry.d_type = DT_DIR;
 		entry.d_name[0] = '.';
 		entry.d_name[1] = '\0';
 		entry.d_namlen = 1;
 		entry.d_reclen = sizeof(entry);
 		error = vfs_read_dirent(ap, &entry, uio->uio_offset);
 		if (error != 0)
 			return (SET_ERROR(error));
 	}
 
 	if (uio->uio_offset < sizeof(entry))
 		return (SET_ERROR(EINVAL));
 	if (uio->uio_offset == sizeof(entry)) {
 		entry.d_fileno = parent_id;
 		entry.d_type = DT_DIR;
 		entry.d_name[0] = '.';
 		entry.d_name[1] = '.';
 		entry.d_name[2] = '\0';
 		entry.d_namlen = 2;
 		entry.d_reclen = sizeof(entry);
 		error = vfs_read_dirent(ap, &entry, uio->uio_offset);
 		if (error != 0)
 			return (SET_ERROR(error));
 	}
 
 	if (offp != NULL)
 		*offp = 2 * sizeof(entry);
 	return (0);
 }
 
 
 /*
  * .zfs inode namespace
  *
  * We need to generate unique inode numbers for all files and directories
  * within the .zfs pseudo-filesystem.  We use the following scheme:
  *
  * 	ENTRY			ZFSCTL_INODE
  * 	.zfs			1
  * 	.zfs/snapshot		2
  * 	.zfs/snapshot/<snap>	objectid(snap)
  */
 #define	ZFSCTL_INO_SNAP(id)	(id)
 
 static struct vop_vector zfsctl_ops_root;
 static struct vop_vector zfsctl_ops_snapdir;
 static struct vop_vector zfsctl_ops_snapshot;
 static struct vop_vector zfsctl_ops_shares_dir;
 
 void
 zfsctl_init(void)
 {
 }
 
 void
 zfsctl_fini(void)
 {
 }
 
 boolean_t
 zfsctl_is_node(vnode_t *vp)
 {
 	return (vn_matchops(vp, zfsctl_ops_root) ||
 	    vn_matchops(vp, zfsctl_ops_snapdir) ||
 	    vn_matchops(vp, zfsctl_ops_snapshot) ||
 	    vn_matchops(vp, zfsctl_ops_shares_dir));
 
 }
 
 typedef struct zfsctl_root {
 	sfs_node_t	node;
 	sfs_node_t	*snapdir;
 	timestruc_t	cmtime;
 } zfsctl_root_t;
 
 
 /*
  * Create the '.zfs' directory.
  */
 void
 zfsctl_create(zfsvfs_t *zfsvfs)
 {
 	zfsctl_root_t *dot_zfs;
 	sfs_node_t *snapdir;
 	vnode_t *rvp;
 	uint64_t crtime[2];
 
 	ASSERT(zfsvfs->z_ctldir == NULL);
 
 	snapdir = sfs_alloc_node(sizeof(*snapdir), "snapshot", ZFSCTL_INO_ROOT,
 	    ZFSCTL_INO_SNAPDIR);
 	dot_zfs = (zfsctl_root_t *)sfs_alloc_node(sizeof(*dot_zfs), ".zfs", 0,
 	    ZFSCTL_INO_ROOT);
 	dot_zfs->snapdir = snapdir;
 
 	VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp) == 0);
 	VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
 	    &crtime, sizeof(crtime)));
 	ZFS_TIME_DECODE(&dot_zfs->cmtime, crtime);
 	vput(rvp);
 
 	zfsvfs->z_ctldir = dot_zfs;
 }
 
 /*
  * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
  * The nodes must not have any associated vnodes by now as they should be
  * vflush-ed.
  */
 void
 zfsctl_destroy(zfsvfs_t *zfsvfs)
 {
 	sfs_destroy_node(zfsvfs->z_ctldir->snapdir);
 	sfs_destroy_node((sfs_node_t *)zfsvfs->z_ctldir);
 	zfsvfs->z_ctldir = NULL;
 }
 
 static int
 zfsctl_fs_root_vnode(struct mount *mp, void *arg __unused, int flags,
     struct vnode **vpp)
 {
 	return (VFS_ROOT(mp, flags, vpp));
 }
 
 static void
 zfsctl_common_vnode_setup(vnode_t *vp, void *arg)
 {
 	ASSERT_VOP_ELOCKED(vp, __func__);
 
 	/* We support shared locking. */
 	VN_LOCK_ASHARE(vp);
 	vp->v_type = VDIR;
 	vp->v_data = arg;
 }
 
 static int
 zfsctl_root_vnode(struct mount *mp, void *arg __unused, int flags,
     struct vnode **vpp)
 {
 	void *node;
 	int err;
 
 	node = ((zfsvfs_t*)mp->mnt_data)->z_ctldir;
 	err = sfs_vgetx(mp, flags, 0, ZFSCTL_INO_ROOT, "zfs", &zfsctl_ops_root,
 	    zfsctl_common_vnode_setup, node, vpp);
 	return (err);
 }
 
 static int
 zfsctl_snapdir_vnode(struct mount *mp, void *arg __unused, int flags,
     struct vnode **vpp)
 {
 	void *node;
 	int err;
 
 	node = ((zfsvfs_t*)mp->mnt_data)->z_ctldir->snapdir;
 	err = sfs_vgetx(mp, flags, ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, "zfs",
 	   &zfsctl_ops_snapdir, zfsctl_common_vnode_setup, node, vpp);
 	return (err);
 }
 
 /*
  * Given a root znode, retrieve the associated .zfs directory.
  * Add a hold to the vnode and return it.
  */
 int
 zfsctl_root(zfsvfs_t *zfsvfs, int flags, vnode_t **vpp)
 {
 	vnode_t *vp;
 	int error;
 
 	error = zfsctl_root_vnode(zfsvfs->z_vfs, NULL, flags, vpp);
 	return (error);
 }
 
 /*
  * Common open routine.  Disallow any write access.
  */
 static int
 zfsctl_common_open(struct vop_open_args *ap)
 {
 	int flags = ap->a_mode;
 
 	if (flags & FWRITE)
 		return (SET_ERROR(EACCES));
 
 	return (0);
 }
 
 /*
  * Common close routine.  Nothing to do here.
  */
 /* ARGSUSED */
 static int
 zfsctl_common_close(struct vop_close_args *ap)
 {
 	return (0);
 }
 
 /*
  * Common access routine.  Disallow writes.
  */
 static int
 zfsctl_common_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		accmode_t a_accmode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	accmode_t accmode = ap->a_accmode;
 
 	if (accmode & VWRITE)
 		return (SET_ERROR(EACCES));
 	return (0);
 }
 
 /*
  * Common getattr function.  Fill in basic information.
  */
 static void
 zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
 {
 	timestruc_t	now;
 	sfs_node_t *node;
 
 	node = vp->v_data;
 
 	vap->va_uid = 0;
 	vap->va_gid = 0;
 	vap->va_rdev = 0;
 	/*
 	 * We are a purely virtual object, so we have no
 	 * blocksize or allocated blocks.
 	 */
 	vap->va_blksize = 0;
 	vap->va_nblocks = 0;
 	vap->va_seq = 0;
 	vn_fsid(vp, vap);
 	vap->va_mode = zfsctl_ctldir_mode;
 	vap->va_type = VDIR;
 	/*
 	 * We live in the now (for atime).
 	 */
 	gethrestime(&now);
 	vap->va_atime = now;
 	/* FreeBSD: Reset chflags(2) flags. */
 	vap->va_flags = 0;
 
 	vap->va_nodeid = node->sn_id;
 
 	/* At least '.' and '..'. */
 	vap->va_nlink = 2;
 }
 
 static int
 zfsctl_common_fid(ap)
 	struct vop_fid_args /* {
 		struct vnode *a_vp;
 		struct fid *a_fid;
 	} */ *ap;
 {
 	vnode_t		*vp = ap->a_vp;
 	fid_t		*fidp = (void *)ap->a_fid;
 	sfs_node_t	*node = vp->v_data;
 	uint64_t	object = node->sn_id;
 	zfid_short_t	*zfid;
 	int		i;
 
 	zfid = (zfid_short_t *)fidp;
 	zfid->zf_len = SHORT_FID_LEN;
 
 	for (i = 0; i < sizeof(zfid->zf_object); i++)
 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 
 	/* .zfs nodes always have a generation number of 0 */
 	for (i = 0; i < sizeof(zfid->zf_gen); i++)
 		zfid->zf_gen[i] = 0;
 
 	return (0);
 }
 
 static int
 zfsctl_common_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 
 	(void) sfs_reclaim_vnode(vp);
 	return (0);
 }
 
 static int
 zfsctl_common_print(ap)
 	struct vop_print_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	sfs_print_node(ap->a_vp->v_data);
 	return (0);
 }
 
 /*
  * Get root directory attributes.
  */
 static int
 zfsctl_root_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	zfsctl_root_t *node = vp->v_data;
 
 	zfsctl_common_getattr(vp, vap);
 	vap->va_ctime = node->cmtime;
 	vap->va_mtime = vap->va_ctime;
 	vap->va_birthtime = vap->va_ctime;
 	vap->va_nlink += 1; /* snapdir */
 	vap->va_size = vap->va_nlink;
 	return (0);
 }
 
 /*
  * When we lookup "." we still can be asked to lock it
  * differently, can't we?
  */
 int
 zfsctl_relock_dot(vnode_t *dvp, int ltype)
 {
 	vref(dvp);
 	if (ltype != VOP_ISLOCKED(dvp)) {
 		if (ltype == LK_EXCLUSIVE)
 			vn_lock(dvp, LK_UPGRADE | LK_RETRY);
 		else /* if (ltype == LK_SHARED) */
 			vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
 
 		/* Relock for the "." case may left us with reclaimed vnode. */
 		if ((dvp->v_iflag & VI_DOOMED) != 0) {
 			vrele(dvp);
 			return (SET_ERROR(ENOENT));
 		}
 	}
 	return (0);
 }
 
 /*
  * Special case the handling of "..".
  */
 int
 zfsctl_root_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	vnode_t *dvp = ap->a_dvp;
 	vnode_t **vpp = ap->a_vpp;
 	cred_t *cr = ap->a_cnp->cn_cred;
 	int flags = ap->a_cnp->cn_flags;
 	int lkflags = ap->a_cnp->cn_lkflags;
 	int nameiop = ap->a_cnp->cn_nameiop;
 	int err;
 	int ltype;
 
 	ASSERT(dvp->v_type == VDIR);
 
 	if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
 		return (SET_ERROR(ENOTSUP));
 
 	if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
 		err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
 		if (err == 0)
 			*vpp = dvp;
 	} else if ((flags & ISDOTDOT) != 0) {
 		err = vn_vget_ino_gen(dvp, zfsctl_fs_root_vnode, NULL,
 		    lkflags, vpp);
 	} else if (strncmp(cnp->cn_nameptr, "snapshot", cnp->cn_namelen) == 0) {
 		err = zfsctl_snapdir_vnode(dvp->v_mount, NULL, lkflags, vpp);
 	} else {
 		err = SET_ERROR(ENOENT);
 	}
 	if (err != 0)
 		*vpp = NULL;
 	return (err);
 }
 
 static int
 zfsctl_root_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		int *ncookies;
 		u_long **a_cookies;
 	} */ *ap;
 {
 	struct dirent entry;
 	vnode_t *vp = ap->a_vp;
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 	zfsctl_root_t *node = vp->v_data;
 	uio_t *uio = ap->a_uio;
 	int *eofp = ap->a_eofflag;
 	off_t dots_offset;
 	int error;
 
 	ASSERT(vp->v_type == VDIR);
 
 	error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, uio,
 	    &dots_offset);
 	if (error != 0) {
 		if (error == ENAMETOOLONG) /* ran out of destination space */
 			error = 0;
 		return (error);
 	}
 	if (uio->uio_offset != dots_offset)
 		return (SET_ERROR(EINVAL));
 
 	CTASSERT(sizeof(node->snapdir->sn_name) <= sizeof(entry.d_name));
 	entry.d_fileno = node->snapdir->sn_id;
 	entry.d_type = DT_DIR;
 	strcpy(entry.d_name, node->snapdir->sn_name);
 	entry.d_namlen = strlen(entry.d_name);
 	entry.d_reclen = sizeof(entry);
 	error = vfs_read_dirent(ap, &entry, uio->uio_offset);
 	if (error != 0) {
 		if (error == ENAMETOOLONG)
 			error = 0;
 		return (SET_ERROR(error));
 	}
 	if (eofp != NULL)
 		*eofp = 1;
 	return (0);
 }
 
 static int
 zfsctl_root_vptocnp(struct vop_vptocnp_args *ap)
 {
 	static const char dotzfs_name[4] = ".zfs";
 	vnode_t *dvp;
 	int error;
 
 	if (*ap->a_buflen < sizeof (dotzfs_name))
 		return (SET_ERROR(ENOMEM));
 
 	error = vn_vget_ino_gen(ap->a_vp, zfsctl_fs_root_vnode, NULL,
 	    LK_SHARED, &dvp);
 	if (error != 0)
 		return (SET_ERROR(error));
 
 	VOP_UNLOCK(dvp, 0);
 	*ap->a_vpp = dvp;
 	*ap->a_buflen -= sizeof (dotzfs_name);
 	bcopy(dotzfs_name, ap->a_buf + *ap->a_buflen, sizeof (dotzfs_name));
 	return (0);
 }
 
 static int
 zfsctl_common_pathconf(ap)
 	struct vop_pathconf_args /* {
 		struct vnode *a_vp;
 		int a_name;
 		int *a_retval;
 	} */ *ap;
 {
 	/*
 	 * We care about ACL variables so that user land utilities like ls
 	 * can display them correctly.  Since the ctldir's st_dev is set to be
 	 * the same as the parent dataset, we must support all variables that
 	 * it supports.
 	 */
 	switch (ap->a_name) {
 	case _PC_LINK_MAX:
 		*ap->a_retval = MIN(LONG_MAX, ZFS_LINK_MAX);
 		return (0);
 
 	case _PC_FILESIZEBITS:
 		*ap->a_retval = 64;
 		return (0);
 
 	case _PC_MIN_HOLE_SIZE:
 		*ap->a_retval = (int)SPA_MINBLOCKSIZE;
 		return (0);
 
 	case _PC_ACL_EXTENDED:
 		*ap->a_retval = 0;
 		return (0);
 
 	case _PC_ACL_NFS4:
 		*ap->a_retval = 1;
 		return (0);
 
 	case _PC_ACL_PATH_MAX:
 		*ap->a_retval = ACL_MAX_ENTRIES;
 		return (0);
 
 	case _PC_NAME_MAX:
 		*ap->a_retval = NAME_MAX;
 		return (0);
 
 	default:
 		return (vop_stdpathconf(ap));
 	}
 }
 
 /**
  * Returns a trivial ACL
  */
 int
 zfsctl_common_getacl(ap)
 	struct vop_getacl_args /* {
 		struct vnode *vp;
 		acl_type_t a_type;
 		struct acl *a_aclp;
 		struct ucred *cred;
 		struct thread *td;
 	} */ *ap;
 {
 	int i;
 
 	if (ap->a_type != ACL_TYPE_NFS4)
 		return (EINVAL);
 
 	acl_nfs4_sync_acl_from_mode(ap->a_aclp, zfsctl_ctldir_mode, 0);
 	/*
 	 * acl_nfs4_sync_acl_from_mode assumes that the owner can always modify
 	 * attributes.  That is not the case for the ctldir, so we must clear
 	 * those bits.  We also must clear ACL_READ_NAMED_ATTRS, because xattrs
 	 * aren't supported by the ctldir.
 	 */
 	for (i = 0; i < ap->a_aclp->acl_cnt; i++) {
 		struct acl_entry *entry;
 		entry = &(ap->a_aclp->acl_entry[i]);
 		uint32_t old_perm = entry->ae_perm;
 		entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER |
 		    ACL_WRITE_ATTRIBUTES | ACL_WRITE_NAMED_ATTRS |
 		    ACL_READ_NAMED_ATTRS );
 	}
 
 	return (0);
 }
 
 static struct vop_vector zfsctl_ops_root = {
 	.vop_default =	&default_vnodeops,
 	.vop_open =	zfsctl_common_open,
 	.vop_close =	zfsctl_common_close,
 	.vop_ioctl =	VOP_EINVAL,
 	.vop_getattr =	zfsctl_root_getattr,
 	.vop_access =	zfsctl_common_access,
 	.vop_readdir =	zfsctl_root_readdir,
 	.vop_lookup =	zfsctl_root_lookup,
 	.vop_inactive =	VOP_NULL,
 	.vop_reclaim =	zfsctl_common_reclaim,
 	.vop_fid =	zfsctl_common_fid,
 	.vop_print =	zfsctl_common_print,
 	.vop_vptocnp =	zfsctl_root_vptocnp,
 	.vop_pathconf =	zfsctl_common_pathconf,
 	.vop_getacl =	zfsctl_common_getacl,
 };
 
 static int
 zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
 {
 	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
 
 	dmu_objset_name(os, zname);
 	if (strlen(zname) + 1 + strlen(name) >= len)
 		return (SET_ERROR(ENAMETOOLONG));
 	(void) strcat(zname, "@");
 	(void) strcat(zname, name);
 	return (0);
 }
 
 static int
 zfsctl_snapshot_lookup(vnode_t *vp, const char *name, uint64_t *id)
 {
 	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
 	int err;
 
 	err = dsl_dataset_snap_lookup(dmu_objset_ds(os), name, id);
 	return (err);
 }
 
 /*
  * Given a vnode get a root vnode of a filesystem mounted on top of
  * the vnode, if any.  The root vnode is referenced and locked.
  * If no filesystem is mounted then the orinal vnode remains referenced
  * and locked.  If any error happens the orinal vnode is unlocked and
  * released.
  */
 static int
 zfsctl_mounted_here(vnode_t **vpp, int flags)
 {
 	struct mount *mp;
 	int err;
 
 	ASSERT_VOP_LOCKED(*vpp, __func__);
 	ASSERT3S((*vpp)->v_type, ==, VDIR);
 
 	if ((mp = (*vpp)->v_mountedhere) != NULL) {
 		err = vfs_busy(mp, 0);
 		KASSERT(err == 0, ("vfs_busy(mp, 0) failed with %d", err));
 		KASSERT(vrefcnt(*vpp) > 1, ("unreferenced mountpoint"));
 		vput(*vpp);
 		err = VFS_ROOT(mp, flags, vpp);
 		vfs_unbusy(mp);
 		return (err);
 	}
 	return (EJUSTRETURN);
 }
 
 typedef struct {
 	const char *snap_name;
 	uint64_t    snap_id;
 } snapshot_setup_arg_t;
 
 static void
 zfsctl_snapshot_vnode_setup(vnode_t *vp, void *arg)
 {
 	snapshot_setup_arg_t *ssa = arg;
 	sfs_node_t *node;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 
 	node = sfs_alloc_node(sizeof(sfs_node_t),
 	    ssa->snap_name, ZFSCTL_INO_SNAPDIR, ssa->snap_id);
 	zfsctl_common_vnode_setup(vp, node);
 
 	/* We have to support recursive locking. */
 	VN_LOCK_AREC(vp);
 }
 
 /*
  * Lookup entry point for the 'snapshot' directory.  Try to open the
  * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
  * Perform a mount of the associated dataset on top of the vnode.
  * There are four possibilities:
  * - the snapshot node and vnode do not exist
  * - the snapshot vnode is covered by the mounted snapshot
  * - the snapshot vnode is not covered yet, the mount operation is in progress
  * - the snapshot vnode is not covered, because the snapshot has been unmounted
  * The last two states are transient and should be relatively short-lived.
  */
 int
 zfsctl_snapdir_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	vnode_t *dvp = ap->a_dvp;
 	vnode_t **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	char name[NAME_MAX + 1];
 	char fullname[ZFS_MAX_DATASET_NAME_LEN];
 	char *mountpoint;
 	size_t mountpoint_len;
 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
 	uint64_t snap_id;
 	int nameiop = cnp->cn_nameiop;
 	int lkflags = cnp->cn_lkflags;
 	int flags = cnp->cn_flags;
 	int err;
 
 	ASSERT(dvp->v_type == VDIR);
 
 	if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
 		return (SET_ERROR(ENOTSUP));
 
 	if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
 		err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
 		if (err == 0)
 			*vpp = dvp;
 		return (err);
 	}
 	if (flags & ISDOTDOT) {
 		err = vn_vget_ino_gen(dvp, zfsctl_root_vnode, NULL, lkflags,
 		    vpp);
 		return (err);
 	}
 
 	if (cnp->cn_namelen >= sizeof(name))
 		return (SET_ERROR(ENAMETOOLONG));
 
 	strlcpy(name, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
 	err = zfsctl_snapshot_lookup(dvp, name, &snap_id);
 	if (err != 0)
 		return (SET_ERROR(ENOENT));
 
 	for (;;) {
 		snapshot_setup_arg_t ssa;
 
 		ssa.snap_name = name;
 		ssa.snap_id = snap_id;
 		err = sfs_vgetx(dvp->v_mount, LK_SHARED, ZFSCTL_INO_SNAPDIR,
 		   snap_id, "zfs", &zfsctl_ops_snapshot,
 		   zfsctl_snapshot_vnode_setup, &ssa, vpp);
 		if (err != 0)
 			return (err);
 
 		/* Check if a new vnode has just been created. */
 		if (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE)
 			break;
 
 		/*
 		 * Check if a snapshot is already mounted on top of the vnode.
 		 */
 		err = zfsctl_mounted_here(vpp, lkflags);
 		if (err != EJUSTRETURN)
 			return (err);
 
 		/*
 		 * If the vnode is not covered, then either the mount operation
 		 * is in progress or the snapshot has already been unmounted
 		 * but the vnode hasn't been inactivated and reclaimed yet.
 		 * We can try to re-use the vnode in the latter case.
 		 */
 		VI_LOCK(*vpp);
 		if (((*vpp)->v_iflag & VI_MOUNT) == 0) {
 			/* Upgrade to exclusive lock in order to:
 			 * - avoid race conditions
 			 * - satisfy the contract of mount_snapshot()
 			 */
 			err = VOP_LOCK(*vpp, LK_TRYUPGRADE | LK_INTERLOCK);
 			if (err == 0)
 				break;
 		} else {
 			VI_UNLOCK(*vpp);
 		}
 
 		/*
 		 * In this state we can loop on uncontested locks and starve
 		 * the thread doing the lengthy, non-trivial mount operation.
 		 * So, yield to prevent that from happening.
 		 */
 		vput(*vpp);
 		kern_yield(PRI_USER);
 	}
 
 	VERIFY0(zfsctl_snapshot_zname(dvp, name, sizeof(fullname), fullname));
 
 	mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) +
 	    strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(name) + 1;
 	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
 	(void) snprintf(mountpoint, mountpoint_len,
 	    "%s/" ZFS_CTLDIR_NAME "/snapshot/%s",
 	    dvp->v_vfsp->mnt_stat.f_mntonname, name);
 
 	err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0);
 	kmem_free(mountpoint, mountpoint_len);
 	if (err == 0) {
 		/*
 		 * Fix up the root vnode mounted on .zfs/snapshot/<snapname>.
 		 *
 		 * This is where we lie about our v_vfsp in order to
 		 * make .zfs/snapshot/<snapname> accessible over NFS
 		 * without requiring manual mounts of <snapname>.
 		 */
 		ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
 		VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
 
 		/* Clear the root flag (set via VFS_ROOT) as well. */
 		(*vpp)->v_vflag &= ~VV_ROOT;
 	}
 
 	if (err != 0)
 		*vpp = NULL;
 	return (err);
 }
 
 static int
 zfsctl_snapdir_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		int *ncookies;
 		u_long **a_cookies;
 	} */ *ap;
 {
 	char snapname[ZFS_MAX_DATASET_NAME_LEN];
 	struct dirent entry;
 	vnode_t *vp = ap->a_vp;
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 	uio_t *uio = ap->a_uio;
 	int *eofp = ap->a_eofflag;
 	off_t dots_offset;
 	int error;
 
 	ASSERT(vp->v_type == VDIR);
 
 	error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap, uio,
 	    &dots_offset);
 	if (error != 0) {
 		if (error == ENAMETOOLONG) /* ran out of destination space */
 			error = 0;
 		return (error);
 	}
 
 	ZFS_ENTER(zfsvfs);
 	for (;;) {
 		uint64_t cookie;
 		uint64_t id;
 
 		cookie = uio->uio_offset - dots_offset;
 
 		dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
 		error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof(snapname),
 		    snapname, &id, &cookie, NULL);
 		dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
 		if (error != 0) {
 			if (error == ENOENT) {
 				if (eofp != NULL)
 					*eofp = 1;
 				error = 0;
 			}
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 
 		entry.d_fileno = id;
 		entry.d_type = DT_DIR;
 		strcpy(entry.d_name, snapname);
 		entry.d_namlen = strlen(entry.d_name);
 		entry.d_reclen = sizeof(entry);
+		/* NOTE: d_off is the offset for the *next* entry. */
+		entry.d_off = cookie + dots_offset;
 		error = vfs_read_dirent(ap, &entry, uio->uio_offset);
 		if (error != 0) {
 			if (error == ENAMETOOLONG)
 				error = 0;
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(error));
 		}
 		uio->uio_offset = cookie + dots_offset;
 	}
 	/* NOTREACHED */
 }
 
 static int
 zfsctl_snapdir_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	vattr_t *vap = ap->a_vap;
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 	dsl_dataset_t *ds = dmu_objset_ds(zfsvfs->z_os);
 	sfs_node_t *node = vp->v_data;
 	uint64_t snap_count;
 	int err;
 
 	ZFS_ENTER(zfsvfs);
 	zfsctl_common_getattr(vp, vap);
 	vap->va_ctime = dmu_objset_snap_cmtime(zfsvfs->z_os);
 	vap->va_mtime = vap->va_ctime;
 	vap->va_birthtime = vap->va_ctime;
 	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
 		err = zap_count(dmu_objset_pool(ds->ds_objset)->dp_meta_objset,
 		    dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
 		if (err != 0) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 		vap->va_nlink += snap_count;
 	}
 	vap->va_size = vap->va_nlink;
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 static struct vop_vector zfsctl_ops_snapdir = {
 	.vop_default =	&default_vnodeops,
 	.vop_open =	zfsctl_common_open,
 	.vop_close =	zfsctl_common_close,
 	.vop_getattr =	zfsctl_snapdir_getattr,
 	.vop_access =	zfsctl_common_access,
 	.vop_readdir =	zfsctl_snapdir_readdir,
 	.vop_lookup =	zfsctl_snapdir_lookup,
 	.vop_reclaim =	zfsctl_common_reclaim,
 	.vop_fid =	zfsctl_common_fid,
 	.vop_print =	zfsctl_common_print,
 	.vop_pathconf =	zfsctl_common_pathconf,
 	.vop_getacl =	zfsctl_common_getacl,
 };
 
 static int
 zfsctl_snapshot_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 
 	VERIFY(vrecycle(vp) == 1);
 	return (0);
 }
 
 static int
 zfsctl_snapshot_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	void *data = vp->v_data;
 
 	sfs_reclaim_vnode(vp);
 	sfs_destroy_node(data);
 	return (0);
 }
 
 static int
 zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap)
 {
 	struct mount *mp;
 	vnode_t *dvp;
 	vnode_t *vp;
 	sfs_node_t *node;
 	size_t len;
 	int locked;
 	int error;
 
 	vp = ap->a_vp;
 	node = vp->v_data;
 	len = strlen(node->sn_name);
 	if (*ap->a_buflen < len)
 		return (SET_ERROR(ENOMEM));
 
 	/*
 	 * Prevent unmounting of the snapshot while the vnode lock
 	 * is not held.  That is not strictly required, but allows
 	 * us to assert that an uncovered snapshot vnode is never
 	 * "leaked".
 	 */
 	mp = vp->v_mountedhere;
 	if (mp == NULL)
 		return (SET_ERROR(ENOENT));
 	error = vfs_busy(mp, 0);
 	KASSERT(error == 0, ("vfs_busy(mp, 0) failed with %d", error));
 
 	/*
 	 * We can vput the vnode as we can now depend on the reference owned
 	 * by the busied mp.  But we also need to hold the vnode, because
 	 * the reference may go after vfs_unbusy() which has to be called
 	 * before we can lock the vnode again.
 	 */
 	locked = VOP_ISLOCKED(vp);
 	vhold(vp);
 	vput(vp);
 
 	/* Look up .zfs/snapshot, our parent. */
 	error = zfsctl_snapdir_vnode(vp->v_mount, NULL, LK_SHARED, &dvp);
 	if (error == 0) {
 		VOP_UNLOCK(dvp, 0);
 		*ap->a_vpp = dvp;
 		*ap->a_buflen -= len;
 		bcopy(node->sn_name, ap->a_buf + *ap->a_buflen, len);
 	}
 	vfs_unbusy(mp);
 	vget(vp, locked | LK_VNHELD | LK_RETRY, curthread);
 	return (error);
 }
 
 /*
  * These VP's should never see the light of day.  They should always
  * be covered.
  */
 static struct vop_vector zfsctl_ops_snapshot = {
 	.vop_default =		NULL, /* ensure very restricted access */
 	.vop_inactive =		zfsctl_snapshot_inactive,
 	.vop_reclaim =		zfsctl_snapshot_reclaim,
 	.vop_vptocnp =		zfsctl_snapshot_vptocnp,
 	.vop_lock1 =		vop_stdlock,
 	.vop_unlock =		vop_stdunlock,
 	.vop_islocked =		vop_stdislocked,
 	.vop_advlockpurge =	vop_stdadvlockpurge, /* called by vgone */
 	.vop_print =		zfsctl_common_print,
 };
 
 int
 zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
 {
 	struct mount *mp;
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	vnode_t *vp;
 	int error;
 
 	ASSERT(zfsvfs->z_ctldir != NULL);
 	*zfsvfsp = NULL;
 	error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
 	    ZFSCTL_INO_SNAPDIR, objsetid, &vp);
 	if (error == 0 && vp != NULL) {
 		/*
 		 * XXX Probably need to at least reference, if not busy, the mp.
 		 */
 		if (vp->v_mountedhere != NULL)
 			*zfsvfsp = vp->v_mountedhere->mnt_data;
 		vput(vp);
 	}
 	if (*zfsvfsp == NULL)
 		return (SET_ERROR(EINVAL));
 	return (0);
 }
 
 /*
  * Unmount any snapshots for the given filesystem.  This is called from
  * zfs_umount() - if we have a ctldir, then go through and unmount all the
  * snapshots.
  */
 int
 zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
 {
 	char snapname[ZFS_MAX_DATASET_NAME_LEN];
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	struct mount *mp;
 	vnode_t *dvp;
 	vnode_t *vp;
 	sfs_node_t *node;
 	sfs_node_t *snap;
 	uint64_t cookie;
 	int error;
 
 	ASSERT(zfsvfs->z_ctldir != NULL);
 
 	cookie = 0;
 	for (;;) {
 		uint64_t id;
 
 		dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
 		error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof(snapname),
 		    snapname, &id, &cookie, NULL);
 		dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
 		if (error != 0) {
 			if (error == ENOENT)
 				error = 0;
 			break;
 		}
 
 		for (;;) {
 			error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
 			    ZFSCTL_INO_SNAPDIR, id, &vp);
 			if (error != 0 || vp == NULL)
 				break;
 
 			mp = vp->v_mountedhere;
 
 			/*
 			 * v_mountedhere being NULL means that the
 			 * (uncovered) vnode is in a transient state
 			 * (mounting or unmounting), so loop until it
 			 * settles down.
 			 */
 			if (mp != NULL)
 				break;
 			vput(vp);
 		}
 		if (error != 0)
 			break;
 		if (vp == NULL)
 			continue;	/* no mountpoint, nothing to do */
 
 		/*
 		 * The mount-point vnode is kept locked to avoid spurious EBUSY
 		 * from a concurrent umount.
 		 * The vnode lock must have recursive locking enabled.
 		 */
 		vfs_ref(mp);
 		error = dounmount(mp, fflags, curthread);
 		KASSERT_IMPLY(error == 0, vrefcnt(vp) == 1,
 		    ("extra references after unmount"));
 		vput(vp);
 		if (error != 0)
 			break;
 	}
 	KASSERT_IMPLY((fflags & MS_FORCE) != 0, error == 0,
 	    ("force unmounting failed"));
 	return (error);
 }
 
Index: stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
===================================================================
--- stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	(revision 340720)
+++ stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	(revision 340721)
@@ -1,6070 +1,6075 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2017 Nexenta Systems, Inc.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
 #include <sys/resource.h>
 #include <sys/vfs.h>
 #include <sys/vm.h>
 #include <sys/vnode.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/kmem.h>
 #include <sys/taskq.h>
 #include <sys/uio.h>
 #include <sys/atomic.h>
 #include <sys/namei.h>
 #include <sys/mman.h>
 #include <sys/cmn_err.h>
 #include <sys/errno.h>
 #include <sys/unistd.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/dbuf.h>
 #include <sys/zap.h>
 #include <sys/sa.h>
 #include <sys/dirent.h>
 #include <sys/policy.h>
 #include <sys/sunddi.h>
 #include <sys/filio.h>
 #include <sys/sid.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
 #include <sys/zfs_sa.h>
 #include <sys/zfs_rlock.h>
 #include <sys/extdirent.h>
 #include <sys/kidmap.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/sched.h>
 #include <sys/acl.h>
 #include <sys/vmmeter.h>
 #include <vm/vm_param.h>
 #include <sys/zil.h>
 
 /*
  * Programming rules.
  *
  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  * properly lock its in-core state, create a DMU transaction, do the work,
  * record this work in the intent log (ZIL), commit the DMU transaction,
  * and wait for the intent log to commit if it is a synchronous operation.
  * Moreover, the vnode ops must work in both normal and log replay context.
  * The ordering of events is important to avoid deadlocks and references
  * to freed memory.  The example below illustrates the following Big Rules:
  *
  *  (1)	A check must be made in each zfs thread for a mounted file system.
  *	This is done avoiding races using ZFS_ENTER(zfsvfs).
  *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
  *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
  *	can return EIO from the calling function.
  *
  *  (2)	VN_RELE() should always be the last thing except for zil_commit()
  *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
  *	First, if it's the last reference, the vnode/znode
  *	can be freed, so the zp may point to freed memory.  Second, the last
  *	reference will call zfs_zinactive(), which may induce a lot of work --
  *	pushing cached pages (which acquires range locks) and syncing out
  *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
  *	which could deadlock the system if you were already holding one.
  *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
  *
  *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
  *	as they can span dmu_tx_assign() calls.
  *
  *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
  *      dmu_tx_assign().  This is critical because we don't want to block
  *      while holding locks.
  *
  *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
  *	reduces lock contention and CPU usage when we must wait (note that if
  *	throughput is constrained by the storage, nearly every transaction
  *	must wait).
  *
  *      Note, in particular, that if a lock is sometimes acquired before
  *      the tx assigns, and sometimes after (e.g. z_lock), then failing
  *      to use a non-blocking assign can deadlock the system.  The scenario:
  *
  *	Thread A has grabbed a lock before calling dmu_tx_assign().
  *	Thread B is in an already-assigned tx, and blocks for this lock.
  *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
  *	forever, because the previous txg can't quiesce until B's tx commits.
  *
  *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
  *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
  *	calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
  *	to indicate that this operation has already called dmu_tx_wait().
  *	This will ensure that we don't retry forever, waiting a short bit
  *	each time.
  *
  *  (5)	If the operation succeeded, generate the intent log entry for it
  *	before dropping locks.  This ensures that the ordering of events
  *	in the intent log matches the order in which they actually occurred.
  *	During ZIL replay the zfs_log_* functions will update the sequence
  *	number to indicate the zil transaction has replayed.
  *
  *  (6)	At the end of each vnode op, the DMU tx must always commit,
  *	regardless of whether there were any errors.
  *
  *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
  *	to ensure that synchronous semantics are provided when necessary.
  *
  * In general, this is how things should be ordered in each vnode op:
  *
  *	ZFS_ENTER(zfsvfs);		// exit if unmounted
  * top:
  *	zfs_dirent_lookup(&dl, ...)	// lock directory entry (may VN_HOLD())
  *	rw_enter(...);			// grab any other locks you need
  *	tx = dmu_tx_create(...);	// get DMU tx
  *	dmu_tx_hold_*();		// hold each object you might modify
  *	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
  *	if (error) {
  *		rw_exit(...);		// drop locks
  *		zfs_dirent_unlock(dl);	// unlock directory entry
  *		VN_RELE(...);		// release held vnodes
  *		if (error == ERESTART) {
  *			waited = B_TRUE;
  *			dmu_tx_wait(tx);
  *			dmu_tx_abort(tx);
  *			goto top;
  *		}
  *		dmu_tx_abort(tx);	// abort DMU tx
  *		ZFS_EXIT(zfsvfs);	// finished in zfs
  *		return (error);		// really out of space
  *	}
  *	error = do_real_work();		// do whatever this VOP does
  *	if (error == 0)
  *		zfs_log_*(...);		// on success, make ZIL entry
  *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
  *	rw_exit(...);			// drop locks
  *	zfs_dirent_unlock(dl);		// unlock directory entry
  *	VN_RELE(...);			// release held vnodes
  *	zil_commit(zilog, foid);	// synchronous when necessary
  *	ZFS_EXIT(zfsvfs);		// finished in zfs
  *	return (error);			// done, report error
  */
 
 /* ARGSUSED */
 static int
 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(*vpp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
 	    ((flag & FAPPEND) == 0)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 	    ZTOV(zp)->v_type == VREG &&
 	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
 		if (fs_vscan(*vpp, cr, 0) != 0) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EACCES));
 		}
 	}
 
 	/* Keep a count of the synchronous opens in the znode */
 	if (flag & (FSYNC | FDSYNC))
 		atomic_inc_32(&zp->z_sync_cnt);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	/*
 	 * Clean up any locks held by this process on the vp.
 	 */
 	cleanlocks(vp, ddi_get_pid(), 0);
 	cleanshares(vp, ddi_get_pid());
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	/* Decrement the synchronous opens in the znode */
 	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
 		atomic_dec_32(&zp->z_sync_cnt);
 
 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 	    ZTOV(zp)->v_type == VREG &&
 	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
 		VERIFY(fs_vscan(vp, cr, 1) == 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
  * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
  */
 static int
 zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
 {
 	znode_t	*zp = VTOZ(vp);
 	uint64_t noff = (uint64_t)*off; /* new offset */
 	uint64_t file_sz;
 	int error;
 	boolean_t hole;
 
 	file_sz = zp->z_size;
 	if (noff >= file_sz)  {
 		return (SET_ERROR(ENXIO));
 	}
 
 	if (cmd == _FIO_SEEK_HOLE)
 		hole = B_TRUE;
 	else
 		hole = B_FALSE;
 
 	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
 
 	if (error == ESRCH)
 		return (SET_ERROR(ENXIO));
 
 	/*
 	 * We could find a hole that begins after the logical end-of-file,
 	 * because dmu_offset_next() only works on whole blocks.  If the
 	 * EOF falls mid-block, then indicate that the "virtual hole"
 	 * at the end of the file begins at the logical EOF, rather than
 	 * at the end of the last block.
 	 */
 	if (noff > file_sz) {
 		ASSERT(hole);
 		noff = file_sz;
 	}
 
 	if (noff < *off)
 		return (error);
 	*off = noff;
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
     int *rvalp, caller_context_t *ct)
 {
 	offset_t off;
 	offset_t ndata;
 	dmu_object_info_t doi;
 	int error;
 	zfsvfs_t *zfsvfs;
 	znode_t *zp;
 
 	switch (com) {
 	case _FIOFFS:
 	{
 		return (0);
 
 		/*
 		 * The following two ioctls are used by bfu.  Faking out,
 		 * necessary to avoid bfu errors.
 		 */
 	}
 	case _FIOGDIO:
 	case _FIOSDIO:
 	{
 		return (0);
 	}
 
 	case _FIO_SEEK_DATA:
 	case _FIO_SEEK_HOLE:
 	{
 #ifdef illumos
 		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
 			return (SET_ERROR(EFAULT));
 #else
 		off = *(offset_t *)data;
 #endif
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 
 		/* offset parameter is in/out */
 		error = zfs_holey(vp, com, &off);
 		ZFS_EXIT(zfsvfs);
 		if (error)
 			return (error);
 #ifdef illumos
 		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
 			return (SET_ERROR(EFAULT));
 #else
 		*(offset_t *)data = off;
 #endif
 		return (0);
 	}
 #ifdef illumos
 	case _FIO_COUNT_FILLED:
 	{
 		/*
 		 * _FIO_COUNT_FILLED adds a new ioctl command which
 		 * exposes the number of filled blocks in a
 		 * ZFS object.
 		 */
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 
 		/*
 		 * Wait for all dirty blocks for this object
 		 * to get synced out to disk, and the DMU info
 		 * updated.
 		 */
 		error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
 		if (error) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 
 		/*
 		 * Retrieve fill count from DMU object.
 		 */
 		error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
 		if (error) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 
 		ndata = doi.doi_fill_count;
 
 		ZFS_EXIT(zfsvfs);
 		if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
 			return (SET_ERROR(EFAULT));
 		return (0);
 	}
 #endif
 	}
 	return (SET_ERROR(ENOTTY));
 }
 
 static vm_page_t
 page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
 {
 	vm_object_t obj;
 	vm_page_t pp;
 	int64_t end;
 
 	/*
 	 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
 	 * aligned boundaries, if the range is not aligned.  As a result a
 	 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
 	 * It may happen that all DEV_BSIZE subranges are marked clean and thus
 	 * the whole page would be considred clean despite have some dirty data.
 	 * For this reason we should shrink the range to DEV_BSIZE aligned
 	 * boundaries before calling vm_page_clear_dirty.
 	 */
 	end = rounddown2(off + nbytes, DEV_BSIZE);
 	off = roundup2(off, DEV_BSIZE);
 	nbytes = end - off;
 
 	obj = vp->v_object;
 	zfs_vmobject_assert_wlocked(obj);
 
 	for (;;) {
 		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
 		    pp->valid) {
 			if (vm_page_xbusied(pp)) {
 				/*
 				 * Reference the page before unlocking and
 				 * sleeping so that the page daemon is less
 				 * likely to reclaim it.
 				 */
 				vm_page_reference(pp);
 				vm_page_lock(pp);
 				zfs_vmobject_wunlock(obj);
 				vm_page_busy_sleep(pp, "zfsmwb", true);
 				zfs_vmobject_wlock(obj);
 				continue;
 			}
 			vm_page_sbusy(pp);
 		} else if (pp != NULL) {
 			ASSERT(!pp->valid);
 			pp = NULL;
 		}
 
 		if (pp != NULL) {
 			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
 			vm_object_pip_add(obj, 1);
 			pmap_remove_write(pp);
 			if (nbytes != 0)
 				vm_page_clear_dirty(pp, off, nbytes);
 		}
 		break;
 	}
 	return (pp);
 }
 
 static void
 page_unbusy(vm_page_t pp)
 {
 
 	vm_page_sunbusy(pp);
 	vm_object_pip_subtract(pp->object, 1);
 }
 
 static vm_page_t
 page_hold(vnode_t *vp, int64_t start)
 {
 	vm_object_t obj;
 	vm_page_t pp;
 
 	obj = vp->v_object;
 	zfs_vmobject_assert_wlocked(obj);
 
 	for (;;) {
 		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
 		    pp->valid) {
 			if (vm_page_xbusied(pp)) {
 				/*
 				 * Reference the page before unlocking and
 				 * sleeping so that the page daemon is less
 				 * likely to reclaim it.
 				 */
 				vm_page_reference(pp);
 				vm_page_lock(pp);
 				zfs_vmobject_wunlock(obj);
 				vm_page_busy_sleep(pp, "zfsmwb", true);
 				zfs_vmobject_wlock(obj);
 				continue;
 			}
 
 			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
 			vm_page_lock(pp);
 			vm_page_hold(pp);
 			vm_page_unlock(pp);
 
 		} else
 			pp = NULL;
 		break;
 	}
 	return (pp);
 }
 
 static void
 page_unhold(vm_page_t pp)
 {
 
 	vm_page_lock(pp);
 	vm_page_unhold(pp);
 	vm_page_unlock(pp);
 }
 
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  What this means:
  *
  * On Write:	If we find a memory mapped page, we write to *both*
  *		the page and the dmu buffer.
  */
 static void
 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
     int segflg, dmu_tx_t *tx)
 {
 	vm_object_t obj;
 	struct sf_buf *sf;
 	caddr_t va;
 	int off;
 
 	ASSERT(segflg != UIO_NOCOPY);
 	ASSERT(vp->v_mount != NULL);
 	obj = vp->v_object;
 	ASSERT(obj != NULL);
 
 	off = start & PAGEOFFSET;
 	zfs_vmobject_wlock(obj);
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 		vm_page_t pp;
 		int nbytes = imin(PAGESIZE - off, len);
 
 		if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
 			zfs_vmobject_wunlock(obj);
 
 			va = zfs_map_page(pp, &sf);
 			(void) dmu_read(os, oid, start+off, nbytes,
 			    va+off, DMU_READ_PREFETCH);;
 			zfs_unmap_page(sf);
 
 			zfs_vmobject_wlock(obj);
 			page_unbusy(pp);
 		}
 		len -= nbytes;
 		off = 0;
 	}
 	vm_object_pip_wakeupn(obj, 0);
 	zfs_vmobject_wunlock(obj);
 }
 
 /*
  * Read with UIO_NOCOPY flag means that sendfile(2) requests
  * ZFS to populate a range of page cache pages with data.
  *
  * NOTE: this function could be optimized to pre-allocate
  * all pages in advance, drain exclusive busy on all of them,
  * map them into contiguous KVA region and populate them
  * in one single dmu_read() call.
  */
 static int
 mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
 {
 	znode_t *zp = VTOZ(vp);
 	objset_t *os = zp->z_zfsvfs->z_os;
 	struct sf_buf *sf;
 	vm_object_t obj;
 	vm_page_t pp;
 	int64_t start;
 	caddr_t va;
 	int len = nbytes;
 	int off;
 	int error = 0;
 
 	ASSERT(uio->uio_segflg == UIO_NOCOPY);
 	ASSERT(vp->v_mount != NULL);
 	obj = vp->v_object;
 	ASSERT(obj != NULL);
 	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
 
 	zfs_vmobject_wlock(obj);
 	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
 		int bytes = MIN(PAGESIZE, len);
 
 		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
 		    VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
 		if (pp->valid == 0) {
 			zfs_vmobject_wunlock(obj);
 			va = zfs_map_page(pp, &sf);
 			error = dmu_read(os, zp->z_id, start, bytes, va,
 			    DMU_READ_PREFETCH);
 			if (bytes != PAGESIZE && error == 0)
 				bzero(va + bytes, PAGESIZE - bytes);
 			zfs_unmap_page(sf);
 			zfs_vmobject_wlock(obj);
 			vm_page_sunbusy(pp);
 			vm_page_lock(pp);
 			if (error) {
 				if (pp->wire_count == 0 && pp->valid == 0 &&
 				    !vm_page_busied(pp))
 					vm_page_free(pp);
 			} else {
 				pp->valid = VM_PAGE_BITS_ALL;
 				vm_page_activate(pp);
 			}
 			vm_page_unlock(pp);
 		} else {
 			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
 			vm_page_sunbusy(pp);
 		}
 		if (error)
 			break;
 		uio->uio_resid -= bytes;
 		uio->uio_offset += bytes;
 		len -= bytes;
 	}
 	zfs_vmobject_wunlock(obj);
 	return (error);
 }
 
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  What this means:
  *
  * On Read:	We "read" preferentially from memory mapped pages,
  *		else we default from the dmu buffer.
  *
  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
  *	 the file is memory mapped.
  */
 static int
 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
 {
 	znode_t *zp = VTOZ(vp);
 	vm_object_t obj;
 	int64_t start;
 	caddr_t va;
 	int len = nbytes;
 	int off;
 	int error = 0;
 
 	ASSERT(vp->v_mount != NULL);
 	obj = vp->v_object;
 	ASSERT(obj != NULL);
 
 	start = uio->uio_loffset;
 	off = start & PAGEOFFSET;
 	zfs_vmobject_wlock(obj);
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 		vm_page_t pp;
 		uint64_t bytes = MIN(PAGESIZE - off, len);
 
 		if (pp = page_hold(vp, start)) {
 			struct sf_buf *sf;
 			caddr_t va;
 
 			zfs_vmobject_wunlock(obj);
 			va = zfs_map_page(pp, &sf);
 #ifdef illumos
 			error = uiomove(va + off, bytes, UIO_READ, uio);
 #else
 			error = vn_io_fault_uiomove(va + off, bytes, uio);
 #endif
 			zfs_unmap_page(sf);
 			zfs_vmobject_wlock(obj);
 			page_unhold(pp);
 		} else {
 			zfs_vmobject_wunlock(obj);
 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, bytes);
 			zfs_vmobject_wlock(obj);
 		}
 		len -= bytes;
 		off = 0;
 		if (error)
 			break;
 	}
 	zfs_vmobject_wunlock(obj);
 	return (error);
 }
 
 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
 
 /*
  * Read bytes from specified file into supplied buffer.
  *
  *	IN:	vp	- vnode of file to be read from.
  *		uio	- structure supplying read location, range info,
  *			  and return buffer.
  *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	uio	- updated offset and range, buffer filled.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Side Effects:
  *	vp - atime updated if byte count > 0
  */
 /* ARGSUSED */
 static int
 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	ssize_t		n, nbytes;
 	int		error = 0;
 	rl_t		*rl;
 	xuio_t		*xuio = NULL;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EACCES));
 	}
 
 	/*
 	 * Validate file offset
 	 */
 	if (uio->uio_loffset < (offset_t)0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Fasttrack empty reads
 	 */
 	if (uio->uio_resid == 0) {
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	/*
 	 * Check for mandatory locks
 	 */
 	if (MANDMODE(zp->z_mode)) {
 		if (error = chklock(vp, FREAD,
 		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	/*
 	 * If we're in FRSYNC mode, sync out this znode before reading it.
 	 */
 	if (zfsvfs->z_log &&
 	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
 		zil_commit(zfsvfs->z_log, zp->z_id);
 
 	/*
 	 * Lock the range against changes.
 	 */
 	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
 
 	/*
 	 * If we are reading past end-of-file we can skip
 	 * to the end; but we might still need to set atime.
 	 */
 	if (uio->uio_loffset >= zp->z_size) {
 		error = 0;
 		goto out;
 	}
 
 	ASSERT(uio->uio_loffset < zp->z_size);
 	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
 
 #ifdef illumos
 	if ((uio->uio_extflg == UIO_XUIO) &&
 	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
 		int nblk;
 		int blksz = zp->z_blksz;
 		uint64_t offset = uio->uio_loffset;
 
 		xuio = (xuio_t *)uio;
 		if ((ISP2(blksz))) {
 			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
 			    blksz)) / blksz;
 		} else {
 			ASSERT(offset + n <= blksz);
 			nblk = 1;
 		}
 		(void) dmu_xuio_init(xuio, nblk);
 
 		if (vn_has_cached_data(vp)) {
 			/*
 			 * For simplicity, we always allocate a full buffer
 			 * even if we only expect to read a portion of a block.
 			 */
 			while (--nblk >= 0) {
 				(void) dmu_xuio_add(xuio,
 				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 				    blksz), 0, blksz);
 			}
 		}
 	}
 #endif	/* illumos */
 
 	while (n > 0) {
 		nbytes = MIN(n, zfs_read_chunk_size -
 		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
 
 #ifdef __FreeBSD__
 		if (uio->uio_segflg == UIO_NOCOPY)
 			error = mappedread_sf(vp, nbytes, uio);
 		else
 #endif /* __FreeBSD__ */
 		if (vn_has_cached_data(vp)) {
 			error = mappedread(vp, nbytes, uio);
 		} else {
 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, nbytes);
 		}
 		if (error) {
 			/* convert checksum errors into IO errors */
 			if (error == ECKSUM)
 				error = SET_ERROR(EIO);
 			break;
 		}
 
 		n -= nbytes;
 	}
 out:
 	zfs_range_unlock(rl);
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Write the bytes to a file.
  *
  *	IN:	vp	- vnode of file to be written to.
  *		uio	- structure supplying write location, range info,
  *			  and data buffer.
  *		ioflag	- FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
  *			  set if in append mode.
  *		cr	- credentials of caller.
  *		ct	- caller context (NFS/CIFS fem monitor only)
  *
  *	OUT:	uio	- updated offset and range.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - ctime|mtime updated if byte count > 0
  */
 
 /* ARGSUSED */
 static int
 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	rlim64_t	limit = MAXOFFSET_T;
 	ssize_t		start_resid = uio->uio_resid;
 	ssize_t		tx_bytes;
 	uint64_t	end_size;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	zilog_t		*zilog;
 	offset_t	woff;
 	ssize_t		n, nbytes;
 	rl_t		*rl;
 	int		max_blksz = zfsvfs->z_max_blksz;
 	int		error = 0;
 	arc_buf_t	*abuf;
 	iovec_t		*aiov = NULL;
 	xuio_t		*xuio = NULL;
 	int		i_iov = 0;
 	int		iovcnt = uio->uio_iovcnt;
 	iovec_t		*iovp = uio->uio_iov;
 	int		write_eof;
 	int		count = 0;
 	sa_bulk_attr_t	bulk[4];
 	uint64_t	mtime[2], ctime[2];
 
 	/*
 	 * Fasttrack empty write
 	 */
 	n = start_resid;
 	if (n == 0)
 		return (0);
 
 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 		limit = MAXOFFSET_T;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 	    &zp->z_size, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, 8);
 
 	/*
 	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
 	 * callers might not be able to detect properly that we are read-only,
 	 * so check it explicitly here.
 	 */
 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EROFS));
 	}
 
 	/*
 	 * If immutable or not appending then return EPERM.
 	 * Intentionally allow ZFS_READONLY through here.
 	 * See zfs_zaccess_common()
 	 */
 	if ((zp->z_pflags & ZFS_IMMUTABLE) ||
 	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
 	    (uio->uio_loffset < zp->z_size))) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * Validate file offset
 	 */
 	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
 	if (woff < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Check for mandatory locks before calling zfs_range_lock()
 	 * in order to prevent a deadlock with locks set via fcntl().
 	 */
 	if (MANDMODE((mode_t)zp->z_mode) &&
 	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 #ifdef illumos
 	/*
 	 * Pre-fault the pages to ensure slow (eg NFS) pages
 	 * don't hold up txg.
 	 * Skip this if uio contains loaned arc_buf.
 	 */
 	if ((uio->uio_extflg == UIO_XUIO) &&
 	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
 		xuio = (xuio_t *)uio;
 	else
 		uio_prefaultpages(MIN(n, max_blksz), uio);
 #endif
 
 	/*
 	 * If in append mode, set the io offset pointer to eof.
 	 */
 	if (ioflag & FAPPEND) {
 		/*
 		 * Obtain an appending range lock to guarantee file append
 		 * semantics.  We reset the write offset once we have the lock.
 		 */
 		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
 		woff = rl->r_off;
 		if (rl->r_len == UINT64_MAX) {
 			/*
 			 * We overlocked the file because this write will cause
 			 * the file block size to increase.
 			 * Note that zp_size cannot change with this lock held.
 			 */
 			woff = zp->z_size;
 		}
 		uio->uio_loffset = woff;
 	} else {
 		/*
 		 * Note that if the file block size will change as a result of
 		 * this write, then this range lock will lock the entire file
 		 * so that we can re-write the block safely.
 		 */
 		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
 	}
 
 	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
 		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
 		return (EFBIG);
 	}
 
 	if (woff >= limit) {
 		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EFBIG));
 	}
 
 	if ((woff + n) > limit || woff > (limit - n))
 		n = limit - woff;
 
 	/* Will this write extend the file length? */
 	write_eof = (woff + n > zp->z_size);
 
 	end_size = MAX(zp->z_size, woff + n);
 
 	/*
 	 * Write the file in reasonable size chunks.  Each chunk is written
 	 * in a separate transaction; this keeps the intent log records small
 	 * and allows us to do more fine-grained space accounting.
 	 */
 	while (n > 0) {
 		abuf = NULL;
 		woff = uio->uio_loffset;
 		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
 		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
 			if (abuf != NULL)
 				dmu_return_arcbuf(abuf);
 			error = SET_ERROR(EDQUOT);
 			break;
 		}
 
 		if (xuio && abuf == NULL) {
 			ASSERT(i_iov < iovcnt);
 			aiov = &iovp[i_iov];
 			abuf = dmu_xuio_arcbuf(xuio, i_iov);
 			dmu_xuio_clear(xuio, i_iov);
 			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
 			    iovec_t *, aiov, arc_buf_t *, abuf);
 			ASSERT((aiov->iov_base == abuf->b_data) ||
 			    ((char *)aiov->iov_base - (char *)abuf->b_data +
 			    aiov->iov_len == arc_buf_size(abuf)));
 			i_iov++;
 		} else if (abuf == NULL && n >= max_blksz &&
 		    woff >= zp->z_size &&
 		    P2PHASE(woff, max_blksz) == 0 &&
 		    zp->z_blksz == max_blksz) {
 			/*
 			 * This write covers a full block.  "Borrow" a buffer
 			 * from the dmu so that we can fill it before we enter
 			 * a transaction.  This avoids the possibility of
 			 * holding up the transaction if the data copy hangs
 			 * up on a pagefault (e.g., from an NFS server mapping).
 			 */
 			size_t cbytes;
 
 			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 			    max_blksz);
 			ASSERT(abuf != NULL);
 			ASSERT(arc_buf_size(abuf) == max_blksz);
 			if (error = uiocopy(abuf->b_data, max_blksz,
 			    UIO_WRITE, uio, &cbytes)) {
 				dmu_return_arcbuf(abuf);
 				break;
 			}
 			ASSERT(cbytes == max_blksz);
 		}
 
 		/*
 		 * Start a transaction.
 		 */
 		tx = dmu_tx_create(zfsvfs->z_os);
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
 		zfs_sa_upgrade_txholds(tx, zp);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 			if (abuf != NULL)
 				dmu_return_arcbuf(abuf);
 			break;
 		}
 
 		/*
 		 * If zfs_range_lock() over-locked we grow the blocksize
 		 * and then reduce the lock range.  This will only happen
 		 * on the first iteration since zfs_range_reduce() will
 		 * shrink down r_len to the appropriate size.
 		 */
 		if (rl->r_len == UINT64_MAX) {
 			uint64_t new_blksz;
 
 			if (zp->z_blksz > max_blksz) {
 				/*
 				 * File's blocksize is already larger than the
 				 * "recordsize" property.  Only let it grow to
 				 * the next power of 2.
 				 */
 				ASSERT(!ISP2(zp->z_blksz));
 				new_blksz = MIN(end_size,
 				    1 << highbit64(zp->z_blksz));
 			} else {
 				new_blksz = MIN(end_size, max_blksz);
 			}
 			zfs_grow_blocksize(zp, new_blksz, tx);
 			zfs_range_reduce(rl, woff, n);
 		}
 
 		/*
 		 * XXX - should we really limit each write to z_max_blksz?
 		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
 		 */
 		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
 
 		if (woff + nbytes > zp->z_size)
 			vnode_pager_setsize(vp, woff + nbytes);
 
 		if (abuf == NULL) {
 			tx_bytes = uio->uio_resid;
 			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, nbytes, tx);
 			tx_bytes -= uio->uio_resid;
 		} else {
 			tx_bytes = nbytes;
 			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
 			/*
 			 * If this is not a full block write, but we are
 			 * extending the file past EOF and this data starts
 			 * block-aligned, use assign_arcbuf().  Otherwise,
 			 * write via dmu_write().
 			 */
 			if (tx_bytes < max_blksz && (!write_eof ||
 			    aiov->iov_base != abuf->b_data)) {
 				ASSERT(xuio);
 				dmu_write(zfsvfs->z_os, zp->z_id, woff,
 				    aiov->iov_len, aiov->iov_base, tx);
 				dmu_return_arcbuf(abuf);
 				xuio_stat_wbuf_copied();
 			} else {
 				ASSERT(xuio || tx_bytes == max_blksz);
 				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
 				    woff, abuf, tx);
 			}
 			ASSERT(tx_bytes <= uio->uio_resid);
 			uioskip(uio, tx_bytes);
 		}
 		if (tx_bytes && vn_has_cached_data(vp)) {
 			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
 			    zp->z_id, uio->uio_segflg, tx);
 		}
 
 		/*
 		 * If we made no progress, we're done.  If we made even
 		 * partial progress, update the znode and ZIL accordingly.
 		 */
 		if (tx_bytes == 0) {
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 			    (void *)&zp->z_size, sizeof (uint64_t), tx);
 			dmu_tx_commit(tx);
 			ASSERT(error != 0);
 			break;
 		}
 
 		/*
 		 * Clear Set-UID/Set-GID bits on successful write if not
 		 * privileged and at least one of the excute bits is set.
 		 *
 		 * It would be nice to to this after all writes have
 		 * been done, but that would still expose the ISUID/ISGID
 		 * to another app after the partial write is committed.
 		 *
 		 * Note: we don't call zfs_fuid_map_id() here because
 		 * user 0 is not an ephemeral uid.
 		 */
 		mutex_enter(&zp->z_acl_lock);
 		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
 		    (S_IXUSR >> 6))) != 0 &&
 		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
 		    secpolicy_vnode_setid_retain(vp, cr,
 		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
 			uint64_t newmode;
 			zp->z_mode &= ~(S_ISUID | S_ISGID);
 			newmode = zp->z_mode;
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
 			    (void *)&newmode, sizeof (uint64_t), tx);
 		}
 		mutex_exit(&zp->z_acl_lock);
 
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 		    B_TRUE);
 
 		/*
 		 * Update the file size (zp_size) if it has changed;
 		 * account for possible concurrent updates.
 		 */
 		while ((end_size = zp->z_size) < uio->uio_loffset) {
 			(void) atomic_cas_64(&zp->z_size, end_size,
 			    uio->uio_loffset);
 #ifdef illumos
 			ASSERT(error == 0);
 #else
 			ASSERT(error == 0 || error == EFAULT);
 #endif
 		}
 		/*
 		 * If we are replaying and eof is non zero then force
 		 * the file size to the specified eof. Note, there's no
 		 * concurrency during replay.
 		 */
 		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
 			zp->z_size = zfsvfs->z_replay_eof;
 
 		if (error == 0)
 			error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		else
 			(void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 
 		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
 		dmu_tx_commit(tx);
 
 		if (error != 0)
 			break;
 		ASSERT(tx_bytes == nbytes);
 		n -= nbytes;
 
 #ifdef illumos
 		if (!xuio && n > 0)
 			uio_prefaultpages(MIN(n, max_blksz), uio);
 #endif
 	}
 
 	zfs_range_unlock(rl);
 
 	/*
 	 * If we're in replay mode, or we made no progress, return error.
 	 * Otherwise, it's at least a partial write, so it's successful.
 	 */
 	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 #ifdef __FreeBSD__
 	/*
 	 * EFAULT means that at least one page of the source buffer was not
 	 * available.  VFS will re-try remaining I/O upon this error.
 	 */
 	if (error == EFAULT) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 #endif
 
 	if (ioflag & (FSYNC | FDSYNC) ||
 	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, zp->z_id);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 void
 zfs_get_done(zgd_t *zgd, int error)
 {
 	znode_t *zp = zgd->zgd_private;
 	objset_t *os = zp->z_zfsvfs->z_os;
 
 	if (zgd->zgd_db)
 		dmu_buf_rele(zgd->zgd_db, zgd);
 
 	zfs_range_unlock(zgd->zgd_rl);
 
 	/*
 	 * Release the vnode asynchronously as we currently have the
 	 * txg stopped from syncing.
 	 */
 	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
 
 	if (error == 0 && zgd->zgd_bp)
 		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
 
 	kmem_free(zgd, sizeof (zgd_t));
 }
 
 #ifdef DEBUG
 static int zil_fault_io = 0;
 #endif
 
 /*
  * Get data to generate a TX_WRITE intent log record.
  */
 int
 zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
 {
 	zfsvfs_t *zfsvfs = arg;
 	objset_t *os = zfsvfs->z_os;
 	znode_t *zp;
 	uint64_t object = lr->lr_foid;
 	uint64_t offset = lr->lr_offset;
 	uint64_t size = lr->lr_length;
 	dmu_buf_t *db;
 	zgd_t *zgd;
 	int error = 0;
 
 	ASSERT3P(lwb, !=, NULL);
 	ASSERT3P(zio, !=, NULL);
 	ASSERT3U(size, !=, 0);
 
 	/*
 	 * Nothing to do if the file has been removed
 	 */
 	if (zfs_zget(zfsvfs, object, &zp) != 0)
 		return (SET_ERROR(ENOENT));
 	if (zp->z_unlinked) {
 		/*
 		 * Release the vnode asynchronously as we currently have the
 		 * txg stopped from syncing.
 		 */
 		VN_RELE_ASYNC(ZTOV(zp),
 		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
 		return (SET_ERROR(ENOENT));
 	}
 
 	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
 	zgd->zgd_lwb = lwb;
 	zgd->zgd_private = zp;
 
 	/*
 	 * Write records come in two flavors: immediate and indirect.
 	 * For small writes it's cheaper to store the data with the
 	 * log record (immediate); for large writes it's cheaper to
 	 * sync the data and get a pointer to it (indirect) so that
 	 * we don't have to write the data twice.
 	 */
 	if (buf != NULL) { /* immediate write */
 		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
 		/* test for truncation needs to be done while range locked */
 		if (offset >= zp->z_size) {
 			error = SET_ERROR(ENOENT);
 		} else {
 			error = dmu_read(os, object, offset, size, buf,
 			    DMU_READ_NO_PREFETCH);
 		}
 		ASSERT(error == 0 || error == ENOENT);
 	} else { /* indirect write */
 		/*
 		 * Have to lock the whole block to ensure when it's
 		 * written out and its checksum is being calculated
 		 * that no one can change the data. We need to re-check
 		 * blocksize after we get the lock in case it's changed!
 		 */
 		for (;;) {
 			uint64_t blkoff;
 			size = zp->z_blksz;
 			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
 			offset -= blkoff;
 			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
 			    RL_READER);
 			if (zp->z_blksz == size)
 				break;
 			offset += blkoff;
 			zfs_range_unlock(zgd->zgd_rl);
 		}
 		/* test for truncation needs to be done while range locked */
 		if (lr->lr_offset >= zp->z_size)
 			error = SET_ERROR(ENOENT);
 #ifdef DEBUG
 		if (zil_fault_io) {
 			error = SET_ERROR(EIO);
 			zil_fault_io = 0;
 		}
 #endif
 		if (error == 0)
 			error = dmu_buf_hold(os, object, offset, zgd, &db,
 			    DMU_READ_NO_PREFETCH);
 
 		if (error == 0) {
 			blkptr_t *bp = &lr->lr_blkptr;
 
 			zgd->zgd_db = db;
 			zgd->zgd_bp = bp;
 
 			ASSERT(db->db_offset == offset);
 			ASSERT(db->db_size == size);
 
 			error = dmu_sync(zio, lr->lr_common.lrc_txg,
 			    zfs_get_done, zgd);
 			ASSERT(error || lr->lr_length <= size);
 
 			/*
 			 * On success, we need to wait for the write I/O
 			 * initiated by dmu_sync() to complete before we can
 			 * release this dbuf.  We will finish everything up
 			 * in the zfs_get_done() callback.
 			 */
 			if (error == 0)
 				return (0);
 
 			if (error == EALREADY) {
 				lr->lr_common.lrc_txtype = TX_WRITE2;
 				/*
 				 * TX_WRITE2 relies on the data previously
 				 * written by the TX_WRITE that caused
 				 * EALREADY.  We zero out the BP because
 				 * it is the old, currently-on-disk BP,
 				 * so there's no need to zio_flush() its
 				 * vdevs (flushing would needlesly hurt
 				 * performance, and doesn't work on
 				 * indirect vdevs).
 				 */
 				zgd->zgd_bp = NULL;
 				BP_ZERO(bp);
 				error = 0;
 			}
 		}
 	}
 
 	zfs_get_done(zgd, error);
 
 	return (error);
 }
 
 /*ARGSUSED*/
 static int
 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if (flag & V_ACE_MASK)
 		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
 	else
 		error = zfs_zaccess_rwx(zp, mode, flag, cr);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 static int
 zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
 {
 	int error;
 
 	*vpp = arg;
 	error = vn_lock(*vpp, lkflags);
 	if (error != 0)
 		vrele(*vpp);
 	return (error);
 }
 
 static int
 zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
 {
 	znode_t *zdp = VTOZ(dvp);
 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
 	int error;
 	int ltype;
 
 	ASSERT_VOP_LOCKED(dvp, __func__);
 #ifdef DIAGNOSTIC
 	if ((zdp->z_pflags & ZFS_XATTR) == 0)
 		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
 #endif
 
 	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
 		ASSERT3P(dvp, ==, vp);
 		vref(dvp);
 		ltype = lkflags & LK_TYPE_MASK;
 		if (ltype != VOP_ISLOCKED(dvp)) {
 			if (ltype == LK_EXCLUSIVE)
 				vn_lock(dvp, LK_UPGRADE | LK_RETRY);
 			else /* if (ltype == LK_SHARED) */
 				vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
 
 			/*
 			 * Relock for the "." case could leave us with
 			 * reclaimed vnode.
 			 */
 			if (dvp->v_iflag & VI_DOOMED) {
 				vrele(dvp);
 				return (SET_ERROR(ENOENT));
 			}
 		}
 		return (0);
 	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
 		/*
 		 * Note that in this case, dvp is the child vnode, and we
 		 * are looking up the parent vnode - exactly reverse from
 		 * normal operation.  Unlocking dvp requires some rather
 		 * tricky unlock/relock dance to prevent mp from being freed;
 		 * use vn_vget_ino_gen() which takes care of all that.
 		 *
 		 * XXX Note that there is a time window when both vnodes are
 		 * unlocked.  It is possible, although highly unlikely, that
 		 * during that window the parent-child relationship between
 		 * the vnodes may change, for example, get reversed.
 		 * In that case we would have a wrong lock order for the vnodes.
 		 * All other filesystems seem to ignore this problem, so we
 		 * do the same here.
 		 * A potential solution could be implemented as follows:
 		 * - using LK_NOWAIT when locking the second vnode and retrying
 		 *   if necessary
 		 * - checking that the parent-child relationship still holds
 		 *   after locking both vnodes and retrying if it doesn't
 		 */
 		error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
 		return (error);
 	} else {
 		error = vn_lock(vp, lkflags);
 		if (error != 0)
 			vrele(vp);
 		return (error);
 	}
 }
 
 /*
  * Lookup an entry in a directory, or an extended attribute directory.
  * If it exists, return a held vnode reference for it.
  *
  *	IN:	dvp	- vnode of directory to search.
  *		nm	- name of entry to lookup.
  *		pnp	- full pathname to lookup [UNUSED].
  *		flags	- LOOKUP_XATTR set if looking for an attribute.
  *		rdir	- root directory vnode [UNUSED].
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	vpp	- vnode of located entry, NULL if not found.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	NA
  */
 /* ARGSUSED */
 static int
 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
     int nameiop, cred_t *cr, kthread_t *td, int flags)
 {
 	znode_t *zdp = VTOZ(dvp);
 	znode_t *zp;
 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
 	int	error = 0;
 
 	/*
 	 * Fast path lookup, however we must skip DNLC lookup
 	 * for case folding or normalizing lookups because the
 	 * DNLC code only stores the passed in name.  This means
 	 * creating 'a' and removing 'A' on a case insensitive
 	 * file system would work, but DNLC still thinks 'a'
 	 * exists and won't let you create it again on the next
 	 * pass through fast path.
 	 */
 	if (!(flags & LOOKUP_XATTR)) {
 		if (dvp->v_type != VDIR) {
 			return (SET_ERROR(ENOTDIR));
 		} else if (zdp->z_sa_hdl == NULL) {
 			return (SET_ERROR(EIO));
 		}
 	}
 
 	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zdp);
 
 	*vpp = NULL;
 
 	if (flags & LOOKUP_XATTR) {
 #ifdef TODO
 		/*
 		 * If the xattr property is off, refuse the lookup request.
 		 */
 		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EINVAL));
 		}
 #endif
 
 		/*
 		 * We don't allow recursive attributes..
 		 * Maybe someday we will.
 		 */
 		if (zdp->z_pflags & ZFS_XATTR) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EINVAL));
 		}
 
 		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 
 		/*
 		 * Do we have permission to get into attribute directory?
 		 */
 		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
 		    B_FALSE, cr)) {
 			vrele(*vpp);
 			*vpp = NULL;
 		}
 
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Check accessibility of directory.
 	 */
 	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 
 
 	/*
 	 * First handle the special cases.
 	 */
 	if ((cnp->cn_flags & ISDOTDOT) != 0) {
 		/*
 		 * If we are a snapshot mounted under .zfs, return
 		 * the vp for the snapshot directory.
 		 */
 		if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
 			struct componentname cn;
 			vnode_t *zfsctl_vp;
 			int ltype;
 
 			ZFS_EXIT(zfsvfs);
 			ltype = VOP_ISLOCKED(dvp);
 			VOP_UNLOCK(dvp, 0);
 			error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
 			    &zfsctl_vp);
 			if (error == 0) {
 				cn.cn_nameptr = "snapshot";
 				cn.cn_namelen = strlen(cn.cn_nameptr);
 				cn.cn_nameiop = cnp->cn_nameiop;
 				cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
 				cn.cn_lkflags = cnp->cn_lkflags;
 				error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
 				vput(zfsctl_vp);
 			}
 			vn_lock(dvp, ltype | LK_RETRY);
 			return (error);
 		}
 	}
 	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
 		ZFS_EXIT(zfsvfs);
 		if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
 			return (SET_ERROR(ENOTSUP));
 		error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
 		return (error);
 	}
 
 	/*
 	 * The loop is retry the lookup if the parent-child relationship
 	 * changes during the dot-dot locking complexities.
 	 */
 	for (;;) {
 		uint64_t parent;
 
 		error = zfs_dirlook(zdp, nm, &zp);
 		if (error == 0)
 			*vpp = ZTOV(zp);
 
 		ZFS_EXIT(zfsvfs);
 		if (error != 0)
 			break;
 
 		error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
 		if (error != 0) {
 			/*
 			 * If we've got a locking error, then the vnode
 			 * got reclaimed because of a force unmount.
 			 * We never enter doomed vnodes into the name cache.
 			 */
 			*vpp = NULL;
 			return (error);
 		}
 
 		if ((cnp->cn_flags & ISDOTDOT) == 0)
 			break;
 
 		ZFS_ENTER(zfsvfs);
 		if (zdp->z_sa_hdl == NULL) {
 			error = SET_ERROR(EIO);
 		} else {
 			error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 			    &parent, sizeof (parent));
 		}
 		if (error != 0) {
 			ZFS_EXIT(zfsvfs);
 			vput(ZTOV(zp));
 			break;
 		}
 		if (zp->z_id == parent) {
 			ZFS_EXIT(zfsvfs);
 			break;
 		}
 		vput(ZTOV(zp));
 	}
 
 out:
 	if (error != 0)
 		*vpp = NULL;
 
 	/* Translate errors and add SAVENAME when needed. */
 	if (cnp->cn_flags & ISLASTCN) {
 		switch (nameiop) {
 		case CREATE:
 		case RENAME:
 			if (error == ENOENT) {
 				error = EJUSTRETURN;
 				cnp->cn_flags |= SAVENAME;
 				break;
 			}
 			/* FALLTHROUGH */
 		case DELETE:
 			if (error == 0)
 				cnp->cn_flags |= SAVENAME;
 			break;
 		}
 	}
 
 	/* Insert name into cache (as non-existent) if appropriate. */
 	if (zfsvfs->z_use_namecache &&
 	    error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(dvp, NULL, cnp);
 
 	/* Insert name into cache if appropriate. */
 	if (zfsvfs->z_use_namecache &&
 	    error == 0 && (cnp->cn_flags & MAKEENTRY)) {
 		if (!(cnp->cn_flags & ISLASTCN) ||
 		    (nameiop != DELETE && nameiop != RENAME)) {
 			cache_enter(dvp, *vpp, cnp);
 		}
 	}
 
 	return (error);
 }
 
 /*
  * Attempt to create a new entry in a directory.  If the entry
  * already exists, truncate the file if permissible, else return
  * an error.  Return the vp of the created or trunc'd file.
  *
  *	IN:	dvp	- vnode of directory to put new file entry in.
  *		name	- name of new file entry.
  *		vap	- attributes of new file.
  *		excl	- flag indicating exclusive or non-exclusive mode.
  *		mode	- mode to open file with.
  *		cr	- credentials of caller.
  *		flag	- large file flag [UNUSED].
  *		ct	- caller context
  *		vsecp	- ACL to be set
  *
  *	OUT:	vpp	- vnode of created or trunc'd entry.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated if new entry created
  *	 vp - ctime|mtime always, atime if new
  */
 
 /* ARGSUSED */
 static int
 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
     vnode_t **vpp, cred_t *cr, kthread_t *td)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	objset_t	*os;
 	dmu_tx_t	*tx;
 	int		error;
 	ksid_t		*ksid;
 	uid_t		uid;
 	gid_t		gid = crgetgid(cr);
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
 	void		*vsecp = NULL;
 	int		flag = 0;
 	uint64_t	txtype;
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
 	 * make sure file system is at proper version
 	 */
 
 	ksid = crgetsid(cr, KSID_OWNER);
 	if (ksid)
 		uid = ksid_getid(ksid);
 	else
 		uid = crgetuid(cr);
 
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (vsecp || (vap->va_mask & AT_XVATTR) ||
 	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	os = zfsvfs->z_os;
 	zilog = zfsvfs->z_log;
 
 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	if (vap->va_mask & AT_XVATTR) {
 		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_type)) != 0) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	*vpp = NULL;
 
 	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
 		vap->va_mode &= ~S_ISVTX;
 
 	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
 	if (error) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 	ASSERT3P(zp, ==, NULL);
 
 	/*
 	 * Create a new file object and update the directory
 	 * to reference it.
 	 */
 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
 		goto out;
 	}
 
 	/*
 	 * We only support the creation of regular files in
 	 * extended attribute directories.
 	 */
 
 	if ((dzp->z_pflags & ZFS_XATTR) &&
 	    (vap->va_type != VREG)) {
 		error = SET_ERROR(EINVAL);
 		goto out;
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0, vap,
 	    cr, vsecp, &acl_ids)) != 0)
 		goto out;
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
 		error = SET_ERROR(EDQUOT);
 		goto out;
 	}
 
 	getnewvnode_reserve(1);
 
 	tx = dmu_tx_create(os);
 
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE);
 
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 	if (!zfsvfs->z_use_sa &&
 	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 		    0, acl_ids.z_aclp->z_acl_bytes);
 	}
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
 	txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
 	zfs_log_create(zilog, tx, txtype, dzp, zp, name,
 	    vsecp, acl_ids.z_fuidp, vap);
 	zfs_acl_ids_free(&acl_ids);
 	dmu_tx_commit(tx);
 
 	getnewvnode_drop_reserve();
 
 out:
 	if (error == 0) {
 		*vpp = ZTOV(zp);
 	}
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Remove an entry from a directory.
  *
  *	IN:	dvp	- vnode of directory to remove entry from.
  *		name	- name of entry to remove.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime
  *	 vp - ctime (if nlink > 0)
  */
 
 /*ARGSUSED*/
 static int
 zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
 {
 	znode_t		*dzp = VTOZ(dvp);
 	znode_t		*zp = VTOZ(vp);
 	znode_t		*xzp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	acl_obj, xattr_obj;
 	uint64_t	obj = 0;
 	dmu_tx_t	*tx;
 	boolean_t	unlinked, toobig = FALSE;
 	uint64_t	txtype;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	ZFS_VERIFY_ZP(zp);
 	zilog = zfsvfs->z_log;
 	zp = VTOZ(vp);
 
 	xattr_obj = 0;
 	xzp = NULL;
 
 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
 		goto out;
 	}
 
 	/*
 	 * Need to use rmdir for removing directories.
 	 */
 	if (vp->v_type == VDIR) {
 		error = SET_ERROR(EPERM);
 		goto out;
 	}
 
 	vnevent_remove(vp, dvp, name, ct);
 
 	obj = zp->z_id;
 
 	/* are there any extended attributes? */
 	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 	    &xattr_obj, sizeof (xattr_obj));
 	if (error == 0 && xattr_obj) {
 		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
 		ASSERT0(error);
 	}
 
 	/*
 	 * We may delete the znode now, or we may put it in the unlinked set;
 	 * it depends on whether we're the last link, and on whether there are
 	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
 	 * allow for either case.
 	 */
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 
 	if (xzp) {
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
 	}
 
 	/* charge as an update -- would be nice not to charge at all */
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
 	/*
 	 * Mark this transaction as typically resulting in a net free of space
 	 */
 	dmu_tx_mark_netfree(tx);
 
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Remove the directory entry.
 	 */
 	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
 
 	if (error) {
 		dmu_tx_commit(tx);
 		goto out;
 	}
 
 	if (unlinked) {
 		zfs_unlinked_add(zp, tx);
 		vp->v_vflag |= VV_NOSYNC;
 	}
 
 	txtype = TX_REMOVE;
 	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
 
 	dmu_tx_commit(tx);
 out:
 
 	if (xzp)
 		vrele(ZTOV(xzp));
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Create a new directory and insert it into dvp using the name
  * provided.  Return a pointer to the inserted directory.
  *
  *	IN:	dvp	- vnode of directory to add subdir to.
  *		dirname	- name of new directory.
  *		vap	- attributes of new directory.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *		vsecp	- ACL to be set
  *
  *	OUT:	vpp	- vnode of created directory.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  *	 vp - ctime|mtime|atime updated
  */
 /*ARGSUSED*/
 static int
 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	txtype;
 	dmu_tx_t	*tx;
 	int		error;
 	ksid_t		*ksid;
 	uid_t		uid;
 	gid_t		gid = crgetgid(cr);
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
 
 	ASSERT(vap->va_type == VDIR);
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
 	 * make sure file system is at proper version
 	 */
 
 	ksid = crgetsid(cr, KSID_OWNER);
 	if (ksid)
 		uid = ksid_getid(ksid);
 	else
 		uid = crgetuid(cr);
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    ((vap->va_mask & AT_XVATTR) ||
 	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	if (dzp->z_pflags & ZFS_XATTR) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(dirname,
 	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	if (vap->va_mask & AT_XVATTR) {
 		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_type)) != 0) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
 	    NULL, &acl_ids)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * First make sure the new directory doesn't exist.
 	 *
 	 * Existence is checked first to make sure we don't return
 	 * EACCES instead of EEXIST which can cause some applications
 	 * to fail.
 	 */
 	*vpp = NULL;
 
 	if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
 		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 	ASSERT3P(zp, ==, NULL);
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
 		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EDQUOT));
 	}
 
 	/*
 	 * Add a new entry to the directory.
 	 */
 	getnewvnode_reserve(1);
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
 		    acl_ids.z_aclp->z_acl_bytes);
 	}
 
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE);
 
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Create new node.
 	 */
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	/*
 	 * Now put new name in parent dir.
 	 */
 	(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
 
 	*vpp = ZTOV(zp);
 
 	txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
 	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
 	    acl_ids.z_fuidp, vap);
 
 	zfs_acl_ids_free(&acl_ids);
 
 	dmu_tx_commit(tx);
 
 	getnewvnode_drop_reserve();
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Remove a directory subdir entry.  If the current working
  * directory is the same as the subdir to be removed, the
  * remove will fail.
  *
  *	IN:	dvp	- vnode of directory to remove from.
  *		name	- name of directory to be removed.
  *		cwd	- vnode of current working directory.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  */
 /*ARGSUSED*/
 static int
 zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
 {
 	znode_t		*dzp = VTOZ(dvp);
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	dmu_tx_t	*tx;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	ZFS_VERIFY_ZP(zp);
 	zilog = zfsvfs->z_log;
 
 
 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
 		goto out;
 	}
 
 	if (vp->v_type != VDIR) {
 		error = SET_ERROR(ENOTDIR);
 		goto out;
 	}
 
 	vnevent_rmdir(vp, dvp, name, ct);
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	dmu_tx_mark_netfree(tx);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	cache_purge(dvp);
 
 	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
 
 	if (error == 0) {
 		uint64_t txtype = TX_RMDIR;
 		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
 	}
 
 	dmu_tx_commit(tx);
 
 	cache_purge(vp);
 out:
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Read as many directory entries as will fit into the provided
  * buffer from the given directory cursor position (specified in
  * the uio structure).
  *
  *	IN:	vp	- vnode of directory to read.
  *		uio	- structure supplying read location, range info,
  *			  and return buffer.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	OUT:	uio	- updated offset and range, buffer filled.
  *		eofp	- set to true if end-of-file detected.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - atime updated
  *
  * Note that the low 4 bits of the cookie returned by zap is always zero.
  * This allows us to use the low range for "special" directory entries:
  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
  * we use the offset 2 for the '.zfs' directory.
  */
 /* ARGSUSED */
 static int
 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
 {
 	znode_t		*zp = VTOZ(vp);
 	iovec_t		*iovp;
 	edirent_t	*eodp;
 	dirent64_t	*odp;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	objset_t	*os;
 	caddr_t		outbuf;
 	size_t		bufsize;
 	zap_cursor_t	zc;
 	zap_attribute_t	zap;
 	uint_t		bytes_wanted;
 	uint64_t	offset; /* must be unsigned; checks for < 1 */
 	uint64_t	parent;
 	int		local_eof;
 	int		outcount;
 	int		error;
 	uint8_t		prefetch;
 	boolean_t	check_sysattrs;
 	uint8_t		type;
 	int		ncooks;
 	u_long		*cooks = NULL;
 	int		flags = 0;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 	    &parent, sizeof (parent))) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * If we are not given an eof variable,
 	 * use a local one.
 	 */
 	if (eofp == NULL)
 		eofp = &local_eof;
 
 	/*
 	 * Check for valid iov_len.
 	 */
 	if (uio->uio_iov->iov_len <= 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Quit if directory has been removed (posix)
 	 */
 	if ((*eofp = zp->z_unlinked) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	error = 0;
 	os = zfsvfs->z_os;
 	offset = uio->uio_loffset;
 	prefetch = zp->z_zn_prefetch;
 
 	/*
 	 * Initialize the iterator cursor.
 	 */
 	if (offset <= 3) {
 		/*
 		 * Start iteration from the beginning of the directory.
 		 */
 		zap_cursor_init(&zc, os, zp->z_id);
 	} else {
 		/*
 		 * The offset is a serialized cursor.
 		 */
 		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
 	}
 
 	/*
 	 * Get space to change directory entries into fs independent format.
 	 */
 	iovp = uio->uio_iov;
 	bytes_wanted = iovp->iov_len;
 	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
 		bufsize = bytes_wanted;
 		outbuf = kmem_alloc(bufsize, KM_SLEEP);
 		odp = (struct dirent64 *)outbuf;
 	} else {
 		bufsize = bytes_wanted;
 		outbuf = NULL;
 		odp = (struct dirent64 *)iovp->iov_base;
 	}
 	eodp = (struct edirent *)odp;
 
 	if (ncookies != NULL) {
 		/*
 		 * Minimum entry size is dirent size and 1 byte for a file name.
 		 */
 		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
 		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
 		*cookies = cooks;
 		*ncookies = ncooks;
 	}
 	/*
 	 * If this VFS supports the system attribute view interface; and
 	 * we're looking at an extended attribute directory; and we care
 	 * about normalization conflicts on this vfs; then we must check
 	 * for normalization conflicts with the sysattr name space.
 	 */
 #ifdef TODO
 	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
 	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
 	    (flags & V_RDDIR_ENTFLAGS);
 #else
 	check_sysattrs = 0;
 #endif
 
 	/*
 	 * Transform to file-system independent format
 	 */
 	outcount = 0;
 	while (outcount < bytes_wanted) {
 		ino64_t objnum;
 		ushort_t reclen;
 		off64_t *next = NULL;
 
 		/*
 		 * Special case `.', `..', and `.zfs'.
 		 */
 		if (offset == 0) {
 			(void) strcpy(zap.za_name, ".");
 			zap.za_normalization_conflict = 0;
 			objnum = zp->z_id;
 			type = DT_DIR;
 		} else if (offset == 1) {
 			(void) strcpy(zap.za_name, "..");
 			zap.za_normalization_conflict = 0;
 			objnum = parent;
 			type = DT_DIR;
 		} else if (offset == 2 && zfs_show_ctldir(zp)) {
 			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
 			zap.za_normalization_conflict = 0;
 			objnum = ZFSCTL_INO_ROOT;
 			type = DT_DIR;
 		} else {
 			/*
 			 * Grab next entry.
 			 */
 			if (error = zap_cursor_retrieve(&zc, &zap)) {
 				if ((*eofp = (error == ENOENT)) != 0)
 					break;
 				else
 					goto update;
 			}
 
 			if (zap.za_integer_length != 8 ||
 			    zap.za_num_integers != 1) {
 				cmn_err(CE_WARN, "zap_readdir: bad directory "
 				    "entry, obj = %lld, offset = %lld\n",
 				    (u_longlong_t)zp->z_id,
 				    (u_longlong_t)offset);
 				error = SET_ERROR(ENXIO);
 				goto update;
 			}
 
 			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
 			/*
 			 * MacOS X can extract the object type here such as:
 			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
 			 */
 			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
 
 			if (check_sysattrs && !zap.za_normalization_conflict) {
 #ifdef TODO
 				zap.za_normalization_conflict =
 				    xattr_sysattr_casechk(zap.za_name);
 #else
 				panic("%s:%u: TODO", __func__, __LINE__);
 #endif
 			}
 		}
 
 		if (flags & V_RDDIR_ACCFILTER) {
 			/*
 			 * If we have no access at all, don't include
 			 * this entry in the returned information
 			 */
 			znode_t	*ezp;
 			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
 				goto skip_entry;
 			if (!zfs_has_access(ezp, cr)) {
 				vrele(ZTOV(ezp));
 				goto skip_entry;
 			}
 			vrele(ZTOV(ezp));
 		}
 
 		if (flags & V_RDDIR_ENTFLAGS)
 			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
 		else
 			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
 
 		/*
 		 * Will this entry fit in the buffer?
 		 */
 		if (outcount + reclen > bufsize) {
 			/*
 			 * Did we manage to fit anything in the buffer?
 			 */
 			if (!outcount) {
 				error = SET_ERROR(EINVAL);
 				goto update;
 			}
 			break;
 		}
 		if (flags & V_RDDIR_ENTFLAGS) {
 			/*
 			 * Add extended flag entry:
 			 */
 			eodp->ed_ino = objnum;
 			eodp->ed_reclen = reclen;
-			/* NOTE: ed_off is the offset for the *next* entry */
-			next = &(eodp->ed_off);
+			/* NOTE: ed_off is the offset for the *next* entry. */
+			next = &eodp->ed_off;
 			eodp->ed_eflags = zap.za_normalization_conflict ?
 			    ED_CASE_CONFLICT : 0;
 			(void) strncpy(eodp->ed_name, zap.za_name,
 			    EDIRENT_NAMELEN(reclen));
 			eodp = (edirent_t *)((intptr_t)eodp + reclen);
 		} else {
 			/*
 			 * Add normal entry:
 			 */
 			odp->d_ino = objnum;
 			odp->d_reclen = reclen;
 			odp->d_namlen = strlen(zap.za_name);
+			/* NOTE: d_off is the offset for the *next* entry. */
+			next = &odp->d_off;
 			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
 			odp->d_type = type;
 			odp = (dirent64_t *)((intptr_t)odp + reclen);
 		}
 		outcount += reclen;
 
 		ASSERT(outcount <= bufsize);
 
 		/* Prefetch znode */
 		if (prefetch)
 			dmu_prefetch(os, objnum, 0, 0, 0,
 			    ZIO_PRIORITY_SYNC_READ);
 
 	skip_entry:
 		/*
 		 * Move to the next entry, fill in the previous offset.
 		 */
 		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
 			zap_cursor_advance(&zc);
 			offset = zap_cursor_serialize(&zc);
 		} else {
 			offset += 1;
 		}
 
+		/* Fill the offset right after advancing the cursor. */
+		if (next != NULL)
+			*next = offset;
 		if (cooks != NULL) {
 			*cooks++ = offset;
 			ncooks--;
 			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
 		}
 	}
 	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
 
 	/* Subtract unused cookies */
 	if (ncookies != NULL)
 		*ncookies -= ncooks;
 
 	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
 		iovp->iov_base += outcount;
 		iovp->iov_len -= outcount;
 		uio->uio_resid -= outcount;
 	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
 		/*
 		 * Reset the pointer.
 		 */
 		offset = uio->uio_loffset;
 	}
 
 update:
 	zap_cursor_fini(&zc);
 	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
 		kmem_free(outbuf, bufsize);
 
 	if (error == ENOENT)
 		error = 0;
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 
 	uio->uio_loffset = offset;
 	ZFS_EXIT(zfsvfs);
 	if (error != 0 && cookies != NULL) {
 		free(*cookies, M_TEMP);
 		*cookies = NULL;
 		*ncookies = 0;
 	}
 	return (error);
 }
 
 ulong_t zfs_fsync_sync_cnt = 4;
 
 static int
 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
 
 	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 		zil_commit(zfsvfs->z_log, zp->z_id);
 		ZFS_EXIT(zfsvfs);
 	}
 	return (0);
 }
 
 
 /*
  * Get the requested file attributes and place them in the provided
  * vattr structure.
  *
  *	IN:	vp	- vnode of file.
  *		vap	- va_mask identifies requested attributes.
  *			  If AT_XVATTR set, then optional attrs are requested
  *		flags	- ATTR_NOACLCHECK (CIFS server context)
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	vap	- attribute values.
  *
  *	RETURN:	0 (always succeeds).
  */
 /* ARGSUSED */
 static int
 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int	error = 0;
 	uint32_t blksize;
 	u_longlong_t nblocks;
 	uint64_t mtime[2], ctime[2], crtime[2], rdev;
 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
 	xoptattr_t *xoap = NULL;
 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	sa_bulk_attr_t bulk[4];
 	int count = 0;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
 		    &rdev, 8);
 
 	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
 	 * Also, if we are the owner don't bother, since owner should
 	 * always be allowed to read basic attributes of file.
 	 */
 	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
 	    (vap->va_uid != crgetuid(cr))) {
 		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
 		    skipaclchk, cr)) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	/*
 	 * Return all attributes.  It's cheaper to provide the answer
 	 * than to determine whether we were asked the question.
 	 */
 
 	vap->va_type = IFTOVT(zp->z_mode);
 	vap->va_mode = zp->z_mode & ~S_IFMT;
 #ifdef illumos
 	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
 #else
 	vn_fsid(vp, vap);
 #endif
 	vap->va_nodeid = zp->z_id;
 	vap->va_nlink = zp->z_links;
 	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) &&
 	    zp->z_links < ZFS_LINK_MAX)
 		vap->va_nlink++;
 	vap->va_size = zp->z_size;
 #ifdef illumos
 	vap->va_rdev = vp->v_rdev;
 #else
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		vap->va_rdev = zfs_cmpldev(rdev);
 #endif
 	vap->va_seq = zp->z_seq;
 	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
      	vap->va_filerev = zp->z_seq;
 
 	/*
 	 * Add in any requested optional attributes and the create time.
 	 * Also set the corresponding bits in the returned attribute bitmap.
 	 */
 	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
 		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
 			xoap->xoa_archive =
 			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
 			XVA_SET_RTN(xvap, XAT_ARCHIVE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
 			xoap->xoa_readonly =
 			    ((zp->z_pflags & ZFS_READONLY) != 0);
 			XVA_SET_RTN(xvap, XAT_READONLY);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
 			xoap->xoa_system =
 			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
 			XVA_SET_RTN(xvap, XAT_SYSTEM);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
 			xoap->xoa_hidden =
 			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
 			XVA_SET_RTN(xvap, XAT_HIDDEN);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 			xoap->xoa_nounlink =
 			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
 			XVA_SET_RTN(xvap, XAT_NOUNLINK);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 			xoap->xoa_immutable =
 			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
 			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 			xoap->xoa_appendonly =
 			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
 			XVA_SET_RTN(xvap, XAT_APPENDONLY);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 			xoap->xoa_nodump =
 			    ((zp->z_pflags & ZFS_NODUMP) != 0);
 			XVA_SET_RTN(xvap, XAT_NODUMP);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
 			xoap->xoa_opaque =
 			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
 			XVA_SET_RTN(xvap, XAT_OPAQUE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 			xoap->xoa_av_quarantined =
 			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
 			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 			xoap->xoa_av_modified =
 			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
 			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
 		    vp->v_type == VREG) {
 			zfs_sa_get_scanstamp(zp, xvap);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
 			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
 			XVA_SET_RTN(xvap, XAT_REPARSE);
 		}
 		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
 			xoap->xoa_generation = zp->z_gen;
 			XVA_SET_RTN(xvap, XAT_GEN);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
 			xoap->xoa_offline =
 			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
 			XVA_SET_RTN(xvap, XAT_OFFLINE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
 			xoap->xoa_sparse =
 			    ((zp->z_pflags & ZFS_SPARSE) != 0);
 			XVA_SET_RTN(xvap, XAT_SPARSE);
 		}
 	}
 
 	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
 	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
 	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
 	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
 
 
 	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
 	vap->va_blksize = blksize;
 	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
 
 	if (zp->z_blksz == 0) {
 		/*
 		 * Block size hasn't been set; suggest maximal I/O transfers.
 		 */
 		vap->va_blksize = zfsvfs->z_max_blksz;
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Set the file attributes to the values contained in the
  * vattr structure.
  *
  *	IN:	vp	- vnode of file to be modified.
  *		vap	- new attribute values.
  *			  If AT_XVATTR set, then optional attrs are being set
  *		flags	- ATTR_UTIME set if non-default time values provided.
  *			- ATTR_NOACLCHECK (CIFS context only).
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - ctime updated, mtime updated if size changed.
  */
 /* ARGSUSED */
 static int
 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	zilog_t		*zilog;
 	dmu_tx_t	*tx;
 	vattr_t		oldva;
 	xvattr_t	tmpxvattr;
 	uint_t		mask = vap->va_mask;
 	uint_t		saved_mask = 0;
 	uint64_t	saved_mode;
 	int		trim_mask = 0;
 	uint64_t	new_mode;
 	uint64_t	new_uid, new_gid;
 	uint64_t	xattr_obj;
 	uint64_t	mtime[2], ctime[2];
 	znode_t		*attrzp;
 	int		need_policy = FALSE;
 	int		err, err2;
 	zfs_fuid_info_t *fuidp = NULL;
 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
 	xoptattr_t	*xoap;
 	zfs_acl_t	*aclp;
 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	boolean_t	fuid_dirtied = B_FALSE;
 	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
 	int		count = 0, xattr_count = 0;
 
 	if (mask == 0)
 		return (0);
 
 	if (mask & AT_NOSET)
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * Make sure that if we have ephemeral uid/gid or xvattr specified
 	 * that file system is at proper version level
 	 */
 
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
 	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
 	    (mask & AT_XVATTR))) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (mask & AT_SIZE && vp->v_type == VDIR) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EISDIR));
 	}
 
 	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * If this is an xvattr_t, then get a pointer to the structure of
 	 * optional attributes.  If this is NULL, then we have a vattr_t.
 	 */
 	xoap = xva_getxoptattr(xvap);
 
 	xva_init(&tmpxvattr);
 
 	/*
 	 * Immutable files can only alter immutable bit and atime
 	 */
 	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
 	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
 	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	/*
 	 * Note: ZFS_READONLY is handled in zfs_zaccess_common.
 	 */
 
 	/*
 	 * Verify timestamps doesn't overflow 32 bits.
 	 * ZFS can handle large timestamps, but 32bit syscalls can't
 	 * handle times greater than 2039.  This check should be removed
 	 * once large timestamps are fully supported.
 	 */
 	if (mask & (AT_ATIME | AT_MTIME)) {
 		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
 		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EOVERFLOW));
 		}
 	}
 	if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
 	    TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EOVERFLOW));
 	}
 
 	attrzp = NULL;
 	aclp = NULL;
 
 	/* Can this be moved to before the top label? */
 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EROFS));
 	}
 
 	/*
 	 * First validate permissions
 	 */
 
 	if (mask & AT_SIZE) {
 		/*
 		 * XXX - Note, we are not providing any open
 		 * mode flags here (like FNDELAY), so we may
 		 * block if there are locks present... this
 		 * should be addressed in openat().
 		 */
 		/* XXX - would it be OK to generate a log record here? */
 		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
 		if (err) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 	}
 
 	if (mask & (AT_ATIME|AT_MTIME) ||
 	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
 	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
 	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
 	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
 	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
 	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
 	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
 		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
 		    skipaclchk, cr);
 	}
 
 	if (mask & (AT_UID|AT_GID)) {
 		int	idmask = (mask & (AT_UID|AT_GID));
 		int	take_owner;
 		int	take_group;
 
 		/*
 		 * NOTE: even if a new mode is being set,
 		 * we may clear S_ISUID/S_ISGID bits.
 		 */
 
 		if (!(mask & AT_MODE))
 			vap->va_mode = zp->z_mode;
 
 		/*
 		 * Take ownership or chgrp to group we are a member of
 		 */
 
 		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
 		take_group = (mask & AT_GID) &&
 		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
 
 		/*
 		 * If both AT_UID and AT_GID are set then take_owner and
 		 * take_group must both be set in order to allow taking
 		 * ownership.
 		 *
 		 * Otherwise, send the check through secpolicy_vnode_setattr()
 		 *
 		 */
 
 		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
 		    ((idmask == AT_UID) && take_owner) ||
 		    ((idmask == AT_GID) && take_group)) {
 			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
 			    skipaclchk, cr) == 0) {
 				/*
 				 * Remove setuid/setgid for non-privileged users
 				 */
 				secpolicy_setid_clear(vap, vp, cr);
 				trim_mask = (mask & (AT_UID|AT_GID));
 			} else {
 				need_policy =  TRUE;
 			}
 		} else {
 			need_policy =  TRUE;
 		}
 	}
 
 	oldva.va_mode = zp->z_mode;
 	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
 	if (mask & AT_XVATTR) {
 		/*
 		 * Update xvattr mask to include only those attributes
 		 * that are actually changing.
 		 *
 		 * the bits will be restored prior to actually setting
 		 * the attributes so the caller thinks they were set.
 		 */
 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 			if (xoap->xoa_appendonly !=
 			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
 				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 			if (xoap->xoa_nounlink !=
 			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
 				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 			if (xoap->xoa_immutable !=
 			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
 				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 			if (xoap->xoa_nodump !=
 			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_NODUMP);
 				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 			if (xoap->xoa_av_modified !=
 			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
 				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 			if ((vp->v_type != VREG &&
 			    xoap->xoa_av_quarantined) ||
 			    xoap->xoa_av_quarantined !=
 			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
 				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EPERM));
 		}
 
 		if (need_policy == FALSE &&
 		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
 		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
 			need_policy = TRUE;
 		}
 	}
 
 	if (mask & AT_MODE) {
 		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
 			err = secpolicy_setid_setsticky_clear(vp, vap,
 			    &oldva, cr);
 			if (err) {
 				ZFS_EXIT(zfsvfs);
 				return (err);
 			}
 			trim_mask |= AT_MODE;
 		} else {
 			need_policy = TRUE;
 		}
 	}
 
 	if (need_policy) {
 		/*
 		 * If trim_mask is set then take ownership
 		 * has been granted or write_acl is present and user
 		 * has the ability to modify mode.  In that case remove
 		 * UID|GID and or MODE from mask so that
 		 * secpolicy_vnode_setattr() doesn't revoke it.
 		 */
 
 		if (trim_mask) {
 			saved_mask = vap->va_mask;
 			vap->va_mask &= ~trim_mask;
 			if (trim_mask & AT_MODE) {
 				/*
 				 * Save the mode, as secpolicy_vnode_setattr()
 				 * will overwrite it with ova.va_mode.
 				 */
 				saved_mode = vap->va_mode;
 			}
 		}
 		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
 		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
 		if (err) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 
 		if (trim_mask) {
 			vap->va_mask |= saved_mask;
 			if (trim_mask & AT_MODE) {
 				/*
 				 * Recover the mode after
 				 * secpolicy_vnode_setattr().
 				 */
 				vap->va_mode = saved_mode;
 			}
 		}
 	}
 
 	/*
 	 * secpolicy_vnode_setattr, or take ownership may have
 	 * changed va_mask
 	 */
 	mask = vap->va_mask;
 
 	if ((mask & (AT_UID | AT_GID))) {
 		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 		    &xattr_obj, sizeof (xattr_obj));
 
 		if (err == 0 && xattr_obj) {
 			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
 			if (err == 0) {
 				err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
 				if (err != 0)
 					vrele(ZTOV(attrzp));
 			}
 			if (err)
 				goto out2;
 		}
 		if (mask & AT_UID) {
 			new_uid = zfs_fuid_create(zfsvfs,
 			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
 			if (new_uid != zp->z_uid &&
 			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
 				if (attrzp)
 					vput(ZTOV(attrzp));
 				err = SET_ERROR(EDQUOT);
 				goto out2;
 			}
 		}
 
 		if (mask & AT_GID) {
 			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
 			    cr, ZFS_GROUP, &fuidp);
 			if (new_gid != zp->z_gid &&
 			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
 				if (attrzp)
 					vput(ZTOV(attrzp));
 				err = SET_ERROR(EDQUOT);
 				goto out2;
 			}
 		}
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
 
 	if (mask & AT_MODE) {
 		uint64_t pmode = zp->z_mode;
 		uint64_t acl_obj;
 		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
 
 		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
 		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
 			err = SET_ERROR(EPERM);
 			goto out;
 		}
 
 		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
 			goto out;
 
 		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
 			/*
 			 * Are we upgrading ACL from old V0 format
 			 * to V1 format?
 			 */
 			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
 			    zfs_znode_acl_version(zp) ==
 			    ZFS_ACL_VERSION_INITIAL) {
 				dmu_tx_hold_free(tx, acl_obj, 0,
 				    DMU_OBJECT_END);
 				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 				    0, aclp->z_acl_bytes);
 			} else {
 				dmu_tx_hold_write(tx, acl_obj, 0,
 				    aclp->z_acl_bytes);
 			}
 		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, aclp->z_acl_bytes);
 		}
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 	} else {
 		if ((mask & AT_XVATTR) &&
 		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		else
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	}
 
 	if (attrzp) {
 		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
 	}
 
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 
 	zfs_sa_upgrade_txholds(tx, zp);
 
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err)
 		goto out;
 
 	count = 0;
 	/*
 	 * Set each attribute requested.
 	 * We group settings according to the locks they need to acquire.
 	 *
 	 * Note: you cannot set ctime directly, although it will be
 	 * updated as a side-effect of calling this function.
 	 */
 
 	if (mask & (AT_UID|AT_GID|AT_MODE))
 		mutex_enter(&zp->z_acl_lock);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, sizeof (zp->z_pflags));
 
 	if (attrzp) {
 		if (mask & (AT_UID|AT_GID|AT_MODE))
 			mutex_enter(&attrzp->z_acl_lock);
 		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
 		    sizeof (attrzp->z_pflags));
 	}
 
 	if (mask & (AT_UID|AT_GID)) {
 
 		if (mask & AT_UID) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 			    &new_uid, sizeof (new_uid));
 			zp->z_uid = new_uid;
 			if (attrzp) {
 				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
 				    sizeof (new_uid));
 				attrzp->z_uid = new_uid;
 			}
 		}
 
 		if (mask & AT_GID) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
 			    NULL, &new_gid, sizeof (new_gid));
 			zp->z_gid = new_gid;
 			if (attrzp) {
 				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
 				    sizeof (new_gid));
 				attrzp->z_gid = new_gid;
 			}
 		}
 		if (!(mask & AT_MODE)) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
 			    NULL, &new_mode, sizeof (new_mode));
 			new_mode = zp->z_mode;
 		}
 		err = zfs_acl_chown_setattr(zp);
 		ASSERT(err == 0);
 		if (attrzp) {
 			err = zfs_acl_chown_setattr(attrzp);
 			ASSERT(err == 0);
 		}
 	}
 
 	if (mask & AT_MODE) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
 		    &new_mode, sizeof (new_mode));
 		zp->z_mode = new_mode;
 		ASSERT3U((uintptr_t)aclp, !=, 0);
 		err = zfs_aclset_common(zp, aclp, cr, tx);
 		ASSERT0(err);
 		if (zp->z_acl_cached)
 			zfs_acl_free(zp->z_acl_cached);
 		zp->z_acl_cached = aclp;
 		aclp = NULL;
 	}
 
 
 	if (mask & AT_ATIME) {
 		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
 		    &zp->z_atime, sizeof (zp->z_atime));
 	}
 
 	if (mask & AT_MTIME) {
 		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 		    mtime, sizeof (mtime));
 	}
 
 	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
 	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
 		    NULL, mtime, sizeof (mtime));
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, sizeof (ctime));
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 		    B_TRUE);
 	} else if (mask != 0) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, sizeof (ctime));
 		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
 		    B_TRUE);
 		if (attrzp) {
 			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 			    SA_ZPL_CTIME(zfsvfs), NULL,
 			    &ctime, sizeof (ctime));
 			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
 			    mtime, ctime, B_TRUE);
 		}
 	}
 	/*
 	 * Do this after setting timestamps to prevent timestamp
 	 * update from toggling bit
 	 */
 
 	if (xoap && (mask & AT_XVATTR)) {
 
 		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
 			xoap->xoa_createtime = vap->va_birthtime;
 		/*
 		 * restore trimmed off masks
 		 * so that return masks can be set for caller.
 		 */
 
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
 			XVA_SET_REQ(xvap, XAT_APPENDONLY);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
 			XVA_SET_REQ(xvap, XAT_NOUNLINK);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
 			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
 			XVA_SET_REQ(xvap, XAT_NODUMP);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
 			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
 			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
 			ASSERT(vp->v_type == VREG);
 
 		zfs_xvattr_set(zp, xvap, tx);
 	}
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	if (mask != 0)
 		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
 
 	if (mask & (AT_UID|AT_GID|AT_MODE))
 		mutex_exit(&zp->z_acl_lock);
 
 	if (attrzp) {
 		if (mask & (AT_UID|AT_GID|AT_MODE))
 			mutex_exit(&attrzp->z_acl_lock);
 	}
 out:
 	if (err == 0 && attrzp) {
 		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
 		    xattr_count, tx);
 		ASSERT(err2 == 0);
 	}
 
 	if (attrzp)
 		vput(ZTOV(attrzp));
 
 	if (aclp)
 		zfs_acl_free(aclp);
 
 	if (fuidp) {
 		zfs_fuid_info_free(fuidp);
 		fuidp = NULL;
 	}
 
 	if (err) {
 		dmu_tx_abort(tx);
 	} else {
 		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		dmu_tx_commit(tx);
 	}
 
 out2:
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (err);
 }
 
 /*
  * We acquire all but fdvp locks using non-blocking acquisitions.  If we
  * fail to acquire any lock in the path we will drop all held locks,
  * acquire the new lock in a blocking fashion, and then release it and
  * restart the rename.  This acquire/release step ensures that we do not
  * spin on a lock waiting for release.  On error release all vnode locks
  * and decrement references the way tmpfs_rename() would do.
  */
 static int
 zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
     struct vnode *tdvp, struct vnode **tvpp,
     const struct componentname *scnp, const struct componentname *tcnp)
 {
 	zfsvfs_t	*zfsvfs;
 	struct vnode	*nvp, *svp, *tvp;
 	znode_t		*sdzp, *tdzp, *szp, *tzp;
 	const char	*snm = scnp->cn_nameptr;
 	const char	*tnm = tcnp->cn_nameptr;
 	int error;
 
 	VOP_UNLOCK(tdvp, 0);
 	if (*tvpp != NULL && *tvpp != tdvp)
 		VOP_UNLOCK(*tvpp, 0);
 
 relock:
 	error = vn_lock(sdvp, LK_EXCLUSIVE);
 	if (error)
 		goto out;
 	sdzp = VTOZ(sdvp);
 
 	error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
 	if (error != 0) {
 		VOP_UNLOCK(sdvp, 0);
 		if (error != EBUSY)
 			goto out;
 		error = vn_lock(tdvp, LK_EXCLUSIVE);
 		if (error)
 			goto out;
 		VOP_UNLOCK(tdvp, 0);
 		goto relock;
 	}
 	tdzp = VTOZ(tdvp);
 
 	/*
 	 * Before using sdzp and tdzp we must ensure that they are live.
 	 * As a porting legacy from illumos we have two things to worry
 	 * about.  One is typical for FreeBSD and it is that the vnode is
 	 * not reclaimed (doomed).  The other is that the znode is live.
 	 * The current code can invalidate the znode without acquiring the
 	 * corresponding vnode lock if the object represented by the znode
 	 * and vnode is no longer valid after a rollback or receive operation.
 	 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
 	 * that protects the znodes from the invalidation.
 	 */
 	zfsvfs = sdzp->z_zfsvfs;
 	ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
 	ZFS_ENTER(zfsvfs);
 
 	/*
 	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
 	 * bypassing the cleanup code in the case of an error.
 	 */
 	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
 		ZFS_EXIT(zfsvfs);
 		VOP_UNLOCK(sdvp, 0);
 		VOP_UNLOCK(tdvp, 0);
 		error = SET_ERROR(EIO);
 		goto out;
 	}
 
 	/*
 	 * Re-resolve svp to be certain it still exists and fetch the
 	 * correct vnode.
 	 */
 	error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
 	if (error != 0) {
 		/* Source entry invalid or not there. */
 		ZFS_EXIT(zfsvfs);
 		VOP_UNLOCK(sdvp, 0);
 		VOP_UNLOCK(tdvp, 0);
 		if ((scnp->cn_flags & ISDOTDOT) != 0 ||
 		    (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
 			error = SET_ERROR(EINVAL);
 		goto out;
 	}
 	svp = ZTOV(szp);
 
 	/*
 	 * Re-resolve tvp, if it disappeared we just carry on.
 	 */
 	error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		VOP_UNLOCK(sdvp, 0);
 		VOP_UNLOCK(tdvp, 0);
 		vrele(svp);
 		if ((tcnp->cn_flags & ISDOTDOT) != 0)
 			error = SET_ERROR(EINVAL);
 		goto out;
 	}
 	if (tzp != NULL)
 		tvp = ZTOV(tzp);
 	else
 		tvp = NULL;
 
 	/*
 	 * At present the vnode locks must be acquired before z_teardown_lock,
 	 * although it would be more logical to use the opposite order.
 	 */
 	ZFS_EXIT(zfsvfs);
 
 	/*
 	 * Now try acquire locks on svp and tvp.
 	 */
 	nvp = svp;
 	error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
 	if (error != 0) {
 		VOP_UNLOCK(sdvp, 0);
 		VOP_UNLOCK(tdvp, 0);
 		if (tvp != NULL)
 			vrele(tvp);
 		if (error != EBUSY) {
 			vrele(nvp);
 			goto out;
 		}
 		error = vn_lock(nvp, LK_EXCLUSIVE);
 		if (error != 0) {
 			vrele(nvp);
 			goto out;
 		}
 		VOP_UNLOCK(nvp, 0);
 		/*
 		 * Concurrent rename race.
 		 * XXX ?
 		 */
 		if (nvp == tdvp) {
 			vrele(nvp);
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 		vrele(*svpp);
 		*svpp = nvp;
 		goto relock;
 	}
 	vrele(*svpp);
 	*svpp = nvp;
 
 	if (*tvpp != NULL)
 		vrele(*tvpp);
 	*tvpp = NULL;
 	if (tvp != NULL) {
 		nvp = tvp;
 		error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
 		if (error != 0) {
 			VOP_UNLOCK(sdvp, 0);
 			VOP_UNLOCK(tdvp, 0);
 			VOP_UNLOCK(*svpp, 0);
 			if (error != EBUSY) {
 				vrele(nvp);
 				goto out;
 			}
 			error = vn_lock(nvp, LK_EXCLUSIVE);
 			if (error != 0) {
 				vrele(nvp);
 				goto out;
 			}
 			vput(nvp);
 			goto relock;
 		}
 		*tvpp = nvp;
 	}
 
 	return (0);
 
 out:
 	return (error);
 }
 
 /*
  * Note that we must use VRELE_ASYNC in this function as it walks
  * up the directory tree and vrele may need to acquire an exclusive
  * lock if a last reference to a vnode is dropped.
  */
 static int
 zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
 {
 	zfsvfs_t	*zfsvfs;
 	znode_t		*zp, *zp1;
 	uint64_t	parent;
 	int		error;
 
 	zfsvfs = tdzp->z_zfsvfs;
 	if (tdzp == szp)
 		return (SET_ERROR(EINVAL));
 	if (tdzp == sdzp)
 		return (0);
 	if (tdzp->z_id == zfsvfs->z_root)
 		return (0);
 	zp = tdzp;
 	for (;;) {
 		ASSERT(!zp->z_unlinked);
 		if ((error = sa_lookup(zp->z_sa_hdl,
 		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
 			break;
 
 		if (parent == szp->z_id) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 		if (parent == zfsvfs->z_root)
 			break;
 		if (parent == sdzp->z_id)
 			break;
 
 		error = zfs_zget(zfsvfs, parent, &zp1);
 		if (error != 0)
 			break;
 
 		if (zp != tdzp)
 			VN_RELE_ASYNC(ZTOV(zp),
 			    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
 		zp = zp1;
 	}
 
 	if (error == ENOTDIR)
 		panic("checkpath: .. not a directory\n");
 	if (zp != tdzp)
 		VN_RELE_ASYNC(ZTOV(zp),
 		    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
 	return (error);
 }
 
 /*
  * Move an entry from the provided source directory to the target
  * directory.  Change the entry name as indicated.
  *
  *	IN:	sdvp	- Source directory containing the "old entry".
  *		snm	- Old entry name.
  *		tdvp	- Target directory to contain the "new entry".
  *		tnm	- New entry name.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	sdvp,tdvp - ctime|mtime updated
  */
 /*ARGSUSED*/
 static int
 zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
     vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
     cred_t *cr)
 {
 	zfsvfs_t	*zfsvfs;
 	znode_t		*sdzp, *tdzp, *szp, *tzp;
 	zilog_t		*zilog = NULL;
 	dmu_tx_t	*tx;
 	char		*snm = scnp->cn_nameptr;
 	char		*tnm = tcnp->cn_nameptr;
 	int		error = 0;
 
 	/* Reject renames across filesystems. */
 	if ((*svpp)->v_mount != tdvp->v_mount ||
 	    ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
 		error = SET_ERROR(EXDEV);
 		goto out;
 	}
 
 	if (zfsctl_is_node(tdvp)) {
 		error = SET_ERROR(EXDEV);
 		goto out;
 	}
 
 	/*
 	 * Lock all four vnodes to ensure safety and semantics of renaming.
 	 */
 	error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
 	if (error != 0) {
 		/* no vnodes are locked in the case of error here */
 		return (error);
 	}
 
 	tdzp = VTOZ(tdvp);
 	sdzp = VTOZ(sdvp);
 	zfsvfs = tdzp->z_zfsvfs;
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * After we re-enter ZFS_ENTER() we will have to revalidate all
 	 * znodes involved.
 	 */
 	ZFS_ENTER(zfsvfs);
 
 	if (zfsvfs->z_utf8 && u8_validate(tnm,
 	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		error = SET_ERROR(EILSEQ);
 		goto unlockout;
 	}
 
 	/* If source and target are the same file, there is nothing to do. */
 	if ((*svpp) == (*tvpp)) {
 		error = 0;
 		goto unlockout;
 	}
 
 	if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
 	    ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
 	    (*tvpp)->v_mountedhere != NULL)) {
 		error = SET_ERROR(EXDEV);
 		goto unlockout;
 	}
 
 	/*
 	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
 	 * bypassing the cleanup code in the case of an error.
 	 */
 	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
 		error = SET_ERROR(EIO);
 		goto unlockout;
 	}
 
 	szp = VTOZ(*svpp);
 	tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
 	if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
 		error = SET_ERROR(EIO);
 		goto unlockout;
 	}
 
 	/*
 	 * This is to prevent the creation of links into attribute space
 	 * by renaming a linked file into/outof an attribute directory.
 	 * See the comment in zfs_link() for why this is considered bad.
 	 */
 	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
 		error = SET_ERROR(EINVAL);
 		goto unlockout;
 	}
 
 	/*
 	 * Must have write access at the source to remove the old entry
 	 * and write access at the target to create the new entry.
 	 * Note that if target and source are the same, this can be
 	 * done in a single check.
 	 */
 	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
 		goto unlockout;
 
 	if ((*svpp)->v_type == VDIR) {
 		/*
 		 * Avoid ".", "..", and aliases of "." for obvious reasons.
 		 */
 		if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
 		    sdzp == szp ||
 		    (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
 			error = EINVAL;
 			goto unlockout;
 		}
 
 		/*
 		 * Check to make sure rename is valid.
 		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
 		 */
 		if (error = zfs_rename_check(szp, sdzp, tdzp))
 			goto unlockout;
 	}
 
 	/*
 	 * Does target exist?
 	 */
 	if (tzp) {
 		/*
 		 * Source and target must be the same type.
 		 */
 		if ((*svpp)->v_type == VDIR) {
 			if ((*tvpp)->v_type != VDIR) {
 				error = SET_ERROR(ENOTDIR);
 				goto unlockout;
 			} else {
 				cache_purge(tdvp);
 				if (sdvp != tdvp)
 					cache_purge(sdvp);
 			}
 		} else {
 			if ((*tvpp)->v_type == VDIR) {
 				error = SET_ERROR(EISDIR);
 				goto unlockout;
 			}
 		}
 	}
 
 	vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
 	if (tzp)
 		vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
 
 	/*
 	 * notify the target directory if it is not the same
 	 * as source directory.
 	 */
 	if (tdvp != sdvp) {
 		vnevent_rename_dest_dir(tdvp, ct);
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
 	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
 	if (sdzp != tdzp) {
 		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, tdzp);
 	}
 	if (tzp) {
 		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, tzp);
 	}
 
 	zfs_sa_upgrade_txholds(tx, szp);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		goto unlockout;
 	}
 
 
 	if (tzp)	/* Attempt to remove the existing target */
 		error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
 
 	if (error == 0) {
 		error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
 		if (error == 0) {
 			szp->z_pflags |= ZFS_AV_MODIFIED;
 
 			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
 			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
 			ASSERT0(error);
 
 			error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
 			    NULL);
 			if (error == 0) {
 				zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
 				    snm, tdzp, tnm, szp);
 
 				/*
 				 * Update path information for the target vnode
 				 */
 				vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
 			} else {
 				/*
 				 * At this point, we have successfully created
 				 * the target name, but have failed to remove
 				 * the source name.  Since the create was done
 				 * with the ZRENAMING flag, there are
 				 * complications; for one, the link count is
 				 * wrong.  The easiest way to deal with this
 				 * is to remove the newly created target, and
 				 * return the original error.  This must
 				 * succeed; fortunately, it is very unlikely to
 				 * fail, since we just created it.
 				 */
 				VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
 				    ZRENAMING, NULL), ==, 0);
 			}
 		}
 		if (error == 0) {
 			cache_purge(*svpp);
 			if (*tvpp != NULL)
 				cache_purge(*tvpp);
 			cache_purge_negative(tdvp);
 		}
 	}
 
 	dmu_tx_commit(tx);
 
 unlockout:			/* all 4 vnodes are locked, ZFS_ENTER called */
 	ZFS_EXIT(zfsvfs);
 	VOP_UNLOCK(*svpp, 0);
 	VOP_UNLOCK(sdvp, 0);
 
 out:				/* original two vnodes are locked */
 	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	if (*tvpp != NULL)
 		VOP_UNLOCK(*tvpp, 0);
 	if (tdvp != *tvpp)
 		VOP_UNLOCK(tdvp, 0);
 	return (error);
 }
 
 /*
  * Insert the indicated symbolic reference entry into the directory.
  *
  *	IN:	dvp	- Directory to contain new symbolic link.
  *		link	- Name for new symlink entry.
  *		vap	- Attributes of new entry.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  */
 /*ARGSUSED*/
 static int
 zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
     cred_t *cr, kthread_t *td)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	len = strlen(link);
 	int		error;
 	zfs_acl_ids_t	acl_ids;
 	boolean_t	fuid_dirtied;
 	uint64_t	txtype = TX_SYMLINK;
 	int		flags = 0;
 
 	ASSERT(vap->va_type == VLNK);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	if (len > MAXPATHLEN) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENAMETOOLONG));
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0,
 	    vap, cr, NULL, &acl_ids)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
 	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
 		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EDQUOT));
 	}
 
 	getnewvnode_reserve(1);
 	tx = dmu_tx_create(zfsvfs->z_os);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE + len);
 	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
 		    acl_ids.z_aclp->z_acl_bytes);
 	}
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Create a new object for the symlink.
 	 * for version 4 ZPL datsets the symlink will be an SA attribute
 	 */
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	if (zp->z_is_sa)
 		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
 		    link, len, tx);
 	else
 		zfs_sa_symlink(zp, link, len, tx);
 
 	zp->z_size = len;
 	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 	    &zp->z_size, sizeof (zp->z_size), tx);
 	/*
 	 * Insert the new object into the directory.
 	 */
 	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
 
 	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
 	*vpp = ZTOV(zp);
 
 	zfs_acl_ids_free(&acl_ids);
 
 	dmu_tx_commit(tx);
 
 	getnewvnode_drop_reserve();
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Return, in the buffer contained in the provided uio structure,
  * the symbolic path referred to by vp.
  *
  *	IN:	vp	- vnode of symbolic link.
  *		uio	- structure to contain the link path.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	uio	- structure containing the link path.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - atime updated
  */
 /* ARGSUSED */
 static int
 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if (zp->z_is_sa)
 		error = sa_lookup_uio(zp->z_sa_hdl,
 		    SA_ZPL_SYMLINK(zfsvfs), uio);
 	else
 		error = zfs_sa_readlink(zp, uio);
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Insert a new entry into directory tdvp referencing svp.
  *
  *	IN:	tdvp	- Directory to contain new entry.
  *		svp	- vnode of new entry.
  *		name	- name of new entry.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	tdvp - ctime|mtime updated
  *	 svp - ctime updated
  */
 /* ARGSUSED */
 static int
 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
     caller_context_t *ct, int flags)
 {
 	znode_t		*dzp = VTOZ(tdvp);
 	znode_t		*tzp, *szp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	dmu_tx_t	*tx;
 	int		error;
 	uint64_t	parent;
 	uid_t		owner;
 
 	ASSERT(tdvp->v_type == VDIR);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * POSIX dictates that we return EPERM here.
 	 * Better choices include ENOTSUP or EISDIR.
 	 */
 	if (svp->v_type == VDIR) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	szp = VTOZ(svp);
 	ZFS_VERIFY_ZP(szp);
 
 	if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	/* Prevent links to .zfs/shares files */
 
 	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 	    &parent, sizeof (uint64_t))) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 	if (parent == zfsvfs->z_shares_dir) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(name,
 	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	/*
 	 * We do not support links between attributes and non-attributes
 	 * because of the potential security risk of creating links
 	 * into "normal" file space in order to circumvent restrictions
 	 * imposed in attribute space.
 	 */
 	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 
 	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
 	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
 	error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
 	if (error) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	zfs_sa_upgrade_txholds(tx, szp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	error = zfs_link_create(dzp, name, szp, tx, 0);
 
 	if (error == 0) {
 		uint64_t txtype = TX_LINK;
 		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
 	}
 
 	dmu_tx_commit(tx);
 
 	if (error == 0) {
 		vnevent_link(svp, ct);
 	}
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 
 /*ARGSUSED*/
 void
 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
 	if (zp->z_sa_hdl == NULL) {
 		/*
 		 * The fs has been unmounted, or we did a
 		 * suspend/resume and this file no longer exists.
 		 */
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 		vrecycle(vp);
 		return;
 	}
 
 	if (zp->z_unlinked) {
 		/*
 		 * Fast path to recycle a vnode of a removed file.
 		 */
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 		vrecycle(vp);
 		return;
 	}
 
 	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, zp);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 		} else {
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
 			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
 			zp->z_atime_dirty = 0;
 			dmu_tx_commit(tx);
 		}
 	}
 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
 }
 
 
 CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
 CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
 
 /*ARGSUSED*/
 static int
 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	uint32_t	gen;
 	uint64_t	gen64;
 	uint64_t	object = zp->z_id;
 	zfid_short_t	*zfid;
 	int		size, i, error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
 	    &gen64, sizeof (uint64_t))) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	gen = (uint32_t)gen64;
 
 	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
 
 #ifdef illumos
 	if (fidp->fid_len < size) {
 		fidp->fid_len = size;
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENOSPC));
 	}
 #else
 	fidp->fid_len = size;
 #endif
 
 	zfid = (zfid_short_t *)fidp;
 
 	zfid->zf_len = size;
 
 	for (i = 0; i < sizeof (zfid->zf_object); i++)
 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 
 	/* Must have a non-zero generation number to distinguish from .zfs */
 	if (gen == 0)
 		gen = 1;
 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
 		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
 
 	if (size == LONG_FID_LEN) {
 		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
 		zfid_long_t	*zlfid;
 
 		zlfid = (zfid_long_t *)fidp;
 
 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
 			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
 
 		/* XXX - this should be the generation number for the objset */
 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
 			zlfid->zf_setgen[i] = 0;
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 static int
 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t		*zp, *xzp;
 	zfsvfs_t	*zfsvfs;
 	int		error;
 
 	switch (cmd) {
 	case _PC_LINK_MAX:
 		*valp = MIN(LONG_MAX, ZFS_LINK_MAX);
 		return (0);
 
 	case _PC_FILESIZEBITS:
 		*valp = 64;
 		return (0);
 #ifdef illumos
 	case _PC_XATTR_EXISTS:
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 		*valp = 0;
 		error = zfs_dirent_lookup(zp, "", &xzp,
 		    ZXATTR | ZEXISTS | ZSHARED);
 		if (error == 0) {
 			if (!zfs_dirempty(xzp))
 				*valp = 1;
 			vrele(ZTOV(xzp));
 		} else if (error == ENOENT) {
 			/*
 			 * If there aren't extended attributes, it's the
 			 * same as having zero of them.
 			 */
 			error = 0;
 		}
 		ZFS_EXIT(zfsvfs);
 		return (error);
 
 	case _PC_SATTR_ENABLED:
 	case _PC_SATTR_EXISTS:
 		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
 		    (vp->v_type == VREG || vp->v_type == VDIR);
 		return (0);
 
 	case _PC_ACCESS_FILTERING:
 		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
 		    vp->v_type == VDIR;
 		return (0);
 
 	case _PC_ACL_ENABLED:
 		*valp = _ACL_ACE_ENABLED;
 		return (0);
 #endif	/* illumos */
 	case _PC_MIN_HOLE_SIZE:
 		*valp = (int)SPA_MINBLOCKSIZE;
 		return (0);
 #ifdef illumos
 	case _PC_TIMESTAMP_RESOLUTION:
 		/* nanosecond timestamp resolution */
 		*valp = 1L;
 		return (0);
 #endif
 	case _PC_ACL_EXTENDED:
 		*valp = 0;
 		return (0);
 
 	case _PC_ACL_NFS4:
 		*valp = 1;
 		return (0);
 
 	case _PC_ACL_PATH_MAX:
 		*valp = ACL_MAX_ENTRIES;
 		return (0);
 
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 /*ARGSUSED*/
 static int
 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*ARGSUSED*/
 int
 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	zilog_t	*zilog = zfsvfs->z_log;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 static int
 zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
     int *rahead)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	objset_t *os = zp->z_zfsvfs->z_os;
 	rl_t *rl;
 	vm_object_t object;
 	off_t start, end, obj_size;
 	uint_t blksz;
 	int pgsin_b, pgsin_a;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	start = IDX_TO_OFF(ma[0]->pindex);
 	end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
 
 	/*
 	 * Lock a range covering all required and optional pages.
 	 * Note that we need to handle the case of the block size growing.
 	 */
 	for (;;) {
 		blksz = zp->z_blksz;
 		rl = zfs_range_lock(zp, rounddown(start, blksz),
 		    roundup(end, blksz) - rounddown(start, blksz), RL_READER);
 		if (blksz == zp->z_blksz)
 			break;
 		zfs_range_unlock(rl);
 	}
 
 	object = ma[0]->object;
 	zfs_vmobject_wlock(object);
 	obj_size = object->un_pager.vnp.vnp_size;
 	zfs_vmobject_wunlock(object);
 	if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
 		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
 		return (zfs_vm_pagerret_bad);
 	}
 
 	pgsin_b = 0;
 	if (rbehind != NULL) {
 		pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
 		pgsin_b = MIN(*rbehind, pgsin_b);
 	}
 
 	pgsin_a = 0;
 	if (rahead != NULL) {
 		pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
 		if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
 			pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
 		pgsin_a = MIN(*rahead, pgsin_a);
 	}
 
 	/*
 	 * NB: we need to pass the exact byte size of the data that we expect
 	 * to read after accounting for the file size.  This is required because
 	 * ZFS will panic if we request DMU to read beyond the end of the last
 	 * allocated block.
 	 */
 	error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a,
 	    MIN(end, obj_size) - (end - PAGE_SIZE));
 
 	zfs_range_unlock(rl);
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 	ZFS_EXIT(zfsvfs);
 
 	if (error != 0)
 		return (zfs_vm_pagerret_error);
 
 	VM_CNT_INC(v_vnodein);
 	VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a);
 	if (rbehind != NULL)
 		*rbehind = pgsin_b;
 	if (rahead != NULL)
 		*rahead = pgsin_a;
 	return (zfs_vm_pagerret_ok);
 }
 
 static int
 zfs_freebsd_getpages(ap)
 	struct vop_getpages_args /* {
 		struct vnode *a_vp;
 		vm_page_t *a_m;
 		int a_count;
 		int *a_rbehind;
 		int *a_rahead;
 	} */ *ap;
 {
 
 	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
 	    ap->a_rahead));
 }
 
 static int
 zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
     int *rtvals)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	rl_t		*rl;
 	dmu_tx_t	*tx;
 	struct sf_buf	*sf;
 	vm_object_t	object;
 	vm_page_t	m;
 	caddr_t		va;
 	size_t		tocopy;
 	size_t		lo_len;
 	vm_ooffset_t	lo_off;
 	vm_ooffset_t	off;
 	uint_t		blksz;
 	int		ncount;
 	int		pcount;
 	int		err;
 	int		i;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	object = vp->v_object;
 	pcount = btoc(len);
 	ncount = pcount;
 
 	KASSERT(ma[0]->object == object, ("mismatching object"));
 	KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
 
 	for (i = 0; i < pcount; i++)
 		rtvals[i] = zfs_vm_pagerret_error;
 
 	off = IDX_TO_OFF(ma[0]->pindex);
 	blksz = zp->z_blksz;
 	lo_off = rounddown(off, blksz);
 	lo_len = roundup(len + (off - lo_off), blksz);
 	rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER);
 
 	zfs_vmobject_wlock(object);
 	if (len + off > object->un_pager.vnp.vnp_size) {
 		if (object->un_pager.vnp.vnp_size > off) {
 			int pgoff;
 
 			len = object->un_pager.vnp.vnp_size - off;
 			ncount = btoc(len);
 			if ((pgoff = (int)len & PAGE_MASK) != 0) {
 				/*
 				 * If the object is locked and the following
 				 * conditions hold, then the page's dirty
 				 * field cannot be concurrently changed by a
 				 * pmap operation.
 				 */
 				m = ma[ncount - 1];
 				vm_page_assert_sbusied(m);
 				KASSERT(!pmap_page_is_write_mapped(m),
 				    ("zfs_putpages: page %p is not read-only", m));
 				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
 				    pgoff);
 			}
 		} else {
 			len = 0;
 			ncount = 0;
 		}
 		if (ncount < pcount) {
 			for (i = ncount; i < pcount; i++) {
 				rtvals[i] = zfs_vm_pagerret_bad;
 			}
 		}
 	}
 	zfs_vmobject_wunlock(object);
 
 	if (ncount == 0)
 		goto out;
 
 	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
 	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
 		goto out;
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_write(tx, zp->z_id, off, len);
 
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
 		goto out;
 	}
 
 	if (zp->z_blksz < PAGE_SIZE) {
 		for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
 			tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
 			va = zfs_map_page(ma[i], &sf);
 			dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
 			zfs_unmap_page(sf);
 		}
 	} else {
 		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
 	}
 
 	if (err == 0) {
 		uint64_t mtime[2], ctime[2];
 		sa_bulk_attr_t bulk[3];
 		int count = 0;
 
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 		    &mtime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 		    &zp->z_pflags, 8);
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 		    B_TRUE);
 		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		ASSERT0(err);
 		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
 
 		zfs_vmobject_wlock(object);
 		for (i = 0; i < ncount; i++) {
 			rtvals[i] = zfs_vm_pagerret_ok;
 			vm_page_undirty(ma[i]);
 		}
 		zfs_vmobject_wunlock(object);
 		VM_CNT_INC(v_vnodeout);
 		VM_CNT_ADD(v_vnodepgsout, ncount);
 	}
 	dmu_tx_commit(tx);
 
 out:
 	zfs_range_unlock(rl);
 	if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
 	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zfsvfs->z_log, zp->z_id);
 	ZFS_EXIT(zfsvfs);
 	return (rtvals[0]);
 }
 
 int
 zfs_freebsd_putpages(ap)
 	struct vop_putpages_args /* {
 		struct vnode *a_vp;
 		vm_page_t *a_m;
 		int a_count;
 		int a_sync;
 		int *a_rtvals;
 	} */ *ap;
 {
 
 	return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
 	    ap->a_rtvals));
 }
 
 static int
 zfs_freebsd_bmap(ap)
 	struct vop_bmap_args /* {
 		struct vnode *a_vp;
 		daddr_t  a_bn;
 		struct bufobj **a_bop;
 		daddr_t *a_bnp;
 		int *a_runp;
 		int *a_runb;
 	} */ *ap;
 {
 
 	if (ap->a_bop != NULL)
 		*ap->a_bop = &ap->a_vp->v_bufobj;
 	if (ap->a_bnp != NULL)
 		*ap->a_bnp = ap->a_bn;
 	if (ap->a_runp != NULL)
 		*ap->a_runp = 0;
 	if (ap->a_runb != NULL)
 		*ap->a_runb = 0;
 
 	return (0);
 }
 
 static int
 zfs_freebsd_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t	*vp = ap->a_vp;
 	znode_t *zp = VTOZ(vp);
 	int error;
 
 	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
 	if (error == 0)
 		vnode_create_vobject(vp, zp->z_size, ap->a_td);
 	return (error);
 }
 
 static int
 zfs_freebsd_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_ioctl(ap)
 	struct vop_ioctl_args /* {
 		struct vnode *a_vp;
 		u_long a_command;
 		caddr_t a_data;
 		int a_fflag;
 		struct ucred *cred;
 		struct thread *td;
 	} */ *ap;
 {
 
 	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
 	    ap->a_fflag, ap->a_cred, NULL, NULL));
 }
 
 static int
 ioflags(int ioflags)
 {
 	int flags = 0;
 
 	if (ioflags & IO_APPEND)
 		flags |= FAPPEND;
 	if (ioflags & IO_NDELAY)
 		flags |= FNONBLOCK;
 	if (ioflags & IO_SYNC)
 		flags |= (FSYNC | FDSYNC | FRSYNC);
 
 	return (flags);
 }
 
 static int
 zfs_freebsd_read(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 
 	return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
 	    ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_write(ap)
 	struct vop_write_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 
 	return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
 	    ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		accmode_t a_accmode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	znode_t *zp = VTOZ(vp);
 	accmode_t accmode;
 	int error = 0;
 
 	/*
 	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
 	 */
 	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
 	if (accmode != 0)
 		error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
 
 	/*
 	 * VADMIN has to be handled by vaccess().
 	 */
 	if (error == 0) {
 		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
 		if (accmode != 0) {
 			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
 			    zp->z_gid, accmode, ap->a_cred, NULL);
 		}
 	}
 
 	/*
 	 * For VEXEC, ensure that at least one execute bit is set for
 	 * non-directories.
 	 */
 	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
 	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
 		error = EACCES;
 	}
 
 	return (error);
 }
 
 static int
 zfs_freebsd_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	char nm[NAME_MAX + 1];
 
 	ASSERT(cnp->cn_namelen < sizeof(nm));
 	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
 
 	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
 	    cnp->cn_cred, cnp->cn_thread, 0));
 }
 
 static int
 zfs_cache_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	zfsvfs_t *zfsvfs;
 
 	zfsvfs = ap->a_dvp->v_mount->mnt_data;
 	if (zfsvfs->z_use_namecache)
 		return (vfs_cache_lookup(ap));
 	else
 		return (zfs_freebsd_lookup(ap));
 }
 
 static int
 zfs_freebsd_create(ap)
 	struct vop_create_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	zfsvfs_t *zfsvfs;
 	struct componentname *cnp = ap->a_cnp;
 	vattr_t *vap = ap->a_vap;
 	int error, mode;
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	vattr_init_mask(vap);
 	mode = vap->va_mode & ALLPERMS;
 	zfsvfs = ap->a_dvp->v_mount->mnt_data;
 
 	error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
 	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
 	if (zfsvfs->z_use_namecache &&
 	    error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
 	return (error);
 }
 
 static int
 zfs_freebsd_remove(ap)
 	struct vop_remove_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 
 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
 
 	return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
 	    ap->a_cnp->cn_cred));
 }
 
 static int
 zfs_freebsd_mkdir(ap)
 	struct vop_mkdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	vattr_t *vap = ap->a_vap;
 
 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
 
 	vattr_init_mask(vap);
 
 	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
 	    ap->a_cnp->cn_cred));
 }
 
 static int
 zfs_freebsd_rmdir(ap)
 	struct vop_rmdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
 }
 
 static int
 zfs_freebsd_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		int *a_ncookies;
 		u_long **a_cookies;
 	} */ *ap;
 {
 
 	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
 	    ap->a_ncookies, ap->a_cookies));
 }
 
 static int
 zfs_freebsd_fsync(ap)
 	struct vop_fsync_args /* {
 		struct vnode *a_vp;
 		int a_waitfor;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	vop_stdfsync(ap);
 	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
 }
 
 static int
 zfs_freebsd_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	vattr_t *vap = ap->a_vap;
 	xvattr_t xvap;
 	u_long fflags = 0;
 	int error;
 
 	xva_init(&xvap);
 	xvap.xva_vattr = *vap;
 	xvap.xva_vattr.va_mask |= AT_XVATTR;
 
 	/* Convert chflags into ZFS-type flags. */
 	/* XXX: what about SF_SETTABLE?. */
 	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
 	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
 	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
 	XVA_SET_REQ(&xvap, XAT_NODUMP);
 	XVA_SET_REQ(&xvap, XAT_READONLY);
 	XVA_SET_REQ(&xvap, XAT_ARCHIVE);
 	XVA_SET_REQ(&xvap, XAT_SYSTEM);
 	XVA_SET_REQ(&xvap, XAT_HIDDEN);
 	XVA_SET_REQ(&xvap, XAT_REPARSE);
 	XVA_SET_REQ(&xvap, XAT_OFFLINE);
 	XVA_SET_REQ(&xvap, XAT_SPARSE);
 
 	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
 	if (error != 0)
 		return (error);
 
 	/* Convert ZFS xattr into chflags. */
 #define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
 	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
 		fflags |= (fflag);					\
 } while (0)
 	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
 	    xvap.xva_xoptattrs.xoa_immutable);
 	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
 	    xvap.xva_xoptattrs.xoa_appendonly);
 	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
 	    xvap.xva_xoptattrs.xoa_nounlink);
 	FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
 	    xvap.xva_xoptattrs.xoa_archive);
 	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
 	    xvap.xva_xoptattrs.xoa_nodump);
 	FLAG_CHECK(UF_READONLY, XAT_READONLY,
 	    xvap.xva_xoptattrs.xoa_readonly);
 	FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
 	    xvap.xva_xoptattrs.xoa_system);
 	FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
 	    xvap.xva_xoptattrs.xoa_hidden);
 	FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
 	    xvap.xva_xoptattrs.xoa_reparse);
 	FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
 	    xvap.xva_xoptattrs.xoa_offline);
 	FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
 	    xvap.xva_xoptattrs.xoa_sparse);
 
 #undef	FLAG_CHECK
 	*vap = xvap.xva_vattr;
 	vap->va_flags = fflags;
 	return (0);
 }
 
 static int
 zfs_freebsd_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	vattr_t *vap = ap->a_vap;
 	cred_t *cred = ap->a_cred;
 	xvattr_t xvap;
 	u_long fflags;
 	uint64_t zflags;
 
 	vattr_init_mask(vap);
 	vap->va_mask &= ~AT_NOSET;
 
 	xva_init(&xvap);
 	xvap.xva_vattr = *vap;
 
 	zflags = VTOZ(vp)->z_pflags;
 
 	if (vap->va_flags != VNOVAL) {
 		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
 		int error;
 
 		if (zfsvfs->z_use_fuids == B_FALSE)
 			return (EOPNOTSUPP);
 
 		fflags = vap->va_flags;
 		/*
 		 * XXX KDM 
 		 * We need to figure out whether it makes sense to allow
 		 * UF_REPARSE through, since we don't really have other
 		 * facilities to handle reparse points and zfs_setattr()
 		 * doesn't currently allow setting that attribute anyway.
 		 */
 		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
 		     UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
 		     UF_OFFLINE|UF_SPARSE)) != 0)
 			return (EOPNOTSUPP);
 		/*
 		 * Unprivileged processes are not permitted to unset system
 		 * flags, or modify flags if any system flags are set.
 		 * Privileged non-jail processes may not modify system flags
 		 * if securelevel > 0 and any existing system flags are set.
 		 * Privileged jail processes behave like privileged non-jail
 		 * processes if the PR_ALLOW_CHFLAGS permission bit is set;
 		 * otherwise, they behave like unprivileged processes.
 		 */
 		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
 		    priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
 			if (zflags &
 			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
 				error = securelevel_gt(cred, 0);
 				if (error != 0)
 					return (error);
 			}
 		} else {
 			/*
 			 * Callers may only modify the file flags on objects they
 			 * have VADMIN rights for.
 			 */
 			if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
 				return (error);
 			if (zflags &
 			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
 				return (EPERM);
 			}
 			if (fflags &
 			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
 				return (EPERM);
 			}
 		}
 
 #define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
 	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
 	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
 		XVA_SET_REQ(&xvap, (xflag));				\
 		(xfield) = ((fflags & (fflag)) != 0);			\
 	}								\
 } while (0)
 		/* Convert chflags into ZFS-type flags. */
 		/* XXX: what about SF_SETTABLE?. */
 		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
 		    xvap.xva_xoptattrs.xoa_immutable);
 		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
 		    xvap.xva_xoptattrs.xoa_appendonly);
 		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
 		    xvap.xva_xoptattrs.xoa_nounlink);
 		FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
 		    xvap.xva_xoptattrs.xoa_archive);
 		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
 		    xvap.xva_xoptattrs.xoa_nodump);
 		FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
 		    xvap.xva_xoptattrs.xoa_readonly);
 		FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
 		    xvap.xva_xoptattrs.xoa_system);
 		FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
 		    xvap.xva_xoptattrs.xoa_hidden);
 		FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
 		    xvap.xva_xoptattrs.xoa_hidden);
 		FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
 		    xvap.xva_xoptattrs.xoa_offline);
 		FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
 		    xvap.xva_xoptattrs.xoa_sparse);
 #undef	FLAG_CHANGE
 	}
 	if (vap->va_birthtime.tv_sec != VNOVAL) {
 		xvap.xva_vattr.va_mask |= AT_XVATTR;
 		XVA_SET_REQ(&xvap, XAT_CREATETIME);
 	}
 	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
 }
 
 static int
 zfs_freebsd_rename(ap)
 	struct vop_rename_args  /* {
 		struct vnode *a_fdvp;
 		struct vnode *a_fvp;
 		struct componentname *a_fcnp;
 		struct vnode *a_tdvp;
 		struct vnode *a_tvp;
 		struct componentname *a_tcnp;
 	} */ *ap;
 {
 	vnode_t *fdvp = ap->a_fdvp;
 	vnode_t *fvp = ap->a_fvp;
 	vnode_t *tdvp = ap->a_tdvp;
 	vnode_t *tvp = ap->a_tvp;
 	int error;
 
 	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
 	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
 
 	error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
 	    ap->a_tcnp, ap->a_fcnp->cn_cred);
 
 	vrele(fdvp);
 	vrele(fvp);
 	vrele(tdvp);
 	if (tvp != NULL)
 		vrele(tvp);
 
 	return (error);
 }
 
 static int
 zfs_freebsd_symlink(ap)
 	struct vop_symlink_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 		char *a_target;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	vattr_t *vap = ap->a_vap;
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
 	vattr_init_mask(vap);
 
 	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
 	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
 }
 
 static int
 zfs_freebsd_readlink(ap)
 	struct vop_readlink_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 
 	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_link(ap)
 	struct vop_link_args /* {
 		struct vnode *a_tdvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	vnode_t *vp = ap->a_vp;
 	vnode_t *tdvp = ap->a_tdvp;
 
 	if (tdvp->v_mount != vp->v_mount)
 		return (EXDEV);
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
 }
 
 static int
 zfs_freebsd_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 
 	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
 	return (0);
 }
 
 static int
 zfs_freebsd_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t	*vp = ap->a_vp;
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	ASSERT(zp != NULL);
 
 	/* Destroy the vm object and flush associated pages. */
 	vnode_destroy_vobject(vp);
 
 	/*
 	 * z_teardown_inactive_lock protects from a race with
 	 * zfs_znode_dmu_fini in zfsvfs_teardown during
 	 * force unmount.
 	 */
 	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
 	if (zp->z_sa_hdl == NULL)
 		zfs_znode_free(zp);
 	else
 		zfs_zinactive(zp);
 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
 
 	vp->v_data = NULL;
 	return (0);
 }
 
 static int
 zfs_freebsd_fid(ap)
 	struct vop_fid_args /* {
 		struct vnode *a_vp;
 		struct fid *a_fid;
 	} */ *ap;
 {
 
 	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
 }
 
 static int
 zfs_freebsd_pathconf(ap)
 	struct vop_pathconf_args /* {
 		struct vnode *a_vp;
 		int a_name;
 		register_t *a_retval;
 	} */ *ap;
 {
 	ulong_t val;
 	int error;
 
 	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
 	if (error == 0) {
 		*ap->a_retval = val;
 		return (error);
 	}
 	if (error != EOPNOTSUPP)
 		return (error);
 
 	switch (ap->a_name) {
 	case _PC_NAME_MAX:
 		*ap->a_retval = NAME_MAX;
 		return (0);
 	case _PC_PIPE_BUF:
 		if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) {
 			*ap->a_retval = PIPE_BUF;
 			return (0);
 		}
 		return (EINVAL);
 	default:
 		return (vop_stdpathconf(ap));
 	}
 }
 
 /*
  * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
  * extended attribute name:
  *
  *	NAMESPACE	PREFIX	
  *	system		freebsd:system:
  *	user		(none, can be used to access ZFS fsattr(5) attributes
  *			created on Solaris)
  */
 static int
 zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
     size_t size)
 {
 	const char *namespace, *prefix, *suffix;
 
 	/* We don't allow '/' character in attribute name. */
 	if (strchr(name, '/') != NULL)
 		return (EINVAL);
 	/* We don't allow attribute names that start with "freebsd:" string. */
 	if (strncmp(name, "freebsd:", 8) == 0)
 		return (EINVAL);
 
 	bzero(attrname, size);
 
 	switch (attrnamespace) {
 	case EXTATTR_NAMESPACE_USER:
 #if 0
 		prefix = "freebsd:";
 		namespace = EXTATTR_NAMESPACE_USER_STRING;
 		suffix = ":";
 #else
 		/*
 		 * This is the default namespace by which we can access all
 		 * attributes created on Solaris.
 		 */
 		prefix = namespace = suffix = "";
 #endif
 		break;
 	case EXTATTR_NAMESPACE_SYSTEM:
 		prefix = "freebsd:";
 		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
 		suffix = ":";
 		break;
 	case EXTATTR_NAMESPACE_EMPTY:
 	default:
 		return (EINVAL);
 	}
 	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
 	    name) >= size) {
 		return (ENAMETOOLONG);
 	}
 	return (0);
 }
 
 /*
  * Vnode operating to retrieve a named extended attribute.
  */
 static int
 zfs_getextattr(struct vop_getextattr_args *ap)
 /*
 vop_getextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	INOUT struct uio *a_uio;
 	OUT size_t *a_size;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
 	struct thread *td = ap->a_td;
 	struct nameidata nd;
 	char attrname[255];
 	struct vattr va;
 	vnode_t *xvp = NULL, *vp;
 	int error, flags;
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VREAD);
 	if (error != 0)
 		return (error);
 
 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
 	    sizeof(attrname));
 	if (error != 0)
 		return (error);
 
 	ZFS_ENTER(zfsvfs);
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
 	    LOOKUP_XATTR);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	flags = FREAD;
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
 	    xvp, td);
 	error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		if (error == ENOENT)
 			error = ENOATTR;
 		return (error);
 	}
 
 	if (ap->a_size != NULL) {
 		error = VOP_GETATTR(vp, &va, ap->a_cred);
 		if (error == 0)
 			*ap->a_size = (size_t)va.va_size;
 	} else if (ap->a_uio != NULL)
 		error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
 
 	VOP_UNLOCK(vp, 0);
 	vn_close(vp, flags, ap->a_cred, td);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*
  * Vnode operation to remove a named attribute.
  */
 int
 zfs_deleteextattr(struct vop_deleteextattr_args *ap)
 /*
 vop_deleteextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
 	struct thread *td = ap->a_td;
 	struct nameidata nd;
 	char attrname[255];
 	struct vattr va;
 	vnode_t *xvp = NULL, *vp;
 	int error, flags;
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VWRITE);
 	if (error != 0)
 		return (error);
 
 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
 	    sizeof(attrname));
 	if (error != 0)
 		return (error);
 
 	ZFS_ENTER(zfsvfs);
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
 	    LOOKUP_XATTR);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
 	    UIO_SYSSPACE, attrname, xvp, td);
 	error = namei(&nd);
 	vp = nd.ni_vp;
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (error == ENOENT)
 			error = ENOATTR;
 		return (error);
 	}
 
 	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	vput(nd.ni_dvp);
 	if (vp == nd.ni_dvp)
 		vrele(vp);
 	else
 		vput(vp);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*
  * Vnode operation to set a named attribute.
  */
 static int
 zfs_setextattr(struct vop_setextattr_args *ap)
 /*
 vop_setextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	INOUT struct uio *a_uio;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
 	struct thread *td = ap->a_td;
 	struct nameidata nd;
 	char attrname[255];
 	struct vattr va;
 	vnode_t *xvp = NULL, *vp;
 	int error, flags;
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VWRITE);
 	if (error != 0)
 		return (error);
 
 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
 	    sizeof(attrname));
 	if (error != 0)
 		return (error);
 
 	ZFS_ENTER(zfsvfs);
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
 	    LOOKUP_XATTR | CREATE_XATTR_DIR);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	flags = FFLAGS(O_WRONLY | O_CREAT);
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
 	    xvp, td);
 	error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	VATTR_NULL(&va);
 	va.va_size = 0;
 	error = VOP_SETATTR(vp, &va, ap->a_cred);
 	if (error == 0)
 		VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
 
 	VOP_UNLOCK(vp, 0);
 	vn_close(vp, flags, ap->a_cred, td);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*
  * Vnode operation to retrieve extended attributes on a vnode.
  */
 static int
 zfs_listextattr(struct vop_listextattr_args *ap)
 /*
 vop_listextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	INOUT struct uio *a_uio;
 	OUT size_t *a_size;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
 	struct thread *td = ap->a_td;
 	struct nameidata nd;
 	char attrprefix[16];
 	u_char dirbuf[sizeof(struct dirent)];
 	struct dirent *dp;
 	struct iovec aiov;
 	struct uio auio, *uio = ap->a_uio;
 	size_t *sizep = ap->a_size;
 	size_t plen;
 	vnode_t *xvp = NULL, *vp;
 	int done, error, eof, pos;
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VREAD);
 	if (error != 0)
 		return (error);
 
 	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
 	    sizeof(attrprefix));
 	if (error != 0)
 		return (error);
 	plen = strlen(attrprefix);
 
 	ZFS_ENTER(zfsvfs);
 
 	if (sizep != NULL)
 		*sizep = 0;
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
 	    LOOKUP_XATTR);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		/*
 		 * ENOATTR means that the EA directory does not yet exist,
 		 * i.e. there are no extended attributes there.
 		 */
 		if (error == ENOATTR)
 			error = 0;
 		return (error);
 	}
 
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
 	    UIO_SYSSPACE, ".", xvp, td);
 	error = namei(&nd);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_rw = UIO_READ;
 	auio.uio_offset = 0;
 
 	do {
 		u_char nlen;
 
 		aiov.iov_base = (void *)dirbuf;
 		aiov.iov_len = sizeof(dirbuf);
 		auio.uio_resid = sizeof(dirbuf);
 		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
 		done = sizeof(dirbuf) - auio.uio_resid;
 		if (error != 0)
 			break;
 		for (pos = 0; pos < done;) {
 			dp = (struct dirent *)(dirbuf + pos);
 			pos += dp->d_reclen;
 			/*
 			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
 			 * is what we get when attribute was created on Solaris.
 			 */
 			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
 				continue;
 			if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
 				continue;
 			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
 				continue;
 			nlen = dp->d_namlen - plen;
 			if (sizep != NULL)
 				*sizep += 1 + nlen;
 			else if (uio != NULL) {
 				/*
 				 * Format of extattr name entry is one byte for
 				 * length and the rest for name.
 				 */
 				error = uiomove(&nlen, 1, uio->uio_rw, uio);
 				if (error == 0) {
 					error = uiomove(dp->d_name + plen, nlen,
 					    uio->uio_rw, uio);
 				}
 				if (error != 0)
 					break;
 			}
 		}
 	} while (!eof && error == 0);
 
 	vput(vp);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 int
 zfs_freebsd_getacl(ap)
 	struct vop_getacl_args /* {
 		struct vnode *vp;
 		acl_type_t type;
 		struct acl *aclp;
 		struct ucred *cred;
 		struct thread *td;
 	} */ *ap;
 {
 	int		error;
 	vsecattr_t      vsecattr;
 
 	if (ap->a_type != ACL_TYPE_NFS4)
 		return (EINVAL);
 
 	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
 	if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
 		return (error);
 
 	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
 	if (vsecattr.vsa_aclentp != NULL)
 		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
 
 	return (error);
 }
 
 int
 zfs_freebsd_setacl(ap)
 	struct vop_setacl_args /* {
 		struct vnode *vp;
 		acl_type_t type;
 		struct acl *aclp;
 		struct ucred *cred;
 		struct thread *td;
 	} */ *ap;
 {
 	int		error;
 	vsecattr_t      vsecattr;
 	int		aclbsize;	/* size of acl list in bytes */
 	aclent_t	*aaclp;
 
 	if (ap->a_type != ACL_TYPE_NFS4)
 		return (EINVAL);
 
 	if (ap->a_aclp == NULL)
 		return (EINVAL);
 
 	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
 		return (EINVAL);
 
 	/*
 	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
 	 * splitting every entry into two and appending "canonical six"
 	 * entries at the end.  Don't allow for setting an ACL that would
 	 * cause chmod(2) to run out of ACL entries.
 	 */
 	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
 		return (ENOSPC);
 
 	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
 	if (error != 0)
 		return (error);
 
 	vsecattr.vsa_mask = VSA_ACE;
 	aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
 	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
 	aaclp = vsecattr.vsa_aclentp;
 	vsecattr.vsa_aclentsz = aclbsize;
 
 	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
 	error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
 	kmem_free(aaclp, aclbsize);
 
 	return (error);
 }
 
 int
 zfs_freebsd_aclcheck(ap)
 	struct vop_aclcheck_args /* {
 		struct vnode *vp;
 		acl_type_t type;
 		struct acl *aclp;
 		struct ucred *cred;
 		struct thread *td;
 	} */ *ap;
 {
 
 	return (EOPNOTSUPP);
 }
 
 static int
 zfs_vptocnp(struct vop_vptocnp_args *ap)
 {
 	vnode_t *covered_vp;
 	vnode_t *vp = ap->a_vp;;
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 	znode_t *zp = VTOZ(vp);
 	int ltype;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	/*
 	 * If we are a snapshot mounted under .zfs, run the operation
 	 * on the covered vnode.
 	 */
 	if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
 		char name[MAXNAMLEN + 1];
 		znode_t *dzp;
 		size_t len;
 
 		error = zfs_znode_parent_and_name(zp, &dzp, name);
 		if (error == 0) {
 			len = strlen(name);
 			if (*ap->a_buflen < len)
 				error = SET_ERROR(ENOMEM);
 		}
 		if (error == 0) {
 			*ap->a_buflen -= len;
 			bcopy(name, ap->a_buf + *ap->a_buflen, len);
 			*ap->a_vpp = ZTOV(dzp);
 		}
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 	ZFS_EXIT(zfsvfs);
 
 	covered_vp = vp->v_mount->mnt_vnodecovered;
 	vhold(covered_vp);
 	ltype = VOP_ISLOCKED(vp);
 	VOP_UNLOCK(vp, 0);
 	error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread);
 	if (error == 0) {
 		error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
 		    ap->a_buf, ap->a_buflen);
 		vput(covered_vp);
 	}
 	vn_lock(vp, ltype | LK_RETRY);
 	if ((vp->v_iflag & VI_DOOMED) != 0)
 		error = SET_ERROR(ENOENT);
 	return (error);
 }
 
 #ifdef DIAGNOSTIC
 static int
 zfs_lock(ap)
 	struct vop_lock1_args /* {
 		struct vnode *a_vp;
 		int a_flags;
 		char *file;
 		int line;
 	} */ *ap;
 {
 	vnode_t *vp;
 	znode_t *zp;
 	int err;
 
 	err = vop_stdlock(ap);
 	if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) {
 		vp = ap->a_vp;
 		zp = vp->v_data;
 		if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 &&
 		    zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0)
 			VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock));
 	}
 	return (err);
 }
 #endif
 
 struct vop_vector zfs_vnodeops;
 struct vop_vector zfs_fifoops;
 struct vop_vector zfs_shareops;
 
 struct vop_vector zfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_inactive =		zfs_freebsd_inactive,
 	.vop_reclaim =		zfs_freebsd_reclaim,
 	.vop_access =		zfs_freebsd_access,
 	.vop_allocate =		VOP_EINVAL,
 	.vop_lookup =		zfs_cache_lookup,
 	.vop_cachedlookup =	zfs_freebsd_lookup,
 	.vop_getattr =		zfs_freebsd_getattr,
 	.vop_setattr =		zfs_freebsd_setattr,
 	.vop_create =		zfs_freebsd_create,
 	.vop_mknod =		zfs_freebsd_create,
 	.vop_mkdir =		zfs_freebsd_mkdir,
 	.vop_readdir =		zfs_freebsd_readdir,
 	.vop_fsync =		zfs_freebsd_fsync,
 	.vop_open =		zfs_freebsd_open,
 	.vop_close =		zfs_freebsd_close,
 	.vop_rmdir =		zfs_freebsd_rmdir,
 	.vop_ioctl =		zfs_freebsd_ioctl,
 	.vop_link =		zfs_freebsd_link,
 	.vop_symlink =		zfs_freebsd_symlink,
 	.vop_readlink =		zfs_freebsd_readlink,
 	.vop_read =		zfs_freebsd_read,
 	.vop_write =		zfs_freebsd_write,
 	.vop_remove =		zfs_freebsd_remove,
 	.vop_rename =		zfs_freebsd_rename,
 	.vop_pathconf =		zfs_freebsd_pathconf,
 	.vop_bmap =		zfs_freebsd_bmap,
 	.vop_fid =		zfs_freebsd_fid,
 	.vop_getextattr =	zfs_getextattr,
 	.vop_deleteextattr =	zfs_deleteextattr,
 	.vop_setextattr =	zfs_setextattr,
 	.vop_listextattr =	zfs_listextattr,
 	.vop_getacl =		zfs_freebsd_getacl,
 	.vop_setacl =		zfs_freebsd_setacl,
 	.vop_aclcheck =		zfs_freebsd_aclcheck,
 	.vop_getpages =		zfs_freebsd_getpages,
 	.vop_putpages =		zfs_freebsd_putpages,
 	.vop_vptocnp =		zfs_vptocnp,
 #ifdef DIAGNOSTIC
 	.vop_lock1 =		zfs_lock,
 #endif
 };
 
 struct vop_vector zfs_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_fsync =		zfs_freebsd_fsync,
 	.vop_access =		zfs_freebsd_access,
 	.vop_getattr =		zfs_freebsd_getattr,
 	.vop_inactive =		zfs_freebsd_inactive,
 	.vop_read =		VOP_PANIC,
 	.vop_reclaim =		zfs_freebsd_reclaim,
 	.vop_setattr =		zfs_freebsd_setattr,
 	.vop_write =		VOP_PANIC,
 	.vop_pathconf = 	zfs_freebsd_pathconf,
 	.vop_fid =		zfs_freebsd_fid,
 	.vop_getacl =		zfs_freebsd_getacl,
 	.vop_setacl =		zfs_freebsd_setacl,
 	.vop_aclcheck =		zfs_freebsd_aclcheck,
 };
 
 /*
  * special share hidden files vnode operations template
  */
 struct vop_vector zfs_shareops = {
 	.vop_default =		&default_vnodeops,
 	.vop_access =		zfs_freebsd_access,
 	.vop_inactive =		zfs_freebsd_inactive,
 	.vop_reclaim =		zfs_freebsd_reclaim,
 	.vop_fid =		zfs_freebsd_fid,
 	.vop_pathconf =		zfs_freebsd_pathconf,
 };
Index: stable/12/sys/fs/cd9660/cd9660_vnops.c
===================================================================
--- stable/12/sys/fs/cd9660/cd9660_vnops.c	(revision 340720)
+++ stable/12/sys/fs/cd9660/cd9660_vnops.c	(revision 340721)
@@ -1,921 +1,923 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1994
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley
  * by Pace Willisson (pace@blitz.com).  The Rock Ridge Extension
  * Support code is derived from software contributed to Berkeley
  * by Atsushi Murai (amurai@spec.co.jp).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)cd9660_vnops.c	8.19 (Berkeley) 5/27/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/kernel.h>
 #include <sys/conf.h>
 #include <sys/stat.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/malloc.h>
 #include <sys/dirent.h>
 #include <sys/unistd.h>
 #include <sys/filio.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vnode_pager.h>
 #include <vm/uma.h>
 
 #include <fs/cd9660/iso.h>
 #include <fs/cd9660/cd9660_node.h>
 #include <fs/cd9660/iso_rrip.h>
 
 static vop_setattr_t	cd9660_setattr;
 static vop_open_t	cd9660_open;
 static vop_access_t	cd9660_access;
 static vop_getattr_t	cd9660_getattr;
 static vop_ioctl_t	cd9660_ioctl;
 static vop_pathconf_t	cd9660_pathconf;
 static vop_read_t	cd9660_read;
 struct isoreaddir;
 static int iso_uiodir(struct isoreaddir *idp, struct dirent *dp, off_t off);
 static int iso_shipdir(struct isoreaddir *idp);
 static vop_readdir_t	cd9660_readdir;
 static vop_readlink_t	cd9660_readlink;
 static vop_strategy_t	cd9660_strategy;
 static vop_vptofh_t	cd9660_vptofh;
 static vop_getpages_t	cd9660_getpages;
 
 /*
  * Setattr call. Only allowed for block and character special devices.
  */
 static int
 cd9660_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnodeop_desc *a_desc;
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 
 	if (vap->va_flags != (u_long)VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
 	    vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
 	    vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL)
 		return (EROFS);
 	if (vap->va_size != (u_quad_t)VNOVAL) {
 		switch (vp->v_type) {
 		case VDIR:
 			return (EISDIR);
 		case VLNK:
 		case VREG:
 			return (EROFS);
 		case VCHR:
 		case VBLK:
 		case VSOCK:
 		case VFIFO:
 		case VNON:
 		case VBAD:
 		case VMARKER:
 			return (0);
 		}
 	}
 	return (0);
 }
 
 /*
  * Check mode permission on inode pointer. Mode is READ, WRITE or EXEC.
  * The mode is shifted to select the owner/group/other fields. The
  * super user is granted all permissions.
  */
 /* ARGSUSED */
 static int
 cd9660_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		accmode_t a_accmode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct iso_node *ip = VTOI(vp);
 	accmode_t accmode = ap->a_accmode;
 
 	if (vp->v_type == VCHR || vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Disallow write attempts unless the file is a socket,
 	 * fifo, or a block or character device resident on the
 	 * filesystem.
 	 */
 	if (accmode & VWRITE) {
 		switch (vp->v_type) {
 		case VDIR:
 		case VLNK:
 		case VREG:
 			return (EROFS);
 			/* NOT REACHED */
 		default:
 			break;
 		}
 	}
 
 	return (vaccess(vp->v_type, ip->inode.iso_mode, ip->inode.iso_uid,
 	    ip->inode.iso_gid, ap->a_accmode, ap->a_cred, NULL));
 }
 
 static int
 cd9660_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 		struct file *a_fp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct iso_node *ip = VTOI(vp);
 
 	if (vp->v_type == VCHR || vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	vnode_create_vobject(vp, ip->i_size, ap->a_td);
 	return (0);
 }
 
 
 static int
 cd9660_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	struct iso_node *ip = VTOI(vp);
 
 	vap->va_fsid    = dev2udev(ip->i_mnt->im_dev);
 	vap->va_fileid	= ip->i_number;
 
 	vap->va_mode	= ip->inode.iso_mode;
 	vap->va_nlink	= ip->inode.iso_links;
 	vap->va_uid	= ip->inode.iso_uid;
 	vap->va_gid	= ip->inode.iso_gid;
 	vap->va_atime	= ip->inode.iso_atime;
 	vap->va_mtime	= ip->inode.iso_mtime;
 	vap->va_ctime	= ip->inode.iso_ctime;
 	vap->va_rdev	= ip->inode.iso_rdev;
 
 	vap->va_size	= (u_quad_t) ip->i_size;
 	if (ip->i_size == 0 && (vap->va_mode & S_IFMT) == S_IFLNK) {
 		struct vop_readlink_args rdlnk;
 		struct iovec aiov;
 		struct uio auio;
 		char *cp;
 
 		cp = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 		aiov.iov_base = cp;
 		aiov.iov_len = MAXPATHLEN;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = 0;
 		auio.uio_rw = UIO_READ;
 		auio.uio_segflg = UIO_SYSSPACE;
 		auio.uio_td = curthread;
 		auio.uio_resid = MAXPATHLEN;
 		rdlnk.a_uio = &auio;
 		rdlnk.a_vp = ap->a_vp;
 		rdlnk.a_cred = ap->a_cred;
 		if (cd9660_readlink(&rdlnk) == 0)
 			vap->va_size = MAXPATHLEN - auio.uio_resid;
 		free(cp, M_TEMP);
 	}
 	vap->va_flags	= 0;
 	vap->va_gen = 1;
 	vap->va_blocksize = ip->i_mnt->logical_block_size;
 	vap->va_bytes	= (u_quad_t) ip->i_size;
 	vap->va_type	= vp->v_type;
 	vap->va_filerev	= 0;
 	return (0);
 }
 
 /*
  * Vnode op for ioctl.
  */
 static int
 cd9660_ioctl(ap)
 	struct vop_ioctl_args /* {
 		struct vnode *a_vp;
 		u_long  a_command;
 		caddr_t  a_data;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp;
 	struct iso_node *ip;
 	int error;
 
 	vp = ap->a_vp;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	if (vp->v_iflag & VI_DOOMED) {
 		VOP_UNLOCK(vp, 0);
 		return (EBADF);
 	}
 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
 		VOP_UNLOCK(vp, 0);
 		return (EOPNOTSUPP);
 	}
 
 	ip = VTOI(vp);
 	error = 0;
 
 	switch (ap->a_command) {
 	case FIOGETLBA:
 		*(int *)(ap->a_data) = ip->iso_start;
 		break;
 	default:
 		error = ENOTTY;
 		break;
 	}
 
 	VOP_UNLOCK(vp, 0);
 	return (error);
 }
 
 /*
  * Vnode op for reading.
  */
 static int
 cd9660_read(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct iso_node *ip = VTOI(vp);
 	struct iso_mnt *imp;
 	struct buf *bp;
 	daddr_t lbn, rablock;
 	off_t diff;
 	int rasize, error = 0;
 	int seqcount;
 	long size, n, on;
 
 	if (vp->v_type == VCHR || vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
 
 	if (uio->uio_resid == 0)
 		return (0);
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 	imp = ip->i_mnt;
 	do {
 		lbn = lblkno(imp, uio->uio_offset);
 		on = blkoff(imp, uio->uio_offset);
 		n = MIN(imp->logical_block_size - on, uio->uio_resid);
 		diff = (off_t)ip->i_size - uio->uio_offset;
 		if (diff <= 0)
 			return (0);
 		if (diff < n)
 			n = diff;
 		size = blksize(imp, ip, lbn);
 		rablock = lbn + 1;
 		if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
 			if (lblktosize(imp, rablock) < ip->i_size)
 				error = cluster_read(vp, (off_t)ip->i_size,
 					 lbn, size, NOCRED, uio->uio_resid,
 					 (ap->a_ioflag >> 16), 0, &bp);
 			else
 				error = bread(vp, lbn, size, NOCRED, &bp);
 		} else {
 			if (seqcount > 1 &&
 			    lblktosize(imp, rablock) < ip->i_size) {
 				rasize = blksize(imp, ip, rablock);
 				error = breadn(vp, lbn, size, &rablock,
 					       &rasize, 1, NOCRED, &bp);
 			} else
 				error = bread(vp, lbn, size, NOCRED, &bp);
 		}
 		if (error != 0)
 			return (error);
 		n = MIN(n, size - bp->b_resid);
 
 		error = uiomove(bp->b_data + on, (int)n, uio);
 		brelse(bp);
 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
 	return (error);
 }
 
 /*
  * Structure for reading directories
  */
 struct isoreaddir {
 	struct dirent saveent;
 	struct dirent assocent;
 	struct dirent current;
 	off_t saveoff;
 	off_t assocoff;
 	off_t curroff;
 	struct uio *uio;
 	off_t uio_off;
 	int eofflag;
 	u_long *cookies;
 	int ncookies;
 };
 
 static int
 iso_uiodir(idp,dp,off)
 	struct isoreaddir *idp;
 	struct dirent *dp;
 	off_t off;
 {
 	int error;
 
 	dp->d_name[dp->d_namlen] = 0;
 	dp->d_reclen = GENERIC_DIRSIZ(dp);
 
 	if (idp->uio->uio_resid < dp->d_reclen) {
 		idp->eofflag = 0;
 		return (-1);
 	}
 
 	if (idp->cookies) {
 		if (idp->ncookies <= 0) {
 			idp->eofflag = 0;
 			return (-1);
 		}
 
 		*idp->cookies++ = off;
 		--idp->ncookies;
 	}
 
 	if ((error = uiomove(dp, dp->d_reclen, idp->uio)) != 0)
 		return (error);
 	idp->uio_off = off;
 	return (0);
 }
 
 static int
 iso_shipdir(idp)
 	struct isoreaddir *idp;
 {
 	struct dirent *dp;
 	int cl, sl, assoc;
 	int error;
 	char *cname, *sname;
 
 	cl = idp->current.d_namlen;
 	cname = idp->current.d_name;
 	assoc = (cl > 1) && (*cname == ASSOCCHAR);
 	if (assoc) {
 		cl--;
 		cname++;
 	}
 
 	dp = &idp->saveent;
 	sname = dp->d_name;
 	if (!(sl = dp->d_namlen)) {
 		dp = &idp->assocent;
 		sname = dp->d_name + 1;
 		sl = dp->d_namlen - 1;
 	}
 	if (sl > 0) {
 		if (sl != cl
 		    || bcmp(sname,cname,sl)) {
 			if (idp->assocent.d_namlen) {
 				if ((error = iso_uiodir(idp,&idp->assocent,idp->assocoff)) != 0)
 					return (error);
 				idp->assocent.d_namlen = 0;
 			}
 			if (idp->saveent.d_namlen) {
 				if ((error = iso_uiodir(idp,&idp->saveent,idp->saveoff)) != 0)
 					return (error);
 				idp->saveent.d_namlen = 0;
 			}
 		}
 	}
 	idp->current.d_reclen = GENERIC_DIRSIZ(&idp->current);
 	if (assoc) {
 		idp->assocoff = idp->curroff;
 		memcpy(&idp->assocent, &idp->current, idp->current.d_reclen);
 	} else {
 		idp->saveoff = idp->curroff;
 		memcpy(&idp->saveent, &idp->current, idp->current.d_reclen);
 	}
 	return (0);
 }
 
 /*
  * Vnode op for readdir
  */
 static int
 cd9660_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		int *a_ncookies;
 		u_long **a_cookies;
 	} */ *ap;
 {
 	struct uio *uio = ap->a_uio;
 	struct isoreaddir *idp;
 	struct vnode *vdp = ap->a_vp;
 	struct iso_node *dp;
 	struct iso_mnt *imp;
 	struct buf *bp = NULL;
 	struct iso_directory_record *ep;
 	int entryoffsetinblock;
 	doff_t endsearch;
 	u_long bmask;
 	int error = 0;
 	int reclen;
 	u_short namelen;
 	u_int ncookies = 0;
 	u_long *cookies = NULL;
 	cd_ino_t ino;
 
 	dp = VTOI(vdp);
 	imp = dp->i_mnt;
 	bmask = imp->im_bmask;
 
 	idp = malloc(sizeof(*idp), M_TEMP, M_WAITOK);
 	idp->saveent.d_namlen = idp->assocent.d_namlen = 0;
 	/*
 	 * XXX
 	 * Is it worth trying to figure out the type?
 	 */
 	idp->saveent.d_type = idp->assocent.d_type = idp->current.d_type =
 	    DT_UNKNOWN;
 	idp->uio = uio;
 	if (ap->a_ncookies == NULL) {
 		idp->cookies = NULL;
 	} else {
 		/*
 		 * Guess the number of cookies needed.
 		 */
 		ncookies = uio->uio_resid / 16;
 		cookies = malloc(ncookies * sizeof(u_long),
 		    M_TEMP, M_WAITOK);
 		idp->cookies = cookies;
 		idp->ncookies = ncookies;
 	}
 	idp->eofflag = 1;
 	idp->curroff = uio->uio_offset;
 	idp->uio_off = uio->uio_offset;
 
 	if ((entryoffsetinblock = idp->curroff & bmask) &&
 	    (error = cd9660_blkatoff(vdp, (off_t)idp->curroff, NULL, &bp))) {
 		free(idp, M_TEMP);
 		return (error);
 	}
 	endsearch = dp->i_size;
 
 	while (idp->curroff < endsearch) {
 		/*
 		 * If offset is on a block boundary,
 		 * read the next directory block.
 		 * Release previous if it exists.
 		 */
 		if ((idp->curroff & bmask) == 0) {
 			if (bp != NULL)
 				brelse(bp);
 			if ((error =
 			    cd9660_blkatoff(vdp, (off_t)idp->curroff, NULL, &bp)) != 0)
 				break;
 			entryoffsetinblock = 0;
 		}
 		/*
 		 * Get pointer to next entry.
 		 */
 		ep = (struct iso_directory_record *)
 			((char *)bp->b_data + entryoffsetinblock);
 
 		reclen = isonum_711(ep->length);
 		if (reclen == 0) {
 			/* skip to next block, if any */
 			idp->curroff =
 			    (idp->curroff & ~bmask) + imp->logical_block_size;
 			continue;
 		}
 
 		if (reclen < ISO_DIRECTORY_RECORD_SIZE) {
 			error = EINVAL;
 			/* illegal entry, stop */
 			break;
 		}
 
 		if (entryoffsetinblock + reclen > imp->logical_block_size) {
 			error = EINVAL;
 			/* illegal directory, so stop looking */
 			break;
 		}
 
 		idp->current.d_namlen = isonum_711(ep->name_len);
 
 		if (reclen < ISO_DIRECTORY_RECORD_SIZE + idp->current.d_namlen) {
 			error = EINVAL;
 			/* illegal entry, stop */
 			break;
 		}
 
 		if (isonum_711(ep->flags)&2)
 			idp->current.d_fileno = isodirino(ep, imp);
 		else
 			idp->current.d_fileno = dbtob(bp->b_blkno) +
 				entryoffsetinblock;
 
 		idp->curroff += reclen;
+		/* NOTE: d_off is the offset of *next* entry. */
+		idp->current.d_off = idp->curroff;
 
 		switch (imp->iso_ftype) {
 		case ISO_FTYPE_RRIP:
 			ino = idp->current.d_fileno;
 			cd9660_rrip_getname(ep, idp->current.d_name, &namelen,
 			    &ino, imp);
 			idp->current.d_fileno = ino;
 			idp->current.d_namlen = (u_char)namelen;
 			if (idp->current.d_namlen)
 				error = iso_uiodir(idp,&idp->current,idp->curroff);
 			break;
 		default: /* ISO_FTYPE_DEFAULT || ISO_FTYPE_9660 || ISO_FTYPE_HIGH_SIERRA*/
 			strcpy(idp->current.d_name,"..");
 			if (idp->current.d_namlen == 1 && ep->name[0] == 0) {
 				idp->current.d_namlen = 1;
 				error = iso_uiodir(idp,&idp->current,idp->curroff);
 			} else if (idp->current.d_namlen == 1 && ep->name[0] == 1) {
 				idp->current.d_namlen = 2;
 				error = iso_uiodir(idp,&idp->current,idp->curroff);
 			} else {
 				isofntrans(ep->name,idp->current.d_namlen,
 					   idp->current.d_name, &namelen,
 					   imp->iso_ftype == ISO_FTYPE_9660,
 					   isonum_711(ep->flags)&4,
 					   imp->joliet_level,
 					   imp->im_flags,
 					   imp->im_d2l);
 				idp->current.d_namlen = (u_char)namelen;
 				if (imp->iso_ftype == ISO_FTYPE_DEFAULT)
 					error = iso_shipdir(idp);
 				else
 					error = iso_uiodir(idp,&idp->current,idp->curroff);
 			}
 		}
 		if (error)
 			break;
 
 		entryoffsetinblock += reclen;
 	}
 
 	if (!error && imp->iso_ftype == ISO_FTYPE_DEFAULT) {
 		idp->current.d_namlen = 0;
 		error = iso_shipdir(idp);
 	}
 	if (error < 0)
 		error = 0;
 
 	if (ap->a_ncookies != NULL) {
 		if (error)
 			free(cookies, M_TEMP);
 		else {
 			/*
 			 * Work out the number of cookies actually used.
 			 */
 			*ap->a_ncookies = ncookies - idp->ncookies;
 			*ap->a_cookies = cookies;
 		}
 	}
 
 	if (bp)
 		brelse (bp);
 
 	uio->uio_offset = idp->uio_off;
 	*ap->a_eofflag = idp->eofflag;
 
 	free(idp, M_TEMP);
 
 	return (error);
 }
 
 /*
  * Return target name of a symbolic link
  * Shouldn't we get the parent vnode and read the data from there?
  * This could eventually result in deadlocks in cd9660_lookup.
  * But otherwise the block read here is in the block buffer two times.
  */
 typedef struct iso_directory_record ISODIR;
 typedef struct iso_node		    ISONODE;
 typedef struct iso_mnt		    ISOMNT;
 static int
 cd9660_readlink(ap)
 	struct vop_readlink_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	ISONODE	*ip;
 	ISODIR	*dirp;
 	ISOMNT	*imp;
 	struct	buf *bp;
 	struct	uio *uio;
 	u_short	symlen;
 	int	error;
 	char	*symname;
 
 	ip  = VTOI(ap->a_vp);
 	imp = ip->i_mnt;
 	uio = ap->a_uio;
 
 	if (imp->iso_ftype != ISO_FTYPE_RRIP)
 		return (EINVAL);
 
 	/*
 	 * Get parents directory record block that this inode included.
 	 */
 	error = bread(imp->im_devvp,
 		      (ip->i_number >> imp->im_bshift) <<
 		      (imp->im_bshift - DEV_BSHIFT),
 		      imp->logical_block_size, NOCRED, &bp);
 	if (error) {
 		brelse(bp);
 		return (EINVAL);
 	}
 
 	/*
 	 * Setup the directory pointer for this inode
 	 */
 	dirp = (ISODIR *)(bp->b_data + (ip->i_number & imp->im_bmask));
 
 	/*
 	 * Just make sure, we have a right one....
 	 *   1: Check not cross boundary on block
 	 */
 	if ((ip->i_number & imp->im_bmask) + isonum_711(dirp->length)
 	    > (unsigned)imp->logical_block_size) {
 		brelse(bp);
 		return (EINVAL);
 	}
 
 	/*
 	 * Now get a buffer
 	 * Abuse a namei buffer for now.
 	 */
 	if (uio->uio_segflg == UIO_SYSSPACE)
 		symname = uio->uio_iov->iov_base;
 	else
 		symname = uma_zalloc(namei_zone, M_WAITOK);
 
 	/*
 	 * Ok, we just gathering a symbolic name in SL record.
 	 */
 	if (cd9660_rrip_getsymname(dirp, symname, &symlen, imp) == 0) {
 		if (uio->uio_segflg != UIO_SYSSPACE)
 			uma_zfree(namei_zone, symname);
 		brelse(bp);
 		return (EINVAL);
 	}
 	/*
 	 * Don't forget before you leave from home ;-)
 	 */
 	brelse(bp);
 
 	/*
 	 * return with the symbolic name to caller's.
 	 */
 	if (uio->uio_segflg != UIO_SYSSPACE) {
 		error = uiomove(symname, symlen, uio);
 		uma_zfree(namei_zone, symname);
 		return (error);
 	}
 	uio->uio_resid -= symlen;
 	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + symlen;
 	uio->uio_iov->iov_len -= symlen;
 	return (0);
 }
 
 /*
  * Calculate the logical to physical mapping if not done already,
  * then call the device strategy routine.
  */
 static int
 cd9660_strategy(ap)
 	struct vop_strategy_args /* {
 		struct buf *a_vp;
 		struct buf *a_bp;
 	} */ *ap;
 {
 	struct buf *bp = ap->a_bp;
 	struct vnode *vp = ap->a_vp;
 	struct iso_node *ip;
 	struct bufobj *bo;
 
 	ip = VTOI(vp);
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		panic("cd9660_strategy: spec");
 	if (bp->b_blkno == bp->b_lblkno) {
 		bp->b_blkno = (ip->iso_start + bp->b_lblkno) <<
 		    (ip->i_mnt->im_bshift - DEV_BSHIFT);
 	}
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bo = ip->i_mnt->im_bo;
 	BO_STRATEGY(bo, bp);
 	return (0);
 }
 
 /*
  * Return POSIX pathconf information applicable to cd9660 filesystems.
  */
 static int
 cd9660_pathconf(ap)
 	struct vop_pathconf_args /* {
 		struct vnode *a_vp;
 		int a_name;
 		register_t *a_retval;
 	} */ *ap;
 {
 
 	switch (ap->a_name) {
 	case _PC_FILESIZEBITS:
 		*ap->a_retval = 32;
 		return (0);
 	case _PC_LINK_MAX:
 		*ap->a_retval = 1;
 		return (0);
 	case _PC_NAME_MAX:
 		if (VTOI(ap->a_vp)->i_mnt->iso_ftype == ISO_FTYPE_RRIP)
 			*ap->a_retval = NAME_MAX;
 		else
 			*ap->a_retval = 37;
 		return (0);
 	case _PC_SYMLINK_MAX:
 		if (VTOI(ap->a_vp)->i_mnt->iso_ftype == ISO_FTYPE_RRIP) {
 			*ap->a_retval = MAXPATHLEN;
 			return (0);
 		}
 		return (EINVAL);
 	case _PC_NO_TRUNC:
 		*ap->a_retval = 1;
 		return (0);
 	default:
 		return (vop_stdpathconf(ap));
 	}
 	/* NOTREACHED */
 }
 
 /*
  * Vnode pointer to File handle
  */
 static int
 cd9660_vptofh(ap)
 	struct vop_vptofh_args /* {
 		struct vnode *a_vp;
 		struct fid *a_fhp;
 	} */ *ap;
 {
 	struct ifid ifh;
 	struct iso_node *ip = VTOI(ap->a_vp);
 
 	ifh.ifid_len = sizeof(struct ifid);
 
 	ifh.ifid_ino = ip->i_number;
 	ifh.ifid_start = ip->iso_start;
 	/*
 	 * This intentionally uses sizeof(ifh) in order to not copy stack
 	 * garbage on ILP32.
 	 */
 	memcpy(ap->a_fhp, &ifh, sizeof(ifh));
 
 #ifdef	ISOFS_DBG
 	printf("vptofh: ino %jd, start %ld\n",
 	    (uintmax_t)ifh.ifid_ino, ifh.ifid_start);
 #endif
 
 	return (0);
 }
 
 SYSCTL_NODE(_vfs, OID_AUTO, cd9660, CTLFLAG_RW, 0, "cd9660 filesystem");
 static int use_buf_pager = 1;
 SYSCTL_INT(_vfs_cd9660, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN,
     &use_buf_pager, 0,
     "Use buffer pager instead of bmap");
 
 static daddr_t
 cd9660_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
 {
 
 	return (lblkno(VTOI(vp)->i_mnt, off));
 }
 
 static int
 cd9660_gbp_getblksz(struct vnode *vp, daddr_t lbn)
 {
 	struct iso_node *ip;
 
 	ip = VTOI(vp);
 	return (blksize(ip->i_mnt, ip, lbn));
 }
 
 static int
 cd9660_getpages(struct vop_getpages_args *ap)
 {
 	struct vnode *vp;
 
 	vp = ap->a_vp;
 	if (vp->v_type == VCHR || vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	if (use_buf_pager)
 		return (vfs_bio_getpages(vp, ap->a_m, ap->a_count,
 		    ap->a_rbehind, ap->a_rahead, cd9660_gbp_getblkno,
 		    cd9660_gbp_getblksz));
 	return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
 	    ap->a_rbehind, ap->a_rahead, NULL, NULL));
 }
 
 /*
  * Global vfs data structures for cd9660
  */
 struct vop_vector cd9660_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_open =		cd9660_open,
 	.vop_access =		cd9660_access,
 	.vop_bmap =		cd9660_bmap,
 	.vop_cachedlookup =	cd9660_lookup,
 	.vop_getattr =		cd9660_getattr,
 	.vop_inactive =		cd9660_inactive,
 	.vop_ioctl =		cd9660_ioctl,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_pathconf =		cd9660_pathconf,
 	.vop_read =		cd9660_read,
 	.vop_readdir =		cd9660_readdir,
 	.vop_readlink =		cd9660_readlink,
 	.vop_reclaim =		cd9660_reclaim,
 	.vop_setattr =		cd9660_setattr,
 	.vop_strategy =		cd9660_strategy,
 	.vop_vptofh =		cd9660_vptofh,
 	.vop_getpages =		cd9660_getpages,
 };
 
 /*
  * Special device vnode ops
  */
 
 struct vop_vector cd9660_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_access =		cd9660_access,
 	.vop_getattr =		cd9660_getattr,
 	.vop_inactive =		cd9660_inactive,
 	.vop_reclaim =		cd9660_reclaim,
 	.vop_setattr =		cd9660_setattr,
 	.vop_vptofh =		cd9660_vptofh,
 };
Index: stable/12/sys/fs/devfs/devfs_vnops.c
===================================================================
--- stable/12/sys/fs/devfs/devfs_vnops.c	(revision 340720)
+++ stable/12/sys/fs/devfs/devfs_vnops.c	(revision 340721)
@@ -1,1966 +1,1968 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2000-2004
  *	Poul-Henning Kamp.  All rights reserved.
  * Copyright (c) 1989, 1992-1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kernfs_vnops.c	8.15 (Berkeley) 5/21/95
  * From: FreeBSD: src/sys/miscfs/kernfs/kernfs_vnops.c 1.43
  *
  * $FreeBSD$
  */
 
 /*
  * TODO:
  *	mkdir: want it ?
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
 #include <sys/ttycom.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 static struct vop_vector devfs_vnodeops;
 static struct vop_vector devfs_specops;
 static struct fileops devfs_ops_f;
 
 #include <fs/devfs/devfs.h>
 #include <fs/devfs/devfs_int.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 
 static MALLOC_DEFINE(M_CDEVPDATA, "DEVFSP", "Metainfo for cdev-fp data");
 
 struct mtx	devfs_de_interlock;
 MTX_SYSINIT(devfs_de_interlock, &devfs_de_interlock, "devfs interlock", MTX_DEF);
 struct sx	clone_drain_lock;
 SX_SYSINIT(clone_drain_lock, &clone_drain_lock, "clone events drain lock");
 struct mtx	cdevpriv_mtx;
 MTX_SYSINIT(cdevpriv_mtx, &cdevpriv_mtx, "cdevpriv lock", MTX_DEF);
 
 SYSCTL_DECL(_vfs_devfs);
 
 static int devfs_dotimes;
 SYSCTL_INT(_vfs_devfs, OID_AUTO, dotimes, CTLFLAG_RW,
     &devfs_dotimes, 0, "Update timestamps on DEVFS with default precision");
 
 /*
  * Update devfs node timestamp.  Note that updates are unlocked and
  * stat(2) could see partially updated times.
  */
 static void
 devfs_timestamp(struct timespec *tsp)
 {
 	time_t ts;
 
 	if (devfs_dotimes) {
 		vfs_timestamp(tsp);
 	} else {
 		ts = time_second;
 		if (tsp->tv_sec != ts) {
 			tsp->tv_sec = ts;
 			tsp->tv_nsec = 0;
 		}
 	}
 }
 
 static int
 devfs_fp_check(struct file *fp, struct cdev **devp, struct cdevsw **dswp,
     int *ref)
 {
 
 	*dswp = devvn_refthread(fp->f_vnode, devp, ref);
 	if (*devp != fp->f_data) {
 		if (*dswp != NULL)
 			dev_relthread(*devp, *ref);
 		return (ENXIO);
 	}
 	KASSERT((*devp)->si_refcount > 0,
 	    ("devfs: un-referenced struct cdev *(%s)", devtoname(*devp)));
 	if (*dswp == NULL)
 		return (ENXIO);
 	curthread->td_fpop = fp;
 	return (0);
 }
 
 int
 devfs_get_cdevpriv(void **datap)
 {
 	struct file *fp;
 	struct cdev_privdata *p;
 	int error;
 
 	fp = curthread->td_fpop;
 	if (fp == NULL)
 		return (EBADF);
 	p = fp->f_cdevpriv;
 	if (p != NULL) {
 		error = 0;
 		*datap = p->cdpd_data;
 	} else
 		error = ENOENT;
 	return (error);
 }
 
 int
 devfs_set_cdevpriv(void *priv, d_priv_dtor_t *priv_dtr)
 {
 	struct file *fp;
 	struct cdev_priv *cdp;
 	struct cdev_privdata *p;
 	int error;
 
 	fp = curthread->td_fpop;
 	if (fp == NULL)
 		return (ENOENT);
 	cdp = cdev2priv((struct cdev *)fp->f_data);
 	p = malloc(sizeof(struct cdev_privdata), M_CDEVPDATA, M_WAITOK);
 	p->cdpd_data = priv;
 	p->cdpd_dtr = priv_dtr;
 	p->cdpd_fp = fp;
 	mtx_lock(&cdevpriv_mtx);
 	if (fp->f_cdevpriv == NULL) {
 		LIST_INSERT_HEAD(&cdp->cdp_fdpriv, p, cdpd_list);
 		fp->f_cdevpriv = p;
 		mtx_unlock(&cdevpriv_mtx);
 		error = 0;
 	} else {
 		mtx_unlock(&cdevpriv_mtx);
 		free(p, M_CDEVPDATA);
 		error = EBUSY;
 	}
 	return (error);
 }
 
 void
 devfs_destroy_cdevpriv(struct cdev_privdata *p)
 {
 
 	mtx_assert(&cdevpriv_mtx, MA_OWNED);
 	KASSERT(p->cdpd_fp->f_cdevpriv == p,
 	    ("devfs_destoy_cdevpriv %p != %p", p->cdpd_fp->f_cdevpriv, p));
 	p->cdpd_fp->f_cdevpriv = NULL;
 	LIST_REMOVE(p, cdpd_list);
 	mtx_unlock(&cdevpriv_mtx);
 	(p->cdpd_dtr)(p->cdpd_data);
 	free(p, M_CDEVPDATA);
 }
 
 static void
 devfs_fpdrop(struct file *fp)
 {
 	struct cdev_privdata *p;
 
 	mtx_lock(&cdevpriv_mtx);
 	if ((p = fp->f_cdevpriv) == NULL) {
 		mtx_unlock(&cdevpriv_mtx);
 		return;
 	}
 	devfs_destroy_cdevpriv(p);
 }
 
 void
 devfs_clear_cdevpriv(void)
 {
 	struct file *fp;
 
 	fp = curthread->td_fpop;
 	if (fp == NULL)
 		return;
 	devfs_fpdrop(fp);
 }
 
 /*
  * On success devfs_populate_vp() returns with dmp->dm_lock held.
  */
 static int
 devfs_populate_vp(struct vnode *vp)
 {
 	struct devfs_dirent *de;
 	struct devfs_mount *dmp;
 	int locked;
 
 	ASSERT_VOP_LOCKED(vp, "devfs_populate_vp");
 
 	dmp = VFSTODEVFS(vp->v_mount);
 	locked = VOP_ISLOCKED(vp);
 
 	sx_xlock(&dmp->dm_lock);
 	DEVFS_DMP_HOLD(dmp);
 
 	/* Can't call devfs_populate() with the vnode lock held. */
 	VOP_UNLOCK(vp, 0);
 	devfs_populate(dmp);
 
 	sx_xunlock(&dmp->dm_lock);
 	vn_lock(vp, locked | LK_RETRY);
 	sx_xlock(&dmp->dm_lock);
 	if (DEVFS_DMP_DROP(dmp)) {
 		sx_xunlock(&dmp->dm_lock);
 		devfs_unmount_final(dmp);
 		return (ERESTART);
 	}
 	if ((vp->v_iflag & VI_DOOMED) != 0) {
 		sx_xunlock(&dmp->dm_lock);
 		return (ERESTART);
 	}
 	de = vp->v_data;
 	KASSERT(de != NULL,
 	    ("devfs_populate_vp: vp->v_data == NULL but vnode not doomed"));
 	if ((de->de_flags & DE_DOOMED) != 0) {
 		sx_xunlock(&dmp->dm_lock);
 		return (ERESTART);
 	}
 
 	return (0);
 }
 
 static int
 devfs_vptocnp(struct vop_vptocnp_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode **dvp = ap->a_vpp;
 	struct devfs_mount *dmp;
 	char *buf = ap->a_buf;
 	int *buflen = ap->a_buflen;
 	struct devfs_dirent *dd, *de;
 	int i, error;
 
 	dmp = VFSTODEVFS(vp->v_mount);
 
 	error = devfs_populate_vp(vp);
 	if (error != 0)
 		return (error);
 
 	i = *buflen;
 	dd = vp->v_data;
 
 	if (vp->v_type == VCHR) {
 		i -= strlen(dd->de_cdp->cdp_c.si_name);
 		if (i < 0) {
 			error = ENOMEM;
 			goto finished;
 		}
 		bcopy(dd->de_cdp->cdp_c.si_name, buf + i,
 		    strlen(dd->de_cdp->cdp_c.si_name));
 		de = dd->de_dir;
 	} else if (vp->v_type == VDIR) {
 		if (dd == dmp->dm_rootdir) {
 			*dvp = vp;
 			vref(*dvp);
 			goto finished;
 		}
 		i -= dd->de_dirent->d_namlen;
 		if (i < 0) {
 			error = ENOMEM;
 			goto finished;
 		}
 		bcopy(dd->de_dirent->d_name, buf + i,
 		    dd->de_dirent->d_namlen);
 		de = dd;
 	} else {
 		error = ENOENT;
 		goto finished;
 	}
 	*buflen = i;
 	de = devfs_parent_dirent(de);
 	if (de == NULL) {
 		error = ENOENT;
 		goto finished;
 	}
 	mtx_lock(&devfs_de_interlock);
 	*dvp = de->de_vnode;
 	if (*dvp != NULL) {
 		VI_LOCK(*dvp);
 		mtx_unlock(&devfs_de_interlock);
 		vholdl(*dvp);
 		VI_UNLOCK(*dvp);
 		vref(*dvp);
 		vdrop(*dvp);
 	} else {
 		mtx_unlock(&devfs_de_interlock);
 		error = ENOENT;
 	}
 finished:
 	sx_xunlock(&dmp->dm_lock);
 	return (error);
 }
 
 /*
  * Construct the fully qualified path name relative to the mountpoint.
  * If a NULL cnp is provided, no '/' is appended to the resulting path.
  */
 char *
 devfs_fqpn(char *buf, struct devfs_mount *dmp, struct devfs_dirent *dd,
     struct componentname *cnp)
 {
 	int i;
 	struct devfs_dirent *de;
 
 	sx_assert(&dmp->dm_lock, SA_LOCKED);
 
 	i = SPECNAMELEN;
 	buf[i] = '\0';
 	if (cnp != NULL)
 		i -= cnp->cn_namelen;
 	if (i < 0)
 		 return (NULL);
 	if (cnp != NULL)
 		bcopy(cnp->cn_nameptr, buf + i, cnp->cn_namelen);
 	de = dd;
 	while (de != dmp->dm_rootdir) {
 		if (cnp != NULL || i < SPECNAMELEN) {
 			i--;
 			if (i < 0)
 				 return (NULL);
 			buf[i] = '/';
 		}
 		i -= de->de_dirent->d_namlen;
 		if (i < 0)
 			 return (NULL);
 		bcopy(de->de_dirent->d_name, buf + i,
 		    de->de_dirent->d_namlen);
 		de = devfs_parent_dirent(de);
 		if (de == NULL)
 			return (NULL);
 	}
 	return (buf + i);
 }
 
 static int
 devfs_allocv_drop_refs(int drop_dm_lock, struct devfs_mount *dmp,
 	struct devfs_dirent *de)
 {
 	int not_found;
 
 	not_found = 0;
 	if (de->de_flags & DE_DOOMED)
 		not_found = 1;
 	if (DEVFS_DE_DROP(de)) {
 		KASSERT(not_found == 1, ("DEVFS de dropped but not doomed"));
 		devfs_dirent_free(de);
 	}
 	if (DEVFS_DMP_DROP(dmp)) {
 		KASSERT(not_found == 1,
 			("DEVFS mount struct freed before dirent"));
 		not_found = 2;
 		sx_xunlock(&dmp->dm_lock);
 		devfs_unmount_final(dmp);
 	}
 	if (not_found == 1 || (drop_dm_lock && not_found != 2))
 		sx_unlock(&dmp->dm_lock);
 	return (not_found);
 }
 
 static void
 devfs_insmntque_dtr(struct vnode *vp, void *arg)
 {
 	struct devfs_dirent *de;
 
 	de = (struct devfs_dirent *)arg;
 	mtx_lock(&devfs_de_interlock);
 	vp->v_data = NULL;
 	de->de_vnode = NULL;
 	mtx_unlock(&devfs_de_interlock);
 	vgone(vp);
 	vput(vp);
 }
 
 /*
  * devfs_allocv shall be entered with dmp->dm_lock held, and it drops
  * it on return.
  */
 int
 devfs_allocv(struct devfs_dirent *de, struct mount *mp, int lockmode,
     struct vnode **vpp)
 {
 	int error;
 	struct vnode *vp;
 	struct cdev *dev;
 	struct devfs_mount *dmp;
 	struct cdevsw *dsw;
 
 	dmp = VFSTODEVFS(mp);
 	if (de->de_flags & DE_DOOMED) {
 		sx_xunlock(&dmp->dm_lock);
 		return (ENOENT);
 	}
 loop:
 	DEVFS_DE_HOLD(de);
 	DEVFS_DMP_HOLD(dmp);
 	mtx_lock(&devfs_de_interlock);
 	vp = de->de_vnode;
 	if (vp != NULL) {
 		VI_LOCK(vp);
 		mtx_unlock(&devfs_de_interlock);
 		sx_xunlock(&dmp->dm_lock);
 		vget(vp, lockmode | LK_INTERLOCK | LK_RETRY, curthread);
 		sx_xlock(&dmp->dm_lock);
 		if (devfs_allocv_drop_refs(0, dmp, de)) {
 			vput(vp);
 			return (ENOENT);
 		}
 		else if ((vp->v_iflag & VI_DOOMED) != 0) {
 			mtx_lock(&devfs_de_interlock);
 			if (de->de_vnode == vp) {
 				de->de_vnode = NULL;
 				vp->v_data = NULL;
 			}
 			mtx_unlock(&devfs_de_interlock);
 			vput(vp);
 			goto loop;
 		}
 		sx_xunlock(&dmp->dm_lock);
 		*vpp = vp;
 		return (0);
 	}
 	mtx_unlock(&devfs_de_interlock);
 	if (de->de_dirent->d_type == DT_CHR) {
 		if (!(de->de_cdp->cdp_flags & CDP_ACTIVE)) {
 			devfs_allocv_drop_refs(1, dmp, de);
 			return (ENOENT);
 		}
 		dev = &de->de_cdp->cdp_c;
 	} else {
 		dev = NULL;
 	}
 	error = getnewvnode("devfs", mp, &devfs_vnodeops, &vp);
 	if (error != 0) {
 		devfs_allocv_drop_refs(1, dmp, de);
 		printf("devfs_allocv: failed to allocate new vnode\n");
 		return (error);
 	}
 
 	if (de->de_dirent->d_type == DT_CHR) {
 		vp->v_type = VCHR;
 		VI_LOCK(vp);
 		dev_lock();
 		dev_refl(dev);
 		/* XXX: v_rdev should be protect by vnode lock */
 		vp->v_rdev = dev;
 		KASSERT(vp->v_usecount == 1,
 		    ("%s %d (%d)\n", __func__, __LINE__, vp->v_usecount));
 		dev->si_usecount += vp->v_usecount;
 		/* Special casing of ttys for deadfs.  Probably redundant. */
 		dsw = dev->si_devsw;
 		if (dsw != NULL && (dsw->d_flags & D_TTY) != 0)
 			vp->v_vflag |= VV_ISTTY;
 		dev_unlock();
 		VI_UNLOCK(vp);
 		if ((dev->si_flags & SI_ETERNAL) != 0)
 			vp->v_vflag |= VV_ETERNALDEV;
 		vp->v_op = &devfs_specops;
 	} else if (de->de_dirent->d_type == DT_DIR) {
 		vp->v_type = VDIR;
 	} else if (de->de_dirent->d_type == DT_LNK) {
 		vp->v_type = VLNK;
 	} else {
 		vp->v_type = VBAD;
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWITNESS);
 	VN_LOCK_ASHARE(vp);
 	mtx_lock(&devfs_de_interlock);
 	vp->v_data = de;
 	de->de_vnode = vp;
 	mtx_unlock(&devfs_de_interlock);
 	error = insmntque1(vp, mp, devfs_insmntque_dtr, de);
 	if (error != 0) {
 		(void) devfs_allocv_drop_refs(1, dmp, de);
 		return (error);
 	}
 	if (devfs_allocv_drop_refs(0, dmp, de)) {
 		vput(vp);
 		return (ENOENT);
 	}
 #ifdef MAC
 	mac_devfs_vnode_associate(mp, de, vp);
 #endif
 	sx_xunlock(&dmp->dm_lock);
 	*vpp = vp;
 	return (0);
 }
 
 static int
 devfs_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct devfs_dirent *de;
 	struct proc *p;
 	int error;
 
 	de = vp->v_data;
 	if (vp->v_type == VDIR)
 		de = de->de_dir;
 
 	error = vaccess(vp->v_type, de->de_mode, de->de_uid, de->de_gid,
 	    ap->a_accmode, ap->a_cred, NULL);
 	if (error == 0)
 		return (0);
 	if (error != EACCES)
 		return (error);
 	p = ap->a_td->td_proc;
 	/* We do, however, allow access to the controlling terminal */
 	PROC_LOCK(p);
 	if (!(p->p_flag & P_CONTROLT)) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	if (p->p_session->s_ttydp == de->de_cdp)
 		error = 0;
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 _Static_assert(((FMASK | FCNTLFLAGS) & (FLASTCLOSE | FREVOKE)) == 0,
     "devfs-only flag reuse failed");
 
 static int
 devfs_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp, *oldvp;
 	struct thread *td = ap->a_td;
 	struct proc *p;
 	struct cdev *dev = vp->v_rdev;
 	struct cdevsw *dsw;
 	int dflags, error, ref, vp_locked;
 
 	/*
 	 * XXX: Don't call d_close() if we were called because of
 	 * XXX: insmntque1() failure.
 	 */
 	if (vp->v_data == NULL)
 		return (0);
 
 	/*
 	 * Hack: a tty device that is a controlling terminal
 	 * has a reference from the session structure.
 	 * We cannot easily tell that a character device is
 	 * a controlling terminal, unless it is the closing
 	 * process' controlling terminal.  In that case,
 	 * if the reference count is 2 (this last descriptor
 	 * plus the session), release the reference from the session.
 	 */
 	if (td != NULL) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 		if (vp == p->p_session->s_ttyvp) {
 			PROC_UNLOCK(p);
 			oldvp = NULL;
 			sx_xlock(&proctree_lock);
 			if (vp == p->p_session->s_ttyvp) {
 				SESS_LOCK(p->p_session);
 				VI_LOCK(vp);
 				if (count_dev(dev) == 2 &&
 				    (vp->v_iflag & VI_DOOMED) == 0) {
 					p->p_session->s_ttyvp = NULL;
 					p->p_session->s_ttydp = NULL;
 					oldvp = vp;
 				}
 				VI_UNLOCK(vp);
 				SESS_UNLOCK(p->p_session);
 			}
 			sx_xunlock(&proctree_lock);
 			if (oldvp != NULL)
 				vrele(oldvp);
 		} else
 			PROC_UNLOCK(p);
 	}
 	/*
 	 * We do not want to really close the device if it
 	 * is still in use unless we are trying to close it
 	 * forcibly. Since every use (buffer, vnode, swap, cmap)
 	 * holds a reference to the vnode, and because we mark
 	 * any other vnodes that alias this device, when the
 	 * sum of the reference counts on all the aliased
 	 * vnodes descends to one, we are on last close.
 	 */
 	dsw = dev_refthread(dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	dflags = 0;
 	VI_LOCK(vp);
 	if (vp->v_iflag & VI_DOOMED) {
 		/* Forced close. */
 		dflags |= FREVOKE | FNONBLOCK;
 	} else if (dsw->d_flags & D_TRACKCLOSE) {
 		/* Keep device updated on status. */
 	} else if (count_dev(dev) > 1) {
 		VI_UNLOCK(vp);
 		dev_relthread(dev, ref);
 		return (0);
 	}
 	if (count_dev(dev) == 1)
 		dflags |= FLASTCLOSE;
 	vholdl(vp);
 	VI_UNLOCK(vp);
 	vp_locked = VOP_ISLOCKED(vp);
 	VOP_UNLOCK(vp, 0);
 	KASSERT(dev->si_refcount > 0,
 	    ("devfs_close() on un-referenced struct cdev *(%s)", devtoname(dev)));
 	error = dsw->d_close(dev, ap->a_fflag | dflags, S_IFCHR, td);
 	dev_relthread(dev, ref);
 	vn_lock(vp, vp_locked | LK_RETRY);
 	vdrop(vp);
 	return (error);
 }
 
 static int
 devfs_close_f(struct file *fp, struct thread *td)
 {
 	int error;
 	struct file *fpop;
 
 	/*
 	 * NB: td may be NULL if this descriptor is closed due to
 	 * garbage collection from a closed UNIX domain socket.
 	 */
 	fpop = curthread->td_fpop;
 	curthread->td_fpop = fp;
 	error = vnops.fo_close(fp, td);
 	curthread->td_fpop = fpop;
 
 	/*
 	 * The f_cdevpriv cannot be assigned non-NULL value while we
 	 * are destroying the file.
 	 */
 	if (fp->f_cdevpriv != NULL)
 		devfs_fpdrop(fp);
 	return (error);
 }
 
 static int
 devfs_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	struct devfs_dirent *de;
 	struct devfs_mount *dmp;
 	struct cdev *dev;
 	struct timeval boottime;
 	int error;
 
 	error = devfs_populate_vp(vp);
 	if (error != 0)
 		return (error);
 
 	dmp = VFSTODEVFS(vp->v_mount);
 	sx_xunlock(&dmp->dm_lock);
 
 	de = vp->v_data;
 	KASSERT(de != NULL, ("Null dirent in devfs_getattr vp=%p", vp));
 	if (vp->v_type == VDIR) {
 		de = de->de_dir;
 		KASSERT(de != NULL,
 		    ("Null dir dirent in devfs_getattr vp=%p", vp));
 	}
 	vap->va_uid = de->de_uid;
 	vap->va_gid = de->de_gid;
 	vap->va_mode = de->de_mode;
 	if (vp->v_type == VLNK)
 		vap->va_size = strlen(de->de_symlink);
 	else if (vp->v_type == VDIR)
 		vap->va_size = vap->va_bytes = DEV_BSIZE;
 	else
 		vap->va_size = 0;
 	if (vp->v_type != VDIR)
 		vap->va_bytes = 0;
 	vap->va_blocksize = DEV_BSIZE;
 	vap->va_type = vp->v_type;
 
 	getboottime(&boottime);
 #define fix(aa)							\
 	do {							\
 		if ((aa).tv_sec <= 3600) {			\
 			(aa).tv_sec = boottime.tv_sec;		\
 			(aa).tv_nsec = boottime.tv_usec * 1000; \
 		}						\
 	} while (0)
 
 	if (vp->v_type != VCHR)  {
 		fix(de->de_atime);
 		vap->va_atime = de->de_atime;
 		fix(de->de_mtime);
 		vap->va_mtime = de->de_mtime;
 		fix(de->de_ctime);
 		vap->va_ctime = de->de_ctime;
 	} else {
 		dev = vp->v_rdev;
 		fix(dev->si_atime);
 		vap->va_atime = dev->si_atime;
 		fix(dev->si_mtime);
 		vap->va_mtime = dev->si_mtime;
 		fix(dev->si_ctime);
 		vap->va_ctime = dev->si_ctime;
 
 		vap->va_rdev = cdev2priv(dev)->cdp_inode;
 	}
 	vap->va_gen = 0;
 	vap->va_flags = 0;
 	vap->va_filerev = 0;
 	vap->va_nlink = de->de_links;
 	vap->va_fileid = de->de_inode;
 
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 devfs_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred, struct thread *td)
 {
 	struct file *fpop;
 	int error;
 
 	fpop = td->td_fpop;
 	td->td_fpop = fp;
 	error = vnops.fo_ioctl(fp, com, data, cred, td);
 	td->td_fpop = fpop;
 	return (error);
 }
 
 static int
 devfs_ioctl(struct vop_ioctl_args *ap)
 {
 	struct fiodgname_arg *fgn;
 	struct vnode *vpold, *vp;
 	struct cdevsw *dsw;
 	struct thread *td;
 	struct cdev *dev;
 	int error, ref, i;
 	const char *p;
 	u_long com;
 
 	vp = ap->a_vp;
 	com = ap->a_command;
 	td = ap->a_td;
 
 	dsw = devvn_refthread(vp, &dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	KASSERT(dev->si_refcount > 0,
 	    ("devfs: un-referenced struct cdev *(%s)", devtoname(dev)));
 
 	if (com == FIODTYPE) {
 		*(int *)ap->a_data = dsw->d_flags & D_TYPEMASK;
 		error = 0;
 		goto out;
 	} else if (com == FIODGNAME) {
 		fgn = ap->a_data;
 		p = devtoname(dev);
 		i = strlen(p) + 1;
 		if (i > fgn->len)
 			error = EINVAL;
 		else
 			error = copyout(p, fgn->buf, i);
 		goto out;
 	}
 
 	error = dsw->d_ioctl(dev, com, ap->a_data, ap->a_fflag, td);
 
 out:
 	dev_relthread(dev, ref);
 	if (error == ENOIOCTL)
 		error = ENOTTY;
 
 	if (error == 0 && com == TIOCSCTTY) {
 		/* Do nothing if reassigning same control tty */
 		sx_slock(&proctree_lock);
 		if (td->td_proc->p_session->s_ttyvp == vp) {
 			sx_sunlock(&proctree_lock);
 			return (0);
 		}
 
 		vpold = td->td_proc->p_session->s_ttyvp;
 		VREF(vp);
 		SESS_LOCK(td->td_proc->p_session);
 		td->td_proc->p_session->s_ttyvp = vp;
 		td->td_proc->p_session->s_ttydp = cdev2priv(dev);
 		SESS_UNLOCK(td->td_proc->p_session);
 
 		sx_sunlock(&proctree_lock);
 
 		/* Get rid of reference to old control tty */
 		if (vpold)
 			vrele(vpold);
 	}
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 devfs_kqfilter_f(struct file *fp, struct knote *kn)
 {
 	struct cdev *dev;
 	struct cdevsw *dsw;
 	int error, ref;
 	struct file *fpop;
 	struct thread *td;
 
 	td = curthread;
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error)
 		return (error);
 	error = dsw->d_kqfilter(dev, kn);
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 	return (error);
 }
 
 static inline int
 devfs_prison_check(struct devfs_dirent *de, struct thread *td)
 {
 	struct cdev_priv *cdp;
 	struct ucred *dcr;
 	struct proc *p;
 	int error;
 
 	cdp = de->de_cdp;
 	if (cdp == NULL)
 		return (0);
 	dcr = cdp->cdp_c.si_cred;
 	if (dcr == NULL)
 		return (0);
 
 	error = prison_check(td->td_ucred, dcr);
 	if (error == 0)
 		return (0);
 	/* We do, however, allow access to the controlling terminal */
 	p = td->td_proc;
 	PROC_LOCK(p);
 	if (!(p->p_flag & P_CONTROLT)) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	if (p->p_session->s_ttydp == cdp)
 		error = 0;
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 static int
 devfs_lookupx(struct vop_lookup_args *ap, int *dm_unlock)
 {
 	struct componentname *cnp;
 	struct vnode *dvp, **vpp;
 	struct thread *td;
 	struct devfs_dirent *de, *dd;
 	struct devfs_dirent **dde;
 	struct devfs_mount *dmp;
 	struct mount *mp;
 	struct cdev *cdev;
 	int error, flags, nameiop, dvplocked;
 	char specname[SPECNAMELEN + 1], *pname;
 
 	cnp = ap->a_cnp;
 	vpp = ap->a_vpp;
 	dvp = ap->a_dvp;
 	pname = cnp->cn_nameptr;
 	td = cnp->cn_thread;
 	flags = cnp->cn_flags;
 	nameiop = cnp->cn_nameiop;
 	mp = dvp->v_mount;
 	dmp = VFSTODEVFS(mp);
 	dd = dvp->v_data;
 	*vpp = NULLVP;
 
 	if ((flags & ISLASTCN) && nameiop == RENAME)
 		return (EOPNOTSUPP);
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	if ((flags & ISDOTDOT) && (dvp->v_vflag & VV_ROOT))
 		return (EIO);
 
 	error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td);
 	if (error)
 		return (error);
 
 	if (cnp->cn_namelen == 1 && *pname == '.') {
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		*vpp = dvp;
 		VREF(dvp);
 		return (0);
 	}
 
 	if (flags & ISDOTDOT) {
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		de = devfs_parent_dirent(dd);
 		if (de == NULL)
 			return (ENOENT);
 		dvplocked = VOP_ISLOCKED(dvp);
 		VOP_UNLOCK(dvp, 0);
 		error = devfs_allocv(de, mp, cnp->cn_lkflags & LK_TYPE_MASK,
 		    vpp);
 		*dm_unlock = 0;
 		vn_lock(dvp, dvplocked | LK_RETRY);
 		return (error);
 	}
 
 	dd = dvp->v_data;
 	de = devfs_find(dd, cnp->cn_nameptr, cnp->cn_namelen, 0);
 	while (de == NULL) {	/* While(...) so we can use break */
 
 		if (nameiop == DELETE)
 			return (ENOENT);
 
 		/*
 		 * OK, we didn't have an entry for the name we were asked for
 		 * so we try to see if anybody can create it on demand.
 		 */
 		pname = devfs_fqpn(specname, dmp, dd, cnp);
 		if (pname == NULL)
 			break;
 
 		cdev = NULL;
 		DEVFS_DMP_HOLD(dmp);
 		sx_xunlock(&dmp->dm_lock);
 		sx_slock(&clone_drain_lock);
 		EVENTHANDLER_INVOKE(dev_clone,
 		    td->td_ucred, pname, strlen(pname), &cdev);
 		sx_sunlock(&clone_drain_lock);
 
 		if (cdev == NULL)
 			sx_xlock(&dmp->dm_lock);
 		else if (devfs_populate_vp(dvp) != 0) {
 			*dm_unlock = 0;
 			sx_xlock(&dmp->dm_lock);
 			if (DEVFS_DMP_DROP(dmp)) {
 				sx_xunlock(&dmp->dm_lock);
 				devfs_unmount_final(dmp);
 			} else
 				sx_xunlock(&dmp->dm_lock);
 			dev_rel(cdev);
 			return (ENOENT);
 		}
 		if (DEVFS_DMP_DROP(dmp)) {
 			*dm_unlock = 0;
 			sx_xunlock(&dmp->dm_lock);
 			devfs_unmount_final(dmp);
 			if (cdev != NULL)
 				dev_rel(cdev);
 			return (ENOENT);
 		}
 
 		if (cdev == NULL)
 			break;
 
 		dev_lock();
 		dde = &cdev2priv(cdev)->cdp_dirents[dmp->dm_idx];
 		if (dde != NULL && *dde != NULL)
 			de = *dde;
 		dev_unlock();
 		dev_rel(cdev);
 		break;
 	}
 
 	if (de == NULL || de->de_flags & DE_WHITEOUT) {
 		if ((nameiop == CREATE || nameiop == RENAME) &&
 		    (flags & (LOCKPARENT | WANTPARENT)) && (flags & ISLASTCN)) {
 			cnp->cn_flags |= SAVENAME;
 			return (EJUSTRETURN);
 		}
 		return (ENOENT);
 	}
 
 	if (devfs_prison_check(de, td))
 		return (ENOENT);
 
 	if ((cnp->cn_nameiop == DELETE) && (flags & ISLASTCN)) {
 		error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 		if (error)
 			return (error);
 		if (*vpp == dvp) {
 			VREF(dvp);
 			*vpp = dvp;
 			return (0);
 		}
 	}
 	error = devfs_allocv(de, mp, cnp->cn_lkflags & LK_TYPE_MASK, vpp);
 	*dm_unlock = 0;
 	return (error);
 }
 
 static int
 devfs_lookup(struct vop_lookup_args *ap)
 {
 	int j;
 	struct devfs_mount *dmp;
 	int dm_unlock;
 
 	if (devfs_populate_vp(ap->a_dvp) != 0)
 		return (ENOTDIR);
 
 	dmp = VFSTODEVFS(ap->a_dvp->v_mount);
 	dm_unlock = 1;
 	j = devfs_lookupx(ap, &dm_unlock);
 	if (dm_unlock == 1)
 		sx_xunlock(&dmp->dm_lock);
 	return (j);
 }
 
 static int
 devfs_mknod(struct vop_mknod_args *ap)
 {
 	struct componentname *cnp;
 	struct vnode *dvp, **vpp;
 	struct devfs_dirent *dd, *de;
 	struct devfs_mount *dmp;
 	int error;
 
 	/*
 	 * The only type of node we should be creating here is a
 	 * character device, for anything else return EOPNOTSUPP.
 	 */
 	if (ap->a_vap->va_type != VCHR)
 		return (EOPNOTSUPP);
 	dvp = ap->a_dvp;
 	dmp = VFSTODEVFS(dvp->v_mount);
 
 	cnp = ap->a_cnp;
 	vpp = ap->a_vpp;
 	dd = dvp->v_data;
 
 	error = ENOENT;
 	sx_xlock(&dmp->dm_lock);
 	TAILQ_FOREACH(de, &dd->de_dlist, de_list) {
 		if (cnp->cn_namelen != de->de_dirent->d_namlen)
 			continue;
 		if (de->de_dirent->d_type == DT_CHR &&
 		    (de->de_cdp->cdp_flags & CDP_ACTIVE) == 0)
 			continue;
 		if (bcmp(cnp->cn_nameptr, de->de_dirent->d_name,
 		    de->de_dirent->d_namlen) != 0)
 			continue;
 		if (de->de_flags & DE_WHITEOUT)
 			break;
 		goto notfound;
 	}
 	if (de == NULL)
 		goto notfound;
 	de->de_flags &= ~DE_WHITEOUT;
 	error = devfs_allocv(de, dvp->v_mount, LK_EXCLUSIVE, vpp);
 	return (error);
 notfound:
 	sx_xunlock(&dmp->dm_lock);
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 devfs_open(struct vop_open_args *ap)
 {
 	struct thread *td = ap->a_td;
 	struct vnode *vp = ap->a_vp;
 	struct cdev *dev = vp->v_rdev;
 	struct file *fp = ap->a_fp;
 	int error, ref, vlocked;
 	struct cdevsw *dsw;
 	struct file *fpop;
 	struct mtx *mtxp;
 
 	if (vp->v_type == VBLK)
 		return (ENXIO);
 
 	if (dev == NULL)
 		return (ENXIO);
 
 	/* Make this field valid before any I/O in d_open. */
 	if (dev->si_iosize_max == 0)
 		dev->si_iosize_max = DFLTPHYS;
 
 	dsw = dev_refthread(dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	if (fp == NULL && dsw->d_fdopen != NULL) {
 		dev_relthread(dev, ref);
 		return (ENXIO);
 	}
 
 	vlocked = VOP_ISLOCKED(vp);
 	VOP_UNLOCK(vp, 0);
 
 	fpop = td->td_fpop;
 	td->td_fpop = fp;
 	if (fp != NULL) {
 		fp->f_data = dev;
 		fp->f_vnode = vp;
 	}
 	if (dsw->d_fdopen != NULL)
 		error = dsw->d_fdopen(dev, ap->a_mode, td, fp);
 	else
 		error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td);
 	/* Clean up any cdevpriv upon error. */
 	if (error != 0)
 		devfs_clear_cdevpriv();
 	td->td_fpop = fpop;
 
 	vn_lock(vp, vlocked | LK_RETRY);
 	dev_relthread(dev, ref);
 	if (error != 0) {
 		if (error == ERESTART)
 			error = EINTR;
 		return (error);
 	}
 
 #if 0	/* /dev/console */
 	KASSERT(fp != NULL, ("Could not vnode bypass device on NULL fp"));
 #else
 	if (fp == NULL)
 		return (error);
 #endif
 	if (fp->f_ops == &badfileops)
 		finit(fp, fp->f_flag, DTYPE_VNODE, dev, &devfs_ops_f);
 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
 
 	/*
 	 * Hint to the dofilewrite() to not force the buffer draining
 	 * on the writer to the file.  Most likely, the write would
 	 * not need normal buffers.
 	 */
 	mtx_lock(mtxp);
 	fp->f_vnread_flags |= FDEVFS_VNODE;
 	mtx_unlock(mtxp);
 	return (error);
 }
 
 static int
 devfs_pathconf(struct vop_pathconf_args *ap)
 {
 
 	switch (ap->a_name) {
 	case _PC_FILESIZEBITS:
 		*ap->a_retval = 64;
 		return (0);
 	case _PC_NAME_MAX:
 		*ap->a_retval = NAME_MAX;
 		return (0);
 	case _PC_LINK_MAX:
 		*ap->a_retval = INT_MAX;
 		return (0);
 	case _PC_SYMLINK_MAX:
 		*ap->a_retval = MAXPATHLEN;
 		return (0);
 	case _PC_MAX_CANON:
 		if (ap->a_vp->v_vflag & VV_ISTTY) {
 			*ap->a_retval = MAX_CANON;
 			return (0);
 		}
 		return (EINVAL);
 	case _PC_MAX_INPUT:
 		if (ap->a_vp->v_vflag & VV_ISTTY) {
 			*ap->a_retval = MAX_INPUT;
 			return (0);
 		}
 		return (EINVAL);
 	case _PC_VDISABLE:
 		if (ap->a_vp->v_vflag & VV_ISTTY) {
 			*ap->a_retval = _POSIX_VDISABLE;
 			return (0);
 		}
 		return (EINVAL);
 	case _PC_MAC_PRESENT:
 #ifdef MAC
 		/*
 		 * If MAC is enabled, devfs automatically supports
 		 * trivial non-persistant label storage.
 		 */
 		*ap->a_retval = 1;
 #else
 		*ap->a_retval = 0;
 #endif
 		return (0);
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = 1;
 		return (0);
 	default:
 		return (vop_stdpathconf(ap));
 	}
 	/* NOTREACHED */
 }
 
 /* ARGSUSED */
 static int
 devfs_poll_f(struct file *fp, int events, struct ucred *cred, struct thread *td)
 {
 	struct cdev *dev;
 	struct cdevsw *dsw;
 	int error, ref;
 	struct file *fpop;
 
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error != 0) {
 		error = vnops.fo_poll(fp, events, cred, td);
 		return (error);
 	}
 	error = dsw->d_poll(dev, events, td);
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 	return(error);
 }
 
 /*
  * Print out the contents of a special device vnode.
  */
 static int
 devfs_print(struct vop_print_args *ap)
 {
 
 	printf("\tdev %s\n", devtoname(ap->a_vp->v_rdev));
 	return (0);
 }
 
 static int
 devfs_read_f(struct file *fp, struct uio *uio, struct ucred *cred,
     int flags, struct thread *td)
 {
 	struct cdev *dev;
 	int ioflag, error, ref;
 	ssize_t resid;
 	struct cdevsw *dsw;
 	struct file *fpop;
 
 	if (uio->uio_resid > DEVFS_IOSIZE_MAX)
 		return (EINVAL);
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error != 0) {
 		error = vnops.fo_read(fp, uio, cred, flags, td);
 		return (error);
 	}
 	resid = uio->uio_resid;
 	ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT);
 	if (ioflag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 
 	foffset_lock_uio(fp, uio, flags | FOF_NOLOCK);
 	error = dsw->d_read(dev, uio, ioflag);
 	if (uio->uio_resid != resid || (error == 0 && resid != 0))
 		devfs_timestamp(&dev->si_atime);
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 
 	foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF);
 	return (error);
 }
 
 static int
 devfs_readdir(struct vop_readdir_args *ap)
 {
 	int error;
 	struct uio *uio;
 	struct dirent *dp;
 	struct devfs_dirent *dd;
 	struct devfs_dirent *de;
 	struct devfs_mount *dmp;
 	off_t off;
 	int *tmp_ncookies = NULL;
 
 	if (ap->a_vp->v_type != VDIR)
 		return (ENOTDIR);
 
 	uio = ap->a_uio;
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 
 	/*
 	 * XXX: This is a temporary hack to get around this filesystem not
 	 * supporting cookies. We store the location of the ncookies pointer
 	 * in a temporary variable before calling vfs_subr.c:vfs_read_dirent()
 	 * and set the number of cookies to 0. We then set the pointer to
 	 * NULL so that vfs_read_dirent doesn't try to call realloc() on 
 	 * ap->a_cookies. Later in this function, we restore the ap->a_ncookies
 	 * pointer to its original location before returning to the caller.
 	 */
 	if (ap->a_ncookies != NULL) {
 		tmp_ncookies = ap->a_ncookies;
 		*ap->a_ncookies = 0;
 		ap->a_ncookies = NULL;
 	}
 
 	dmp = VFSTODEVFS(ap->a_vp->v_mount);
 	if (devfs_populate_vp(ap->a_vp) != 0) {
 		if (tmp_ncookies != NULL)
 			ap->a_ncookies = tmp_ncookies;
 		return (EIO);
 	}
 	error = 0;
 	de = ap->a_vp->v_data;
 	off = 0;
 	TAILQ_FOREACH(dd, &de->de_dlist, de_list) {
 		KASSERT(dd->de_cdp != (void *)0xdeadc0de, ("%s %d\n", __func__, __LINE__));
 		if (dd->de_flags & (DE_COVERED | DE_WHITEOUT))
 			continue;
 		if (devfs_prison_check(dd, uio->uio_td))
 			continue;
 		if (dd->de_dirent->d_type == DT_DIR)
 			de = dd->de_dir;
 		else
 			de = dd;
 		dp = dd->de_dirent;
 		MPASS(dp->d_reclen == GENERIC_DIRSIZ(dp));
 		if (dp->d_reclen > uio->uio_resid)
 			break;
 		dp->d_fileno = de->de_inode;
+		/* NOTE: d_off is the offset for the *next* entry. */
+		dp->d_off = off + dp->d_reclen;
 		if (off >= uio->uio_offset) {
 			error = vfs_read_dirent(ap, dp, off);
 			if (error)
 				break;
 		}
 		off += dp->d_reclen;
 	}
 	sx_xunlock(&dmp->dm_lock);
 	uio->uio_offset = off;
 
 	/*
 	 * Restore ap->a_ncookies if it wasn't originally NULL in the first
 	 * place.
 	 */
 	if (tmp_ncookies != NULL)
 		ap->a_ncookies = tmp_ncookies;
 
 	return (error);
 }
 
 static int
 devfs_readlink(struct vop_readlink_args *ap)
 {
 	struct devfs_dirent *de;
 
 	de = ap->a_vp->v_data;
 	return (uiomove(de->de_symlink, strlen(de->de_symlink), ap->a_uio));
 }
 
 static int
 devfs_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp;
 	struct devfs_dirent *de;
 
 	vp = ap->a_vp;
 	mtx_lock(&devfs_de_interlock);
 	de = vp->v_data;
 	if (de != NULL) {
 		de->de_vnode = NULL;
 		vp->v_data = NULL;
 	}
 	mtx_unlock(&devfs_de_interlock);
 	vnode_destroy_vobject(vp);
 	return (0);
 }
 
 static int
 devfs_reclaim_vchr(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp;
 	struct cdev *dev;
 
 	vp = ap->a_vp;
 	MPASS(vp->v_type == VCHR);
 
 	devfs_reclaim(ap);
 
 	VI_LOCK(vp);
 	dev_lock();
 	dev = vp->v_rdev;
 	vp->v_rdev = NULL;
 	if (dev != NULL)
 		dev->si_usecount -= vp->v_usecount;
 	dev_unlock();
 	VI_UNLOCK(vp);
 	if (dev != NULL)
 		dev_rel(dev);
 	return (0);
 }
 
 static int
 devfs_remove(struct vop_remove_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode *vp = ap->a_vp;
 	struct devfs_dirent *dd;
 	struct devfs_dirent *de, *de_covered;
 	struct devfs_mount *dmp = VFSTODEVFS(vp->v_mount);
 
 	ASSERT_VOP_ELOCKED(dvp, "devfs_remove");
 	ASSERT_VOP_ELOCKED(vp, "devfs_remove");
 
 	sx_xlock(&dmp->dm_lock);
 	dd = ap->a_dvp->v_data;
 	de = vp->v_data;
 	if (de->de_cdp == NULL) {
 		TAILQ_REMOVE(&dd->de_dlist, de, de_list);
 		if (de->de_dirent->d_type == DT_LNK) {
 			de_covered = devfs_find(dd, de->de_dirent->d_name,
 			    de->de_dirent->d_namlen, 0);
 			if (de_covered != NULL)
 				de_covered->de_flags &= ~DE_COVERED;
 		}
 		/* We need to unlock dvp because devfs_delete() may lock it. */
 		VOP_UNLOCK(vp, 0);
 		if (dvp != vp)
 			VOP_UNLOCK(dvp, 0);
 		devfs_delete(dmp, de, 0);
 		sx_xunlock(&dmp->dm_lock);
 		if (dvp != vp)
 			vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	} else {
 		de->de_flags |= DE_WHITEOUT;
 		sx_xunlock(&dmp->dm_lock);
 	}
 	return (0);
 }
 
 /*
  * Revoke is called on a tty when a terminal session ends.  The vnode
  * is orphaned by setting v_op to deadfs so we need to let go of it
  * as well so that we create a new one next time around.
  *
  */
 static int
 devfs_revoke(struct vop_revoke_args *ap)
 {
 	struct vnode *vp = ap->a_vp, *vp2;
 	struct cdev *dev;
 	struct cdev_priv *cdp;
 	struct devfs_dirent *de;
 	u_int i;
 
 	KASSERT((ap->a_flags & REVOKEALL) != 0, ("devfs_revoke !REVOKEALL"));
 
 	dev = vp->v_rdev;
 	cdp = cdev2priv(dev);
  
 	dev_lock();
 	cdp->cdp_inuse++;
 	dev_unlock();
 
 	vhold(vp);
 	vgone(vp);
 	vdrop(vp);
 
 	VOP_UNLOCK(vp,0);
  loop:
 	for (;;) {
 		mtx_lock(&devfs_de_interlock);
 		dev_lock();
 		vp2 = NULL;
 		for (i = 0; i <= cdp->cdp_maxdirent; i++) {
 			de = cdp->cdp_dirents[i];
 			if (de == NULL)
 				continue;
 
 			vp2 = de->de_vnode;
 			if (vp2 != NULL) {
 				dev_unlock();
 				VI_LOCK(vp2);
 				mtx_unlock(&devfs_de_interlock);
 				if (vget(vp2, LK_EXCLUSIVE | LK_INTERLOCK,
 				    curthread))
 					goto loop;
 				vhold(vp2);
 				vgone(vp2);
 				vdrop(vp2);
 				vput(vp2);
 				break;
 			} 
 		}
 		if (vp2 != NULL) {
 			continue;
 		}
 		dev_unlock();
 		mtx_unlock(&devfs_de_interlock);
 		break;
 	}
 	dev_lock();
 	cdp->cdp_inuse--;
 	if (!(cdp->cdp_flags & CDP_ACTIVE) && cdp->cdp_inuse == 0) {
 		TAILQ_REMOVE(&cdevp_list, cdp, cdp_list);
 		dev_unlock();
 		dev_rel(&cdp->cdp_c);
 	} else
 		dev_unlock();
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	return (0);
 }
 
 static int
 devfs_rioctl(struct vop_ioctl_args *ap)
 {
 	struct vnode *vp;
 	struct devfs_mount *dmp;
 	int error;
 
 	vp = ap->a_vp;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	if (vp->v_iflag & VI_DOOMED) {
 		VOP_UNLOCK(vp, 0);
 		return (EBADF);
 	}
 	dmp = VFSTODEVFS(vp->v_mount);
 	sx_xlock(&dmp->dm_lock);
 	VOP_UNLOCK(vp, 0);
 	DEVFS_DMP_HOLD(dmp);
 	devfs_populate(dmp);
 	if (DEVFS_DMP_DROP(dmp)) {
 		sx_xunlock(&dmp->dm_lock);
 		devfs_unmount_final(dmp);
 		return (ENOENT);
 	}
 	error = devfs_rules_ioctl(dmp, ap->a_command, ap->a_data, ap->a_td);
 	sx_xunlock(&dmp->dm_lock);
 	return (error);
 }
 
 static int
 devfs_rread(struct vop_read_args *ap)
 {
 
 	if (ap->a_vp->v_type != VDIR)
 		return (EINVAL);
 	return (VOP_READDIR(ap->a_vp, ap->a_uio, ap->a_cred, NULL, NULL, NULL));
 }
 
 static int
 devfs_setattr(struct vop_setattr_args *ap)
 {
 	struct devfs_dirent *de;
 	struct vattr *vap;
 	struct vnode *vp;
 	struct thread *td;
 	int c, error;
 	uid_t uid;
 	gid_t gid;
 
 	vap = ap->a_vap;
 	vp = ap->a_vp;
 	td = curthread;
 	if ((vap->va_type != VNON) ||
 	    (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) ||
 	    (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) ||
 	    (vap->va_flags != VNOVAL && vap->va_flags != 0) ||
 	    (vap->va_rdev != VNOVAL) ||
 	    ((int)vap->va_bytes != VNOVAL) ||
 	    (vap->va_gen != VNOVAL)) {
 		return (EINVAL);
 	}
 
 	error = devfs_populate_vp(vp);
 	if (error != 0)
 		return (error);
 
 	de = vp->v_data;
 	if (vp->v_type == VDIR)
 		de = de->de_dir;
 
 	c = 0;
 	if (vap->va_uid == (uid_t)VNOVAL)
 		uid = de->de_uid;
 	else
 		uid = vap->va_uid;
 	if (vap->va_gid == (gid_t)VNOVAL)
 		gid = de->de_gid;
 	else
 		gid = vap->va_gid;
 	if (uid != de->de_uid || gid != de->de_gid) {
 		if ((ap->a_cred->cr_uid != de->de_uid) || uid != de->de_uid ||
 		    (gid != de->de_gid && !groupmember(gid, ap->a_cred))) {
 			error = priv_check(td, PRIV_VFS_CHOWN);
 			if (error != 0)
 				goto ret;
 		}
 		de->de_uid = uid;
 		de->de_gid = gid;
 		c = 1;
 	}
 
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if (ap->a_cred->cr_uid != de->de_uid) {
 			error = priv_check(td, PRIV_VFS_ADMIN);
 			if (error != 0)
 				goto ret;
 		}
 		de->de_mode = vap->va_mode;
 		c = 1;
 	}
 
 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
 		error = vn_utimes_perm(vp, vap, ap->a_cred, td);
 		if (error != 0)
 			goto ret;
 		if (vap->va_atime.tv_sec != VNOVAL) {
 			if (vp->v_type == VCHR)
 				vp->v_rdev->si_atime = vap->va_atime;
 			else
 				de->de_atime = vap->va_atime;
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			if (vp->v_type == VCHR)
 				vp->v_rdev->si_mtime = vap->va_mtime;
 			else
 				de->de_mtime = vap->va_mtime;
 		}
 		c = 1;
 	}
 
 	if (c) {
 		if (vp->v_type == VCHR)
 			vfs_timestamp(&vp->v_rdev->si_ctime);
 		else
 			vfs_timestamp(&de->de_mtime);
 	}
 
 ret:
 	sx_xunlock(&VFSTODEVFS(vp->v_mount)->dm_lock);
 	return (error);
 }
 
 #ifdef MAC
 static int
 devfs_setlabel(struct vop_setlabel_args *ap)
 {
 	struct vnode *vp;
 	struct devfs_dirent *de;
 
 	vp = ap->a_vp;
 	de = vp->v_data;
 
 	mac_vnode_relabel(ap->a_cred, vp, ap->a_label);
 	mac_devfs_update(vp->v_mount, de, vp);
 
 	return (0);
 }
 #endif
 
 static int
 devfs_stat_f(struct file *fp, struct stat *sb, struct ucred *cred, struct thread *td)
 {
 
 	return (vnops.fo_stat(fp, sb, cred, td));
 }
 
 static int
 devfs_symlink(struct vop_symlink_args *ap)
 {
 	int i, error;
 	struct devfs_dirent *dd;
 	struct devfs_dirent *de, *de_covered, *de_dotdot;
 	struct devfs_mount *dmp;
 
 	error = priv_check(curthread, PRIV_DEVFS_SYMLINK);
 	if (error)
 		return(error);
 	dmp = VFSTODEVFS(ap->a_dvp->v_mount);
 	if (devfs_populate_vp(ap->a_dvp) != 0)
 		return (ENOENT);
 
 	dd = ap->a_dvp->v_data;
 	de = devfs_newdirent(ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen);
 	de->de_flags = DE_USER;
 	de->de_uid = 0;
 	de->de_gid = 0;
 	de->de_mode = 0755;
 	de->de_inode = alloc_unr(devfs_inos);
 	de->de_dir = dd;
 	de->de_dirent->d_type = DT_LNK;
 	i = strlen(ap->a_target) + 1;
 	de->de_symlink = malloc(i, M_DEVFS, M_WAITOK);
 	bcopy(ap->a_target, de->de_symlink, i);
 #ifdef MAC
 	mac_devfs_create_symlink(ap->a_cnp->cn_cred, dmp->dm_mount, dd, de);
 #endif
 	de_covered = devfs_find(dd, de->de_dirent->d_name,
 	    de->de_dirent->d_namlen, 0);
 	if (de_covered != NULL) {
 		if ((de_covered->de_flags & DE_USER) != 0) {
 			devfs_delete(dmp, de, DEVFS_DEL_NORECURSE);
 			sx_xunlock(&dmp->dm_lock);
 			return (EEXIST);
 		}
 		KASSERT((de_covered->de_flags & DE_COVERED) == 0,
 		    ("devfs_symlink: entry %p already covered", de_covered));
 		de_covered->de_flags |= DE_COVERED;
 	}
 
 	de_dotdot = TAILQ_FIRST(&dd->de_dlist);		/* "." */
 	de_dotdot = TAILQ_NEXT(de_dotdot, de_list);	/* ".." */
 	TAILQ_INSERT_AFTER(&dd->de_dlist, de_dotdot, de, de_list);
 	devfs_dir_ref_de(dmp, dd);
 	devfs_rules_apply(dmp, de);
 
 	return (devfs_allocv(de, ap->a_dvp->v_mount, LK_EXCLUSIVE, ap->a_vpp));
 }
 
 static int
 devfs_truncate_f(struct file *fp, off_t length, struct ucred *cred, struct thread *td)
 {
 
 	return (vnops.fo_truncate(fp, length, cred, td));
 }
 
 static int
 devfs_write_f(struct file *fp, struct uio *uio, struct ucred *cred,
     int flags, struct thread *td)
 {
 	struct cdev *dev;
 	int error, ioflag, ref;
 	ssize_t resid;
 	struct cdevsw *dsw;
 	struct file *fpop;
 
 	if (uio->uio_resid > DEVFS_IOSIZE_MAX)
 		return (EINVAL);
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error != 0) {
 		error = vnops.fo_write(fp, uio, cred, flags, td);
 		return (error);
 	}
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td));
 	ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT | O_FSYNC);
 	if (ioflag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 	foffset_lock_uio(fp, uio, flags | FOF_NOLOCK);
 
 	resid = uio->uio_resid;
 
 	error = dsw->d_write(dev, uio, ioflag);
 	if (uio->uio_resid != resid || (error == 0 && resid != 0)) {
 		devfs_timestamp(&dev->si_ctime);
 		dev->si_mtime = dev->si_ctime;
 	}
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 
 	foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF);
 	return (error);
 }
 
 static int
 devfs_mmap_f(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
     vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
     struct thread *td)
 {
 	struct cdev *dev;
 	struct cdevsw *dsw;
 	struct mount *mp;
 	struct vnode *vp;
 	struct file *fpop;
 	vm_object_t object;
 	vm_prot_t maxprot;
 	int error, ref;
 
 	vp = fp->f_vnode;
 
 	/*
 	 * Ensure that file and memory protections are
 	 * compatible.
 	 */
 	mp = vp->v_mount;
 	if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
 		maxprot = VM_PROT_NONE;
 		if ((prot & VM_PROT_EXECUTE) != 0)
 			return (EACCES);
 	} else
 		maxprot = VM_PROT_EXECUTE;
 	if ((fp->f_flag & FREAD) != 0)
 		maxprot |= VM_PROT_READ;
 	else if ((prot & VM_PROT_READ) != 0)
 		return (EACCES);
 
 	/*
 	 * If we are sharing potential changes via MAP_SHARED and we
 	 * are trying to get write permission although we opened it
 	 * without asking for it, bail out.
 	 *
 	 * Note that most character devices always share mappings.
 	 * The one exception is that D_MMAP_ANON devices
 	 * (i.e. /dev/zero) permit private writable mappings.
 	 *
 	 * Rely on vm_mmap_cdev() to fail invalid MAP_PRIVATE requests
 	 * as well as updating maxprot to permit writing for
 	 * D_MMAP_ANON devices rather than doing that here.
 	 */
 	if ((flags & MAP_SHARED) != 0) {
 		if ((fp->f_flag & FWRITE) != 0)
 			maxprot |= VM_PROT_WRITE;
 		else if ((prot & VM_PROT_WRITE) != 0)
 			return (EACCES);
 	}
 	maxprot &= cap_maxprot;
 
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error != 0)
 		return (error);
 
 	error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, dev, dsw, &foff,
 	    &object);
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 	if (error != 0)
 		return (error);
 
 	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
 	    foff, FALSE, td);
 	if (error != 0)
 		vm_object_deallocate(object);
 	return (error);
 }
 
 dev_t
 dev2udev(struct cdev *x)
 {
 	if (x == NULL)
 		return (NODEV);
 	return (cdev2priv(x)->cdp_inode);
 }
 
 static struct fileops devfs_ops_f = {
 	.fo_read =	devfs_read_f,
 	.fo_write =	devfs_write_f,
 	.fo_truncate =	devfs_truncate_f,
 	.fo_ioctl =	devfs_ioctl_f,
 	.fo_poll =	devfs_poll_f,
 	.fo_kqfilter =	devfs_kqfilter_f,
 	.fo_stat =	devfs_stat_f,
 	.fo_close =	devfs_close_f,
 	.fo_chmod =	vn_chmod,
 	.fo_chown =	vn_chown,
 	.fo_sendfile =	vn_sendfile,
 	.fo_seek =	vn_seek,
 	.fo_fill_kinfo = vn_fill_kinfo,
 	.fo_mmap =	devfs_mmap_f,
 	.fo_flags =	DFLAG_PASSABLE | DFLAG_SEEKABLE
 };
 
 /* Vops for non-CHR vnodes in /dev. */
 static struct vop_vector devfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		devfs_access,
 	.vop_getattr =		devfs_getattr,
 	.vop_ioctl =		devfs_rioctl,
 	.vop_lookup =		devfs_lookup,
 	.vop_mknod =		devfs_mknod,
 	.vop_pathconf =		devfs_pathconf,
 	.vop_read =		devfs_rread,
 	.vop_readdir =		devfs_readdir,
 	.vop_readlink =		devfs_readlink,
 	.vop_reclaim =		devfs_reclaim,
 	.vop_remove =		devfs_remove,
 	.vop_revoke =		devfs_revoke,
 	.vop_setattr =		devfs_setattr,
 #ifdef MAC
 	.vop_setlabel =		devfs_setlabel,
 #endif
 	.vop_symlink =		devfs_symlink,
 	.vop_vptocnp =		devfs_vptocnp,
 };
 
 /* Vops for VCHR vnodes in /dev. */
 static struct vop_vector devfs_specops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		devfs_access,
 	.vop_bmap =		VOP_PANIC,
 	.vop_close =		devfs_close,
 	.vop_create =		VOP_PANIC,
 	.vop_fsync =		vop_stdfsync,
 	.vop_getattr =		devfs_getattr,
 	.vop_ioctl =		devfs_ioctl,
 	.vop_link =		VOP_PANIC,
 	.vop_mkdir =		VOP_PANIC,
 	.vop_mknod =		VOP_PANIC,
 	.vop_open =		devfs_open,
 	.vop_pathconf =		devfs_pathconf,
 	.vop_poll =		dead_poll,
 	.vop_print =		devfs_print,
 	.vop_read =		dead_read,
 	.vop_readdir =		VOP_PANIC,
 	.vop_readlink =		VOP_PANIC,
 	.vop_reallocblks =	VOP_PANIC,
 	.vop_reclaim =		devfs_reclaim_vchr,
 	.vop_remove =		devfs_remove,
 	.vop_rename =		VOP_PANIC,
 	.vop_revoke =		devfs_revoke,
 	.vop_rmdir =		VOP_PANIC,
 	.vop_setattr =		devfs_setattr,
 #ifdef MAC
 	.vop_setlabel =		devfs_setlabel,
 #endif
 	.vop_strategy =		VOP_PANIC,
 	.vop_symlink =		VOP_PANIC,
 	.vop_vptocnp =		devfs_vptocnp,
 	.vop_write =		dead_write,
 };
 
 /*
  * Our calling convention to the device drivers used to be that we passed
  * vnode.h IO_* flags to read()/write(), but we're moving to fcntl.h O_ 
  * flags instead since that's what open(), close() and ioctl() takes and
  * we don't really want vnode.h in device drivers.
  * We solved the source compatibility by redefining some vnode flags to
  * be the same as the fcntl ones and by sending down the bitwise OR of
  * the respective fcntl/vnode flags.  These CTASSERTS make sure nobody
  * pulls the rug out under this.
  */
 CTASSERT(O_NONBLOCK == IO_NDELAY);
 CTASSERT(O_FSYNC == IO_SYNC);
Index: stable/12/sys/fs/ext2fs/ext2_lookup.c
===================================================================
--- stable/12/sys/fs/ext2fs/ext2_lookup.c	(revision 340720)
+++ stable/12/sys/fs/ext2fs/ext2_lookup.c	(revision 340721)
@@ -1,1281 +1,1283 @@
 /*-
  *  modified for Lites 1.1
  *
  *  Aug 1995, Godmar Back (gback@cs.utah.edu)
  *  University of Utah, Department of Computer Science
  */
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ufs_lookup.c	8.6 (Berkeley) 4/1/94
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/endian.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/malloc.h>
 #include <sys/dirent.h>
 #include <sys/sysctl.h>
 
 #include <ufs/ufs/dir.h>
 
 #include <fs/ext2fs/fs.h>
 #include <fs/ext2fs/inode.h>
 #include <fs/ext2fs/ext2_mount.h>
 #include <fs/ext2fs/ext2fs.h>
 #include <fs/ext2fs/ext2_dinode.h>
 #include <fs/ext2fs/ext2_dir.h>
 #include <fs/ext2fs/ext2_extern.h>
 #include <fs/ext2fs/fs.h>
 
 #ifdef INVARIANTS
 static int dirchk = 1;
 #else
 static int dirchk = 0;
 #endif
 
 static SYSCTL_NODE(_vfs, OID_AUTO, e2fs, CTLFLAG_RD, 0, "EXT2FS filesystem");
 SYSCTL_INT(_vfs_e2fs, OID_AUTO, dircheck, CTLFLAG_RW, &dirchk, 0, "");
 
 /*
    DIRBLKSIZE in ffs is DEV_BSIZE (in most cases 512)
    while it is the native blocksize in ext2fs - thus, a #define
    is no longer appropriate
 */
 #undef  DIRBLKSIZ
 
 static u_char ext2_ft_to_dt[] = {
 	DT_UNKNOWN,		/* EXT2_FT_UNKNOWN */
 	DT_REG,			/* EXT2_FT_REG_FILE */
 	DT_DIR,			/* EXT2_FT_DIR */
 	DT_CHR,			/* EXT2_FT_CHRDEV */
 	DT_BLK,			/* EXT2_FT_BLKDEV */
 	DT_FIFO,		/* EXT2_FT_FIFO */
 	DT_SOCK,		/* EXT2_FT_SOCK */
 	DT_LNK,			/* EXT2_FT_SYMLINK */
 };
 #define	FTTODT(ft) \
     ((ft) < nitems(ext2_ft_to_dt) ? ext2_ft_to_dt[(ft)] : DT_UNKNOWN)
 
 static u_char dt_to_ext2_ft[] = {
 	EXT2_FT_UNKNOWN,	/* DT_UNKNOWN */
 	EXT2_FT_FIFO,		/* DT_FIFO */
 	EXT2_FT_CHRDEV,		/* DT_CHR */
 	EXT2_FT_UNKNOWN,	/* unused */
 	EXT2_FT_DIR,		/* DT_DIR */
 	EXT2_FT_UNKNOWN,	/* unused */
 	EXT2_FT_BLKDEV,		/* DT_BLK */
 	EXT2_FT_UNKNOWN,	/* unused */
 	EXT2_FT_REG_FILE,	/* DT_REG */
 	EXT2_FT_UNKNOWN,	/* unused */
 	EXT2_FT_SYMLINK,	/* DT_LNK */
 	EXT2_FT_UNKNOWN,	/* unused */
 	EXT2_FT_SOCK,		/* DT_SOCK */
 	EXT2_FT_UNKNOWN,	/* unused */
 	EXT2_FT_UNKNOWN,	/* DT_WHT */
 };
 #define	DTTOFT(dt) \
     ((dt) < nitems(dt_to_ext2_ft) ? dt_to_ext2_ft[(dt)] : EXT2_FT_UNKNOWN)
 
 static int	ext2_dirbadentry(struct vnode *dp, struct ext2fs_direct_2 *de,
 		    int entryoffsetinblock);
 static int	ext2_is_dot_entry(struct componentname *cnp);
 static int	ext2_lookup_ino(struct vnode *vdp, struct vnode **vpp,
 		    struct componentname *cnp, ino_t *dd_ino);
 
 static int
 ext2_is_dot_entry(struct componentname *cnp)
 {
 	if (cnp->cn_namelen <= 2 && cnp->cn_nameptr[0] == '.' &&
 	    (cnp->cn_nameptr[1] == '.' || cnp->cn_nameptr[1] == '\0'))
 		return (1);
 	return (0);
 }
 
 /*
  * Vnode op for reading directories.
  */
 int
 ext2_readdir(struct vop_readdir_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct buf *bp;
 	struct inode *ip;
 	struct ext2fs_direct_2 *dp, *edp;
 	u_long *cookies;
 	struct dirent dstdp;
 	off_t offset, startoffset;
 	size_t readcnt, skipcnt;
 	ssize_t startresid;
 	u_int ncookies;
 	int DIRBLKSIZ = VTOI(ap->a_vp)->i_e2fs->e2fs_bsize;
 	int error;
 
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 	ip = VTOI(vp);
 	if (ap->a_ncookies != NULL) {
 		if (uio->uio_resid < 0)
 			ncookies = 0;
 		else
 			ncookies = uio->uio_resid;
 		if (uio->uio_offset >= ip->i_size)
 			ncookies = 0;
 		else if (ip->i_size - uio->uio_offset < ncookies)
 			ncookies = ip->i_size - uio->uio_offset;
 		ncookies = ncookies / (offsetof(struct ext2fs_direct_2,
 		    e2d_namlen) + 4) + 1;
 		cookies = malloc(ncookies * sizeof(*cookies), M_TEMP, M_WAITOK);
 		*ap->a_ncookies = ncookies;
 		*ap->a_cookies = cookies;
 	} else {
 		ncookies = 0;
 		cookies = NULL;
 	}
 	offset = startoffset = uio->uio_offset;
 	startresid = uio->uio_resid;
 	error = 0;
 	while (error == 0 && uio->uio_resid > 0 &&
 	    uio->uio_offset < ip->i_size) {
 		error = ext2_blkatoff(vp, uio->uio_offset, NULL, &bp);
 		if (error)
 			break;
 		if (bp->b_offset + bp->b_bcount > ip->i_size)
 			readcnt = ip->i_size - bp->b_offset;
 		else
 			readcnt = bp->b_bcount;
 		skipcnt = (size_t)(uio->uio_offset - bp->b_offset) &
 		    ~(size_t)(DIRBLKSIZ - 1);
 		offset = bp->b_offset + skipcnt;
 		dp = (struct ext2fs_direct_2 *)&bp->b_data[skipcnt];
 		edp = (struct ext2fs_direct_2 *)&bp->b_data[readcnt];
 		while (error == 0 && uio->uio_resid > 0 && dp < edp) {
 			if (dp->e2d_reclen <= offsetof(struct ext2fs_direct_2,
 			    e2d_namlen) || (caddr_t)dp + dp->e2d_reclen >
 			    (caddr_t)edp) {
 				error = EIO;
 				break;
 			}
 			/*-
 			 * "New" ext2fs directory entries differ in 3 ways
 			 * from ufs on-disk ones:
 			 * - the name is not necessarily NUL-terminated.
 			 * - the file type field always exists and always
 			 *   follows the name length field.
 			 * - the file type is encoded in a different way.
 			 *
 			 * "Old" ext2fs directory entries need no special
 			 * conversions, since they are binary compatible
 			 * with "new" entries having a file type of 0 (i.e.,
 			 * EXT2_FT_UNKNOWN).  Splitting the old name length
 			 * field didn't make a mess like it did in ufs,
 			 * because ext2fs uses a machine-independent disk
 			 * layout.
 			 */
 			dstdp.d_namlen = dp->e2d_namlen;
 			dstdp.d_type = FTTODT(dp->e2d_type);
 			if (offsetof(struct ext2fs_direct_2, e2d_namlen) +
 			    dstdp.d_namlen > dp->e2d_reclen) {
 				error = EIO;
 				break;
 			}
 			if (offset < startoffset || dp->e2d_ino == 0)
 				goto nextentry;
 			dstdp.d_fileno = dp->e2d_ino;
 			dstdp.d_reclen = GENERIC_DIRSIZ(&dstdp);
 			bcopy(dp->e2d_name, dstdp.d_name, dstdp.d_namlen);
 			dstdp.d_name[dstdp.d_namlen] = '\0';
+			/* NOTE: d_off is the offset of the *next* entry. */
+			dstdp.d_off = offset + dp->e2d_reclen;
 			if (dstdp.d_reclen > uio->uio_resid) {
 				if (uio->uio_resid == startresid)
 					error = EINVAL;
 				else
 					error = EJUSTRETURN;
 				break;
 			}
 			/* Advance dp. */
 			error = uiomove((caddr_t)&dstdp, dstdp.d_reclen, uio);
 			if (error)
 				break;
 			if (cookies != NULL) {
 				KASSERT(ncookies > 0,
 				    ("ext2_readdir: cookies buffer too small"));
 				*cookies = offset + dp->e2d_reclen;
 				cookies++;
 				ncookies--;
 			}
 nextentry:
 			offset += dp->e2d_reclen;
 			dp = (struct ext2fs_direct_2 *)((caddr_t)dp +
 			    dp->e2d_reclen);
 		}
 		bqrelse(bp);
 		uio->uio_offset = offset;
 	}
 	/* We need to correct uio_offset. */
 	uio->uio_offset = offset;
 	if (error == EJUSTRETURN)
 		error = 0;
 	if (ap->a_ncookies != NULL) {
 		if (error == 0) {
 			ap->a_ncookies -= ncookies;
 		} else {
 			free(*ap->a_cookies, M_TEMP);
 			*ap->a_ncookies = 0;
 			*ap->a_cookies = NULL;
 		}
 	}
 	if (error == 0 && ap->a_eofflag)
 		*ap->a_eofflag = ip->i_size <= uio->uio_offset;
 	return (error);
 }
 
 /*
  * Convert a component of a pathname into a pointer to a locked inode.
  * This is a very central and rather complicated routine.
  * If the file system is not maintained in a strict tree hierarchy,
  * this can result in a deadlock situation (see comments in code below).
  *
  * The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending
  * on whether the name is to be looked up, created, renamed, or deleted.
  * When CREATE, RENAME, or DELETE is specified, information usable in
  * creating, renaming, or deleting a directory entry may be calculated.
  * If flag has LOCKPARENT or'ed into it and the target of the pathname
  * exists, lookup returns both the target and its parent directory locked.
  * When creating or renaming and LOCKPARENT is specified, the target may
  * not be ".".  When deleting and LOCKPARENT is specified, the target may
  * be "."., but the caller must check to ensure it does an vrele and vput
  * instead of two vputs.
  *
  * Overall outline of ext2_lookup:
  *
  *	search for name in directory, to found or notfound
  * notfound:
  *	if creating, return locked directory, leaving info on available slots
  *	else return error
  * found:
  *	if at end of path and deleting, return information to allow delete
  *	if at end of path and rewriting (RENAME and LOCKPARENT), lock target
  *	  inode and return info to allow rewrite
  *	if not at end, add name to cache; if at end and neither creating
  *	  nor deleting, add name to cache
  */
 int
 ext2_lookup(struct vop_cachedlookup_args *ap)
 {
 
 	return (ext2_lookup_ino(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL));
 }
 
 static int
 ext2_lookup_ino(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp,
     ino_t *dd_ino)
 {
 	struct inode *dp;		/* inode for directory being searched */
 	struct buf *bp;			/* a buffer of directory entries */
 	struct ext2fs_direct_2 *ep;	/* the current directory entry */
 	int entryoffsetinblock;		/* offset of ep in bp's buffer */
 	struct ext2fs_searchslot ss;
 	doff_t i_diroff;		/* cached i_diroff value */
 	doff_t i_offset;		/* cached i_offset value */
 	int numdirpasses;		/* strategy for directory search */
 	doff_t endsearch;		/* offset to end directory search */
 	doff_t prevoff;			/* prev entry dp->i_offset */
 	struct vnode *pdp;		/* saved dp during symlink work */
 	struct vnode *tdp;		/* returned by VFS_VGET */
 	doff_t enduseful;		/* pointer past last used dir slot */
 	u_long bmask;			/* block offset mask */
 	int error;
 	struct ucred *cred = cnp->cn_cred;
 	int flags = cnp->cn_flags;
 	int nameiop = cnp->cn_nameiop;
 	ino_t ino, ino1;
 	int ltype;
 	int entry_found = 0;
 
 	int DIRBLKSIZ = VTOI(vdp)->i_e2fs->e2fs_bsize;
 
 	if (vpp != NULL)
 		*vpp = NULL;
 
 	dp = VTOI(vdp);
 	bmask = VFSTOEXT2(vdp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
 restart:
 	bp = NULL;
 	ss.slotoffset = -1;
 
 	/*
 	 * We now have a segment name to search for, and a directory to search.
 	 *
 	 * Suppress search for slots unless creating
 	 * file and at end of pathname, in which case
 	 * we watch for a place to put the new file in
 	 * case it doesn't already exist.
 	 */
 	i_diroff = dp->i_diroff;
 	ss.slotstatus = FOUND;
 	ss.slotfreespace = ss.slotsize = ss.slotneeded = 0;
 	if ((nameiop == CREATE || nameiop == RENAME) &&
 	    (flags & ISLASTCN)) {
 		ss.slotstatus = NONE;
 		ss.slotneeded = EXT2_DIR_REC_LEN(cnp->cn_namelen);
 		/*
 		 * was ss.slotneeded = (sizeof(struct direct) - MAXNAMLEN +
 		 * cnp->cn_namelen + 3) &~ 3;
 		 */
 	}
 	/*
 	 * Try to lookup dir entry using htree directory index.
 	 *
 	 * If we got an error or we want to find '.' or '..' entry,
 	 * we will fall back to linear search.
 	 */
 	if (!ext2_is_dot_entry(cnp) && ext2_htree_has_idx(dp)) {
 		numdirpasses = 1;
 		entryoffsetinblock = 0;
 		switch (ext2_htree_lookup(dp, cnp->cn_nameptr, cnp->cn_namelen,
 		    &bp, &entryoffsetinblock, &i_offset, &prevoff,
 		    &enduseful, &ss)) {
 		case 0:
 			ep = (struct ext2fs_direct_2 *)((char *)bp->b_data +
 			    (i_offset & bmask));
 			goto foundentry;
 		case ENOENT:
 			i_offset = roundup2(dp->i_size, DIRBLKSIZ);
 			goto notfound;
 		default:
 			/*
 			 * Something failed; just fallback to do a linear
 			 * search.
 			 */
 			break;
 		}
 	}
 
 	/*
 	 * If there is cached information on a previous search of
 	 * this directory, pick up where we last left off.
 	 * We cache only lookups as these are the most common
 	 * and have the greatest payoff. Caching CREATE has little
 	 * benefit as it usually must search the entire directory
 	 * to determine that the entry does not exist. Caching the
 	 * location of the last DELETE or RENAME has not reduced
 	 * profiling time and hence has been removed in the interest
 	 * of simplicity.
 	 */
 	if (nameiop != LOOKUP || i_diroff == 0 ||
 	    i_diroff > dp->i_size) {
 		entryoffsetinblock = 0;
 		i_offset = 0;
 		numdirpasses = 1;
 	} else {
 		i_offset = i_diroff;
 		if ((entryoffsetinblock = i_offset & bmask) &&
 		    (error = ext2_blkatoff(vdp, (off_t)i_offset, NULL,
 		    &bp)))
 			return (error);
 		numdirpasses = 2;
 		nchstats.ncs_2passes++;
 	}
 	prevoff = i_offset;
 	endsearch = roundup2(dp->i_size, DIRBLKSIZ);
 	enduseful = 0;
 
 searchloop:
 	while (i_offset < endsearch) {
 		/*
 		 * If necessary, get the next directory block.
 		 */
 		if (bp != NULL)
 			brelse(bp);
 		error = ext2_blkatoff(vdp, (off_t)i_offset, NULL, &bp);
 		if (error != 0)
 			return (error);
 
 		entryoffsetinblock = 0;
 		if (ss.slotstatus == NONE) {
 			ss.slotoffset = -1;
 			ss.slotfreespace = 0;
 		}
 
 		error = ext2_search_dirblock(dp, bp->b_data, &entry_found,
 		    cnp->cn_nameptr, cnp->cn_namelen,
 		    &entryoffsetinblock, &i_offset, &prevoff,
 		    &enduseful, &ss);
 		if (error != 0) {
 			brelse(bp);
 			return (error);
 		}
 		if (entry_found) {
 			ep = (struct ext2fs_direct_2 *)((char *)bp->b_data +
 			    (entryoffsetinblock & bmask));
 foundentry:
 			ino = ep->e2d_ino;
 			goto found;
 		}
 	}
 notfound:
 	/*
 	 * If we started in the middle of the directory and failed
 	 * to find our target, we must check the beginning as well.
 	 */
 	if (numdirpasses == 2) {
 		numdirpasses--;
 		i_offset = 0;
 		endsearch = i_diroff;
 		goto searchloop;
 	}
 	if (bp != NULL)
 		brelse(bp);
 	/*
 	 * If creating, and at end of pathname and current
 	 * directory has not been removed, then can consider
 	 * allowing file to be created.
 	 */
 	if ((nameiop == CREATE || nameiop == RENAME) &&
 	    (flags & ISLASTCN) && dp->i_nlink != 0) {
 		/*
 		 * Access for write is interpreted as allowing
 		 * creation of files in the directory.
 		 */
 		if ((error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread)) != 0)
 			return (error);
 		/*
 		 * Return an indication of where the new directory
 		 * entry should be put.  If we didn't find a slot,
 		 * then set dp->i_count to 0 indicating
 		 * that the new slot belongs at the end of the
 		 * directory. If we found a slot, then the new entry
 		 * can be put in the range from dp->i_offset to
 		 * dp->i_offset + dp->i_count.
 		 */
 		if (ss.slotstatus == NONE) {
 			dp->i_offset = roundup2(dp->i_size, DIRBLKSIZ);
 			dp->i_count = 0;
 			enduseful = dp->i_offset;
 		} else {
 			dp->i_offset = ss.slotoffset;
 			dp->i_count = ss.slotsize;
 			if (enduseful < ss.slotoffset + ss.slotsize)
 				enduseful = ss.slotoffset + ss.slotsize;
 		}
 		dp->i_endoff = roundup2(enduseful, DIRBLKSIZ);
 		/*
 		 * We return with the directory locked, so that
 		 * the parameters we set up above will still be
 		 * valid if we actually decide to do a direnter().
 		 * We return ni_vp == NULL to indicate that the entry
 		 * does not currently exist; we leave a pointer to
 		 * the (locked) directory inode in ndp->ni_dvp.
 		 * The pathname buffer is saved so that the name
 		 * can be obtained later.
 		 *
 		 * NB - if the directory is unlocked, then this
 		 * information cannot be used.
 		 */
 		cnp->cn_flags |= SAVENAME;
 		return (EJUSTRETURN);
 	}
 	/*
 	 * Insert name into cache (as non-existent) if appropriate.
 	 */
 	if ((cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(vdp, NULL, cnp);
 	return (ENOENT);
 
 found:
 	if (dd_ino != NULL)
 		*dd_ino = ino;
 	if (numdirpasses == 2)
 		nchstats.ncs_pass2++;
 	/*
 	 * Check that directory length properly reflects presence
 	 * of this entry.
 	 */
 	if (entryoffsetinblock + EXT2_DIR_REC_LEN(ep->e2d_namlen)
 	    > dp->i_size) {
 		ext2_dirbad(dp, i_offset, "i_size too small");
 		dp->i_size = entryoffsetinblock + EXT2_DIR_REC_LEN(ep->e2d_namlen);
 		dp->i_flag |= IN_CHANGE | IN_UPDATE;
 	}
 	brelse(bp);
 
 	/*
 	 * Found component in pathname.
 	 * If the final component of path name, save information
 	 * in the cache as to where the entry was found.
 	 */
 	if ((flags & ISLASTCN) && nameiop == LOOKUP)
 		dp->i_diroff = rounddown2(i_offset, DIRBLKSIZ);
 	/*
 	 * If deleting, and at end of pathname, return
 	 * parameters which can be used to remove file.
 	 */
 	if (nameiop == DELETE && (flags & ISLASTCN)) {
 		if (flags & LOCKPARENT)
 			ASSERT_VOP_ELOCKED(vdp, __FUNCTION__);
 		/*
 		 * Write access to directory required to delete files.
 		 */
 		if ((error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread)) != 0)
 			return (error);
 		/*
 		 * Return pointer to current entry in dp->i_offset,
 		 * and distance past previous entry (if there
 		 * is a previous entry in this block) in dp->i_count.
 		 * Save directory inode pointer in ndp->ni_dvp for dirremove().
 		 *
 		 * Technically we shouldn't be setting these in the
 		 * WANTPARENT case (first lookup in rename()), but any
 		 * lookups that will result in directory changes will
 		 * overwrite these.
 		 */
 		dp->i_offset = i_offset;
 		if ((dp->i_offset & (DIRBLKSIZ - 1)) == 0)
 			dp->i_count = 0;
 		else
 			dp->i_count = dp->i_offset - prevoff;
 		if (dd_ino != NULL)
 			return (0);
 		if (dp->i_number == ino) {
 			VREF(vdp);
 			*vpp = vdp;
 			return (0);
 		}
 		if ((error = VFS_VGET(vdp->v_mount, ino, LK_EXCLUSIVE,
 		    &tdp)) != 0)
 			return (error);
 		/*
 		 * If directory is "sticky", then user must own
 		 * the directory, or the file in it, else she
 		 * may not delete it (unless she's root). This
 		 * implements append-only directories.
 		 */
 		if ((dp->i_mode & ISVTX) &&
 		    cred->cr_uid != 0 &&
 		    cred->cr_uid != dp->i_uid &&
 		    VTOI(tdp)->i_uid != cred->cr_uid) {
 			vput(tdp);
 			return (EPERM);
 		}
 		*vpp = tdp;
 		return (0);
 	}
 
 	/*
 	 * If rewriting (RENAME), return the inode and the
 	 * information required to rewrite the present directory
 	 * Must get inode of directory entry to verify it's a
 	 * regular file, or empty directory.
 	 */
 	if (nameiop == RENAME && (flags & ISLASTCN)) {
 		if ((error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread)) != 0)
 			return (error);
 		/*
 		 * Careful about locking second inode.
 		 * This can only occur if the target is ".".
 		 */
 		dp->i_offset = i_offset;
 		if (dp->i_number == ino)
 			return (EISDIR);
 		if (dd_ino != NULL)
 			return (0);
 		if ((error = VFS_VGET(vdp->v_mount, ino, LK_EXCLUSIVE,
 		    &tdp)) != 0)
 			return (error);
 		*vpp = tdp;
 		cnp->cn_flags |= SAVENAME;
 		return (0);
 	}
 	if (dd_ino != NULL)
 		return (0);
 
 	/*
 	 * Step through the translation in the name.  We do not `vput' the
 	 * directory because we may need it again if a symbolic link
 	 * is relative to the current directory.  Instead we save it
 	 * unlocked as "pdp".  We must get the target inode before unlocking
 	 * the directory to insure that the inode will not be removed
 	 * before we get it.  We prevent deadlock by always fetching
 	 * inodes from the root, moving down the directory tree. Thus
 	 * when following backward pointers ".." we must unlock the
 	 * parent directory before getting the requested directory.
 	 * There is a potential race condition here if both the current
 	 * and parent directories are removed before the VFS_VGET for the
 	 * inode associated with ".." returns.  We hope that this occurs
 	 * infrequently since we cannot avoid this race condition without
 	 * implementing a sophisticated deadlock detection algorithm.
 	 * Note also that this simple deadlock detection scheme will not
 	 * work if the file system has any hard links other than ".."
 	 * that point backwards in the directory structure.
 	 */
 	pdp = vdp;
 	if (flags & ISDOTDOT) {
 		error = vn_vget_ino(pdp, ino, cnp->cn_lkflags, &tdp);
 		if (pdp->v_iflag & VI_DOOMED) {
 			if (error == 0)
 				vput(tdp);
 			error = ENOENT;
 		}
 		if (error)
 			return (error);
 		/*
 		 * Recheck that ".." entry in the vdp directory points
 		 * to the inode we looked up before vdp lock was
 		 * dropped.
 		 */
 		error = ext2_lookup_ino(pdp, NULL, cnp, &ino1);
 		if (error) {
 			vput(tdp);
 			return (error);
 		}
 		if (ino1 != ino) {
 			vput(tdp);
 			goto restart;
 		}
 		*vpp = tdp;
 	} else if (dp->i_number == ino) {
 		VREF(vdp);	/* we want ourself, ie "." */
 		/*
 		 * When we lookup "." we still can be asked to lock it
 		 * differently.
 		 */
 		ltype = cnp->cn_lkflags & LK_TYPE_MASK;
 		if (ltype != VOP_ISLOCKED(vdp)) {
 			if (ltype == LK_EXCLUSIVE)
 				vn_lock(vdp, LK_UPGRADE | LK_RETRY);
 			else	/* if (ltype == LK_SHARED) */
 				vn_lock(vdp, LK_DOWNGRADE | LK_RETRY);
 		}
 		*vpp = vdp;
 	} else {
 		if ((error = VFS_VGET(vdp->v_mount, ino, cnp->cn_lkflags,
 		    &tdp)) != 0)
 			return (error);
 		*vpp = tdp;
 	}
 
 	/*
 	 * Insert name into cache if appropriate.
 	 */
 	if (cnp->cn_flags & MAKEENTRY)
 		cache_enter(vdp, *vpp, cnp);
 	return (0);
 }
 
 int
 ext2_search_dirblock(struct inode *ip, void *data, int *foundp,
     const char *name, int namelen, int *entryoffsetinblockp,
     doff_t *offp, doff_t *prevoffp, doff_t *endusefulp,
     struct ext2fs_searchslot *ssp)
 {
 	struct vnode *vdp;
 	struct ext2fs_direct_2 *ep, *top;
 	uint32_t bsize = ip->i_e2fs->e2fs_bsize;
 	int offset = *entryoffsetinblockp;
 	int namlen;
 
 	vdp = ITOV(ip);
 
 	ep = (struct ext2fs_direct_2 *)((char *)data + offset);
 	top = (struct ext2fs_direct_2 *)((char *)data + bsize);
 	while (ep < top) {
 		/*
 		 * Full validation checks are slow, so we only check
 		 * enough to insure forward progress through the
 		 * directory. Complete checks can be run by setting
 		 * "vfs.e2fs.dirchk" to be true.
 		 */
 		if (ep->e2d_reclen == 0 ||
 		    (dirchk && ext2_dirbadentry(vdp, ep, offset))) {
 			int i;
 
 			ext2_dirbad(ip, *offp, "mangled entry");
 			i = bsize - (offset & (bsize - 1));
 			*offp += i;
 			offset += i;
 			continue;
 		}
 
 		/*
 		 * If an appropriate sized slot has not yet been found,
 		 * check to see if one is available. Also accumulate space
 		 * in the current block so that we can determine if
 		 * compaction is viable.
 		 */
 		if (ssp->slotstatus != FOUND) {
 			int size = ep->e2d_reclen;
 
 			if (ep->e2d_ino != 0)
 				size -= EXT2_DIR_REC_LEN(ep->e2d_namlen);
 			else if (ext2_is_dirent_tail(ip, ep))
 				size -= sizeof(struct ext2fs_direct_tail);
 			if (size > 0) {
 				if (size >= ssp->slotneeded) {
 					ssp->slotstatus = FOUND;
 					ssp->slotoffset = *offp;
 					ssp->slotsize = ep->e2d_reclen;
 				} else if (ssp->slotstatus == NONE) {
 					ssp->slotfreespace += size;
 					if (ssp->slotoffset == -1)
 						ssp->slotoffset = *offp;
 					if (ssp->slotfreespace >= ssp->slotneeded) {
 						ssp->slotstatus = COMPACT;
 						ssp->slotsize = *offp +
 						    ep->e2d_reclen -
 						    ssp->slotoffset;
 					}
 				}
 			}
 		}
 		/*
 		 * Check for a name match.
 		 */
 		if (ep->e2d_ino) {
 			namlen = ep->e2d_namlen;
 			if (namlen == namelen &&
 			    !bcmp(name, ep->e2d_name, (unsigned)namlen)) {
 				/*
 				 * Save directory entry's inode number and
 				 * reclen in ndp->ni_ufs area, and release
 				 * directory buffer.
 				 */
 				*foundp = 1;
 				return (0);
 			}
 		}
 		*prevoffp = *offp;
 		*offp += ep->e2d_reclen;
 		offset += ep->e2d_reclen;
 		*entryoffsetinblockp = offset;
 		if (ep->e2d_ino)
 			*endusefulp = *offp;
 		/*
 		 * Get pointer to the next entry.
 		 */
 		ep = (struct ext2fs_direct_2 *)((char *)data + offset);
 	}
 
 	return (0);
 }
 
 void
 ext2_dirbad(struct inode *ip, doff_t offset, char *how)
 {
 	struct mount *mp;
 
 	mp = ITOV(ip)->v_mount;
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		panic("ext2_dirbad: %s: bad dir ino %ju at offset %ld: %s\n",
 		    mp->mnt_stat.f_mntonname, (uintmax_t)ip->i_number,
 		    (long)offset, how);
 	else
 		(void)printf("%s: bad dir ino %ju at offset %ld: %s\n",
 		    mp->mnt_stat.f_mntonname, (uintmax_t)ip->i_number,
 		    (long)offset, how);
 
 }
 
 /*
  * Do consistency checking on a directory entry:
  *	record length must be multiple of 4
  *	entry must fit in rest of its DIRBLKSIZ block
  *	record must be large enough to contain entry
  *	name is not longer than MAXNAMLEN
  *	name must be as long as advertised, and null terminated
  */
 /*
  *	changed so that it confirms to ext2_check_dir_entry
  */
 static int
 ext2_dirbadentry(struct vnode *dp, struct ext2fs_direct_2 *de,
     int entryoffsetinblock)
 {
 	int DIRBLKSIZ = VTOI(dp)->i_e2fs->e2fs_bsize;
 
 	char *error_msg = NULL;
 
 	if (de->e2d_reclen < EXT2_DIR_REC_LEN(1))
 		error_msg = "rec_len is smaller than minimal";
 	else if (de->e2d_reclen % 4 != 0)
 		error_msg = "rec_len % 4 != 0";
 	else if (de->e2d_reclen < EXT2_DIR_REC_LEN(de->e2d_namlen))
 		error_msg = "reclen is too small for name_len";
 	else if (entryoffsetinblock + de->e2d_reclen > DIRBLKSIZ)
 		error_msg = "directory entry across blocks";
 	/* else LATER
 	     if (de->inode > dir->i_sb->u.ext2_sb.s_es->s_inodes_count)
 		error_msg = "inode out of bounds";
 	*/
 
 	if (error_msg != NULL) {
 		printf("bad directory entry: %s\n", error_msg);
 		printf("offset=%d, inode=%lu, rec_len=%u, name_len=%u\n",
 			entryoffsetinblock, (unsigned long)de->e2d_ino,
 			de->e2d_reclen, de->e2d_namlen);
 	}
 	return error_msg == NULL ? 0 : 1;
 }
 
 /*
  * Insert an entry into the fresh directory block.
  * Initialize entry tail if the metadata_csum feature is turned on.
  */
 static int
 ext2_add_first_entry(struct vnode *dvp, struct ext2fs_direct_2 *entry,
     struct componentname *cnp)
 {
 	struct inode *dp;
 	struct iovec aiov;
 	struct uio auio;
 	char* buf = NULL;
 	int dirblksize, error;
 
 	dp = VTOI(dvp);
 	dirblksize = dp->i_e2fs->e2fs_bsize;
 
 	if (dp->i_offset & (dirblksize - 1))
 		panic("ext2_add_first_entry: bad directory offset");
 
 	if (EXT2_HAS_RO_COMPAT_FEATURE(dp->i_e2fs,
 	    EXT2F_ROCOMPAT_METADATA_CKSUM)) {
 		entry->e2d_reclen = dirblksize - sizeof(struct ext2fs_direct_tail);
 		buf = malloc(dirblksize, M_TEMP, M_WAITOK);
 		if (!buf) {
 			error = ENOMEM;
 			goto out;
 		}
 		memcpy(buf, entry, EXT2_DIR_REC_LEN(entry->e2d_namlen));
 		ext2_init_dirent_tail(EXT2_DIRENT_TAIL(buf, dirblksize));
 		ext2_dirent_csum_set(dp, (struct ext2fs_direct_2 *)buf);
 
 		auio.uio_offset = dp->i_offset;
 		auio.uio_resid = dirblksize;
 		aiov.iov_len = auio.uio_resid;
 		aiov.iov_base = (caddr_t)buf;
 	} else {
 		entry->e2d_reclen = dirblksize;
 		auio.uio_offset = dp->i_offset;
 		auio.uio_resid = EXT2_DIR_REC_LEN(entry->e2d_namlen);
 		aiov.iov_len = auio.uio_resid;
 		aiov.iov_base = (caddr_t)entry;
 	}
 
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = (struct thread *)0;
 	error = VOP_WRITE(dvp, &auio, IO_SYNC, cnp->cn_cred);
 	if (error)
 		goto out;
 
 	dp->i_size = roundup2(dp->i_size, dirblksize);
 	dp->i_flag |= IN_CHANGE;
 
 out:
 	free(buf, M_TEMP);
 	return (error);
 
 }
 
 /*
  * Write a directory entry after a call to namei, using the parameters
  * that it left in nameidata.  The argument ip is the inode which the new
  * directory entry will refer to.  Dvp is a pointer to the directory to
  * be written, which was left locked by namei. Remaining parameters
  * (dp->i_offset, dp->i_count) indicate how the space for the new
  * entry is to be obtained.
  */
 int
 ext2_direnter(struct inode *ip, struct vnode *dvp, struct componentname *cnp)
 {
 	struct inode *dp;
 	struct ext2fs_direct_2 newdir;
 	int DIRBLKSIZ = ip->i_e2fs->e2fs_bsize;
 	int error;
 
 
 #ifdef INVARIANTS
 	if ((cnp->cn_flags & SAVENAME) == 0)
 		panic("ext2_direnter: missing name");
 #endif
 	dp = VTOI(dvp);
 	newdir.e2d_ino = ip->i_number;
 	newdir.e2d_namlen = cnp->cn_namelen;
 	if (EXT2_HAS_INCOMPAT_FEATURE(ip->i_e2fs,
 	    EXT2F_INCOMPAT_FTYPE))
 		newdir.e2d_type = DTTOFT(IFTODT(ip->i_mode));
 	else
 		newdir.e2d_type = EXT2_FT_UNKNOWN;
 	bcopy(cnp->cn_nameptr, newdir.e2d_name, (unsigned)cnp->cn_namelen + 1);
 
 	if (ext2_htree_has_idx(dp)) {
 		error = ext2_htree_add_entry(dvp, &newdir, cnp);
 		if (error) {
 			dp->i_flag &= ~IN_E3INDEX;
 			dp->i_flag |= IN_CHANGE | IN_UPDATE;
 		}
 		return (error);
 	}
 
 	if (EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_DIRHASHINDEX) &&
 	    !ext2_htree_has_idx(dp)) {
 		if ((dp->i_size / DIRBLKSIZ) == 1 &&
 		    dp->i_offset == DIRBLKSIZ) {
 			/*
 			 * Making indexed directory when one block is not
 			 * enough to save all entries.
 			 */
 			return ext2_htree_create_index(dvp, cnp, &newdir);
 		}
 	}
 
 	/*
 	 * If dp->i_count is 0, then namei could find no
 	 * space in the directory. Here, dp->i_offset will
 	 * be on a directory block boundary and we will write the
 	 * new entry into a fresh block.
 	 */
 	if (dp->i_count == 0)
 		return ext2_add_first_entry(dvp, &newdir, cnp);
 
 	error = ext2_add_entry(dvp, &newdir);
 	if (!error && dp->i_endoff && dp->i_endoff < dp->i_size)
 		error = ext2_truncate(dvp, (off_t)dp->i_endoff, IO_SYNC,
 		    cnp->cn_cred, cnp->cn_thread);
 	return (error);
 }
 
 /*
  * Insert an entry into the directory block.
  * Compact the contents.
  */
 int
 ext2_add_entry(struct vnode *dvp, struct ext2fs_direct_2 *entry)
 {
 	struct ext2fs_direct_2 *ep, *nep;
 	struct inode *dp;
 	struct buf *bp;
 	u_int dsize;
 	int error, loc, newentrysize, spacefree;
 	char *dirbuf;
 
 	dp = VTOI(dvp);
 
 	/*
 	 * If dp->i_count is non-zero, then namei found space
 	 * for the new entry in the range dp->i_offset to
 	 * dp->i_offset + dp->i_count in the directory.
 	 * To use this space, we may have to compact the entries located
 	 * there, by copying them together towards the beginning of the
 	 * block, leaving the free space in one usable chunk at the end.
 	 */
 
 	/*
 	 * Increase size of directory if entry eats into new space.
 	 * This should never push the size past a new multiple of
 	 * DIRBLKSIZE.
 	 *
 	 * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN.
 	 */
 	if (dp->i_offset + dp->i_count > dp->i_size)
 		dp->i_size = dp->i_offset + dp->i_count;
 	/*
 	 * Get the block containing the space for the new directory entry.
 	 */
 	if ((error = ext2_blkatoff(dvp, (off_t)dp->i_offset, &dirbuf,
 	    &bp)) != 0)
 		return (error);
 	/*
 	 * Find space for the new entry. In the simple case, the entry at
 	 * offset base will have the space. If it does not, then namei
 	 * arranged that compacting the region dp->i_offset to
 	 * dp->i_offset + dp->i_count would yield the
 	 * space.
 	 */
 	newentrysize = EXT2_DIR_REC_LEN(entry->e2d_namlen);
 	ep = (struct ext2fs_direct_2 *)dirbuf;
 	dsize = EXT2_DIR_REC_LEN(ep->e2d_namlen);
 	spacefree = ep->e2d_reclen - dsize;
 	for (loc = ep->e2d_reclen; loc < dp->i_count; ) {
 		nep = (struct ext2fs_direct_2 *)(dirbuf + loc);
 		if (ep->e2d_ino) {
 			/* trim the existing slot */
 			ep->e2d_reclen = dsize;
 			ep = (struct ext2fs_direct_2 *)((char *)ep + dsize);
 		} else {
 			/* overwrite; nothing there; header is ours */
 			spacefree += dsize;
 		}
 		dsize = EXT2_DIR_REC_LEN(nep->e2d_namlen);
 		spacefree += nep->e2d_reclen - dsize;
 		loc += nep->e2d_reclen;
 		bcopy((caddr_t)nep, (caddr_t)ep, dsize);
 	}
 	/*
 	 * Update the pointer fields in the previous entry (if any),
 	 * copy in the new entry, and write out the block.
 	 */
 	if (ep->e2d_ino == 0) {
 		if (spacefree + dsize < newentrysize)
 			panic("ext2_direnter: compact1");
 		entry->e2d_reclen = spacefree + dsize;
 	} else {
 		if (spacefree < newentrysize)
 			panic("ext2_direnter: compact2");
 		entry->e2d_reclen = spacefree;
 		ep->e2d_reclen = dsize;
 		ep = (struct ext2fs_direct_2 *)((char *)ep + dsize);
 	}
 	bcopy((caddr_t)entry, (caddr_t)ep, (u_int)newentrysize);
 	ext2_dirent_csum_set(dp, (struct ext2fs_direct_2 *)bp->b_data);
 	if (DOINGASYNC(dvp)) {
 		bdwrite(bp);
 		error = 0;
 	} else {
 		error = bwrite(bp);
 	}
 	dp->i_flag |= IN_CHANGE | IN_UPDATE;
 	return (error);
 }
 
 /*
  * Remove a directory entry after a call to namei, using
  * the parameters which it left in nameidata. The entry
  * dp->i_offset contains the offset into the directory of the
  * entry to be eliminated.  The dp->i_count field contains the
  * size of the previous record in the directory.  If this
  * is 0, the first entry is being deleted, so we need only
  * zero the inode number to mark the entry as free.  If the
  * entry is not the first in the directory, we must reclaim
  * the space of the now empty record by adding the record size
  * to the size of the previous entry.
  */
 int
 ext2_dirremove(struct vnode *dvp, struct componentname *cnp)
 {
 	struct inode *dp;
 	struct ext2fs_direct_2 *ep, *rep;
 	struct buf *bp;
 	int error;
 
 	dp = VTOI(dvp);
 	if (dp->i_count == 0) {
 		/*
 		 * First entry in block: set d_ino to zero.
 		 */
 		if ((error =
 		    ext2_blkatoff(dvp, (off_t)dp->i_offset, (char **)&ep,
 		    &bp)) != 0)
 			return (error);
 		ep->e2d_ino = 0;
 		ext2_dirent_csum_set(dp, (struct ext2fs_direct_2 *)bp->b_data);
 		error = bwrite(bp);
 		dp->i_flag |= IN_CHANGE | IN_UPDATE;
 		return (error);
 	}
 	/*
 	 * Collapse new free space into previous entry.
 	 */
 	if ((error = ext2_blkatoff(dvp, (off_t)(dp->i_offset - dp->i_count),
 	    (char **)&ep, &bp)) != 0)
 		return (error);
 
 	/* Set 'rep' to the entry being removed. */
 	if (dp->i_count == 0)
 		rep = ep;
 	else
 		rep = (struct ext2fs_direct_2 *)((char *)ep + ep->e2d_reclen);
 	ep->e2d_reclen += rep->e2d_reclen;
 	ext2_dirent_csum_set(dp, (struct ext2fs_direct_2 *)bp->b_data);
 	if (DOINGASYNC(dvp) && dp->i_count != 0)
 		bdwrite(bp);
 	else
 		error = bwrite(bp);
 	dp->i_flag |= IN_CHANGE | IN_UPDATE;
 	return (error);
 }
 
 /*
  * Rewrite an existing directory entry to point at the inode
  * supplied.  The parameters describing the directory entry are
  * set up by a call to namei.
  */
 int
 ext2_dirrewrite(struct inode *dp, struct inode *ip, struct componentname *cnp)
 {
 	struct buf *bp;
 	struct ext2fs_direct_2 *ep;
 	struct vnode *vdp = ITOV(dp);
 	int error;
 
 	if ((error = ext2_blkatoff(vdp, (off_t)dp->i_offset, (char **)&ep,
 	    &bp)) != 0)
 		return (error);
 	ep->e2d_ino = ip->i_number;
 	if (EXT2_HAS_INCOMPAT_FEATURE(ip->i_e2fs,
 	    EXT2F_INCOMPAT_FTYPE))
 		ep->e2d_type = DTTOFT(IFTODT(ip->i_mode));
 	else
 		ep->e2d_type = EXT2_FT_UNKNOWN;
 	ext2_dirent_csum_set(dp, (struct ext2fs_direct_2 *)bp->b_data);
 	error = bwrite(bp);
 	dp->i_flag |= IN_CHANGE | IN_UPDATE;
 	return (error);
 }
 
 /*
  * Check if a directory is empty or not.
  * Inode supplied must be locked.
  *
  * Using a struct dirtemplate here is not precisely
  * what we want, but better than using a struct direct.
  *
  * NB: does not handle corrupted directories.
  */
 int
 ext2_dirempty(struct inode *ip, ino_t parentino, struct ucred *cred)
 {
 	off_t off;
 	struct dirtemplate dbuf;
 	struct ext2fs_direct_2 *dp = (struct ext2fs_direct_2 *)&dbuf;
 	int error, namlen;
 	ssize_t count;
 #define	MINDIRSIZ (sizeof(struct dirtemplate) / 2)
 
 	for (off = 0; off < ip->i_size; off += dp->e2d_reclen) {
 		error = vn_rdwr(UIO_READ, ITOV(ip), (caddr_t)dp, MINDIRSIZ,
 		    off, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, cred,
 		    NOCRED, &count, (struct thread *)0);
 		/*
 		 * Since we read MINDIRSIZ, residual must
 		 * be 0 unless we're at end of file.
 		 */
 		if (error || count != 0)
 			return (0);
 		/* avoid infinite loops */
 		if (dp->e2d_reclen == 0)
 			return (0);
 		/* skip empty entries */
 		if (dp->e2d_ino == 0)
 			continue;
 		/* accept only "." and ".." */
 		namlen = dp->e2d_namlen;
 		if (namlen > 2)
 			return (0);
 		if (dp->e2d_name[0] != '.')
 			return (0);
 		/*
 		 * At this point namlen must be 1 or 2.
 		 * 1 implies ".", 2 implies ".." if second
 		 * char is also "."
 		 */
 		if (namlen == 1)
 			continue;
 		if (dp->e2d_name[1] == '.' && dp->e2d_ino == parentino)
 			continue;
 		return (0);
 	}
 	return (1);
 }
 
 /*
  * Check if source directory is in the path of the target directory.
  * Target is supplied locked, source is unlocked.
  * The target is always vput before returning.
  */
 int
 ext2_checkpath(struct inode *source, struct inode *target, struct ucred *cred)
 {
 	struct vnode *vp;
 	int error, namlen;
 	struct dirtemplate dirbuf;
 
 	vp = ITOV(target);
 	if (target->i_number == source->i_number) {
 		error = EEXIST;
 		goto out;
 	}
 	if (target->i_number == EXT2_ROOTINO) {
 		error = 0;
 		goto out;
 	}
 
 	for (;;) {
 		if (vp->v_type != VDIR) {
 			error = ENOTDIR;
 			break;
 		}
 		error = vn_rdwr(UIO_READ, vp, (caddr_t)&dirbuf,
 		    sizeof(struct dirtemplate), (off_t)0, UIO_SYSSPACE,
 		    IO_NODELOCKED | IO_NOMACCHECK, cred, NOCRED, NULL,
 		    NULL);
 		if (error != 0)
 			break;
 		namlen = dirbuf.dotdot_type;	/* like ufs little-endian */
 		if (namlen != 2 ||
 		    dirbuf.dotdot_name[0] != '.' ||
 		    dirbuf.dotdot_name[1] != '.') {
 			error = ENOTDIR;
 			break;
 		}
 		if (dirbuf.dotdot_ino == source->i_number) {
 			error = EINVAL;
 			break;
 		}
 		if (dirbuf.dotdot_ino == EXT2_ROOTINO)
 			break;
 		vput(vp);
 		if ((error = VFS_VGET(vp->v_mount, dirbuf.dotdot_ino,
 		    LK_EXCLUSIVE, &vp)) != 0) {
 			vp = NULL;
 			break;
 		}
 	}
 
 out:
 	if (error == ENOTDIR)
 		printf("checkpath: .. not a directory\n");
 	if (vp != NULL)
 		vput(vp);
 	return (error);
 }
Index: stable/12/sys/fs/fdescfs/fdesc_vnops.c
===================================================================
--- stable/12/sys/fs/fdescfs/fdesc_vnops.c	(revision 340720)
+++ stable/12/sys/fs/fdescfs/fdesc_vnops.c	(revision 340721)
@@ -1,657 +1,659 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)fdesc_vnops.c	8.9 (Berkeley) 1/21/94
  *
  * $FreeBSD$
  */
 
 /*
  * /dev/fd Filesystem
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/dirent.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>	/* boottime */
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/file.h>	/* Must come after sys/malloc.h */
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 #include <fs/fdescfs/fdesc.h>
 
 #define	NFDCACHE 4
 #define FD_NHASH(ix) \
 	(&fdhashtbl[(ix) & fdhash])
 static LIST_HEAD(fdhashhead, fdescnode) *fdhashtbl;
 static u_long fdhash;
 
 struct mtx fdesc_hashmtx;
 
 static vop_getattr_t	fdesc_getattr;
 static vop_lookup_t	fdesc_lookup;
 static vop_open_t	fdesc_open;
 static vop_pathconf_t	fdesc_pathconf;
 static vop_readdir_t	fdesc_readdir;
 static vop_readlink_t	fdesc_readlink;
 static vop_reclaim_t	fdesc_reclaim;
 static vop_setattr_t	fdesc_setattr;
 
 static struct vop_vector fdesc_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		VOP_NULL,
 	.vop_getattr =		fdesc_getattr,
 	.vop_lookup =		fdesc_lookup,
 	.vop_open =		fdesc_open,
 	.vop_pathconf =		fdesc_pathconf,
 	.vop_readdir =		fdesc_readdir,
 	.vop_readlink =		fdesc_readlink,
 	.vop_reclaim =		fdesc_reclaim,
 	.vop_setattr =		fdesc_setattr,
 };
 
 static void fdesc_insmntque_dtr(struct vnode *, void *);
 static void fdesc_remove_entry(struct fdescnode *);
 
 /*
  * Initialise cache headers
  */
 int
 fdesc_init(struct vfsconf *vfsp)
 {
 
 	mtx_init(&fdesc_hashmtx, "fdescfs_hash", NULL, MTX_DEF);
 	fdhashtbl = hashinit(NFDCACHE, M_CACHE, &fdhash);
 	return (0);
 }
 
 /*
  * Uninit ready for unload.
  */
 int
 fdesc_uninit(struct vfsconf *vfsp)
 {
 
 	hashdestroy(fdhashtbl, M_CACHE, fdhash);
 	mtx_destroy(&fdesc_hashmtx);
 	return (0);
 }
 
 /*
  * If allocating vnode fails, call this.
  */
 static void
 fdesc_insmntque_dtr(struct vnode *vp, void *arg)
 {
 
 	vgone(vp);
 	vput(vp);
 }
 
 /*
  * Remove an entry from the hash if it exists.
  */
 static void
 fdesc_remove_entry(struct fdescnode *fd)
 {
 	struct fdhashhead *fc;
 	struct fdescnode *fd2;
 
 	fc = FD_NHASH(fd->fd_ix);
 	mtx_lock(&fdesc_hashmtx);
 	LIST_FOREACH(fd2, fc, fd_hash) {
 		if (fd == fd2) {
 			LIST_REMOVE(fd, fd_hash);
 			break;
 		}
 	}
 	mtx_unlock(&fdesc_hashmtx);
 }
 
 int
 fdesc_allocvp(fdntype ftype, unsigned fd_fd, int ix, struct mount *mp,
     struct vnode **vpp)
 {
 	struct fdescmount *fmp;
 	struct fdhashhead *fc;
 	struct fdescnode *fd, *fd2;
 	struct vnode *vp, *vp2;
 	struct thread *td;
 	int error;
 
 	td = curthread;
 	fc = FD_NHASH(ix);
 loop:
 	mtx_lock(&fdesc_hashmtx);
 	/*
 	 * If a forced unmount is progressing, we need to drop it. The flags are
 	 * protected by the hashmtx.
 	 */
 	fmp = mp->mnt_data;
 	if (fmp == NULL || fmp->flags & FMNT_UNMOUNTF) {
 		mtx_unlock(&fdesc_hashmtx);
 		return (-1);
 	}
 
 	LIST_FOREACH(fd, fc, fd_hash) {
 		if (fd->fd_ix == ix && fd->fd_vnode->v_mount == mp) {
 			/* Get reference to vnode in case it's being free'd */
 			vp = fd->fd_vnode;
 			VI_LOCK(vp);
 			mtx_unlock(&fdesc_hashmtx);
 			if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td))
 				goto loop;
 			*vpp = vp;
 			return (0);
 		}
 	}
 	mtx_unlock(&fdesc_hashmtx);
 
 	fd = malloc(sizeof(struct fdescnode), M_TEMP, M_WAITOK);
 
 	error = getnewvnode("fdescfs", mp, &fdesc_vnodeops, &vp);
 	if (error) {
 		free(fd, M_TEMP);
 		return (error);
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	vp->v_data = fd;
 	fd->fd_vnode = vp;
 	fd->fd_type = ftype;
 	fd->fd_fd = fd_fd;
 	fd->fd_ix = ix;
 	if (ftype == Fdesc && fmp->flags & FMNT_LINRDLNKF)
 		vp->v_vflag |= VV_READLINK;
 	error = insmntque1(vp, mp, fdesc_insmntque_dtr, NULL);
 	if (error != 0) {
 		*vpp = NULLVP;
 		return (error);
 	}
 
 	/* Make sure that someone didn't beat us when inserting the vnode. */
 	mtx_lock(&fdesc_hashmtx);
 	/*
 	 * If a forced unmount is progressing, we need to drop it. The flags are
 	 * protected by the hashmtx.
 	 */
 	fmp = mp->mnt_data;
 	if (fmp == NULL || fmp->flags & FMNT_UNMOUNTF) {
 		mtx_unlock(&fdesc_hashmtx);
 		vgone(vp);
 		vput(vp);
 		*vpp = NULLVP;
 		return (-1);
 	}
 
 	LIST_FOREACH(fd2, fc, fd_hash) {
 		if (fd2->fd_ix == ix && fd2->fd_vnode->v_mount == mp) {
 			/* Get reference to vnode in case it's being free'd */
 			vp2 = fd2->fd_vnode;
 			VI_LOCK(vp2);
 			mtx_unlock(&fdesc_hashmtx);
 			error = vget(vp2, LK_EXCLUSIVE | LK_INTERLOCK, td);
 			/* Someone beat us, dec use count and wait for reclaim */
 			vgone(vp);
 			vput(vp);
 			/* If we didn't get it, return no vnode. */
 			if (error)
 				vp2 = NULLVP;
 			*vpp = vp2;
 			return (error);
 		}
 	}
 
 	/* If we came here, we can insert it safely. */
 	LIST_INSERT_HEAD(fc, fd, fd_hash);
 	mtx_unlock(&fdesc_hashmtx);
 	*vpp = vp;
 	return (0);
 }
 
 struct fdesc_get_ino_args {
 	fdntype ftype;
 	unsigned fd_fd;
 	int ix;
 	struct file *fp;
 	struct thread *td;
 };
 
 static int
 fdesc_get_ino_alloc(struct mount *mp, void *arg, int lkflags,
     struct vnode **rvp)
 {
 	struct fdesc_get_ino_args *a;
 	int error;
 
 	a = arg;
 	error = fdesc_allocvp(a->ftype, a->fd_fd, a->ix, mp, rvp);
 	fdrop(a->fp, a->td);
 	return (error);
 }
 
 
 /*
  * vp is the current namei directory
  * ndp is the name to locate in that directory...
  */
 static int
 fdesc_lookup(struct vop_lookup_args *ap)
 {
 	struct vnode **vpp = ap->a_vpp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	char *pname = cnp->cn_nameptr;
 	struct thread *td = cnp->cn_thread;
 	struct file *fp;
 	struct fdesc_get_ino_args arg;
 	int nlen = cnp->cn_namelen;
 	u_int fd, fd1;
 	int error;
 	struct vnode *fvp;
 
 	if ((cnp->cn_flags & ISLASTCN) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
 		error = EROFS;
 		goto bad;
 	}
 
 	if (cnp->cn_namelen == 1 && *pname == '.') {
 		*vpp = dvp;
 		VREF(dvp);
 		return (0);
 	}
 
 	if (VTOFDESC(dvp)->fd_type != Froot) {
 		error = ENOTDIR;
 		goto bad;
 	}
 
 	fd = 0;
 	/* the only time a leading 0 is acceptable is if it's "0" */
 	if (*pname == '0' && nlen != 1) {
 		error = ENOENT;
 		goto bad;
 	}
 	while (nlen--) {
 		if (*pname < '0' || *pname > '9') {
 			error = ENOENT;
 			goto bad;
 		}
 		fd1 = 10 * fd + *pname++ - '0';
 		if (fd1 < fd) {
 			error = ENOENT;
 			goto bad;
 		}
 		fd = fd1;
 	}
 
 	/*
 	 * No rights to check since 'fp' isn't actually used.
 	 */
 	if ((error = fget(td, fd, &cap_no_rights, &fp)) != 0)
 		goto bad;
 
 	/* Check if we're looking up ourselves. */
 	if (VTOFDESC(dvp)->fd_ix == FD_DESC + fd) {
 		/*
 		 * In case we're holding the last reference to the file, the dvp
 		 * will be re-acquired.
 		 */
 		vhold(dvp);
 		VOP_UNLOCK(dvp, 0);
 		fdrop(fp, td);
 
 		/* Re-aquire the lock afterwards. */
 		vn_lock(dvp, LK_RETRY | LK_EXCLUSIVE);
 		vdrop(dvp);
 		fvp = dvp;
 		if ((dvp->v_iflag & VI_DOOMED) != 0)
 			error = ENOENT;
 	} else {
 		/*
 		 * Unlock our root node (dvp) when doing this, since we might
 		 * deadlock since the vnode might be locked by another thread
 		 * and the root vnode lock will be obtained afterwards (in case
 		 * we're looking up the fd of the root vnode), which will be the
 		 * opposite lock order. Vhold the root vnode first so we don't
 		 * lose it.
 		 */
 		arg.ftype = Fdesc;
 		arg.fd_fd = fd;
 		arg.ix = FD_DESC + fd;
 		arg.fp = fp;
 		arg.td = td;
 		error = vn_vget_ino_gen(dvp, fdesc_get_ino_alloc, &arg,
 		    LK_EXCLUSIVE, &fvp);
 	}
 
 	if (error)
 		goto bad;
 	*vpp = fvp;
 	return (0);
 
 bad:
 	*vpp = NULL;
 	return (error);
 }
 
 static int
 fdesc_open(struct vop_open_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
 	if (VTOFDESC(vp)->fd_type == Froot)
 		return (0);
 
 	/*
 	 * XXX Kludge: set td->td_proc->p_dupfd to contain the value of the file
 	 * descriptor being sought for duplication. The error return ensures
 	 * that the vnode for this device will be released by vn_open. Open
 	 * will detect this special error and take the actions in dupfdopen.
 	 * Other callers of vn_open or VOP_OPEN will simply report the
 	 * error.
 	 */
 	ap->a_td->td_dupfd = VTOFDESC(vp)->fd_fd;	/* XXX */
 	return (ENODEV);
 }
 
 static int
 fdesc_pathconf(struct vop_pathconf_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	int error;
 
 	switch (ap->a_name) {
 	case _PC_NAME_MAX:
 		*ap->a_retval = NAME_MAX;
 		return (0);
 	case _PC_LINK_MAX:
 		if (VTOFDESC(vp)->fd_type == Froot)
 			*ap->a_retval = 2;
 		else
 			*ap->a_retval = 1;
 		return (0);
 	default:
 		if (VTOFDESC(vp)->fd_type == Froot)
 			return (vop_stdpathconf(ap));
 		vref(vp);
 		VOP_UNLOCK(vp, 0);
 		error = kern_fpathconf(curthread, VTOFDESC(vp)->fd_fd,
 		    ap->a_name, ap->a_retval);
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		vunref(vp);
 		return (error);
 	}
 }
 
 static int
 fdesc_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	struct timeval boottime;
 
 	getboottime(&boottime);
 	vap->va_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH;
 	vap->va_fileid = VTOFDESC(vp)->fd_ix;
 	vap->va_uid = 0;
 	vap->va_gid = 0;
 	vap->va_blocksize = DEV_BSIZE;
 	vap->va_atime.tv_sec = boottime.tv_sec;
 	vap->va_atime.tv_nsec = 0;
 	vap->va_mtime = vap->va_atime;
 	vap->va_ctime = vap->va_mtime;
 	vap->va_gen = 0;
 	vap->va_flags = 0;
 	vap->va_bytes = 0;
 	vap->va_filerev = 0;
 
 	switch (VTOFDESC(vp)->fd_type) {
 	case Froot:
 		vap->va_type = VDIR;
 		vap->va_nlink = 2;
 		vap->va_size = DEV_BSIZE;
 		vap->va_rdev = NODEV;
 		break;
 
 	case Fdesc:
 		vap->va_type = (vp->v_vflag & VV_READLINK) == 0 ? VCHR : VLNK;
 		vap->va_nlink = 1;
 		vap->va_size = 0;
 		vap->va_rdev = makedev(0, vap->va_fileid);
 		break;
 
 	default:
 		panic("fdesc_getattr");
 		break;
 	}
 
 	vp->v_type = vap->va_type;
 	return (0);
 }
 
 static int
 fdesc_setattr(struct vop_setattr_args *ap)
 {
 	struct vattr *vap = ap->a_vap;
 	struct vnode *vp;
 	struct mount *mp;
 	struct file *fp;
 	struct thread *td = curthread;
 	cap_rights_t rights;
 	unsigned fd;
 	int error;
 
 	/*
 	 * Can't mess with the root vnode
 	 */
 	if (VTOFDESC(ap->a_vp)->fd_type == Froot)
 		return (EACCES);
 
 	fd = VTOFDESC(ap->a_vp)->fd_fd;
 
 	/*
 	 * Allow setattr where there is an underlying vnode.
 	 */
 	error = getvnode(td, fd,
 	    cap_rights_init(&rights, CAP_EXTATTR_SET), &fp);
 	if (error) {
 		/*
 		 * getvnode() returns EINVAL if the file descriptor is not
 		 * backed by a vnode.  Silently drop all changes except
 		 * chflags(2) in this case.
 		 */
 		if (error == EINVAL) {
 			if (vap->va_flags != VNOVAL)
 				error = EOPNOTSUPP;
 			else
 				error = 0;
 		}
 		return (error);
 	}
 	vp = fp->f_vnode;
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) == 0) {
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_SETATTR(vp, ap->a_vap, ap->a_cred);
 		VOP_UNLOCK(vp, 0);
 		vn_finished_write(mp);
 	}
 	fdrop(fp, td);
 	return (error);
 }
 
 #define UIO_MX _GENERIC_DIRLEN(10) /* number of symbols in INT_MAX printout */
 
 static int
 fdesc_readdir(struct vop_readdir_args *ap)
 {
 	struct fdescmount *fmp;
 	struct uio *uio = ap->a_uio;
 	struct filedesc *fdp;
 	struct dirent d;
 	struct dirent *dp = &d;
 	int error, i, off, fcnt;
 
 	if (VTOFDESC(ap->a_vp)->fd_type != Froot)
 		panic("fdesc_readdir: not dir");
 
 	fmp = VFSTOFDESC(ap->a_vp->v_mount);
 	if (ap->a_ncookies != NULL)
 		*ap->a_ncookies = 0;
 
 	off = (int)uio->uio_offset;
 	if (off != uio->uio_offset || off < 0 || (u_int)off % UIO_MX != 0 ||
 	    uio->uio_resid < UIO_MX)
 		return (EINVAL);
 	i = (u_int)off / UIO_MX;
 	fdp = uio->uio_td->td_proc->p_fd;
 	error = 0;
 
 	fcnt = i - 2;		/* The first two nodes are `.' and `..' */
 
 	FILEDESC_SLOCK(fdp);
 	while (i < fdp->fd_nfiles + 2 && uio->uio_resid >= UIO_MX) {
 		bzero((caddr_t)dp, UIO_MX);
 		switch (i) {
 		case 0:	/* `.' */
 		case 1: /* `..' */
 			dp->d_fileno = i + FD_ROOT;
 			dp->d_namlen = i + 1;
 			dp->d_reclen = UIO_MX;
 			bcopy("..", dp->d_name, dp->d_namlen);
 			dp->d_name[i + 1] = '\0';
 			dp->d_type = DT_DIR;
 			break;
 		default:
 			if (fdp->fd_ofiles[fcnt].fde_file == NULL)
 				break;
 			dp->d_namlen = sprintf(dp->d_name, "%d", fcnt);
 			dp->d_reclen = UIO_MX;
 			dp->d_type = (fmp->flags & FMNT_LINRDLNKF) == 0 ?
 			    DT_CHR : DT_LNK;
 			dp->d_fileno = i + FD_DESC;
 			break;
 		}
+		/* NOTE: d_off is the offset of the *next* entry. */
+		dp->d_off = UIO_MX * (i + 1);
 		if (dp->d_namlen != 0) {
 			/*
 			 * And ship to userland
 			 */
 			FILEDESC_SUNLOCK(fdp);
 			error = uiomove(dp, UIO_MX, uio);
 			if (error)
 				goto done;
 			FILEDESC_SLOCK(fdp);
 		}
 		i++;
 		fcnt++;
 	}
 	FILEDESC_SUNLOCK(fdp);
 
 done:
 	uio->uio_offset = i * UIO_MX;
 	return (error);
 }
 
 static int
 fdesc_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp;
 	struct fdescnode *fd;
 
  	vp = ap->a_vp;
  	fd = VTOFDESC(vp);
 	fdesc_remove_entry(fd);
 	free(vp->v_data, M_TEMP);
 	vp->v_data = NULL;
 	return (0);
 }
 
 static int
 fdesc_readlink(struct vop_readlink_args *va)
 {
 	struct vnode *vp, *vn;
 	struct thread *td;
 	struct uio *uio;
 	struct file *fp;
 	char *freepath, *fullpath;
 	size_t pathlen;
 	int lockflags, fd_fd;
 	int error;
 
 	freepath = NULL;
 	vn = va->a_vp;
 	if (VTOFDESC(vn)->fd_type != Fdesc)
 		panic("fdesc_readlink: not fdescfs link");
 	fd_fd = ((struct fdescnode *)vn->v_data)->fd_fd;
 	lockflags = VOP_ISLOCKED(vn);
 	VOP_UNLOCK(vn, 0);
 
 	td = curthread;
 	error = fget_cap(td, fd_fd, &cap_no_rights, &fp, NULL);
 	if (error != 0)
 		goto out;
 
 	switch (fp->f_type) {
 	case DTYPE_VNODE:
 		vp = fp->f_vnode;
 		error = vn_fullpath(td, vp, &fullpath, &freepath);
 		break;
 	default:
 		fullpath = "anon_inode:[unknown]";
 		break;
 	}
 	if (error == 0) {
 		uio = va->a_uio;
 		pathlen = strlen(fullpath);
 		error = uiomove(fullpath, pathlen, uio);
 	}
 	if (freepath != NULL)
 		free(freepath, M_TEMP);
 	fdrop(fp, td);
 
 out:
 	vn_lock(vn, lockflags | LK_RETRY);
 	return (error);
 }
Index: stable/12/sys/fs/msdosfs/msdosfs_vnops.c
===================================================================
--- stable/12/sys/fs/msdosfs/msdosfs_vnops.c	(revision 340720)
+++ stable/12/sys/fs/msdosfs/msdosfs_vnops.c	(revision 340721)
@@ -1,1945 +1,1949 @@
 /* $FreeBSD$ */
 /*	$NetBSD: msdosfs_vnops.c,v 1.68 1998/02/10 14:10:04 mrg Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank.
  * Copyright (C) 1994, 1995, 1997 TooLs GmbH.
  * All rights reserved.
  * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by TooLs GmbH.
  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Written by Paul Popelka (paulp@uts.amdahl.com)
  *
  * You can do anything you want with this software, just don't say you wrote
  * it, and don't remove this notice.
  *
  * This software is provided "as is".
  *
  * The author supplies this software to be publicly redistributed on the
  * understanding that the author is not responsible for the correct
  * functioning of this software in any circumstances and is not liable for
  * any damages caused by this software.
  *
  * October 1992
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/clock.h>
 #include <sys/dirent.h>
 #include <sys/lock.h>
 #include <sys/lockf.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/unistd.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vnode_pager.h>
 
 #include <fs/msdosfs/bpb.h>
 #include <fs/msdosfs/direntry.h>
 #include <fs/msdosfs/denode.h>
 #include <fs/msdosfs/fat.h>
 #include <fs/msdosfs/msdosfsmount.h>
 
 /*
  * Prototypes for MSDOSFS vnode operations
  */
 static vop_create_t	msdosfs_create;
 static vop_mknod_t	msdosfs_mknod;
 static vop_open_t	msdosfs_open;
 static vop_close_t	msdosfs_close;
 static vop_access_t	msdosfs_access;
 static vop_getattr_t	msdosfs_getattr;
 static vop_setattr_t	msdosfs_setattr;
 static vop_read_t	msdosfs_read;
 static vop_write_t	msdosfs_write;
 static vop_fsync_t	msdosfs_fsync;
 static vop_remove_t	msdosfs_remove;
 static vop_link_t	msdosfs_link;
 static vop_rename_t	msdosfs_rename;
 static vop_mkdir_t	msdosfs_mkdir;
 static vop_rmdir_t	msdosfs_rmdir;
 static vop_symlink_t	msdosfs_symlink;
 static vop_readdir_t	msdosfs_readdir;
 static vop_bmap_t	msdosfs_bmap;
 static vop_getpages_t	msdosfs_getpages;
 static vop_strategy_t	msdosfs_strategy;
 static vop_print_t	msdosfs_print;
 static vop_pathconf_t	msdosfs_pathconf;
 static vop_vptofh_t	msdosfs_vptofh;
 
 /*
  * Some general notes:
  *
  * In the ufs filesystem the inodes, superblocks, and indirect blocks are
  * read/written using the vnode for the filesystem. Blocks that represent
  * the contents of a file are read/written using the vnode for the file
  * (including directories when they are read/written as files). This
  * presents problems for the dos filesystem because data that should be in
  * an inode (if dos had them) resides in the directory itself.  Since we
  * must update directory entries without the benefit of having the vnode
  * for the directory we must use the vnode for the filesystem.  This means
  * that when a directory is actually read/written (via read, write, or
  * readdir, or seek) we must use the vnode for the filesystem instead of
  * the vnode for the directory as would happen in ufs. This is to insure we
  * retrieve the correct block from the buffer cache since the hash value is
  * based upon the vnode address and the desired block number.
  */
 
 /*
  * Create a regular file. On entry the directory to contain the file being
  * created is locked.  We must release before we return. We must also free
  * the pathname buffer pointed at by cnp->cn_pnbuf, always on error, or
  * only if the SAVESTART bit in cn_flags is clear on success.
  */
 static int
 msdosfs_create(struct vop_create_args *ap)
 {
 	struct componentname *cnp = ap->a_cnp;
 	struct denode ndirent;
 	struct denode *dep;
 	struct denode *pdep = VTODE(ap->a_dvp);
 	struct timespec ts;
 	int error;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_create(cnp %p, vap %p\n", cnp, ap->a_vap);
 #endif
 
 	/*
 	 * If this is the root directory and there is no space left we
 	 * can't do anything.  This is because the root directory can not
 	 * change size.
 	 */
 	if (pdep->de_StartCluster == MSDOSFSROOT
 	    && pdep->de_fndoffset >= pdep->de_FileSize) {
 		error = ENOSPC;
 		goto bad;
 	}
 
 	/*
 	 * Create a directory entry for the file, then call createde() to
 	 * have it installed. NOTE: DOS files are always executable.  We
 	 * use the absence of the owner write bit to make the file
 	 * readonly.
 	 */
 #ifdef DIAGNOSTIC
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("msdosfs_create: no name");
 #endif
 	memset(&ndirent, 0, sizeof(ndirent));
 	error = uniqdosname(pdep, cnp, ndirent.de_Name);
 	if (error)
 		goto bad;
 
 	ndirent.de_Attributes = ATTR_ARCHIVE;
 	ndirent.de_LowerCase = 0;
 	ndirent.de_StartCluster = 0;
 	ndirent.de_FileSize = 0;
 	ndirent.de_pmp = pdep->de_pmp;
 	ndirent.de_flag = DE_ACCESS | DE_CREATE | DE_UPDATE;
 	vfs_timestamp(&ts);
 	DETIMES(&ndirent, &ts, &ts, &ts);
 	error = createde(&ndirent, pdep, &dep, cnp);
 	if (error)
 		goto bad;
 	*ap->a_vpp = DETOV(dep);
 	if ((cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
 	return (0);
 
 bad:
 	return (error);
 }
 
 static int
 msdosfs_mknod(struct vop_mknod_args *ap)
 {
 
     return (EINVAL);
 }
 
 static int
 msdosfs_open(struct vop_open_args *ap)
 {
 	struct denode *dep = VTODE(ap->a_vp);
 	vnode_create_vobject(ap->a_vp, dep->de_FileSize, ap->a_td);
 	return 0;
 }
 
 static int
 msdosfs_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(vp);
 	struct timespec ts;
 
 	VI_LOCK(vp);
 	if (vp->v_usecount > 1) {
 		vfs_timestamp(&ts);
 		DETIMES(dep, &ts, &ts, &ts);
 	}
 	VI_UNLOCK(vp);
 	return 0;
 }
 
 static int
 msdosfs_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(ap->a_vp);
 	struct msdosfsmount *pmp = dep->de_pmp;
 	mode_t file_mode;
 	accmode_t accmode = ap->a_accmode;
 
 	file_mode = S_IRWXU|S_IRWXG|S_IRWXO;
 	file_mode &= (vp->v_type == VDIR ? pmp->pm_dirmask : pmp->pm_mask);
 
 	/*
 	 * Disallow writing to directories and regular files if the
 	 * filesystem is read-only.
 	 */
 	if (accmode & VWRITE) {
 		switch (vp->v_type) {
 		case VREG:
 		case VDIR:
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			break;
 		default:
 			break;
 		}
 	}
 
 	return (vaccess(vp->v_type, file_mode, pmp->pm_uid, pmp->pm_gid,
 	    ap->a_accmode, ap->a_cred, NULL));
 }
 
 static int
 msdosfs_getattr(struct vop_getattr_args *ap)
 {
 	struct denode *dep = VTODE(ap->a_vp);
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct vattr *vap = ap->a_vap;
 	mode_t mode;
 	struct timespec ts;
 	u_long dirsperblk = pmp->pm_BytesPerSec / sizeof(struct direntry);
 	uint64_t fileid;
 
 	vfs_timestamp(&ts);
 	DETIMES(dep, &ts, &ts, &ts);
 	vap->va_fsid = dev2udev(pmp->pm_dev);
 	/*
 	 * The following computation of the fileid must be the same as that
 	 * used in msdosfs_readdir() to compute d_fileno. If not, pwd
 	 * doesn't work.
 	 */
 	if (dep->de_Attributes & ATTR_DIRECTORY) {
 		fileid = (uint64_t)cntobn(pmp, dep->de_StartCluster) *
 		    dirsperblk;
 		if (dep->de_StartCluster == MSDOSFSROOT)
 			fileid = 1;
 	} else {
 		fileid = (uint64_t)cntobn(pmp, dep->de_dirclust) *
 		    dirsperblk;
 		if (dep->de_dirclust == MSDOSFSROOT)
 			fileid = (uint64_t)roottobn(pmp, 0) * dirsperblk;
 		fileid += (uoff_t)dep->de_diroffset / sizeof(struct direntry);
 	}
 	vap->va_fileid = fileid;
 
 	mode = S_IRWXU|S_IRWXG|S_IRWXO;
 	if (dep->de_Attributes & ATTR_READONLY)
 		mode &= ~(S_IWUSR|S_IWGRP|S_IWOTH);
 	vap->va_mode = mode &
 	    (ap->a_vp->v_type == VDIR ? pmp->pm_dirmask : pmp->pm_mask);
 	vap->va_uid = pmp->pm_uid;
 	vap->va_gid = pmp->pm_gid;
 	vap->va_nlink = 1;
 	vap->va_rdev = NODEV;
 	vap->va_size = dep->de_FileSize;
 	fattime2timespec(dep->de_MDate, dep->de_MTime, 0, 0, &vap->va_mtime);
 	vap->va_ctime = vap->va_mtime;
 	if (pmp->pm_flags & MSDOSFSMNT_LONGNAME) {
 		fattime2timespec(dep->de_ADate, 0, 0, 0, &vap->va_atime);
 		fattime2timespec(dep->de_CDate, dep->de_CTime, dep->de_CHun,
 		    0, &vap->va_birthtime);
 	} else {
 		vap->va_atime = vap->va_mtime;
 		vap->va_birthtime.tv_sec = -1;
 		vap->va_birthtime.tv_nsec = 0;
 	}
 	vap->va_flags = 0;
 	if (dep->de_Attributes & ATTR_ARCHIVE)
 		vap->va_flags |= UF_ARCHIVE;
 	if (dep->de_Attributes & ATTR_HIDDEN)
 		vap->va_flags |= UF_HIDDEN;
 	if (dep->de_Attributes & ATTR_READONLY)
 		vap->va_flags |= UF_READONLY;
 	if (dep->de_Attributes & ATTR_SYSTEM)
 		vap->va_flags |= UF_SYSTEM;
 	vap->va_gen = 0;
 	vap->va_blocksize = pmp->pm_bpcluster;
 	vap->va_bytes =
 	    (dep->de_FileSize + pmp->pm_crbomask) & ~pmp->pm_crbomask;
 	vap->va_type = ap->a_vp->v_type;
 	vap->va_filerev = dep->de_modrev;
 	return (0);
 }
 
 static int
 msdosfs_setattr(struct vop_setattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(ap->a_vp);
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct vattr *vap = ap->a_vap;
 	struct ucred *cred = ap->a_cred;
 	struct thread *td = curthread;
 	int error = 0;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_setattr(): vp %p, vap %p, cred %p\n",
 	    ap->a_vp, vap, cred);
 #endif
 
 	/*
 	 * Check for unsettable attributes.
 	 */
 	if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) ||
 	    (vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) {
 #ifdef MSDOSFS_DEBUG
 		printf("msdosfs_setattr(): returning EINVAL\n");
 		printf("    va_type %d, va_nlink %llx, va_fsid %llx, va_fileid %llx\n",
 		    vap->va_type, (unsigned long long)vap->va_nlink,
 		    (unsigned long long)vap->va_fsid,
 		    (unsigned long long)vap->va_fileid);
 		printf("    va_blocksize %lx, va_rdev %llx, va_bytes %llx, va_gen %lx\n",
 		    vap->va_blocksize, (unsigned long long)vap->va_rdev,
 		    (unsigned long long)vap->va_bytes, vap->va_gen);
 		printf("    va_uid %x, va_gid %x\n",
 		    vap->va_uid, vap->va_gid);
 #endif
 		return (EINVAL);
 	}
 
 	/*
 	 * We don't allow setting attributes on the root directory.
 	 * The special case for the root directory is because before
 	 * FAT32, the root directory didn't have an entry for itself
 	 * (and was otherwise special).  With FAT32, the root
 	 * directory is not so special, but still doesn't have an
 	 * entry for itself.
 	 */
 	if (vp->v_vflag & VV_ROOT)
 		return (EINVAL);
 
 	if (vap->va_flags != VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if (cred->cr_uid != pmp->pm_uid) {
 			error = priv_check_cred(cred, PRIV_VFS_ADMIN, 0);
 			if (error)
 				return (error);
 		}
 		/*
 		 * We are very inconsistent about handling unsupported
 		 * attributes.  We ignored the access time and the
 		 * read and execute bits.  We were strict for the other
 		 * attributes.
 		 */
 		if (vap->va_flags & ~(UF_ARCHIVE | UF_HIDDEN | UF_READONLY |
 		    UF_SYSTEM))
 			return EOPNOTSUPP;
 		if (vap->va_flags & UF_ARCHIVE)
 			dep->de_Attributes |= ATTR_ARCHIVE;
 		else
 			dep->de_Attributes &= ~ATTR_ARCHIVE;
 		if (vap->va_flags & UF_HIDDEN)
 			dep->de_Attributes |= ATTR_HIDDEN;
 		else
 			dep->de_Attributes &= ~ATTR_HIDDEN;
 		/* We don't allow changing the readonly bit on directories. */
 		if (vp->v_type != VDIR) {
 			if (vap->va_flags & UF_READONLY)
 				dep->de_Attributes |= ATTR_READONLY;
 			else
 				dep->de_Attributes &= ~ATTR_READONLY;
 		}
 		if (vap->va_flags & UF_SYSTEM)
 			dep->de_Attributes |= ATTR_SYSTEM;
 		else
 			dep->de_Attributes &= ~ATTR_SYSTEM;
 		dep->de_flag |= DE_MODIFIED;
 	}
 
 	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
 		uid_t uid;
 		gid_t gid;
 
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		uid = vap->va_uid;
 		if (uid == (uid_t)VNOVAL)
 			uid = pmp->pm_uid;
 		gid = vap->va_gid;
 		if (gid == (gid_t)VNOVAL)
 			gid = pmp->pm_gid;
 		if (cred->cr_uid != pmp->pm_uid || uid != pmp->pm_uid ||
 		    (gid != pmp->pm_gid && !groupmember(gid, cred))) {
 			error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0);
 			if (error)
 				return (error);
 		}
 		if (uid != pmp->pm_uid || gid != pmp->pm_gid)
 			return EINVAL;
 	}
 
 	if (vap->va_size != VNOVAL) {
 		switch (vp->v_type) {
 		case VDIR:
 			return (EISDIR);
 		case VREG:
 			/*
 			 * Truncation is only supported for regular files,
 			 * Disallow it if the filesystem is read-only.
 			 */
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			break;
 		default:
 			/*
 			 * According to POSIX, the result is unspecified
 			 * for file types other than regular files,
 			 * directories and shared memory objects.  We
 			 * don't support any file types except regular
 			 * files and directories in this file system, so
 			 * this (default) case is unreachable and can do
 			 * anything.  Keep falling through to detrunc()
 			 * for now.
 			 */
 			break;
 		}
 		error = detrunc(dep, vap->va_size, 0, cred);
 		if (error)
 			return error;
 	}
 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		error = vn_utimes_perm(vp, vap, cred, td);
 		if (error != 0)
 			return (error);
 		if ((pmp->pm_flags & MSDOSFSMNT_NOWIN95) == 0 &&
 		    vap->va_atime.tv_sec != VNOVAL) {
 			dep->de_flag &= ~DE_ACCESS;
 			timespec2fattime(&vap->va_atime, 0,
 			    &dep->de_ADate, NULL, NULL);
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			dep->de_flag &= ~DE_UPDATE;
 			timespec2fattime(&vap->va_mtime, 0,
 			    &dep->de_MDate, &dep->de_MTime, NULL);
 		}
 		/*
 		 * We don't set the archive bit when modifying the time of
 		 * a directory to emulate the Windows/DOS behavior.
 		 */
 		if (vp->v_type != VDIR)
 			dep->de_Attributes |= ATTR_ARCHIVE;
 		dep->de_flag |= DE_MODIFIED;
 	}
 	/*
 	 * DOS files only have the ability to have their writability
 	 * attribute set, so we use the owner write bit to set the readonly
 	 * attribute.
 	 */
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if (cred->cr_uid != pmp->pm_uid) {
 			error = priv_check_cred(cred, PRIV_VFS_ADMIN, 0);
 			if (error)
 				return (error);
 		}
 		if (vp->v_type != VDIR) {
 			/* We ignore the read and execute bits. */
 			if (vap->va_mode & S_IWUSR)
 				dep->de_Attributes &= ~ATTR_READONLY;
 			else
 				dep->de_Attributes |= ATTR_READONLY;
 			dep->de_Attributes |= ATTR_ARCHIVE;
 			dep->de_flag |= DE_MODIFIED;
 		}
 	}
 	return (deupdat(dep, 0));
 }
 
 static int
 msdosfs_read(struct vop_read_args *ap)
 {
 	int error = 0;
 	int blsize;
 	int isadir;
 	ssize_t orig_resid;
 	u_int n;
 	u_long diff;
 	u_long on;
 	daddr_t lbn;
 	daddr_t rablock;
 	int rasize;
 	int seqcount;
 	struct buf *bp;
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(vp);
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct uio *uio = ap->a_uio;
 
 	/*
 	 * If they didn't ask for any data, then we are done.
 	 */
 	orig_resid = uio->uio_resid;
 	if (orig_resid == 0)
 		return (0);
 
 	/*
 	 * The caller is supposed to ensure that
 	 * uio->uio_offset >= 0 and uio->uio_resid >= 0.
 	 * We don't need to check for large offsets as in ffs because
 	 * dep->de_FileSize <= MSDOSFS_FILESIZE_MAX < OFF_MAX, so large
 	 * offsets cannot cause overflow even in theory.
 	 */
 
 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
 
 	isadir = dep->de_Attributes & ATTR_DIRECTORY;
 	do {
 		if (uio->uio_offset >= dep->de_FileSize)
 			break;
 		lbn = de_cluster(pmp, uio->uio_offset);
 		rablock = lbn + 1;
 		blsize = pmp->pm_bpcluster;
 		on = uio->uio_offset & pmp->pm_crbomask;
 		/*
 		 * If we are operating on a directory file then be sure to
 		 * do i/o with the vnode for the filesystem instead of the
 		 * vnode for the directory.
 		 */
 		if (isadir) {
 			/* convert cluster # to block # */
 			error = pcbmap(dep, lbn, &lbn, 0, &blsize);
 			if (error == E2BIG) {
 				error = EINVAL;
 				break;
 			} else if (error)
 				break;
 			error = bread(pmp->pm_devvp, lbn, blsize, NOCRED, &bp);
 		} else if (de_cn2off(pmp, rablock) >= dep->de_FileSize) {
 			error = bread(vp, lbn, blsize, NOCRED, &bp);
 		} else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
 			error = cluster_read(vp, dep->de_FileSize, lbn, blsize,
 			    NOCRED, on + uio->uio_resid, seqcount, 0, &bp);
 		} else if (seqcount > 1) {
 			rasize = blsize;
 			error = breadn(vp, lbn,
 			    blsize, &rablock, &rasize, 1, NOCRED, &bp);
 		} else {
 			error = bread(vp, lbn, blsize, NOCRED, &bp);
 		}
 		if (error) {
 			brelse(bp);
 			break;
 		}
 		diff = pmp->pm_bpcluster - on;
 		n = diff > uio->uio_resid ? uio->uio_resid : diff;
 		diff = dep->de_FileSize - uio->uio_offset;
 		if (diff < n)
 			n = diff;
 		diff = blsize - bp->b_resid;
 		if (diff < n)
 			n = diff;
 		error = vn_io_fault_uiomove(bp->b_data + on, (int) n, uio);
 		brelse(bp);
 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
 	if (!isadir && (error == 0 || uio->uio_resid != orig_resid) &&
 	    (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
 		dep->de_flag |= DE_ACCESS;
 	return (error);
 }
 
 /*
  * Write data to a file or directory.
  */
 static int
 msdosfs_write(struct vop_write_args *ap)
 {
 	int n;
 	int croffset;
 	ssize_t resid;
 	u_long osize;
 	int error = 0;
 	u_long count;
 	int seqcount;
 	daddr_t bn, lastcn;
 	struct buf *bp;
 	int ioflag = ap->a_ioflag;
 	struct uio *uio = ap->a_uio;
 	struct vnode *vp = ap->a_vp;
 	struct vnode *thisvp;
 	struct denode *dep = VTODE(vp);
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct ucred *cred = ap->a_cred;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_write(vp %p, uio %p, ioflag %x, cred %p\n",
 	    vp, uio, ioflag, cred);
 	printf("msdosfs_write(): diroff %lu, dirclust %lu, startcluster %lu\n",
 	    dep->de_diroffset, dep->de_dirclust, dep->de_StartCluster);
 #endif
 
 	switch (vp->v_type) {
 	case VREG:
 		if (ioflag & IO_APPEND)
 			uio->uio_offset = dep->de_FileSize;
 		thisvp = vp;
 		break;
 	case VDIR:
 		return EISDIR;
 	default:
 		panic("msdosfs_write(): bad file type");
 	}
 
 	/*
 	 * This is needed (unlike in ffs_write()) because we extend the
 	 * file outside of the loop but we don't want to extend the file
 	 * for writes of 0 bytes.
 	 */
 	if (uio->uio_resid == 0)
 		return (0);
 
 	/*
 	 * The caller is supposed to ensure that
 	 * uio->uio_offset >= 0 and uio->uio_resid >= 0.
 	 */
 	if ((uoff_t)uio->uio_offset + uio->uio_resid > MSDOSFS_FILESIZE_MAX)
 		return (EFBIG);
 
 	/*
 	 * If they've exceeded their filesize limit, tell them about it.
 	 */
 	if (vn_rlimit_fsize(vp, uio, uio->uio_td))
 		return (EFBIG);
 
 	/*
 	 * If the offset we are starting the write at is beyond the end of
 	 * the file, then they've done a seek.  Unix filesystems allow
 	 * files with holes in them, DOS doesn't so we must fill the hole
 	 * with zeroed blocks.
 	 */
 	if (uio->uio_offset > dep->de_FileSize) {
 		error = deextend(dep, uio->uio_offset, cred);
 		if (error)
 			return (error);
 	}
 
 	/*
 	 * Remember some values in case the write fails.
 	 */
 	resid = uio->uio_resid;
 	osize = dep->de_FileSize;
 
 	/*
 	 * If we write beyond the end of the file, extend it to its ultimate
 	 * size ahead of the time to hopefully get a contiguous area.
 	 */
 	if (uio->uio_offset + resid > osize) {
 		count = de_clcount(pmp, uio->uio_offset + resid) -
 			de_clcount(pmp, osize);
 		error = extendfile(dep, count, NULL, NULL, 0);
 		if (error &&  (error != ENOSPC || (ioflag & IO_UNIT)))
 			goto errexit;
 		lastcn = dep->de_fc[FC_LASTFC].fc_frcn;
 	} else
 		lastcn = de_clcount(pmp, osize) - 1;
 
 	seqcount = ioflag >> IO_SEQSHIFT;
 	do {
 		if (de_cluster(pmp, uio->uio_offset) > lastcn) {
 			error = ENOSPC;
 			break;
 		}
 
 		croffset = uio->uio_offset & pmp->pm_crbomask;
 		n = min(uio->uio_resid, pmp->pm_bpcluster - croffset);
 		if (uio->uio_offset + n > dep->de_FileSize) {
 			dep->de_FileSize = uio->uio_offset + n;
 			/* The object size needs to be set before buffer is allocated */
 			vnode_pager_setsize(vp, dep->de_FileSize);
 		}
 
 		bn = de_cluster(pmp, uio->uio_offset);
 		if ((uio->uio_offset & pmp->pm_crbomask) == 0
 		    && (de_cluster(pmp, uio->uio_offset + uio->uio_resid)
 			> de_cluster(pmp, uio->uio_offset)
 			|| uio->uio_offset + uio->uio_resid >= dep->de_FileSize)) {
 			/*
 			 * If either the whole cluster gets written,
 			 * or we write the cluster from its start beyond EOF,
 			 * then no need to read data from disk.
 			 */
 			bp = getblk(thisvp, bn, pmp->pm_bpcluster, 0, 0, 0);
 			/*
 			 * This call to vfs_bio_clrbuf() ensures that
 			 * even if vn_io_fault_uiomove() below faults,
 			 * garbage from the newly instantiated buffer
 			 * is not exposed to the userspace via mmap().
 			 */
 			vfs_bio_clrbuf(bp);
 			/*
 			 * Do the bmap now, since pcbmap needs buffers
 			 * for the FAT table. (see msdosfs_strategy)
 			 */
 			if (bp->b_blkno == bp->b_lblkno) {
 				error = pcbmap(dep, bp->b_lblkno, &bn, 0, 0);
 				if (error)
 					bp->b_blkno = -1;
 				else
 					bp->b_blkno = bn;
 			}
 			if (bp->b_blkno == -1) {
 				brelse(bp);
 				if (!error)
 					error = EIO;		/* XXX */
 				break;
 			}
 		} else {
 			/*
 			 * The block we need to write into exists, so read it in.
 			 */
 			error = bread(thisvp, bn, pmp->pm_bpcluster, cred, &bp);
 			if (error) {
 				brelse(bp);
 				break;
 			}
 		}
 
 		/*
 		 * Should these vnode_pager_* functions be done on dir
 		 * files?
 		 */
 
 		/*
 		 * Copy the data from user space into the buf header.
 		 */
 		error = vn_io_fault_uiomove(bp->b_data + croffset, n, uio);
 		if (error) {
 			brelse(bp);
 			break;
 		}
 
 		/* Prepare for clustered writes in some else clauses. */
 		if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0)
 			bp->b_flags |= B_CLUSTEROK;
 
 		/*
 		 * If IO_SYNC, then each buffer is written synchronously.
 		 * Otherwise, if we have a severe page deficiency then
 		 * write the buffer asynchronously.  Otherwise, if on a
 		 * cluster boundary then write the buffer asynchronously,
 		 * combining it with contiguous clusters if permitted and
 		 * possible, since we don't expect more writes into this
 		 * buffer soon.  Otherwise, do a delayed write because we
 		 * expect more writes into this buffer soon.
 		 */
 		if (ioflag & IO_SYNC)
 			(void)bwrite(bp);
 		else if (vm_page_count_severe() || buf_dirty_count_severe())
 			bawrite(bp);
 		else if (n + croffset == pmp->pm_bpcluster) {
 			if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0)
 				cluster_write(vp, bp, dep->de_FileSize,
 				    seqcount, 0);
 			else
 				bawrite(bp);
 		} else
 			bdwrite(bp);
 		dep->de_flag |= DE_UPDATE;
 	} while (error == 0 && uio->uio_resid > 0);
 
 	/*
 	 * If the write failed and they want us to, truncate the file back
 	 * to the size it was before the write was attempted.
 	 */
 errexit:
 	if (error) {
 		if (ioflag & IO_UNIT) {
 			detrunc(dep, osize, ioflag & IO_SYNC, NOCRED);
 			uio->uio_offset -= resid - uio->uio_resid;
 			uio->uio_resid = resid;
 		} else {
 			detrunc(dep, dep->de_FileSize, ioflag & IO_SYNC, NOCRED);
 			if (uio->uio_resid != resid)
 				error = 0;
 		}
 	} else if (ioflag & IO_SYNC)
 		error = deupdat(dep, 1);
 	return (error);
 }
 
 /*
  * Flush the blocks of a file to disk.
  */
 static int
 msdosfs_fsync(struct vop_fsync_args *ap)
 {
 	struct vnode *devvp;
 	int allerror, error;
 
 	vop_stdfsync(ap);
 
 	/*
 	* If the syncing request comes from fsync(2), sync the entire
 	* FAT and any other metadata that happens to be on devvp.  We
 	* need this mainly for the FAT.  We write the FAT sloppily, and
 	* syncing it all now is the best we can easily do to get all
 	* directory entries associated with the file (not just the file)
 	* fully synced.  The other metadata includes critical metadata
 	* for all directory entries, but only in the MNT_ASYNC case.  We
 	* will soon sync all metadata in the file's directory entry.
 	* Non-critical metadata for associated directory entries only
 	* gets synced accidentally, as in most file systems.
 	*/
 	if (ap->a_waitfor == MNT_WAIT) {
 		devvp = VTODE(ap->a_vp)->de_pmp->pm_devvp;
 		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 		allerror = VOP_FSYNC(devvp, MNT_WAIT, ap->a_td);
 		VOP_UNLOCK(devvp, 0);
 	} else
 		allerror = 0;
 
 	error = deupdat(VTODE(ap->a_vp), ap->a_waitfor == MNT_WAIT);
 	if (allerror == 0)
 		allerror = error;
 	return (allerror);
 }
 
 static int
 msdosfs_remove(struct vop_remove_args *ap)
 {
 	struct denode *dep = VTODE(ap->a_vp);
 	struct denode *ddep = VTODE(ap->a_dvp);
 	int error;
 
 	if (ap->a_vp->v_type == VDIR)
 		error = EPERM;
 	else
 		error = removede(ddep, dep);
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_remove(), dep %p, v_usecount %d\n", dep, ap->a_vp->v_usecount);
 #endif
 	return (error);
 }
 
 /*
  * DOS filesystems don't know what links are.
  */
 static int
 msdosfs_link(struct vop_link_args *ap)
 {
 	return (EOPNOTSUPP);
 }
 
 /*
  * Renames on files require moving the denode to a new hash queue since the
  * denode's location is used to compute which hash queue to put the file
  * in. Unless it is a rename in place.  For example "mv a b".
  *
  * What follows is the basic algorithm:
  *
  * if (file move) {
  *	if (dest file exists) {
  *		remove dest file
  *	}
  *	if (dest and src in same directory) {
  *		rewrite name in existing directory slot
  *	} else {
  *		write new entry in dest directory
  *		update offset and dirclust in denode
  *		move denode to new hash chain
  *		clear old directory entry
  *	}
  * } else {
  *	directory move
  *	if (dest directory exists) {
  *		if (dest is not empty) {
  *			return ENOTEMPTY
  *		}
  *		remove dest directory
  *	}
  *	if (dest and src in same directory) {
  *		rewrite name in existing entry
  *	} else {
  *		be sure dest is not a child of src directory
  *		write entry in dest directory
  *		update "." and ".." in moved directory
  *		clear old directory entry for moved directory
  *	}
  * }
  *
  * On entry:
  *	source's parent directory is unlocked
  *	source file or directory is unlocked
  *	destination's parent directory is locked
  *	destination file or directory is locked if it exists
  *
  * On exit:
  *	all denodes should be released
  */
 static int
 msdosfs_rename(struct vop_rename_args *ap)
 {
 	struct vnode *tdvp = ap->a_tdvp;
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *fdvp = ap->a_fdvp;
 	struct vnode *tvp = ap->a_tvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct componentname *fcnp = ap->a_fcnp;
 	struct denode *ip, *xp, *dp, *zp;
 	u_char toname[12], oldname[11];
 	u_long from_diroffset, to_diroffset;
 	u_char to_count;
 	int doingdirectory = 0, newparent = 0;
 	int error;
 	u_long cn, pcl;
 	daddr_t bn;
 	struct msdosfsmount *pmp;
 	struct direntry *dotdotp;
 	struct buf *bp;
 
 	pmp = VFSTOMSDOSFS(fdvp->v_mount);
 
 #ifdef DIAGNOSTIC
 	if ((tcnp->cn_flags & HASBUF) == 0 ||
 	    (fcnp->cn_flags & HASBUF) == 0)
 		panic("msdosfs_rename: no name");
 #endif
 	/*
 	 * Check for cross-device rename.
 	 */
 	if (fvp->v_mount != tdvp->v_mount ||
 	    (tvp && fvp->v_mount != tvp->v_mount)) {
 		error = EXDEV;
 abortit:
 		if (tdvp == tvp)
 			vrele(tdvp);
 		else
 			vput(tdvp);
 		if (tvp)
 			vput(tvp);
 		vrele(fdvp);
 		vrele(fvp);
 		return (error);
 	}
 
 	/*
 	 * If source and dest are the same, do nothing.
 	 */
 	if (tvp == fvp) {
 		error = 0;
 		goto abortit;
 	}
 
 	error = vn_lock(fvp, LK_EXCLUSIVE);
 	if (error)
 		goto abortit;
 	dp = VTODE(fdvp);
 	ip = VTODE(fvp);
 
 	/*
 	 * Be sure we are not renaming ".", "..", or an alias of ".". This
 	 * leads to a crippled directory tree.  It's pretty tough to do a
 	 * "ls" or "pwd" with the "." directory entry missing, and "cd .."
 	 * doesn't work if the ".." entry is missing.
 	 */
 	if (ip->de_Attributes & ATTR_DIRECTORY) {
 		/*
 		 * Avoid ".", "..", and aliases of "." for obvious reasons.
 		 */
 		if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
 		    dp == ip ||
 		    (fcnp->cn_flags & ISDOTDOT) ||
 		    (tcnp->cn_flags & ISDOTDOT) ||
 		    (ip->de_flag & DE_RENAME)) {
 			VOP_UNLOCK(fvp, 0);
 			error = EINVAL;
 			goto abortit;
 		}
 		ip->de_flag |= DE_RENAME;
 		doingdirectory++;
 	}
 
 	/*
 	 * When the target exists, both the directory
 	 * and target vnodes are returned locked.
 	 */
 	dp = VTODE(tdvp);
 	xp = tvp ? VTODE(tvp) : NULL;
 	/*
 	 * Remember direntry place to use for destination
 	 */
 	to_diroffset = dp->de_fndoffset;
 	to_count = dp->de_fndcnt;
 
 	/*
 	 * If ".." must be changed (ie the directory gets a new
 	 * parent) then the source directory must not be in the
 	 * directory hierarchy above the target, as this would
 	 * orphan everything below the source directory. Also
 	 * the user must have write permission in the source so
 	 * as to be able to change "..". We must repeat the call
 	 * to namei, as the parent directory is unlocked by the
 	 * call to doscheckpath().
 	 */
 	error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread);
 	VOP_UNLOCK(fvp, 0);
 	if (VTODE(fdvp)->de_StartCluster != VTODE(tdvp)->de_StartCluster)
 		newparent = 1;
 	if (doingdirectory && newparent) {
 		if (error)	/* write access check above */
 			goto bad;
 		if (xp != NULL)
 			vput(tvp);
 		/*
 		 * doscheckpath() vput()'s dp,
 		 * so we have to do a relookup afterwards
 		 */
 		error = doscheckpath(ip, dp);
 		if (error)
 			goto out;
 		if ((tcnp->cn_flags & SAVESTART) == 0)
 			panic("msdosfs_rename: lost to startdir");
 		error = relookup(tdvp, &tvp, tcnp);
 		if (error)
 			goto out;
 		dp = VTODE(tdvp);
 		xp = tvp ? VTODE(tvp) : NULL;
 	}
 
 	if (xp != NULL) {
 		/*
 		 * Target must be empty if a directory and have no links
 		 * to it. Also, ensure source and target are compatible
 		 * (both directories, or both not directories).
 		 */
 		if (xp->de_Attributes & ATTR_DIRECTORY) {
 			if (!dosdirempty(xp)) {
 				error = ENOTEMPTY;
 				goto bad;
 			}
 			if (!doingdirectory) {
 				error = ENOTDIR;
 				goto bad;
 			}
 			cache_purge(tdvp);
 		} else if (doingdirectory) {
 			error = EISDIR;
 			goto bad;
 		}
 		error = removede(dp, xp);
 		if (error)
 			goto bad;
 		vput(tvp);
 		xp = NULL;
 	}
 
 	/*
 	 * Convert the filename in tcnp into a dos filename. We copy this
 	 * into the denode and directory entry for the destination
 	 * file/directory.
 	 */
 	error = uniqdosname(VTODE(tdvp), tcnp, toname);
 	if (error)
 		goto abortit;
 
 	/*
 	 * Since from wasn't locked at various places above,
 	 * have to do a relookup here.
 	 */
 	fcnp->cn_flags &= ~MODMASK;
 	fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
 	if ((fcnp->cn_flags & SAVESTART) == 0)
 		panic("msdosfs_rename: lost from startdir");
 	if (!newparent)
 		VOP_UNLOCK(tdvp, 0);
 	if (relookup(fdvp, &fvp, fcnp) == 0)
 		vrele(fdvp);
 	if (fvp == NULL) {
 		/*
 		 * From name has disappeared.
 		 */
 		if (doingdirectory)
 			panic("rename: lost dir entry");
 		if (newparent)
 			VOP_UNLOCK(tdvp, 0);
 		vrele(tdvp);
 		vrele(ap->a_fvp);
 		return 0;
 	}
 	xp = VTODE(fvp);
 	zp = VTODE(fdvp);
 	from_diroffset = zp->de_fndoffset;
 
 	/*
 	 * Ensure that the directory entry still exists and has not
 	 * changed till now. If the source is a file the entry may
 	 * have been unlinked or renamed. In either case there is
 	 * no further work to be done. If the source is a directory
 	 * then it cannot have been rmdir'ed or renamed; this is
 	 * prohibited by the DE_RENAME flag.
 	 */
 	if (xp != ip) {
 		if (doingdirectory)
 			panic("rename: lost dir entry");
 		VOP_UNLOCK(fvp, 0);
 		if (newparent)
 			VOP_UNLOCK(fdvp, 0);
 		vrele(ap->a_fvp);
 		xp = NULL;
 	} else {
 		vrele(fvp);
 		xp = NULL;
 
 		/*
 		 * First write a new entry in the destination
 		 * directory and mark the entry in the source directory
 		 * as deleted.  Then move the denode to the correct hash
 		 * chain for its new location in the filesystem.  And, if
 		 * we moved a directory, then update its .. entry to point
 		 * to the new parent directory.
 		 */
 		memcpy(oldname, ip->de_Name, 11);
 		memcpy(ip->de_Name, toname, 11);	/* update denode */
 		dp->de_fndoffset = to_diroffset;
 		dp->de_fndcnt = to_count;
 		error = createde(ip, dp, (struct denode **)0, tcnp);
 		if (error) {
 			memcpy(ip->de_Name, oldname, 11);
 			if (newparent)
 				VOP_UNLOCK(fdvp, 0);
 			VOP_UNLOCK(fvp, 0);
 			goto bad;
 		}
 		/*
 		 * If ip is for a directory, then its name should always
 		 * be "." since it is for the directory entry in the
 		 * directory itself (msdosfs_lookup() always translates
 		 * to the "." entry so as to get a unique denode, except
 		 * for the root directory there are different
 		 * complications).  However, we just corrupted its name
 		 * to pass the correct name to createde().  Undo this.
 		 */
 		if ((ip->de_Attributes & ATTR_DIRECTORY) != 0)
 			memcpy(ip->de_Name, oldname, 11);
 		ip->de_refcnt++;
 		zp->de_fndoffset = from_diroffset;
 		error = removede(zp, ip);
 		if (error) {
 			/* XXX should downgrade to ro here, fs is corrupt */
 			if (newparent)
 				VOP_UNLOCK(fdvp, 0);
 			VOP_UNLOCK(fvp, 0);
 			goto bad;
 		}
 		if (!doingdirectory) {
 			error = pcbmap(dp, de_cluster(pmp, to_diroffset), 0,
 				       &ip->de_dirclust, 0);
 			if (error) {
 				/* XXX should downgrade to ro here, fs is corrupt */
 				if (newparent)
 					VOP_UNLOCK(fdvp, 0);
 				VOP_UNLOCK(fvp, 0);
 				goto bad;
 			}
 			if (ip->de_dirclust == MSDOSFSROOT)
 				ip->de_diroffset = to_diroffset;
 			else
 				ip->de_diroffset = to_diroffset & pmp->pm_crbomask;
 		}
 		reinsert(ip);
 		if (newparent)
 			VOP_UNLOCK(fdvp, 0);
 	}
 
 	/*
 	 * If we moved a directory to a new parent directory, then we must
 	 * fixup the ".." entry in the moved directory.
 	 */
 	if (doingdirectory && newparent) {
 		cn = ip->de_StartCluster;
 		if (cn == MSDOSFSROOT) {
 			/* this should never happen */
 			panic("msdosfs_rename(): updating .. in root directory?");
 		} else
 			bn = cntobn(pmp, cn);
 		error = bread(pmp->pm_devvp, bn, pmp->pm_bpcluster,
 			      NOCRED, &bp);
 		if (error) {
 			/* XXX should downgrade to ro here, fs is corrupt */
 			brelse(bp);
 			VOP_UNLOCK(fvp, 0);
 			goto bad;
 		}
 		dotdotp = (struct direntry *)bp->b_data + 1;
 		pcl = dp->de_StartCluster;
 		if (FAT32(pmp) && pcl == pmp->pm_rootdirblk)
 			pcl = MSDOSFSROOT;
 		putushort(dotdotp->deStartCluster, pcl);
 		if (FAT32(pmp))
 			putushort(dotdotp->deHighClust, pcl >> 16);
 		if (DOINGASYNC(fvp))
 			bdwrite(bp);
 		else if ((error = bwrite(bp)) != 0) {
 			/* XXX should downgrade to ro here, fs is corrupt */
 			VOP_UNLOCK(fvp, 0);
 			goto bad;
 		}
 	}
 
 	/*
 	 * The msdosfs lookup is case insensitive. Several aliases may
 	 * be inserted for a single directory entry. As a consequnce,
 	 * name cache purge done by lookup for fvp when DELETE op for
 	 * namei is specified, might be not enough to expunge all
 	 * namecache entries that were installed for this direntry.
 	 */
 	cache_purge(fvp);
 	VOP_UNLOCK(fvp, 0);
 bad:
 	if (xp)
 		vput(tvp);
 	vput(tdvp);
 out:
 	ip->de_flag &= ~DE_RENAME;
 	vrele(fdvp);
 	vrele(fvp);
 	return (error);
 
 }
 
 static struct {
 	struct direntry dot;
 	struct direntry dotdot;
 } dosdirtemplate = {
 	{	".          ",				/* the . entry */
 		ATTR_DIRECTORY,				/* file attribute */
 		0,					/* reserved */
 		0, { 0, 0 }, { 0, 0 },			/* create time & date */
 		{ 0, 0 },				/* access date */
 		{ 0, 0 },				/* high bits of start cluster */
 		{ 210, 4 }, { 210, 4 },			/* modify time & date */
 		{ 0, 0 },				/* startcluster */
 		{ 0, 0, 0, 0 }				/* filesize */
 	},
 	{	"..         ",				/* the .. entry */
 		ATTR_DIRECTORY,				/* file attribute */
 		0,					/* reserved */
 		0, { 0, 0 }, { 0, 0 },			/* create time & date */
 		{ 0, 0 },				/* access date */
 		{ 0, 0 },				/* high bits of start cluster */
 		{ 210, 4 }, { 210, 4 },			/* modify time & date */
 		{ 0, 0 },				/* startcluster */
 		{ 0, 0, 0, 0 }				/* filesize */
 	}
 };
 
 static int
 msdosfs_mkdir(struct vop_mkdir_args *ap)
 {
 	struct componentname *cnp = ap->a_cnp;
 	struct denode *dep;
 	struct denode *pdep = VTODE(ap->a_dvp);
 	struct direntry *denp;
 	struct msdosfsmount *pmp = pdep->de_pmp;
 	struct buf *bp;
 	u_long newcluster, pcl;
 	int bn;
 	int error;
 	struct denode ndirent;
 	struct timespec ts;
 
 	/*
 	 * If this is the root directory and there is no space left we
 	 * can't do anything.  This is because the root directory can not
 	 * change size.
 	 */
 	if (pdep->de_StartCluster == MSDOSFSROOT
 	    && pdep->de_fndoffset >= pdep->de_FileSize) {
 		error = ENOSPC;
 		goto bad2;
 	}
 
 	/*
 	 * Allocate a cluster to hold the about to be created directory.
 	 */
 	error = clusteralloc(pmp, 0, 1, CLUST_EOFE, &newcluster, NULL);
 	if (error)
 		goto bad2;
 
 	memset(&ndirent, 0, sizeof(ndirent));
 	ndirent.de_pmp = pmp;
 	ndirent.de_flag = DE_ACCESS | DE_CREATE | DE_UPDATE;
 	vfs_timestamp(&ts);
 	DETIMES(&ndirent, &ts, &ts, &ts);
 
 	/*
 	 * Now fill the cluster with the "." and ".." entries. And write
 	 * the cluster to disk.  This way it is there for the parent
 	 * directory to be pointing at if there were a crash.
 	 */
 	bn = cntobn(pmp, newcluster);
 	/* always succeeds */
 	bp = getblk(pmp->pm_devvp, bn, pmp->pm_bpcluster, 0, 0, 0);
 	memset(bp->b_data, 0, pmp->pm_bpcluster);
 	memcpy(bp->b_data, &dosdirtemplate, sizeof dosdirtemplate);
 	denp = (struct direntry *)bp->b_data;
 	putushort(denp[0].deStartCluster, newcluster);
 	putushort(denp[0].deCDate, ndirent.de_CDate);
 	putushort(denp[0].deCTime, ndirent.de_CTime);
 	denp[0].deCHundredth = ndirent.de_CHun;
 	putushort(denp[0].deADate, ndirent.de_ADate);
 	putushort(denp[0].deMDate, ndirent.de_MDate);
 	putushort(denp[0].deMTime, ndirent.de_MTime);
 	pcl = pdep->de_StartCluster;
 	/*
 	 * Although the root directory has a non-magic starting cluster
 	 * number for FAT32, chkdsk and fsck_msdosfs still require
 	 * references to it in dotdot entries to be magic.
 	 */
 	if (FAT32(pmp) && pcl == pmp->pm_rootdirblk)
 		pcl = MSDOSFSROOT;
 	putushort(denp[1].deStartCluster, pcl);
 	putushort(denp[1].deCDate, ndirent.de_CDate);
 	putushort(denp[1].deCTime, ndirent.de_CTime);
 	denp[1].deCHundredth = ndirent.de_CHun;
 	putushort(denp[1].deADate, ndirent.de_ADate);
 	putushort(denp[1].deMDate, ndirent.de_MDate);
 	putushort(denp[1].deMTime, ndirent.de_MTime);
 	if (FAT32(pmp)) {
 		putushort(denp[0].deHighClust, newcluster >> 16);
 		putushort(denp[1].deHighClust, pcl >> 16);
 	}
 
 	if (DOINGASYNC(ap->a_dvp))
 		bdwrite(bp);
 	else if ((error = bwrite(bp)) != 0)
 		goto bad;
 
 	/*
 	 * Now build up a directory entry pointing to the newly allocated
 	 * cluster.  This will be written to an empty slot in the parent
 	 * directory.
 	 */
 #ifdef DIAGNOSTIC
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("msdosfs_mkdir: no name");
 #endif
 	error = uniqdosname(pdep, cnp, ndirent.de_Name);
 	if (error)
 		goto bad;
 
 	ndirent.de_Attributes = ATTR_DIRECTORY;
 	ndirent.de_LowerCase = 0;
 	ndirent.de_StartCluster = newcluster;
 	ndirent.de_FileSize = 0;
 	error = createde(&ndirent, pdep, &dep, cnp);
 	if (error)
 		goto bad;
 	*ap->a_vpp = DETOV(dep);
 	return (0);
 
 bad:
 	clusterfree(pmp, newcluster, NULL);
 bad2:
 	return (error);
 }
 
 static int
 msdosfs_rmdir(struct vop_rmdir_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct denode *ip, *dp;
 	int error;
 
 	ip = VTODE(vp);
 	dp = VTODE(dvp);
 
 	/*
 	 * Verify the directory is empty (and valid).
 	 * (Rmdir ".." won't be valid since
 	 *  ".." will contain a reference to
 	 *  the current directory and thus be
 	 *  non-empty.)
 	 */
 	error = 0;
 	if (!dosdirempty(ip) || ip->de_flag & DE_RENAME) {
 		error = ENOTEMPTY;
 		goto out;
 	}
 	/*
 	 * Delete the entry from the directory.  For dos filesystems this
 	 * gets rid of the directory entry on disk, the in memory copy
 	 * still exists but the de_refcnt is <= 0.  This prevents it from
 	 * being found by deget().  When the vput() on dep is done we give
 	 * up access and eventually msdosfs_reclaim() will be called which
 	 * will remove it from the denode cache.
 	 */
 	error = removede(dp, ip);
 	if (error)
 		goto out;
 	/*
 	 * This is where we decrement the link count in the parent
 	 * directory.  Since dos filesystems don't do this we just purge
 	 * the name cache.
 	 */
 	cache_purge(dvp);
 	/*
 	 * Truncate the directory that is being deleted.
 	 */
 	error = detrunc(ip, (u_long)0, IO_SYNC, cnp->cn_cred);
 	cache_purge(vp);
 
 out:
 	return (error);
 }
 
 /*
  * DOS filesystems don't know what symlinks are.
  */
 static int
 msdosfs_symlink(struct vop_symlink_args *ap)
 {
 	return (EOPNOTSUPP);
 }
 
 static int
 msdosfs_readdir(struct vop_readdir_args *ap)
 {
 	struct mbnambuf nb;
 	int error = 0;
 	int diff;
 	long n;
 	int blsize;
 	long on;
 	u_long cn;
 	u_long dirsperblk;
 	long bias = 0;
 	daddr_t bn, lbn;
 	struct buf *bp;
 	struct denode *dep = VTODE(ap->a_vp);
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct direntry *dentp;
 	struct dirent dirbuf;
 	struct uio *uio = ap->a_uio;
 	u_long *cookies = NULL;
 	int ncookies = 0;
 	off_t offset, off;
 	int chksum = -1;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_readdir(): vp %p, uio %p, cred %p, eofflagp %p\n",
 	    ap->a_vp, uio, ap->a_cred, ap->a_eofflag);
 #endif
 
 	/*
 	 * msdosfs_readdir() won't operate properly on regular files since
 	 * it does i/o only with the filesystem vnode, and hence can
 	 * retrieve the wrong block from the buffer cache for a plain file.
 	 * So, fail attempts to readdir() on a plain file.
 	 */
 	if ((dep->de_Attributes & ATTR_DIRECTORY) == 0)
 		return (ENOTDIR);
 
 	/*
 	 * To be safe, initialize dirbuf
 	 */
 	memset(dirbuf.d_name, 0, sizeof(dirbuf.d_name));
 
 	/*
 	 * If the user buffer is smaller than the size of one dos directory
 	 * entry or the file offset is not a multiple of the size of a
 	 * directory entry, then we fail the read.
 	 */
 	off = offset = uio->uio_offset;
 	if (uio->uio_resid < sizeof(struct direntry) ||
 	    (offset & (sizeof(struct direntry) - 1)))
 		return (EINVAL);
 
 	if (ap->a_ncookies) {
 		ncookies = uio->uio_resid / 16;
 		cookies = malloc(ncookies * sizeof(u_long), M_TEMP,
 		       M_WAITOK);
 		*ap->a_cookies = cookies;
 		*ap->a_ncookies = ncookies;
 	}
 
 	dirsperblk = pmp->pm_BytesPerSec / sizeof(struct direntry);
 
 	/*
 	 * If they are reading from the root directory then, we simulate
 	 * the . and .. entries since these don't exist in the root
 	 * directory.  We also set the offset bias to make up for having to
 	 * simulate these entries. By this I mean that at file offset 64 we
 	 * read the first entry in the root directory that lives on disk.
 	 */
 	if (dep->de_StartCluster == MSDOSFSROOT
 	    || (FAT32(pmp) && dep->de_StartCluster == pmp->pm_rootdirblk)) {
 #if 0
 		printf("msdosfs_readdir(): going after . or .. in root dir, offset %d\n",
 		    offset);
 #endif
 		bias = 2 * sizeof(struct direntry);
 		if (offset < bias) {
 			for (n = (int)offset / sizeof(struct direntry);
 			     n < 2; n++) {
 				dirbuf.d_fileno = FAT32(pmp) ?
 				    (uint64_t)cntobn(pmp, pmp->pm_rootdirblk) *
 				    dirsperblk : 1;
 				dirbuf.d_type = DT_DIR;
 				switch (n) {
 				case 0:
 					dirbuf.d_namlen = 1;
 					strcpy(dirbuf.d_name, ".");
 					break;
 				case 1:
 					dirbuf.d_namlen = 2;
 					strcpy(dirbuf.d_name, "..");
 					break;
 				}
 				dirbuf.d_reclen = GENERIC_DIRSIZ(&dirbuf);
+				/* NOTE: d_off is the offset of the *next* entry. */
+				dirbuf.d_off = offset + sizeof(struct direntry);
 				if (uio->uio_resid < dirbuf.d_reclen)
 					goto out;
 				error = uiomove(&dirbuf, dirbuf.d_reclen, uio);
 				if (error)
 					goto out;
 				offset += sizeof(struct direntry);
 				off = offset;
 				if (cookies) {
 					*cookies++ = offset;
 					if (--ncookies <= 0)
 						goto out;
 				}
 			}
 		}
 	}
 
 	mbnambuf_init(&nb);
 	off = offset;
 	while (uio->uio_resid > 0) {
 		lbn = de_cluster(pmp, offset - bias);
 		on = (offset - bias) & pmp->pm_crbomask;
 		n = min(pmp->pm_bpcluster - on, uio->uio_resid);
 		diff = dep->de_FileSize - (offset - bias);
 		if (diff <= 0)
 			break;
 		n = min(n, diff);
 		error = pcbmap(dep, lbn, &bn, &cn, &blsize);
 		if (error)
 			break;
 		error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp);
 		if (error) {
 			brelse(bp);
 			return (error);
 		}
 		n = min(n, blsize - bp->b_resid);
 		if (n == 0) {
 			brelse(bp);
 			return (EIO);
 		}
 
 		/*
 		 * Convert from dos directory entries to fs-independent
 		 * directory entries.
 		 */
 		for (dentp = (struct direntry *)(bp->b_data + on);
 		     (char *)dentp < bp->b_data + on + n;
 		     dentp++, offset += sizeof(struct direntry)) {
 #if 0
 			printf("rd: dentp %08x prev %08x crnt %08x deName %02x attr %02x\n",
 			    dentp, prev, crnt, dentp->deName[0], dentp->deAttributes);
 #endif
 			/*
 			 * If this is an unused entry, we can stop.
 			 */
 			if (dentp->deName[0] == SLOT_EMPTY) {
 				brelse(bp);
 				goto out;
 			}
 			/*
 			 * Skip deleted entries.
 			 */
 			if (dentp->deName[0] == SLOT_DELETED) {
 				chksum = -1;
 				mbnambuf_init(&nb);
 				continue;
 			}
 
 			/*
 			 * Handle Win95 long directory entries
 			 */
 			if (dentp->deAttributes == ATTR_WIN95) {
 				if (pmp->pm_flags & MSDOSFSMNT_SHORTNAME)
 					continue;
 				chksum = win2unixfn(&nb,
 				    (struct winentry *)dentp, chksum, pmp);
 				continue;
 			}
 
 			/*
 			 * Skip volume labels
 			 */
 			if (dentp->deAttributes & ATTR_VOLUME) {
 				chksum = -1;
 				mbnambuf_init(&nb);
 				continue;
 			}
 			/*
 			 * This computation of d_fileno must match
 			 * the computation of va_fileid in
 			 * msdosfs_getattr.
 			 */
 			if (dentp->deAttributes & ATTR_DIRECTORY) {
 				cn = getushort(dentp->deStartCluster);
 				if (FAT32(pmp)) {
 					cn |= getushort(dentp->deHighClust) <<
 					    16;
 					if (cn == MSDOSFSROOT)
 						cn = pmp->pm_rootdirblk;
 				}
 				if (cn == MSDOSFSROOT && !FAT32(pmp))
 					dirbuf.d_fileno = 1;
 				else
 					dirbuf.d_fileno = cntobn(pmp, cn) *
 					    dirsperblk;
 				dirbuf.d_type = DT_DIR;
 			} else {
 				dirbuf.d_fileno = (uoff_t)offset /
 				    sizeof(struct direntry);
 				dirbuf.d_type = DT_REG;
 			}
 
 			if (chksum != winChksum(dentp->deName)) {
 				dirbuf.d_namlen = dos2unixfn(dentp->deName,
 				    (u_char *)dirbuf.d_name,
 				    dentp->deLowerCase |
 					((pmp->pm_flags & MSDOSFSMNT_SHORTNAME) ?
 					(LCASE_BASE | LCASE_EXT) : 0),
 				    pmp);
 				mbnambuf_init(&nb);
 			} else
 				mbnambuf_flush(&nb, &dirbuf);
 			chksum = -1;
 			dirbuf.d_reclen = GENERIC_DIRSIZ(&dirbuf);
+			/* NOTE: d_off is the offset of the *next* entry. */
+			dirbuf.d_off = offset + sizeof(struct direntry);
 			if (uio->uio_resid < dirbuf.d_reclen) {
 				brelse(bp);
 				goto out;
 			}
 			error = uiomove(&dirbuf, dirbuf.d_reclen, uio);
 			if (error) {
 				brelse(bp);
 				goto out;
 			}
 			if (cookies) {
 				*cookies++ = offset + sizeof(struct direntry);
 				if (--ncookies <= 0) {
 					brelse(bp);
 					goto out;
 				}
 			}
 			off = offset + sizeof(struct direntry);
 		}
 		brelse(bp);
 	}
 out:
 	/* Subtract unused cookies */
 	if (ap->a_ncookies)
 		*ap->a_ncookies -= ncookies;
 
 	uio->uio_offset = off;
 
 	/*
 	 * Set the eofflag (NFS uses it)
 	 */
 	if (ap->a_eofflag) {
 		if (dep->de_FileSize - (offset - bias) <= 0)
 			*ap->a_eofflag = 1;
 		else
 			*ap->a_eofflag = 0;
 	}
 	return (error);
 }
 
 /*-
  * a_vp   - pointer to the file's vnode
  * a_bn   - logical block number within the file (cluster number for us)
  * a_bop  - where to return the bufobj of the special file containing the fs
  * a_bnp  - where to return the "physical" block number corresponding to a_bn
  *          (relative to the special file; units are blocks of size DEV_BSIZE)
  * a_runp - where to return the "run past" a_bn.  This is the count of logical
  *          blocks whose physical blocks (together with a_bn's physical block)
  *          are contiguous.
  * a_runb - where to return the "run before" a_bn.
  */
 static int
 msdosfs_bmap(struct vop_bmap_args *ap)
 {
 	struct denode *dep;
 	struct mount *mp;
 	struct msdosfsmount *pmp;
 	struct vnode *vp;
 	daddr_t runbn;
 	u_long cn;
 	int bnpercn, error, maxio, maxrun, run;
 
 	vp = ap->a_vp;
 	dep = VTODE(vp);
 	pmp = dep->de_pmp;
 	if (ap->a_bop != NULL)
 		*ap->a_bop = &pmp->pm_devvp->v_bufobj;
 	if (ap->a_bnp == NULL)
 		return (0);
 	if (ap->a_runp != NULL)
 		*ap->a_runp = 0;
 	if (ap->a_runb != NULL)
 		*ap->a_runb = 0;
 	cn = ap->a_bn;
 	if (cn != ap->a_bn)
 		return (EFBIG);
 	error = pcbmap(dep, cn, ap->a_bnp, NULL, NULL);
 	if (error != 0 || (ap->a_runp == NULL && ap->a_runb == NULL))
 		return (error);
 
 	mp = vp->v_mount;
 	maxio = mp->mnt_iosize_max / mp->mnt_stat.f_iosize;
 	bnpercn = de_cn2bn(pmp, 1);
 	if (ap->a_runp != NULL) {
 		maxrun = ulmin(maxio - 1, pmp->pm_maxcluster - cn);
 		for (run = 1; run <= maxrun; run++) {
 			if (pcbmap(dep, cn + run, &runbn, NULL, NULL) != 0 ||
 			    runbn != *ap->a_bnp + run * bnpercn)
 				break;
 		}
 		*ap->a_runp = run - 1;
 	}
 	if (ap->a_runb != NULL) {
 		maxrun = ulmin(maxio - 1, cn);
 		for (run = 1; run < maxrun; run++) {
 			if (pcbmap(dep, cn - run, &runbn, NULL, NULL) != 0 ||
 			    runbn != *ap->a_bnp - run * bnpercn)
 				break;
 		}
 		*ap->a_runb = run - 1;
 	}
 	return (0);
 }
 
 SYSCTL_NODE(_vfs, OID_AUTO, msdosfs, CTLFLAG_RW, 0, "msdos filesystem");
 static int use_buf_pager = 1;
 SYSCTL_INT(_vfs_msdosfs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN,
     &use_buf_pager, 0,
     "Use buffer pager instead of bmap");
 
 static daddr_t
 msdosfs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
 {
 
 	return (de_cluster(VTODE(vp)->de_pmp, off));
 }
 
 static int
 msdosfs_gbp_getblksz(struct vnode *vp, daddr_t lbn)
 {
 
 	return (VTODE(vp)->de_pmp->pm_bpcluster);
 }
 
 static int
 msdosfs_getpages(struct vop_getpages_args *ap)
 {
 
 	if (use_buf_pager)
 		return (vfs_bio_getpages(ap->a_vp, ap->a_m, ap->a_count,
 		    ap->a_rbehind, ap->a_rahead, msdosfs_gbp_getblkno,
 		    msdosfs_gbp_getblksz));
 	return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count,
 	    ap->a_rbehind, ap->a_rahead, NULL, NULL));
 }
 
 static int
 msdosfs_strategy(struct vop_strategy_args *ap)
 {
 	struct buf *bp = ap->a_bp;
 	struct denode *dep = VTODE(ap->a_vp);
 	struct bufobj *bo;
 	int error = 0;
 	daddr_t blkno;
 
 	/*
 	 * If we don't already know the filesystem relative block number
 	 * then get it using pcbmap().  If pcbmap() returns the block
 	 * number as -1 then we've got a hole in the file.  DOS filesystems
 	 * don't allow files with holes, so we shouldn't ever see this.
 	 */
 	if (bp->b_blkno == bp->b_lblkno) {
 		error = pcbmap(dep, bp->b_lblkno, &blkno, 0, 0);
 		bp->b_blkno = blkno;
 		if (error) {
 			bp->b_error = error;
 			bp->b_ioflags |= BIO_ERROR;
 			bufdone(bp);
 			return (0);
 		}
 		if ((long)bp->b_blkno == -1)
 			vfs_bio_clrbuf(bp);
 	}
 	if (bp->b_blkno == -1) {
 		bufdone(bp);
 		return (0);
 	}
 	/*
 	 * Read/write the block from/to the disk that contains the desired
 	 * file block.
 	 */
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bo = dep->de_pmp->pm_bo;
 	BO_STRATEGY(bo, bp);
 	return (0);
 }
 
 static int
 msdosfs_print(struct vop_print_args *ap)
 {
 	struct denode *dep = VTODE(ap->a_vp);
 
 	printf("\tstartcluster %lu, dircluster %lu, diroffset %lu, ",
 	       dep->de_StartCluster, dep->de_dirclust, dep->de_diroffset);
 	printf("on dev %s\n", devtoname(dep->de_pmp->pm_dev));
 	return (0);
 }
 
 static int
 msdosfs_pathconf(struct vop_pathconf_args *ap)
 {
 	struct msdosfsmount *pmp = VTODE(ap->a_vp)->de_pmp;
 
 	switch (ap->a_name) {
 	case _PC_FILESIZEBITS:
 		*ap->a_retval = 32;
 		return (0);
 	case _PC_LINK_MAX:
 		*ap->a_retval = 1;
 		return (0);
 	case _PC_NAME_MAX:
 		*ap->a_retval = pmp->pm_flags & MSDOSFSMNT_LONGNAME ? WIN_MAXLEN : 12;
 		return (0);
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = 1;
 		return (0);
 	case _PC_NO_TRUNC:
 		*ap->a_retval = 0;
 		return (0);
 	default:
 		return (vop_stdpathconf(ap));
 	}
 	/* NOTREACHED */
 }
 
 static int
 msdosfs_vptofh(struct vop_vptofh_args *ap)
 {
 	struct denode *dep;
 	struct defid *defhp;
 
 	dep = VTODE(ap->a_vp);
 	defhp = (struct defid *)ap->a_fhp;
 	defhp->defid_len = sizeof(struct defid);
 	defhp->defid_dirclust = dep->de_dirclust;
 	defhp->defid_dirofs = dep->de_diroffset;
 	/* defhp->defid_gen = dep->de_gen; */
 	return (0);
 }
 
 /* Global vfs data structures for msdosfs */
 struct vop_vector msdosfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		msdosfs_access,
 	.vop_bmap =		msdosfs_bmap,
 	.vop_getpages =		msdosfs_getpages,
 	.vop_cachedlookup =	msdosfs_lookup,
 	.vop_open =		msdosfs_open,
 	.vop_close =		msdosfs_close,
 	.vop_create =		msdosfs_create,
 	.vop_fsync =		msdosfs_fsync,
 	.vop_fdatasync =	vop_stdfdatasync_buf,
 	.vop_getattr =		msdosfs_getattr,
 	.vop_inactive =		msdosfs_inactive,
 	.vop_link =		msdosfs_link,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_mkdir =		msdosfs_mkdir,
 	.vop_mknod =		msdosfs_mknod,
 	.vop_pathconf =		msdosfs_pathconf,
 	.vop_print =		msdosfs_print,
 	.vop_read =		msdosfs_read,
 	.vop_readdir =		msdosfs_readdir,
 	.vop_reclaim =		msdosfs_reclaim,
 	.vop_remove =		msdosfs_remove,
 	.vop_rename =		msdosfs_rename,
 	.vop_rmdir =		msdosfs_rmdir,
 	.vop_setattr =		msdosfs_setattr,
 	.vop_strategy =		msdosfs_strategy,
 	.vop_symlink =		msdosfs_symlink,
 	.vop_write =		msdosfs_write,
 	.vop_vptofh =		msdosfs_vptofh,
 };
Index: stable/12/sys/fs/nandfs/nandfs_vnops.c
===================================================================
--- stable/12/sys/fs/nandfs/nandfs_vnops.c	(revision 340720)
+++ stable/12/sys/fs/nandfs/nandfs_vnops.c	(revision 340721)
@@ -1,2454 +1,2456 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2010-2012 Semihalf
  * Copyright (c) 2008, 2009 Reinoud Zandijk
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * From: NetBSD: nilfs_vnops.c,v 1.2 2009/08/26 03:40:48 elad
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/lockf.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/sysctl.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/buf.h>
 #include <sys/bio.h>
 #include <sys/fcntl.h>
 #include <sys/dirent.h>
 #include <sys/rwlock.h>
 #include <sys/stat.h>
 #include <sys/priv.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/vnode_pager.h>
 
 #include <machine/_inttypes.h>
 
 #include <fs/nandfs/nandfs_mount.h>
 #include <fs/nandfs/nandfs.h>
 #include <fs/nandfs/nandfs_subr.h>
 
 extern uma_zone_t nandfs_node_zone;
 static void nandfs_read_filebuf(struct nandfs_node *, struct buf *);
 static void nandfs_itimes_locked(struct vnode *);
 static int nandfs_truncate(struct vnode *, uint64_t);
 
 static vop_pathconf_t	nandfs_pathconf;
 
 #define UPDATE_CLOSE 0
 #define UPDATE_WAIT 0
 
 static int
 nandfs_inactive(struct vop_inactive_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nandfs_node *node = VTON(vp);
 	int error = 0;
 
 	DPRINTF(VNCALL, ("%s: vp:%p node:%p\n", __func__, vp, node));
 
 	if (node == NULL) {
 		DPRINTF(NODE, ("%s: inactive NULL node\n", __func__));
 		return (0);
 	}
 
 	if (node->nn_inode.i_mode != 0 && !(node->nn_inode.i_links_count)) {
 		nandfs_truncate(vp, 0);
 		error = nandfs_node_destroy(node);
 		if (error)
 			nandfs_error("%s: destroy node: %p\n", __func__, node);
 		node->nn_flags = 0;
 		vrecycle(vp);
 	}
 
 	return (error);
 }
 
 static int
 nandfs_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nandfs_node *nandfs_node = VTON(vp);
 	struct nandfs_device *fsdev = nandfs_node->nn_nandfsdev;
 	uint64_t ino = nandfs_node->nn_ino;
 
 	DPRINTF(VNCALL, ("%s: vp:%p node:%p\n", __func__, vp, nandfs_node));
 
 	/* Invalidate all entries to a particular vnode. */
 	cache_purge(vp);
 
 	/* Destroy the vm object and flush associated pages. */
 	vnode_destroy_vobject(vp);
 
 	/* Remove from vfs hash if not system vnode */
 	if (!NANDFS_SYS_NODE(nandfs_node->nn_ino))
 		vfs_hash_remove(vp);
 
 	/* Dispose all node knowledge */
 	nandfs_dispose_node(&nandfs_node);
 
 	if (!NANDFS_SYS_NODE(ino))
 		NANDFS_WRITEUNLOCK(fsdev);
 
 	return (0);
 }
 
 static int
 nandfs_read(struct vop_read_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nandfs_node *node = VTON(vp);
 	struct nandfs_device *nandfsdev = node->nn_nandfsdev;
 	struct uio *uio = ap->a_uio;
 	struct buf *bp;
 	uint64_t size;
 	uint32_t blocksize;
 	off_t bytesinfile;
 	ssize_t toread, off;
 	daddr_t lbn;
 	ssize_t resid;
 	int error = 0;
 
 	if (uio->uio_resid == 0)
 		return (0);
 
 	size = node->nn_inode.i_size;
 	if (uio->uio_offset >= size)
 		return (0);
 
 	blocksize = nandfsdev->nd_blocksize;
 	bytesinfile = size - uio->uio_offset;
 
 	resid = omin(uio->uio_resid, bytesinfile);
 
 	while (resid) {
 		lbn = uio->uio_offset / blocksize;
 		off = uio->uio_offset & (blocksize - 1);
 
 		toread = omin(resid, blocksize - off);
 
 		DPRINTF(READ, ("nandfs_read bn: 0x%jx toread: 0x%zx (0x%x)\n",
 		    (uintmax_t)lbn, toread, blocksize));
 
 		error = nandfs_bread(node, lbn, NOCRED, 0, &bp);
 		if (error) {
 			brelse(bp);
 			break;
 		}
 
 		error = uiomove(bp->b_data + off, toread, uio);
 		if (error) {
 			brelse(bp);
 			break;
 		}
 
 		brelse(bp);
 		resid -= toread;
 	}
 
 	return (error);
 }
 
 static int
 nandfs_write(struct vop_write_args *ap)
 {
 	struct nandfs_device *fsdev;
 	struct nandfs_node *node;
 	struct vnode *vp;
 	struct uio *uio;
 	struct buf *bp;
 	uint64_t file_size, vblk;
 	uint32_t blocksize;
 	ssize_t towrite, off;
 	daddr_t lbn;
 	ssize_t resid;
 	int error, ioflag, modified;
 
 	vp = ap->a_vp;
 	uio = ap->a_uio;
 	ioflag = ap->a_ioflag;
 	node = VTON(vp);
 	fsdev = node->nn_nandfsdev;
 
 	if (nandfs_fs_full(fsdev))
 		return (ENOSPC);
 
 	DPRINTF(WRITE, ("nandfs_write called %#zx at %#jx\n",
 	    uio->uio_resid, (uintmax_t)uio->uio_offset));
 
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 	if (uio->uio_resid == 0)
 		return (0);
 
 	blocksize = fsdev->nd_blocksize;
 	file_size = node->nn_inode.i_size;
 
 	switch (vp->v_type) {
 	case VREG:
 		if (ioflag & IO_APPEND)
 			uio->uio_offset = file_size;
 		break;
 	case VDIR:
 		return (EISDIR);
 	case VLNK:
 		break;
 	default:
 		panic("%s: bad file type vp: %p", __func__, vp);
 	}
 
 	/* If explicitly asked to append, uio_offset can be wrong? */
 	if (ioflag & IO_APPEND)
 		uio->uio_offset = file_size;
 
 	resid = uio->uio_resid;
 	modified = error = 0;
 
 	while (uio->uio_resid) {
 		lbn = uio->uio_offset / blocksize;
 		off = uio->uio_offset & (blocksize - 1);
 
 		towrite = omin(uio->uio_resid, blocksize - off);
 
 		DPRINTF(WRITE, ("%s: lbn: 0x%jd toread: 0x%zx (0x%x)\n",
 		    __func__, (uintmax_t)lbn, towrite, blocksize));
 
 		error = nandfs_bmap_lookup(node, lbn, &vblk);
 		if (error)
 			break;
 
 		DPRINTF(WRITE, ("%s: lbn: 0x%jd toread: 0x%zx (0x%x) "
 		    "vblk=%jx\n", __func__, (uintmax_t)lbn, towrite, blocksize,
 		    vblk));
 
 		if (vblk != 0)
 			error = nandfs_bread(node, lbn, NOCRED, 0, &bp);
 		else
 			error = nandfs_bcreate(node, lbn, NOCRED, 0, &bp);
 
 		DPRINTF(WRITE, ("%s: vp %p bread bp %p lbn %#jx\n", __func__,
 		    vp, bp, (uintmax_t)lbn));
 		if (error) {
 			if (bp)
 				brelse(bp);
 			break;
 		}
 
 		error = uiomove((char *)bp->b_data + off, (int)towrite, uio);
 		if (error)
 			break;
 
 		error = nandfs_dirty_buf(bp, 0);
 		if (error)
 			break;
 
 		modified++;
 	}
 
 	/* XXX proper handling when only part of file was properly written */
 	if (modified) {
 		if (resid > uio->uio_resid && ap->a_cred &&
 		    ap->a_cred->cr_uid != 0)
 			node->nn_inode.i_mode &= ~(ISUID | ISGID);
 
 		if (file_size < uio->uio_offset + uio->uio_resid) {
 			node->nn_inode.i_size = uio->uio_offset +
 			    uio->uio_resid;
 			node->nn_flags |= IN_CHANGE | IN_UPDATE;
 			vnode_pager_setsize(vp, uio->uio_offset +
 			    uio->uio_resid);
 			nandfs_itimes(vp);
 		}
 	}
 
 	DPRINTF(WRITE, ("%s: return:%d\n", __func__, error));
 
 	return (error);
 }
 
 static int
 nandfs_lookup(struct vop_cachedlookup_args *ap)
 {
 	struct vnode *dvp, **vpp;
 	struct componentname *cnp;
 	struct ucred *cred;
 	struct thread *td;
 	struct nandfs_node *dir_node, *node;
 	struct nandfsmount *nmp;
 	uint64_t ino, off;
 	const char *name;
 	int namelen, nameiop, islastcn, mounted_ro;
 	int error, found;
 
 	DPRINTF(VNCALL, ("%s\n", __func__));
 
 	dvp = ap->a_dvp;
 	vpp = ap->a_vpp;
 	*vpp = NULL;
 
 	cnp = ap->a_cnp;
 	cred = cnp->cn_cred;
 	td = cnp->cn_thread;
 
 	dir_node = VTON(dvp);
 	nmp = dir_node->nn_nmp;
 
 	/* Simplify/clarification flags */
 	nameiop = cnp->cn_nameiop;
 	islastcn = cnp->cn_flags & ISLASTCN;
 	mounted_ro = dvp->v_mount->mnt_flag & MNT_RDONLY;
 
 	/*
 	 * If requesting a modify on the last path element on a read-only
 	 * filingsystem, reject lookup;
 	 */
 	if (islastcn && mounted_ro && (nameiop == DELETE || nameiop == RENAME))
 		return (EROFS);
 
 	if (dir_node->nn_inode.i_links_count == 0)
 		return (ENOENT);
 
 	/*
 	 * Obviously, the file is not (anymore) in the namecache, we have to
 	 * search for it. There are three basic cases: '.', '..' and others.
 	 *
 	 * Following the guidelines of VOP_LOOKUP manpage and tmpfs.
 	 */
 	error = 0;
 	if ((cnp->cn_namelen == 1) && (cnp->cn_nameptr[0] == '.')) {
 		DPRINTF(LOOKUP, ("\tlookup '.'\n"));
 		/* Special case 1 '.' */
 		VREF(dvp);
 		*vpp = dvp;
 		/* Done */
 	} else if (cnp->cn_flags & ISDOTDOT) {
 		/* Special case 2 '..' */
 		DPRINTF(LOOKUP, ("\tlookup '..'\n"));
 
 		/* Get our node */
 		name = "..";
 		namelen = 2;
 		error = nandfs_lookup_name_in_dir(dvp, name, namelen, &ino,
 		    &found, &off);
 		if (error)
 			goto out;
 		if (!found)
 			error = ENOENT;
 
 		/* First unlock parent */
 		VOP_UNLOCK(dvp, 0);
 
 		if (error == 0) {
 			DPRINTF(LOOKUP, ("\tfound '..'\n"));
 			/* Try to create/reuse the node */
 			error = nandfs_get_node(nmp, ino, &node);
 
 			if (!error) {
 				DPRINTF(LOOKUP,
 				    ("\tnode retrieved/created OK\n"));
 				*vpp = NTOV(node);
 			}
 		}
 
 		/* Try to relock parent */
 		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 	} else {
 		DPRINTF(LOOKUP, ("\tlookup file\n"));
 		/* All other files */
 		/* Look up filename in the directory returning its inode */
 		name = cnp->cn_nameptr;
 		namelen = cnp->cn_namelen;
 		error = nandfs_lookup_name_in_dir(dvp, name, namelen,
 		    &ino, &found, &off);
 		if (error)
 			goto out;
 		if (!found) {
 			DPRINTF(LOOKUP, ("\tNOT found\n"));
 			/*
 			 * UGH, didn't find name. If we're creating or
 			 * renaming on the last name this is OK and we ought
 			 * to return EJUSTRETURN if its allowed to be created.
 			 */
 			error = ENOENT;
 			if ((nameiop == CREATE || nameiop == RENAME) &&
 			    islastcn) {
 				error = VOP_ACCESS(dvp, VWRITE, cred, td);
 				if (!error) {
 					/* keep the component name */
 					cnp->cn_flags |= SAVENAME;
 					error = EJUSTRETURN;
 				}
 			}
 			/* Done */
 		} else {
 			if (ino == NANDFS_WHT_INO)
 				cnp->cn_flags |= ISWHITEOUT;
 
 			if ((cnp->cn_flags & ISWHITEOUT) &&
 			    (nameiop == LOOKUP))
 				return (ENOENT);
 
 			if ((nameiop == DELETE) && islastcn) {
 				if ((cnp->cn_flags & ISWHITEOUT) &&
 				    (cnp->cn_flags & DOWHITEOUT)) {
 					cnp->cn_flags |= SAVENAME;
 					dir_node->nn_diroff = off;
 					return (EJUSTRETURN);
 				}
 
 				error = VOP_ACCESS(dvp, VWRITE, cred,
 				    cnp->cn_thread);
 				if (error)
 					return (error);
 
 				/* Try to create/reuse the node */
 				error = nandfs_get_node(nmp, ino, &node);
 				if (!error) {
 					*vpp = NTOV(node);
 					node->nn_diroff = off;
 				}
 
 				if ((dir_node->nn_inode.i_mode & ISVTX) &&
 				    cred->cr_uid != 0 &&
 				    cred->cr_uid != dir_node->nn_inode.i_uid &&
 				    node->nn_inode.i_uid != cred->cr_uid) {
 					vput(*vpp);
 					*vpp = NULL;
 					return (EPERM);
 				}
 			} else if ((nameiop == RENAME) && islastcn) {
 				error = VOP_ACCESS(dvp, VWRITE, cred,
 				    cnp->cn_thread);
 				if (error)
 					return (error);
 
 				/* Try to create/reuse the node */
 				error = nandfs_get_node(nmp, ino, &node);
 				if (!error) {
 					*vpp = NTOV(node);
 					node->nn_diroff = off;
 				}
 			} else {
 				/* Try to create/reuse the node */
 				error = nandfs_get_node(nmp, ino, &node);
 				if (!error) {
 					*vpp = NTOV(node);
 					node->nn_diroff = off;
 				}
 			}
 		}
 	}
 
 out:
 	/*
 	 * Store result in the cache if requested. If we are creating a file,
 	 * the file might not be found and thus putting it into the namecache
 	 * might be seen as negative caching.
 	 */
 	if ((cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(dvp, *vpp, cnp);
 
 	return (error);
 
 }
 
 static int
 nandfs_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	struct nandfs_node *node = VTON(vp);
 	struct nandfs_inode *inode = &node->nn_inode;
 
 	DPRINTF(VNCALL, ("%s: vp: %p\n", __func__, vp));
 	nandfs_itimes(vp);
 
 	/* Basic info */
 	VATTR_NULL(vap);
 	vap->va_atime.tv_sec = inode->i_mtime;
 	vap->va_atime.tv_nsec = inode->i_mtime_nsec;
 	vap->va_mtime.tv_sec = inode->i_mtime;
 	vap->va_mtime.tv_nsec = inode->i_mtime_nsec;
 	vap->va_ctime.tv_sec = inode->i_ctime;
 	vap->va_ctime.tv_nsec = inode->i_ctime_nsec;
 	vap->va_type = IFTOVT(inode->i_mode);
 	vap->va_mode = inode->i_mode & ~S_IFMT;
 	vap->va_nlink = inode->i_links_count;
 	vap->va_uid = inode->i_uid;
 	vap->va_gid = inode->i_gid;
 	vap->va_rdev = inode->i_special;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	vap->va_fileid = node->nn_ino;
 	vap->va_size = inode->i_size;
 	vap->va_blocksize = node->nn_nandfsdev->nd_blocksize;
 	vap->va_gen = 0;
 	vap->va_flags = inode->i_flags;
 	vap->va_bytes = inode->i_blocks * vap->va_blocksize;
 	vap->va_filerev = 0;
 	vap->va_vaflags = 0;
 
 	return (0);
 }
 
 static int
 nandfs_vtruncbuf(struct vnode *vp, uint64_t nblks)
 {
 	struct nandfs_device *nffsdev;
 	struct bufobj *bo;
 	struct buf *bp, *nbp;
 
 	bo = &vp->v_bufobj;
 	nffsdev = VTON(vp)->nn_nandfsdev;
 
 	ASSERT_VOP_LOCKED(vp, "nandfs_truncate");
 restart:
 	BO_LOCK(bo);
 restart_locked:
 	TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
 		if (bp->b_lblkno < nblks)
 			continue;
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
 			goto restart_locked;
 
 		bremfree(bp);
 		bp->b_flags |= (B_INVAL | B_RELBUF);
 		bp->b_flags &= ~(B_ASYNC | B_MANAGED);
 		BO_UNLOCK(bo);
 		brelse(bp);
 		BO_LOCK(bo);
 	}
 
 	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 		if (bp->b_lblkno < nblks)
 			continue;
 		if (BUF_LOCK(bp,
 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 		    BO_LOCKPTR(bo)) == ENOLCK)
 			goto restart;
 		bp->b_flags |= (B_INVAL | B_RELBUF);
 		bp->b_flags &= ~(B_ASYNC | B_MANAGED);
 		brelse(bp);
 		nandfs_dirty_bufs_decrement(nffsdev);
 		BO_LOCK(bo);
 	}
 
 	BO_UNLOCK(bo);
 
 	return (0);
 }
 
 static int
 nandfs_truncate(struct vnode *vp, uint64_t newsize)
 {
 	struct nandfs_device *nffsdev;
 	struct nandfs_node *node;
 	struct nandfs_inode *inode;
 	struct buf *bp = NULL;
 	uint64_t oblks, nblks, vblk, size, rest;
 	int error;
 
 	node = VTON(vp);
 	nffsdev = node->nn_nandfsdev;
 	inode = &node->nn_inode;
 
 	/* Calculate end of file */
 	size = inode->i_size;
 
 	if (newsize == size) {
 		node->nn_flags |= IN_CHANGE | IN_UPDATE;
 		nandfs_itimes(vp);
 		return (0);
 	}
 
 	if (newsize > size) {
 		inode->i_size = newsize;
 		vnode_pager_setsize(vp, newsize);
 		node->nn_flags |= IN_CHANGE | IN_UPDATE;
 		nandfs_itimes(vp);
 		return (0);
 	}
 
 	nblks = howmany(newsize, nffsdev->nd_blocksize);
 	oblks = howmany(size, nffsdev->nd_blocksize);
 	rest = newsize % nffsdev->nd_blocksize;
 
 	if (rest) {
 		error = nandfs_bmap_lookup(node, nblks - 1, &vblk);
 		if (error)
 			return (error);
 
 		if (vblk != 0)
 			error = nandfs_bread(node, nblks - 1, NOCRED, 0, &bp);
 		else
 			error = nandfs_bcreate(node, nblks - 1, NOCRED, 0, &bp);
 
 		if (error) {
 			if (bp)
 				brelse(bp);
 			return (error);
 		}
 
 		bzero((char *)bp->b_data + rest,
 		    (u_int)(nffsdev->nd_blocksize - rest));
 		error = nandfs_dirty_buf(bp, 0);
 		if (error)
 			return (error);
 	}
 
 	DPRINTF(VNCALL, ("%s: vp %p oblks %jx nblks %jx\n", __func__, vp, oblks,
 	    nblks));
 
 	error = nandfs_bmap_truncate_mapping(node, oblks - 1, nblks - 1);
 	if (error) {
 		if (bp)
 			nandfs_undirty_buf(bp);
 		return (error);
 	}
 
 	error = nandfs_vtruncbuf(vp, nblks);
 	if (error) {
 		if (bp)
 			nandfs_undirty_buf(bp);
 		return (error);
 	}
 
 	inode->i_size = newsize;
 	vnode_pager_setsize(vp, newsize);
 	node->nn_flags |= IN_CHANGE | IN_UPDATE;
 	nandfs_itimes(vp);
 
 	return (error);
 }
 
 static void
 nandfs_itimes_locked(struct vnode *vp)
 {
 	struct nandfs_node *node;
 	struct nandfs_inode *inode;
 	struct timespec ts;
 
 	ASSERT_VI_LOCKED(vp, __func__);
 
 	node = VTON(vp);
 	inode = &node->nn_inode;
 
 	if ((node->nn_flags & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0)
 		return;
 
 	if (((vp->v_mount->mnt_kern_flag &
 	    (MNTK_SUSPENDED | MNTK_SUSPEND)) == 0) ||
 	    (node->nn_flags & (IN_CHANGE | IN_UPDATE)))
 		node->nn_flags |= IN_MODIFIED;
 
 	vfs_timestamp(&ts);
 	if (node->nn_flags & IN_UPDATE) {
 		inode->i_mtime = ts.tv_sec;
 		inode->i_mtime_nsec = ts.tv_nsec;
 	}
 	if (node->nn_flags & IN_CHANGE) {
 		inode->i_ctime = ts.tv_sec;
 		inode->i_ctime_nsec = ts.tv_nsec;
 	}
 
 	node->nn_flags &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE);
 }
 
 void
 nandfs_itimes(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	nandfs_itimes_locked(vp);
 	VI_UNLOCK(vp);
 }
 
 static int
 nandfs_chmod(struct vnode *vp, int mode, struct ucred *cred, struct thread *td)
 {
 	struct nandfs_node *node = VTON(vp);
 	struct nandfs_inode *inode = &node->nn_inode;
 	uint16_t nmode;
 	int error = 0;
 
 	DPRINTF(VNCALL, ("%s: vp %p, mode %x, cred %p, td %p\n", __func__, vp,
 	    mode, cred, td));
 	/*
 	 * To modify the permissions on a file, must possess VADMIN
 	 * for that file.
 	 */
 	if ((error = VOP_ACCESS(vp, VADMIN, cred, td)))
 		return (error);
 
 	/*
 	 * Privileged processes may set the sticky bit on non-directories,
 	 * as well as set the setgid bit on a file with a group that the
 	 * process is not a member of. Both of these are allowed in
 	 * jail(8).
 	 */
 	if (vp->v_type != VDIR && (mode & S_ISTXT)) {
 		if (priv_check_cred(cred, PRIV_VFS_STICKYFILE, 0))
 			return (EFTYPE);
 	}
 	if (!groupmember(inode->i_gid, cred) && (mode & ISGID)) {
 		error = priv_check_cred(cred, PRIV_VFS_SETGID, 0);
 		if (error)
 			return (error);
 	}
 
 	/*
 	 * Deny setting setuid if we are not the file owner.
 	 */
 	if ((mode & ISUID) && inode->i_uid != cred->cr_uid) {
 		error = priv_check_cred(cred, PRIV_VFS_ADMIN, 0);
 		if (error)
 			return (error);
 	}
 
 	nmode = inode->i_mode;
 	nmode &= ~ALLPERMS;
 	nmode |= (mode & ALLPERMS);
 	inode->i_mode = nmode;
 	node->nn_flags |= IN_CHANGE;
 
 	DPRINTF(VNCALL, ("%s: to mode %x\n", __func__, nmode));
 
 	return (error);
 }
 
 static int
 nandfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred,
     struct thread *td)
 {
 	struct nandfs_node *node = VTON(vp);
 	struct nandfs_inode *inode = &node->nn_inode;
 	uid_t ouid;
 	gid_t ogid;
 	int error = 0;
 
 	if (uid == (uid_t)VNOVAL)
 		uid = inode->i_uid;
 	if (gid == (gid_t)VNOVAL)
 		gid = inode->i_gid;
 	/*
 	 * To modify the ownership of a file, must possess VADMIN for that
 	 * file.
 	 */
 	if ((error = VOP_ACCESSX(vp, VWRITE_OWNER, cred, td)))
 		return (error);
 	/*
 	 * To change the owner of a file, or change the group of a file to a
 	 * group of which we are not a member, the caller must have
 	 * privilege.
 	 */
 	if (((uid != inode->i_uid && uid != cred->cr_uid) ||
 	    (gid != inode->i_gid && !groupmember(gid, cred))) &&
 	    (error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0)))
 		return (error);
 	ogid = inode->i_gid;
 	ouid = inode->i_uid;
 
 	inode->i_gid = gid;
 	inode->i_uid = uid;
 
 	node->nn_flags |= IN_CHANGE;
 	if ((inode->i_mode & (ISUID | ISGID)) &&
 	    (ouid != uid || ogid != gid)) {
 		if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID, 0))
 			inode->i_mode &= ~(ISUID | ISGID);
 	}
 	DPRINTF(VNCALL, ("%s: vp %p, cred %p, td %p - ret OK\n", __func__, vp,
 	    cred, td));
 	return (0);
 }
 
 static int
 nandfs_setattr(struct vop_setattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nandfs_node *node = VTON(vp);
 	struct nandfs_inode *inode = &node->nn_inode;
 	struct vattr *vap = ap->a_vap;
 	struct ucred *cred = ap->a_cred;
 	struct thread *td = curthread;
 	uint32_t flags;
 	int error = 0;
 
 	if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) ||
 	    (vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) {
 		DPRINTF(VNCALL, ("%s: unsettable attribute\n", __func__));
 		return (EINVAL);
 	}
 
 	if (vap->va_flags != VNOVAL) {
 		DPRINTF(VNCALL, ("%s: vp:%p td:%p flags:%lx\n", __func__, vp,
 		    td, vap->va_flags));
 
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		/*
 		 * Callers may only modify the file flags on objects they
 		 * have VADMIN rights for.
 		 */
 		if ((error = VOP_ACCESS(vp, VADMIN, cred, td)))
 			return (error);
 		/*
 		 * Unprivileged processes are not permitted to unset system
 		 * flags, or modify flags if any system flags are set.
 		 * Privileged non-jail processes may not modify system flags
 		 * if securelevel > 0 and any existing system flags are set.
 		 * Privileged jail processes behave like privileged non-jail
 		 * processes if the PR_ALLOW_CHFLAGS permission bit is set;
 		 * otherwise, they behave like unprivileged processes.
 		 */
 
 		flags = inode->i_flags;
 		if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0)) {
 			if (flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) {
 				error = securelevel_gt(cred, 0);
 				if (error)
 					return (error);
 			}
 			/* Snapshot flag cannot be set or cleared */
 			if (((vap->va_flags & SF_SNAPSHOT) != 0 &&
 			    (flags & SF_SNAPSHOT) == 0) ||
 			    ((vap->va_flags & SF_SNAPSHOT) == 0 &&
 			    (flags & SF_SNAPSHOT) != 0))
 				return (EPERM);
 
 			inode->i_flags = vap->va_flags;
 		} else {
 			if (flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) ||
 			    (vap->va_flags & UF_SETTABLE) != vap->va_flags)
 				return (EPERM);
 
 			flags &= SF_SETTABLE;
 			flags |= (vap->va_flags & UF_SETTABLE);
 			inode->i_flags = flags;
 		}
 		node->nn_flags |= IN_CHANGE;
 		if (vap->va_flags & (IMMUTABLE | APPEND))
 			return (0);
 	}
 	if (inode->i_flags & (IMMUTABLE | APPEND))
 		return (EPERM);
 
 	if (vap->va_size != (u_quad_t)VNOVAL) {
 		DPRINTF(VNCALL, ("%s: vp:%p td:%p size:%jx\n", __func__, vp, td,
 		    (uintmax_t)vap->va_size));
 
 		switch (vp->v_type) {
 		case VDIR:
 			return (EISDIR);
 		case VLNK:
 		case VREG:
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			if ((inode->i_flags & SF_SNAPSHOT) != 0)
 				return (EPERM);
 			break;
 		default:
 			return (0);
 		}
 
 		if (vap->va_size > node->nn_nandfsdev->nd_maxfilesize)
 			return (EFBIG);
 
 		KASSERT((vp->v_type == VREG), ("Set size %d", vp->v_type));
 		nandfs_truncate(vp, vap->va_size);
 		node->nn_flags |= IN_CHANGE;
 
 		return (0);
 	}
 
 	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		DPRINTF(VNCALL, ("%s: vp:%p td:%p uid/gid %x/%x\n", __func__,
 		    vp, td, vap->va_uid, vap->va_gid));
 		error = nandfs_chown(vp, vap->va_uid, vap->va_gid, cred, td);
 		if (error)
 			return (error);
 	}
 
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		DPRINTF(VNCALL, ("%s: vp:%p td:%p mode %x\n", __func__, vp, td,
 		    vap->va_mode));
 
 		error = nandfs_chmod(vp, (int)vap->va_mode, cred, td);
 		if (error)
 			return (error);
 	}
 	if (vap->va_atime.tv_sec != VNOVAL ||
 	    vap->va_mtime.tv_sec != VNOVAL ||
 	    vap->va_birthtime.tv_sec != VNOVAL) {
 		DPRINTF(VNCALL, ("%s: vp:%p td:%p time a/m/b %jx/%jx/%jx\n",
 		    __func__, vp, td, (uintmax_t)vap->va_atime.tv_sec,
 		    (uintmax_t)vap->va_mtime.tv_sec,
 		    (uintmax_t)vap->va_birthtime.tv_sec));
 
 		if (vap->va_atime.tv_sec != VNOVAL)
 			node->nn_flags |= IN_ACCESS;
 		if (vap->va_mtime.tv_sec != VNOVAL)
 			node->nn_flags |= IN_CHANGE | IN_UPDATE;
 		if (vap->va_birthtime.tv_sec != VNOVAL)
 			node->nn_flags |= IN_MODIFIED;
 		nandfs_itimes(vp);
 		return (0);
 	}
 
 	return (0);
 }
 
 static int
 nandfs_open(struct vop_open_args *ap)
 {
 	struct nandfs_node *node = VTON(ap->a_vp);
 	uint64_t filesize;
 
 	DPRINTF(VNCALL, ("nandfs_open called ap->a_mode %x\n", ap->a_mode));
 
 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	if ((node->nn_inode.i_flags & APPEND) &&
 	    (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE)
 		return (EPERM);
 
 	filesize = node->nn_inode.i_size;
 	vnode_create_vobject(ap->a_vp, filesize, ap->a_td);
 
 	return (0);
 }
 
 static int
 nandfs_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nandfs_node *node = VTON(vp);
 
 	DPRINTF(VNCALL, ("%s: vp %p node %p\n", __func__, vp, node));
 
 	mtx_lock(&vp->v_interlock);
 	if (vp->v_usecount > 1)
 		nandfs_itimes_locked(vp);
 	mtx_unlock(&vp->v_interlock);
 
 	return (0);
 }
 
 static int
 nandfs_check_possible(struct vnode *vp, struct vattr *vap, mode_t mode)
 {
 
 	/* Check if we are allowed to write */
 	switch (vap->va_type) {
 	case VDIR:
 	case VLNK:
 	case VREG:
 		/*
 		 * Normal nodes: check if we're on a read-only mounted
 		 * filingsystem and bomb out if we're trying to write.
 		 */
 		if ((mode & VMODIFY_PERMS) && (vp->v_mount->mnt_flag & MNT_RDONLY))
 			return (EROFS);
 		break;
 	case VBLK:
 	case VCHR:
 	case VSOCK:
 	case VFIFO:
 		/*
 		 * Special nodes: even on read-only mounted filingsystems
 		 * these are allowed to be written to if permissions allow.
 		 */
 		break;
 	default:
 		/* No idea what this is */
 		return (EINVAL);
 	}
 
 	/* No one may write immutable files */
 	if ((mode & VWRITE) && (VTON(vp)->nn_inode.i_flags & IMMUTABLE))
 		return (EPERM);
 
 	return (0);
 }
 
 static int
 nandfs_check_permitted(struct vnode *vp, struct vattr *vap, mode_t mode,
     struct ucred *cred)
 {
 
 	return (vaccess(vp->v_type, vap->va_mode, vap->va_uid, vap->va_gid, mode,
 	    cred, NULL));
 }
 
 static int
 nandfs_advlock(struct vop_advlock_args *ap)
 {
 	struct nandfs_node *nvp;
 	quad_t size;
 
 	nvp = VTON(ap->a_vp);
 	size = nvp->nn_inode.i_size;
 	return (lf_advlock(ap, &(nvp->nn_lockf), size));
 }
 
 static int
 nandfs_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	accmode_t accmode = ap->a_accmode;
 	struct ucred *cred = ap->a_cred;
 	struct vattr vap;
 	int error;
 
 	DPRINTF(VNCALL, ("%s: vp:%p mode: %x\n", __func__, vp, accmode));
 
 	error = VOP_GETATTR(vp, &vap, NULL);
 	if (error)
 		return (error);
 
 	error = nandfs_check_possible(vp, &vap, accmode);
 	if (error)
 		return (error);
 
 	error = nandfs_check_permitted(vp, &vap, accmode, cred);
 
 	return (error);
 }
 
 static int
 nandfs_print(struct vop_print_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nandfs_node *nvp = VTON(vp);
 
 	printf("\tvp=%p, nandfs_node=%p\n", vp, nvp);
 	printf("nandfs inode %#jx\n", (uintmax_t)nvp->nn_ino);
 	printf("flags = 0x%b\n", (u_int)nvp->nn_flags, PRINT_NODE_FLAGS);
 
 	return (0);
 }
 
 static void
 nandfs_read_filebuf(struct nandfs_node *node, struct buf *bp)
 {
 	struct nandfs_device *nandfsdev = node->nn_nandfsdev;
 	struct buf *nbp;
 	nandfs_daddr_t vblk, pblk;
 	nandfs_lbn_t from;
 	uint32_t blocksize;
 	int error = 0;
 	int blk2dev = nandfsdev->nd_blocksize / DEV_BSIZE;
 
 	/*
 	 * Translate all the block sectors into a series of buffers to read
 	 * asynchronously from the nandfs device. Note that this lookup may
 	 * induce readin's too.
 	 */
 
 	blocksize = nandfsdev->nd_blocksize;
 	if (bp->b_bcount / blocksize != 1)
 		panic("invalid b_count in bp %p\n", bp);
 
 	from = bp->b_blkno;
 
 	DPRINTF(READ, ("\tread in from inode %#jx blkno %#jx"
 	    " count %#lx\n", (uintmax_t)node->nn_ino, from,
 	    bp->b_bcount));
 
 	/* Get virtual block numbers for the vnode's buffer span */
 	error = nandfs_bmap_lookup(node, from, &vblk);
 	if (error) {
 		bp->b_error = EINVAL;
 		bp->b_ioflags |= BIO_ERROR;
 		bufdone(bp);
 		return;
 	}
 
 	/* Translate virtual block numbers to physical block numbers */
 	error = nandfs_vtop(node, vblk, &pblk);
 	if (error) {
 		bp->b_error = EINVAL;
 		bp->b_ioflags |= BIO_ERROR;
 		bufdone(bp);
 		return;
 	}
 
 	/* Issue translated blocks */
 	bp->b_resid = bp->b_bcount;
 
 	/* Note virtual block 0 marks not mapped */
 	if (vblk == 0) {
 		vfs_bio_clrbuf(bp);
 		bufdone(bp);
 		return;
 	}
 
 	nbp = bp;
 	nbp->b_blkno = pblk * blk2dev;
 	bp->b_iooffset = dbtob(nbp->b_blkno);
 	MPASS(bp->b_iooffset >= 0);
 	BO_STRATEGY(&nandfsdev->nd_devvp->v_bufobj, nbp);
 	nandfs_vblk_set(bp, vblk);
 	DPRINTF(READ, ("read_filebuf : ino %#jx blk %#jx -> "
 	    "%#jx -> %#jx [bp %p]\n", (uintmax_t)node->nn_ino,
 	    (uintmax_t)(from), (uintmax_t)vblk,
 	    (uintmax_t)pblk, nbp));
 }
 
 static void
 nandfs_write_filebuf(struct nandfs_node *node, struct buf *bp)
 {
 	struct nandfs_device *nandfsdev = node->nn_nandfsdev;
 
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	MPASS(bp->b_iooffset >= 0);
 	BO_STRATEGY(&nandfsdev->nd_devvp->v_bufobj, bp);
 }
 
 static int
 nandfs_strategy(struct vop_strategy_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct buf *bp = ap->a_bp;
 	struct nandfs_node *node = VTON(vp);
 
 
 	/* check if we ought to be here */
 	KASSERT((vp->v_type != VBLK && vp->v_type != VCHR),
 	    ("nandfs_strategy on type %d", vp->v_type));
 
 	/* Translate if needed and pass on */
 	if (bp->b_iocmd == BIO_READ) {
 		nandfs_read_filebuf(node, bp);
 		return (0);
 	}
 
 	/* Send to segment collector */
 	nandfs_write_filebuf(node, bp);
 	return (0);
 }
 
 static int
 nandfs_readdir(struct vop_readdir_args *ap)
 {
 	struct uio *uio = ap->a_uio;
 	struct vnode *vp = ap->a_vp;
 	struct nandfs_node *node = VTON(vp);
 	struct nandfs_dir_entry *ndirent;
 	struct dirent dirent;
 	struct buf *bp;
 	uint64_t file_size, diroffset, transoffset, blkoff;
 	uint64_t blocknr;
 	uint32_t blocksize = node->nn_nandfsdev->nd_blocksize;
 	uint8_t *pos, name_len;
 	int error;
 
 	DPRINTF(READDIR, ("nandfs_readdir called\n"));
 
 	if (vp->v_type != VDIR)
 		return (ENOTDIR);
 
 	file_size = node->nn_inode.i_size;
 	DPRINTF(READDIR, ("nandfs_readdir filesize %jd resid %zd\n",
 	    (uintmax_t)file_size, uio->uio_resid ));
 
 	/* We are called just as long as we keep on pushing data in */
 	error = 0;
 	if ((uio->uio_offset < file_size) &&
 	    (uio->uio_resid >= sizeof(struct dirent))) {
 		diroffset = uio->uio_offset;
 		transoffset = diroffset;
 
 		blocknr = diroffset / blocksize;
 		blkoff = diroffset % blocksize;
 		error = nandfs_bread(node, blocknr, NOCRED, 0, &bp);
 		if (error) {
 			brelse(bp);
 			return (EIO);
 		}
 		while (diroffset < file_size) {
 			DPRINTF(READDIR, ("readdir : offset = %"PRIu64"\n",
 			    diroffset));
 			if (blkoff >= blocksize) {
 				blkoff = 0; blocknr++;
 				brelse(bp);
 				error = nandfs_bread(node, blocknr, NOCRED, 0,
 				    &bp);
 				if (error) {
 					brelse(bp);
 					return (EIO);
 				}
 			}
 
 			/* Read in one dirent */
 			pos = (uint8_t *)bp->b_data + blkoff;
 			ndirent = (struct nandfs_dir_entry *)pos;
 
 			name_len = ndirent->name_len;
 			memset(&dirent, 0, sizeof(struct dirent));
 			dirent.d_fileno = ndirent->inode;
 			if (dirent.d_fileno) {
 				dirent.d_type = ndirent->file_type;
 				dirent.d_namlen = name_len;
 				strncpy(dirent.d_name, ndirent->name, name_len);
 				dirent.d_reclen = GENERIC_DIRSIZ(&dirent);
+				/* NOTE: d_off is the offset of the *next* entry. */
+				dirent.d_off = diroffset + ndirent->rec_len;
 				DPRINTF(READDIR, ("copying `%*.*s`\n", name_len,
 				    name_len, dirent.d_name));
 			}
 
 			/*
 			 * If there isn't enough space in the uio to return a
 			 * whole dirent, break off read
 			 */
 			if (uio->uio_resid < GENERIC_DIRSIZ(&dirent))
 				break;
 
 			/* Transfer */
 			if (dirent.d_fileno)
 				uiomove(&dirent, GENERIC_DIRSIZ(&dirent), uio);
 
 			/* Advance */
 			diroffset += ndirent->rec_len;
 			blkoff += ndirent->rec_len;
 
 			/* Remember the last entry we transferred */
 			transoffset = diroffset;
 		}
 		brelse(bp);
 
 		/* Pass on last transferred offset */
 		uio->uio_offset = transoffset;
 	}
 
 	if (ap->a_eofflag)
 		*ap->a_eofflag = (uio->uio_offset >= file_size);
 
 	return (error);
 }
 
 static int
 nandfs_dirempty(struct vnode *dvp, uint64_t parentino, struct ucred *cred)
 {
 	struct nandfs_node *dnode = VTON(dvp);
 	struct nandfs_dir_entry *dirent;
 	uint64_t file_size = dnode->nn_inode.i_size;
 	uint64_t blockcount = dnode->nn_inode.i_blocks;
 	uint64_t blocknr;
 	uint32_t blocksize = dnode->nn_nandfsdev->nd_blocksize;
 	uint32_t limit;
 	uint32_t off;
 	uint8_t	*pos;
 	struct buf *bp;
 	int error;
 
 	DPRINTF(LOOKUP, ("%s: dvp %p parentino %#jx cred %p\n", __func__, dvp,
 	    (uintmax_t)parentino, cred));
 
 	KASSERT((file_size != 0), ("nandfs_dirempty for NULL dir %p", dvp));
 
 	blocknr = 0;
 	while (blocknr < blockcount) {
 		error = nandfs_bread(dnode, blocknr, NOCRED, 0, &bp);
 		if (error) {
 			brelse(bp);
 			return (0);
 		}
 
 		pos = (uint8_t *)bp->b_data;
 		off = 0;
 
 		if (blocknr == (blockcount - 1))
 			limit = file_size % blocksize;
 		else
 			limit = blocksize;
 
 		while (off < limit) {
 			dirent = (struct nandfs_dir_entry *)(pos + off);
 			off += dirent->rec_len;
 
 			if (dirent->inode == 0)
 				continue;
 
 			switch (dirent->name_len) {
 			case 0:
 				break;
 			case 1:
 				if (dirent->name[0] != '.')
 					goto notempty;
 
 				KASSERT(dirent->inode == dnode->nn_ino,
 				    (".'s inode does not match dir"));
 				break;
 			case 2:
 				if (dirent->name[0] != '.' &&
 				    dirent->name[1] != '.')
 					goto notempty;
 
 				KASSERT(dirent->inode == parentino,
 				    ("..'s inode does not match parent"));
 				break;
 			default:
 				goto notempty;
 			}
 		}
 
 		brelse(bp);
 		blocknr++;
 	}
 
 	return (1);
 notempty:
 	brelse(bp);
 	return (0);
 }
 
 static int
 nandfs_link(struct vop_link_args *ap)
 {
 	struct vnode *tdvp = ap->a_tdvp;
 	struct vnode *vp = ap->a_vp;
 	struct componentname *cnp = ap->a_cnp;
 	struct nandfs_node *node = VTON(vp);
 	struct nandfs_inode *inode = &node->nn_inode;
 	int error;
 
 	if (inode->i_links_count >= NANDFS_LINK_MAX)
 		return (EMLINK);
 
 	if (inode->i_flags & (IMMUTABLE | APPEND))
 		return (EPERM);
 
 	/* Update link count */
 	inode->i_links_count++;
 
 	/* Add dir entry */
 	error = nandfs_add_dirent(tdvp, node->nn_ino, cnp->cn_nameptr,
 	    cnp->cn_namelen, IFTODT(inode->i_mode));
 	if (error) {
 		inode->i_links_count--;
 	}
 
 	node->nn_flags |= IN_CHANGE;
 	nandfs_itimes(vp);
 	DPRINTF(VNCALL, ("%s: tdvp %p vp %p cnp %p\n",
 	    __func__, tdvp, vp, cnp));
 
 	return (0);
 }
 
 static int
 nandfs_create(struct vop_create_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	uint16_t mode = MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode);
 	struct nandfs_node *dir_node = VTON(dvp);
 	struct nandfsmount *nmp = dir_node->nn_nmp;
 	struct nandfs_node *node;
 	int error;
 
 	DPRINTF(VNCALL, ("%s: dvp %p\n", __func__, dvp));
 
 	if (nandfs_fs_full(dir_node->nn_nandfsdev))
 		return (ENOSPC);
 
 	/* Create new vnode/inode */
 	error = nandfs_node_create(nmp, &node, mode);
 	if (error)
 		return (error);
 	node->nn_inode.i_gid = dir_node->nn_inode.i_gid;
 	node->nn_inode.i_uid = cnp->cn_cred->cr_uid;
 
 	/* Add new dir entry */
 	error = nandfs_add_dirent(dvp, node->nn_ino, cnp->cn_nameptr,
 	    cnp->cn_namelen, IFTODT(mode));
 	if (error) {
 		if (nandfs_node_destroy(node)) {
 			nandfs_error("%s: error destroying node %p\n",
 			    __func__, node);
 		}
 		return (error);
 	}
 	*vpp = NTOV(node);
 	if ((cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(dvp, *vpp, cnp);
 
 	DPRINTF(VNCALL, ("created file vp %p nandnode %p ino %jx\n", *vpp, node,
 	    (uintmax_t)node->nn_ino));
 	return (0);
 }
 
 static int
 nandfs_remove(struct vop_remove_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct nandfs_node *node = VTON(vp);
 	struct nandfs_node *dnode = VTON(dvp);
 	struct componentname *cnp = ap->a_cnp;
 
 	DPRINTF(VNCALL, ("%s: dvp %p vp %p nandnode %p ino %#jx link %d\n",
 	    __func__, dvp, vp, node, (uintmax_t)node->nn_ino,
 	    node->nn_inode.i_links_count));
 
 	if (vp->v_type == VDIR)
 		return (EISDIR);
 
 	/* Files marked as immutable or append-only cannot be deleted. */
 	if ((node->nn_inode.i_flags & (IMMUTABLE | APPEND | NOUNLINK)) ||
 	    (dnode->nn_inode.i_flags & APPEND))
 		return (EPERM);
 
 	nandfs_remove_dirent(dvp, node, cnp);
 	node->nn_inode.i_links_count--;
 	node->nn_flags |= IN_CHANGE;
 
 	return (0);
 }
 
 /*
  * Check if source directory is in the path of the target directory.
  * Target is supplied locked, source is unlocked.
  * The target is always vput before returning.
  */
 static int
 nandfs_checkpath(struct nandfs_node *src, struct nandfs_node *dest,
     struct ucred *cred)
 {
 	struct vnode *vp;
 	int error, rootino;
 	struct nandfs_dir_entry dirent;
 
 	vp = NTOV(dest);
 	if (src->nn_ino == dest->nn_ino) {
 		error = EEXIST;
 		goto out;
 	}
 	rootino = NANDFS_ROOT_INO;
 	error = 0;
 	if (dest->nn_ino == rootino)
 		goto out;
 
 	for (;;) {
 		if (vp->v_type != VDIR) {
 			error = ENOTDIR;
 			break;
 		}
 
 		error = vn_rdwr(UIO_READ, vp, (caddr_t)&dirent,
 		    NANDFS_DIR_REC_LEN(2), (off_t)0, UIO_SYSSPACE,
 		    IO_NODELOCKED | IO_NOMACCHECK, cred, NOCRED,
 		    NULL, NULL);
 		if (error != 0)
 			break;
 		if (dirent.name_len != 2 ||
 		    dirent.name[0] != '.' ||
 		    dirent.name[1] != '.') {
 			error = ENOTDIR;
 			break;
 		}
 		if (dirent.inode == src->nn_ino) {
 			error = EINVAL;
 			break;
 		}
 		if (dirent.inode == rootino)
 			break;
 		vput(vp);
 		if ((error = VFS_VGET(vp->v_mount, dirent.inode,
 		    LK_EXCLUSIVE, &vp)) != 0) {
 			vp = NULL;
 			break;
 		}
 	}
 
 out:
 	if (error == ENOTDIR)
 		printf("checkpath: .. not a directory\n");
 	if (vp != NULL)
 		vput(vp);
 	return (error);
 }
 
 static int
 nandfs_rename(struct vop_rename_args *ap)
 {
 	struct vnode *tvp = ap->a_tvp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *fdvp = ap->a_fdvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct componentname *fcnp = ap->a_fcnp;
 	int doingdirectory = 0, oldparent = 0, newparent = 0;
 	int error = 0;
 
 	struct nandfs_node *fdnode, *fnode, *fnode1;
 	struct nandfs_node *tdnode = VTON(tdvp);
 	struct nandfs_node *tnode;
 
 	uint32_t tdflags, fflags, fdflags;
 	uint16_t mode;
 
 	DPRINTF(VNCALL, ("%s: fdvp:%p fvp:%p tdvp:%p tdp:%p\n", __func__, fdvp,
 	    fvp, tdvp, tvp));
 
 	/*
 	 * Check for cross-device rename.
 	 */
 	if ((fvp->v_mount != tdvp->v_mount) ||
 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
 		error = EXDEV;
 abortit:
 		if (tdvp == tvp)
 			vrele(tdvp);
 		else
 			vput(tdvp);
 		if (tvp)
 			vput(tvp);
 		vrele(fdvp);
 		vrele(fvp);
 		return (error);
 	}
 
 	tdflags = tdnode->nn_inode.i_flags;
 	if (tvp &&
 	    ((VTON(tvp)->nn_inode.i_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
 	    (tdflags & APPEND))) {
 		error = EPERM;
 		goto abortit;
 	}
 
 	/*
 	 * Renaming a file to itself has no effect.  The upper layers should
 	 * not call us in that case.  Temporarily just warn if they do.
 	 */
 	if (fvp == tvp) {
 		printf("nandfs_rename: fvp == tvp (can't happen)\n");
 		error = 0;
 		goto abortit;
 	}
 
 	if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
 		goto abortit;
 
 	fdnode = VTON(fdvp);
 	fnode = VTON(fvp);
 
 	if (fnode->nn_inode.i_links_count >= NANDFS_LINK_MAX) {
 		VOP_UNLOCK(fvp, 0);
 		error = EMLINK;
 		goto abortit;
 	}
 
 	fflags = fnode->nn_inode.i_flags;
 	fdflags = fdnode->nn_inode.i_flags;
 
 	if ((fflags & (NOUNLINK | IMMUTABLE | APPEND)) ||
 	    (fdflags & APPEND)) {
 		VOP_UNLOCK(fvp, 0);
 		error = EPERM;
 		goto abortit;
 	}
 
 	mode = fnode->nn_inode.i_mode;
 	if ((mode & S_IFMT) == S_IFDIR) {
 		/*
 		 * Avoid ".", "..", and aliases of "." for obvious reasons.
 		 */
 
 		if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
 		    (fdvp == fvp) ||
 		    ((fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) ||
 		    (fnode->nn_flags & IN_RENAME)) {
 			VOP_UNLOCK(fvp, 0);
 			error = EINVAL;
 			goto abortit;
 		}
 		fnode->nn_flags |= IN_RENAME;
 		doingdirectory = 1;
 		DPRINTF(VNCALL, ("%s: doingdirectory dvp %p\n", __func__,
 		    tdvp));
 		oldparent = fdnode->nn_ino;
 	}
 
 	vrele(fdvp);
 
 	tnode = NULL;
 	if (tvp)
 		tnode = VTON(tvp);
 
 	/*
 	 * Bump link count on fvp while we are moving stuff around. If we
 	 * crash before completing the work, the link count may be wrong
 	 * but correctable.
 	 */
 	fnode->nn_inode.i_links_count++;
 
 	/* Check for in path moving XXX */
 	error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread);
 	VOP_UNLOCK(fvp, 0);
 	if (oldparent != tdnode->nn_ino)
 		newparent = tdnode->nn_ino;
 	if (doingdirectory && newparent) {
 		if (error)	/* write access check above */
 			goto bad;
 		if (tnode != NULL)
 			vput(tvp);
 
 		error = nandfs_checkpath(fnode, tdnode, tcnp->cn_cred);
 		if (error)
 			goto out;
 
 		VREF(tdvp);
 		error = relookup(tdvp, &tvp, tcnp);
 		if (error)
 			goto out;
 		vrele(tdvp);
 		tdnode = VTON(tdvp);
 		tnode = NULL;
 		if (tvp)
 			tnode = VTON(tvp);
 	}
 
 	/*
 	 * If the target doesn't exist, link the target to the source and
 	 * unlink the source. Otherwise, rewrite the target directory to
 	 * reference the source and remove the original entry.
 	 */
 
 	if (tvp == NULL) {
 		/*
 		 * Account for ".." in new directory.
 		 */
 		if (doingdirectory && fdvp != tdvp)
 			tdnode->nn_inode.i_links_count++;
 
 		DPRINTF(VNCALL, ("%s: new entry in dvp:%p\n", __func__, tdvp));
 		/*
 		 * Add name in new directory.
 		 */
 		error = nandfs_add_dirent(tdvp, fnode->nn_ino, tcnp->cn_nameptr,
 		    tcnp->cn_namelen, IFTODT(fnode->nn_inode.i_mode));
 		if (error) {
 			if (doingdirectory && fdvp != tdvp)
 				tdnode->nn_inode.i_links_count--;
 			goto bad;
 		}
 
 		vput(tdvp);
 	} else {
 		/*
 		 * If the parent directory is "sticky", then the user must
 		 * own the parent directory, or the destination of the rename,
 		 * otherwise the destination may not be changed (except by
 		 * root). This implements append-only directories.
 		 */
 		if ((tdnode->nn_inode.i_mode & S_ISTXT) &&
 		    tcnp->cn_cred->cr_uid != 0 &&
 		    tcnp->cn_cred->cr_uid != tdnode->nn_inode.i_uid &&
 		    tnode->nn_inode.i_uid != tcnp->cn_cred->cr_uid) {
 			error = EPERM;
 			goto bad;
 		}
 		/*
 		 * Target must be empty if a directory and have no links
 		 * to it. Also, ensure source and target are compatible
 		 * (both directories, or both not directories).
 		 */
 		mode = tnode->nn_inode.i_mode;
 		if ((mode & S_IFMT) == S_IFDIR) {
 			if (!nandfs_dirempty(tvp, tdnode->nn_ino,
 			    tcnp->cn_cred)) {
 				error = ENOTEMPTY;
 				goto bad;
 			}
 			if (!doingdirectory) {
 				error = ENOTDIR;
 				goto bad;
 			}
 			/*
 			 * Update name cache since directory is going away.
 			 */
 			cache_purge(tdvp);
 		} else if (doingdirectory) {
 			error = EISDIR;
 			goto bad;
 		}
 
 		DPRINTF(VNCALL, ("%s: update entry dvp:%p\n", __func__, tdvp));
 		/*
 		 * Change name tcnp in tdvp to point at fvp.
 		 */
 		error = nandfs_update_dirent(tdvp, fnode, tnode);
 		if (error)
 			goto bad;
 
 		if (doingdirectory && !newparent)
 			tdnode->nn_inode.i_links_count--;
 
 		vput(tdvp);
 
 		tnode->nn_inode.i_links_count--;
 		vput(tvp);
 		tnode = NULL;
 	}
 
 	/*
 	 * Unlink the source.
 	 */
 	fcnp->cn_flags &= ~MODMASK;
 	fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
 	VREF(fdvp);
 	error = relookup(fdvp, &fvp, fcnp);
 	if (error == 0)
 		vrele(fdvp);
 	if (fvp != NULL) {
 		fnode1 = VTON(fvp);
 		fdnode = VTON(fdvp);
 	} else {
 		/*
 		 * From name has disappeared.
 		 */
 		if (doingdirectory)
 			panic("nandfs_rename: lost dir entry");
 		vrele(ap->a_fvp);
 		return (0);
 	}
 
 	DPRINTF(VNCALL, ("%s: unlink source fnode:%p\n", __func__, fnode));
 
 	/*
 	 * Ensure that the directory entry still exists and has not
 	 * changed while the new name has been entered. If the source is
 	 * a file then the entry may have been unlinked or renamed. In
 	 * either case there is no further work to be done. If the source
 	 * is a directory then it cannot have been rmdir'ed; its link
 	 * count of three would cause a rmdir to fail with ENOTEMPTY.
 	 * The IN_RENAME flag ensures that it cannot be moved by another
 	 * rename.
 	 */
 	if (fnode != fnode1) {
 		if (doingdirectory)
 			panic("nandfs: lost dir entry");
 	} else {
 		/*
 		 * If the source is a directory with a
 		 * new parent, the link count of the old
 		 * parent directory must be decremented
 		 * and ".." set to point to the new parent.
 		 */
 		if (doingdirectory && newparent) {
 			DPRINTF(VNCALL, ("%s: new parent %#jx -> %#jx\n",
 			    __func__, (uintmax_t) oldparent,
 			    (uintmax_t) newparent));
 			error = nandfs_update_parent_dir(fvp, newparent);
 			if (!error) {
 				fdnode->nn_inode.i_links_count--;
 				fdnode->nn_flags |= IN_CHANGE;
 			}
 		}
 		error = nandfs_remove_dirent(fdvp, fnode, fcnp);
 		if (!error) {
 			fnode->nn_inode.i_links_count--;
 			fnode->nn_flags |= IN_CHANGE;
 		}
 		fnode->nn_flags &= ~IN_RENAME;
 	}
 	if (fdnode)
 		vput(fdvp);
 	if (fnode)
 		vput(fvp);
 	vrele(ap->a_fvp);
 	return (error);
 
 bad:
 	DPRINTF(VNCALL, ("%s: error:%d\n", __func__, error));
 	if (tnode)
 		vput(NTOV(tnode));
 	vput(NTOV(tdnode));
 out:
 	if (doingdirectory)
 		fnode->nn_flags &= ~IN_RENAME;
 	if (vn_lock(fvp, LK_EXCLUSIVE) == 0) {
 		fnode->nn_inode.i_links_count--;
 		fnode->nn_flags |= IN_CHANGE;
 		fnode->nn_flags &= ~IN_RENAME;
 		vput(fvp);
 	} else
 		vrele(fvp);
 	return (error);
 }
 
 static int
 nandfs_mkdir(struct vop_mkdir_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct nandfs_node *dir_node = VTON(dvp);
 	struct nandfs_inode *dir_inode = &dir_node->nn_inode;
 	struct nandfs_node *node;
 	struct nandfsmount *nmp = dir_node->nn_nmp;
 	uint16_t mode = MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode);
 	int error;
 
 	DPRINTF(VNCALL, ("%s: dvp %p\n", __func__, dvp));
 
 	if (nandfs_fs_full(dir_node->nn_nandfsdev))
 		return (ENOSPC);
 
 	if (dir_inode->i_links_count >= NANDFS_LINK_MAX)
 		return (EMLINK);
 
 	error = nandfs_node_create(nmp, &node, mode);
 	if (error)
 		return (error);
 
 	node->nn_inode.i_gid = dir_node->nn_inode.i_gid;
 	node->nn_inode.i_uid = cnp->cn_cred->cr_uid;
 
 	*vpp = NTOV(node);
 
 	error = nandfs_add_dirent(dvp, node->nn_ino, cnp->cn_nameptr,
 	    cnp->cn_namelen, IFTODT(mode));
 	if (error) {
 		vput(*vpp);
 		return (error);
 	}
 
 	dir_node->nn_inode.i_links_count++;
 	dir_node->nn_flags |= IN_CHANGE;
 
 	error = nandfs_init_dir(NTOV(node), node->nn_ino, dir_node->nn_ino);
 	if (error) {
 		vput(NTOV(node));
 		return (error);
 	}
 
 	DPRINTF(VNCALL, ("created dir vp %p nandnode %p ino %jx\n", *vpp, node,
 	    (uintmax_t)node->nn_ino));
 	return (0);
 }
 
 static int
 nandfs_mknod(struct vop_mknod_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct vattr *vap = ap->a_vap;
 	uint16_t mode = MAKEIMODE(vap->va_type, vap->va_mode);
 	struct componentname *cnp = ap->a_cnp;
 	struct nandfs_node *dir_node = VTON(dvp);
 	struct nandfsmount *nmp = dir_node->nn_nmp;
 	struct nandfs_node *node;
 	int error;
 
 	if (nandfs_fs_full(dir_node->nn_nandfsdev))
 		return (ENOSPC);
 
 	error = nandfs_node_create(nmp, &node, mode);
 	if (error)
 		return (error);
 	node->nn_inode.i_gid = dir_node->nn_inode.i_gid;
 	node->nn_inode.i_uid = cnp->cn_cred->cr_uid;
 	if (vap->va_rdev != VNOVAL)
 		node->nn_inode.i_special = vap->va_rdev;
 
 	*vpp = NTOV(node);
 
 	if (nandfs_add_dirent(dvp, node->nn_ino, cnp->cn_nameptr,
 	    cnp->cn_namelen, IFTODT(mode))) {
 		vput(*vpp);
 		return (ENOTDIR);
 	}
 
 	node->nn_flags |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 
 	return (0);
 }
 
 static int
 nandfs_symlink(struct vop_symlink_args *ap)
 {
 	struct vnode **vpp = ap->a_vpp;
 	struct vnode *dvp = ap->a_dvp;
 	uint16_t mode = MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode);
 	struct componentname *cnp = ap->a_cnp;
 	struct nandfs_node *dir_node = VTON(dvp);
 	struct nandfsmount *nmp = dir_node->nn_nmp;
 	struct nandfs_node *node;
 	int len, error;
 
 	if (nandfs_fs_full(dir_node->nn_nandfsdev))
 		return (ENOSPC);
 
 	error = nandfs_node_create(nmp, &node, S_IFLNK | mode);
 	if (error)
 		return (error);
 	node->nn_inode.i_gid = dir_node->nn_inode.i_gid;
 	node->nn_inode.i_uid = cnp->cn_cred->cr_uid;
 
 	*vpp = NTOV(node);
 
 	if (nandfs_add_dirent(dvp, node->nn_ino, cnp->cn_nameptr,
 	    cnp->cn_namelen, IFTODT(mode))) {
 		vput(*vpp);
 		return (ENOTDIR);
 	}
 
 
 	len = strlen(ap->a_target);
 	error = vn_rdwr(UIO_WRITE, *vpp, ap->a_target, len, (off_t)0,
 	    UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK,
 	    cnp->cn_cred, NOCRED, NULL, NULL);
 	if (error)
 		vput(*vpp);
 
 	return (error);
 }
 
 static int
 nandfs_readlink(struct vop_readlink_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
 	return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred));
 }
 
 static int
 nandfs_rmdir(struct vop_rmdir_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct nandfs_node *node, *dnode;
 	uint32_t dflag, flag;
 	int error = 0;
 
 	node = VTON(vp);
 	dnode = VTON(dvp);
 
 	/* Files marked as immutable or append-only cannot be deleted. */
 	if ((node->nn_inode.i_flags & (IMMUTABLE | APPEND | NOUNLINK)) ||
 	    (dnode->nn_inode.i_flags & APPEND))
 		return (EPERM);
 
 	DPRINTF(VNCALL, ("%s: dvp %p vp %p nandnode %p ino %#jx\n", __func__,
 	    dvp, vp, node, (uintmax_t)node->nn_ino));
 
 	if (node->nn_inode.i_links_count < 2)
 		return (EINVAL);
 
 	if (!nandfs_dirempty(vp, dnode->nn_ino, cnp->cn_cred))
 		return (ENOTEMPTY);
 
 	/* Files marked as immutable or append-only cannot be deleted. */
 	dflag = dnode->nn_inode.i_flags;
 	flag = node->nn_inode.i_flags;
 	if ((dflag & APPEND) ||
 	    (flag & (NOUNLINK | IMMUTABLE | APPEND))) {
 		return (EPERM);
 	}
 
 	if (vp->v_mountedhere != 0)
 		return (EINVAL);
 
 	nandfs_remove_dirent(dvp, node, cnp);
 	dnode->nn_inode.i_links_count -= 1;
 	dnode->nn_flags |= IN_CHANGE;
 
 	cache_purge(dvp);
 
 	error = nandfs_truncate(vp, (uint64_t)0);
 	if (error)
 		return (error);
 
 	node->nn_inode.i_links_count -= 2;
 	node->nn_flags |= IN_CHANGE;
 
 	cache_purge(vp);
 
 	return (error);
 }
 
 static int
 nandfs_fsync(struct vop_fsync_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nandfs_node *node = VTON(vp);
 	int locked;
 
 	DPRINTF(VNCALL, ("%s: vp %p nandnode %p ino %#jx\n", __func__, vp,
 	    node, (uintmax_t)node->nn_ino));
 
 	/*
 	 * Start syncing vnode only if inode was modified or
 	 * there are some dirty buffers
 	 */
 	if (VTON(vp)->nn_flags & IN_MODIFIED ||
 	    vp->v_bufobj.bo_dirty.bv_cnt) {
 		locked = VOP_ISLOCKED(vp);
 		VOP_UNLOCK(vp, 0);
 		nandfs_wakeup_wait_sync(node->nn_nandfsdev, SYNCER_FSYNC);
 		VOP_LOCK(vp, locked | LK_RETRY);
 	}
 
 	return (0);
 }
 
 static int
 nandfs_bmap(struct vop_bmap_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nandfs_node *nnode = VTON(vp);
 	struct nandfs_device *nandfsdev = nnode->nn_nandfsdev;
 	nandfs_daddr_t l2vmap, v2pmap;
 	int error;
 	int blk2dev = nandfsdev->nd_blocksize / DEV_BSIZE;
 
 	DPRINTF(VNCALL, ("%s: vp %p nandnode %p ino %#jx\n", __func__, vp,
 	    nnode, (uintmax_t)nnode->nn_ino));
 
 	if (ap->a_bop != NULL)
 		*ap->a_bop = &nandfsdev->nd_devvp->v_bufobj;
 	if (ap->a_bnp == NULL)
 		return (0);
 	if (ap->a_runp != NULL)
 		*ap->a_runp = 0;
 	if (ap->a_runb != NULL)
 		*ap->a_runb = 0;
 
 	/*
 	 * Translate all the block sectors into a series of buffers to read
 	 * asynchronously from the nandfs device. Note that this lookup may
 	 * induce readin's too.
 	 */
 
 	/* Get virtual block numbers for the vnode's buffer span */
 	error = nandfs_bmap_lookup(nnode, ap->a_bn, &l2vmap);
 	if (error)
 		return (-1);
 
 	/* Translate virtual block numbers to physical block numbers */
 	error = nandfs_vtop(nnode, l2vmap, &v2pmap);
 	if (error)
 		return (-1);
 
 	/* Note virtual block 0 marks not mapped */
 	if (l2vmap == 0)
 		*ap->a_bnp = -1;
 	else
 		*ap->a_bnp = v2pmap * blk2dev;	/* in DEV_BSIZE */
 
 	DPRINTF(VNCALL, ("%s: vp %p nandnode %p ino %#jx lblk %jx -> blk %jx\n",
 	    __func__, vp, nnode, (uintmax_t)nnode->nn_ino, (uintmax_t)ap->a_bn,
 	    (uintmax_t)*ap->a_bnp ));
 
 	return (0);
 }
 
 static void
 nandfs_force_syncer(struct nandfsmount *nmp)
 {
 
 	nmp->nm_flags |= NANDFS_FORCE_SYNCER;
 	nandfs_wakeup_wait_sync(nmp->nm_nandfsdev, SYNCER_FFORCE);
 }
 
 static int
 nandfs_ioctl(struct vop_ioctl_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	u_long command = ap->a_command;
 	caddr_t data = ap->a_data;
 	struct nandfs_node *node = VTON(vp);
 	struct nandfs_device *nandfsdev = node->nn_nandfsdev;
 	struct nandfsmount *nmp = node->nn_nmp;
 	uint64_t *tab, *cno;
 	struct nandfs_seg_stat *nss;
 	struct nandfs_cpmode *ncpm;
 	struct nandfs_argv *nargv;
 	struct nandfs_cpstat *ncp;
 	int error;
 
 	DPRINTF(VNCALL, ("%s: %x\n", __func__, (uint32_t)command));
 
 	error = priv_check(ap->a_td, PRIV_VFS_MOUNT);
 	if (error)
 		return (error);
 
 	if (nmp->nm_ronly) {
 		switch (command) {
 		case NANDFS_IOCTL_GET_FSINFO:
 		case NANDFS_IOCTL_GET_SUSTAT:
 		case NANDFS_IOCTL_GET_CPINFO:
 		case NANDFS_IOCTL_GET_CPSTAT:
 		case NANDFS_IOCTL_GET_SUINFO:
 		case NANDFS_IOCTL_GET_VINFO:
 		case NANDFS_IOCTL_GET_BDESCS:
 			break;
 		default:
 			return (EROFS);
 		}
 	}
 
 	switch (command) {
 	case NANDFS_IOCTL_GET_FSINFO:
 		error = nandfs_get_fsinfo(nmp, (struct nandfs_fsinfo *)data);
 		break;
 	case NANDFS_IOCTL_GET_SUSTAT:
 		nss = (struct nandfs_seg_stat *)data;
 		error = nandfs_get_seg_stat(nandfsdev, nss);
 		break;
 	case NANDFS_IOCTL_CHANGE_CPMODE:
 		ncpm = (struct nandfs_cpmode *)data;
 		error = nandfs_chng_cpmode(nandfsdev->nd_cp_node, ncpm);
 		nandfs_force_syncer(nmp);
 		break;
 	case NANDFS_IOCTL_GET_CPINFO:
 		nargv = (struct nandfs_argv *)data;
 		error = nandfs_get_cpinfo_ioctl(nandfsdev->nd_cp_node, nargv);
 		break;
 	case NANDFS_IOCTL_DELETE_CP:
 		tab = (uint64_t *)data;
 		error = nandfs_delete_cp(nandfsdev->nd_cp_node, tab[0], tab[1]);
 		nandfs_force_syncer(nmp);
 		break;
 	case NANDFS_IOCTL_GET_CPSTAT:
 		ncp = (struct nandfs_cpstat *)data;
 		error = nandfs_get_cpstat(nandfsdev->nd_cp_node, ncp);
 		break;
 	case NANDFS_IOCTL_GET_SUINFO:
 		nargv = (struct nandfs_argv *)data;
 		error = nandfs_get_segment_info_ioctl(nandfsdev, nargv);
 		break;
 	case NANDFS_IOCTL_GET_VINFO:
 		nargv = (struct nandfs_argv *)data;
 		error = nandfs_get_dat_vinfo_ioctl(nandfsdev, nargv);
 		break;
 	case NANDFS_IOCTL_GET_BDESCS:
 		nargv = (struct nandfs_argv *)data;
 		error = nandfs_get_dat_bdescs_ioctl(nandfsdev, nargv);
 		break;
 	case NANDFS_IOCTL_SYNC:
 		cno = (uint64_t *)data;
 		nandfs_force_syncer(nmp);
 		*cno = nandfsdev->nd_last_cno;
 		error = 0;
 		break;
 	case NANDFS_IOCTL_MAKE_SNAP:
 		cno = (uint64_t *)data;
 		error = nandfs_make_snap(nandfsdev, cno);
 		nandfs_force_syncer(nmp);
 		break;
 	case NANDFS_IOCTL_DELETE_SNAP:
 		cno = (uint64_t *)data;
 		error = nandfs_delete_snap(nandfsdev, *cno);
 		nandfs_force_syncer(nmp);
 		break;
 	default:
 		error = ENOTTY;
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * Whiteout vnode call
  */
 static int
 nandfs_whiteout(struct vop_whiteout_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	int error = 0;
 
 	switch (ap->a_flags) {
 	case LOOKUP:
 		return (0);
 	case CREATE:
 		/* Create a new directory whiteout */
 #ifdef INVARIANTS
 		if ((cnp->cn_flags & SAVENAME) == 0)
 			panic("nandfs_whiteout: missing name");
 #endif
 		error = nandfs_add_dirent(dvp, NANDFS_WHT_INO, cnp->cn_nameptr,
 		    cnp->cn_namelen, DT_WHT);
 		break;
 
 	case DELETE:
 		/* Remove an existing directory whiteout */
 		cnp->cn_flags &= ~DOWHITEOUT;
 		error = nandfs_remove_dirent(dvp, NULL, cnp);
 		break;
 	default:
 		panic("nandf_whiteout: unknown op: %d", ap->a_flags);
 	}
 
 	return (error);
 }
 
 static int
 nandfs_pathconf(struct vop_pathconf_args *ap)
 {
 	int error;
 
 	error = 0;
 	switch (ap->a_name) {
 	case _PC_LINK_MAX:
 		*ap->a_retval = NANDFS_LINK_MAX;
 		break;
 	case _PC_NAME_MAX:
 		*ap->a_retval = NANDFS_NAME_LEN;
 		break;
 	case _PC_PIPE_BUF:
 		if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO)
 			*ap->a_retval = PIPE_BUF;
 		else
 			error = EINVAL;
 		break;
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = 1;
 		break;
 	case _PC_NO_TRUNC:
 		*ap->a_retval = 1;
 		break;
 	case _PC_ACL_EXTENDED:
 		*ap->a_retval = 0;
 		break;
 	case _PC_ALLOC_SIZE_MIN:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_bsize;
 		break;
 	case _PC_FILESIZEBITS:
 		*ap->a_retval = 64;
 		break;
 	case _PC_REC_INCR_XFER_SIZE:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
 		break;
 	case _PC_REC_MAX_XFER_SIZE:
 		*ap->a_retval = -1; /* means ``unlimited'' */
 		break;
 	case _PC_REC_MIN_XFER_SIZE:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
 		break;
 	default:
 		error = vop_stdpathconf(ap);
 		break;
 	}
 	return (error);
 }
 
 static int
 nandfs_vnlock1(struct vop_lock1_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nandfs_node *node = VTON(vp);
 	int error, vi_locked;
 
 	/*
 	 * XXX can vnode go away while we are sleeping?
 	 */
 	vi_locked = mtx_owned(&vp->v_interlock);
 	if (vi_locked)
 		VI_UNLOCK(vp);
 	error = NANDFS_WRITELOCKFLAGS(node->nn_nandfsdev,
 	    ap->a_flags & LK_NOWAIT);
 	if (vi_locked && !error)
 		VI_LOCK(vp);
 	if (error)
 		return (error);
 
 	error = vop_stdlock(ap);
 	if (error) {
 		NANDFS_WRITEUNLOCK(node->nn_nandfsdev);
 		return (error);
 	}
 
 	return (0);
 }
 
 static int
 nandfs_vnunlock(struct vop_unlock_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nandfs_node *node = VTON(vp);
 	int error;
 
 	error = vop_stdunlock(ap);
 	if (error)
 		return (error);
 
 	NANDFS_WRITEUNLOCK(node->nn_nandfsdev);
 
 	return (0);
 }
 
 /*
  * Global vfs data structures
  */
 struct vop_vector nandfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_access =		nandfs_access,
 	.vop_advlock =		nandfs_advlock,
 	.vop_bmap =		nandfs_bmap,
 	.vop_close =		nandfs_close,
 	.vop_create =		nandfs_create,
 	.vop_fsync =		nandfs_fsync,
 	.vop_getattr =		nandfs_getattr,
 	.vop_inactive =		nandfs_inactive,
 	.vop_cachedlookup =	nandfs_lookup,
 	.vop_ioctl =		nandfs_ioctl,
 	.vop_link =		nandfs_link,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_mkdir =		nandfs_mkdir,
 	.vop_mknod =		nandfs_mknod,
 	.vop_open =		nandfs_open,
 	.vop_pathconf =		nandfs_pathconf,
 	.vop_print =		nandfs_print,
 	.vop_read =		nandfs_read,
 	.vop_readdir =		nandfs_readdir,
 	.vop_readlink =		nandfs_readlink,
 	.vop_reclaim =		nandfs_reclaim,
 	.vop_remove =		nandfs_remove,
 	.vop_rename =		nandfs_rename,
 	.vop_rmdir =		nandfs_rmdir,
 	.vop_whiteout =		nandfs_whiteout,
 	.vop_write =		nandfs_write,
 	.vop_setattr =		nandfs_setattr,
 	.vop_strategy =		nandfs_strategy,
 	.vop_symlink =		nandfs_symlink,
 	.vop_lock1 =		nandfs_vnlock1,
 	.vop_unlock =		nandfs_vnunlock,
 };
 
 struct vop_vector nandfs_system_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_close =		nandfs_close,
 	.vop_inactive =		nandfs_inactive,
 	.vop_reclaim =		nandfs_reclaim,
 	.vop_strategy =		nandfs_strategy,
 	.vop_fsync =		nandfs_fsync,
 	.vop_bmap =		nandfs_bmap,
 	.vop_access =		VOP_PANIC,
 	.vop_advlock =		VOP_PANIC,
 	.vop_create =		VOP_PANIC,
 	.vop_getattr =		VOP_PANIC,
 	.vop_cachedlookup =	VOP_PANIC,
 	.vop_ioctl =		VOP_PANIC,
 	.vop_link =		VOP_PANIC,
 	.vop_lookup =		VOP_PANIC,
 	.vop_mkdir =		VOP_PANIC,
 	.vop_mknod =		VOP_PANIC,
 	.vop_open =		VOP_PANIC,
 	.vop_pathconf =		VOP_PANIC,
 	.vop_print =		VOP_PANIC,
 	.vop_read =		VOP_PANIC,
 	.vop_readdir =		VOP_PANIC,
 	.vop_readlink =		VOP_PANIC,
 	.vop_remove =		VOP_PANIC,
 	.vop_rename =		VOP_PANIC,
 	.vop_rmdir =		VOP_PANIC,
 	.vop_whiteout =		VOP_PANIC,
 	.vop_write =		VOP_PANIC,
 	.vop_setattr =		VOP_PANIC,
 	.vop_symlink =		VOP_PANIC,
 };
 
 static int
 nandfsfifo_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nandfs_node *node = VTON(vp);
 
 	DPRINTF(VNCALL, ("%s: vp %p node %p\n", __func__, vp, node));
 
 	mtx_lock(&vp->v_interlock);
 	if (vp->v_usecount > 1)
 		nandfs_itimes_locked(vp);
 	mtx_unlock(&vp->v_interlock);
 
 	return (fifo_specops.vop_close(ap));
 }
 
 struct vop_vector nandfs_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_fsync =		VOP_PANIC,
 	.vop_access =		nandfs_access,
 	.vop_close =		nandfsfifo_close,
 	.vop_getattr =		nandfs_getattr,
 	.vop_inactive =		nandfs_inactive,
 	.vop_pathconf =		nandfs_pathconf,
 	.vop_print =		nandfs_print,
 	.vop_read =		VOP_PANIC,
 	.vop_reclaim =		nandfs_reclaim,
 	.vop_setattr =		nandfs_setattr,
 	.vop_write =		VOP_PANIC,
 	.vop_lock1 =		nandfs_vnlock1,
 	.vop_unlock =		nandfs_vnunlock,
 };
 
 int
 nandfs_vinit(struct vnode *vp, uint64_t ino)
 {
 	struct nandfs_node *node;
 
 	ASSERT_VOP_LOCKED(vp, __func__);
 
 	node = VTON(vp);
 
 	/* Check if we're fetching the root */
 	if (ino == NANDFS_ROOT_INO)
 		vp->v_vflag |= VV_ROOT;
 
 	if (ino != NANDFS_GC_INO)
 		vp->v_type = IFTOVT(node->nn_inode.i_mode);
 	else
 		vp->v_type = VREG;
 
 	if (vp->v_type == VFIFO)
 		vp->v_op = &nandfs_fifoops;
 
 	return (0);
 }
Index: stable/12/sys/fs/pseudofs/pseudofs_vnops.c
===================================================================
--- stable/12/sys/fs/pseudofs/pseudofs_vnops.c	(revision 340720)
+++ stable/12/sys/fs/pseudofs/pseudofs_vnops.c	(revision 340721)
@@ -1,1061 +1,1063 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2001 Dag-Erling Coïdan Smørgrav
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_pseudofs.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/ctype.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 
 #include <fs/pseudofs/pseudofs.h>
 #include <fs/pseudofs/pseudofs_internal.h>
 
 #define KASSERT_PN_IS_DIR(pn)						\
 	KASSERT((pn)->pn_type == pfstype_root ||			\
 	    (pn)->pn_type == pfstype_dir ||				\
 	    (pn)->pn_type == pfstype_procdir,				\
 	    ("%s(): VDIR vnode refers to non-directory pfs_node", __func__))
 
 #define KASSERT_PN_IS_FILE(pn)						\
 	KASSERT((pn)->pn_type == pfstype_file,				\
 	    ("%s(): VREG vnode refers to non-file pfs_node", __func__))
 
 #define KASSERT_PN_IS_LINK(pn)						\
 	KASSERT((pn)->pn_type == pfstype_symlink,			\
 	    ("%s(): VLNK vnode refers to non-link pfs_node", __func__))
 
 /*
  * Returns the fileno, adjusted for target pid
  */
 static uint32_t
 pn_fileno(struct pfs_node *pn, pid_t pid)
 {
 
 	KASSERT(pn->pn_fileno > 0,
 	    ("%s(): no fileno allocated", __func__));
 	if (pid != NO_PID)
 		return (pn->pn_fileno * NO_PID + pid);
 	return (pn->pn_fileno);
 }
 
 /*
  * Returns non-zero if given file is visible to given thread.
  */
 static int
 pfs_visible_proc(struct thread *td, struct pfs_node *pn, struct proc *proc)
 {
 	int visible;
 
 	if (proc == NULL)
 		return (0);
 
 	PROC_LOCK_ASSERT(proc, MA_OWNED);
 
 	visible = ((proc->p_flag & P_WEXIT) == 0);
 	if (visible)
 		visible = (p_cansee(td, proc) == 0);
 	if (visible && pn->pn_vis != NULL)
 		visible = pn_vis(td, proc, pn);
 	if (!visible)
 		return (0);
 	return (1);
 }
 
 static int
 pfs_visible(struct thread *td, struct pfs_node *pn, pid_t pid,
     bool allproc_locked, struct proc **p)
 {
 	struct proc *proc;
 
 	PFS_TRACE(("%s (pid: %d, req: %d)",
 	    pn->pn_name, pid, td->td_proc->p_pid));
 
 	if (p)
 		*p = NULL;
 	if (pid == NO_PID)
 		PFS_RETURN (1);
 	proc = allproc_locked ? pfind_locked(pid) : pfind(pid);
 	if (proc == NULL)
 		PFS_RETURN (0);
 	if (pfs_visible_proc(td, pn, proc)) {
 		if (p)
 			*p = proc;
 		else
 			PROC_UNLOCK(proc);
 		PFS_RETURN (1);
 	}
 	PROC_UNLOCK(proc);
 	PFS_RETURN (0);
 }
 
 /*
  * Verify permissions
  */
 static int
 pfs_access(struct vop_access_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct vattr vattr;
 	int error;
 
 	PFS_TRACE(("%s", pvd->pvd_pn->pn_name));
 	(void)pvd;
 
 	error = VOP_GETATTR(vn, &vattr, va->a_cred);
 	if (error)
 		PFS_RETURN (error);
 	error = vaccess(vn->v_type, vattr.va_mode, vattr.va_uid,
 	    vattr.va_gid, va->a_accmode, va->a_cred, NULL);
 	PFS_RETURN (error);
 }
 
 /*
  * Close a file or directory
  */
 static int
 pfs_close(struct vop_close_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 	struct proc *proc;
 	int error;
 
 	PFS_TRACE(("%s", pn->pn_name));
 	pfs_assert_not_owned(pn);
 
 	/*
 	 * Do nothing unless this is the last close and the node has a
 	 * last-close handler.
 	 */
 	if (vrefcnt(vn) > 1 || pn->pn_close == NULL)
 		PFS_RETURN (0);
 
 	if (pvd->pvd_pid != NO_PID) {
 		proc = pfind(pvd->pvd_pid);
 	} else {
 		proc = NULL;
 	}
 
 	error = pn_close(va->a_td, proc, pn);
 
 	if (proc != NULL)
 		PROC_UNLOCK(proc);
 
 	PFS_RETURN (error);
 }
 
 /*
  * Get file attributes
  */
 static int
 pfs_getattr(struct vop_getattr_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 	struct vattr *vap = va->a_vap;
 	struct proc *proc;
 	int error = 0;
 
 	PFS_TRACE(("%s", pn->pn_name));
 	pfs_assert_not_owned(pn);
 
 	if (!pfs_visible(curthread, pn, pvd->pvd_pid, false, &proc))
 		PFS_RETURN (ENOENT);
 
 	vap->va_type = vn->v_type;
 	vap->va_fileid = pn_fileno(pn, pvd->pvd_pid);
 	vap->va_flags = 0;
 	vap->va_blocksize = PAGE_SIZE;
 	vap->va_bytes = vap->va_size = 0;
 	vap->va_filerev = 0;
 	vap->va_fsid = vn->v_mount->mnt_stat.f_fsid.val[0];
 	vap->va_nlink = 1;
 	nanotime(&vap->va_ctime);
 	vap->va_atime = vap->va_mtime = vap->va_ctime;
 
 	switch (pn->pn_type) {
 	case pfstype_procdir:
 	case pfstype_root:
 	case pfstype_dir:
 #if 0
 		pfs_lock(pn);
 		/* compute link count */
 		pfs_unlock(pn);
 #endif
 		vap->va_mode = 0555;
 		break;
 	case pfstype_file:
 	case pfstype_symlink:
 		vap->va_mode = 0444;
 		break;
 	default:
 		printf("shouldn't be here!\n");
 		vap->va_mode = 0;
 		break;
 	}
 
 	if (proc != NULL) {
 		vap->va_uid = proc->p_ucred->cr_ruid;
 		vap->va_gid = proc->p_ucred->cr_rgid;
 	} else {
 		vap->va_uid = 0;
 		vap->va_gid = 0;
 	}
 
 	if (pn->pn_attr != NULL)
 		error = pn_attr(curthread, proc, pn, vap);
 
 	if(proc != NULL)
 		PROC_UNLOCK(proc);
 
 	PFS_RETURN (error);
 }
 
 /*
  * Perform an ioctl
  */
 static int
 pfs_ioctl(struct vop_ioctl_args *va)
 {
 	struct vnode *vn;
 	struct pfs_vdata *pvd;
 	struct pfs_node *pn;
 	struct proc *proc;
 	int error;
 
 	vn = va->a_vp;
 	vn_lock(vn, LK_SHARED | LK_RETRY);
 	if (vn->v_iflag & VI_DOOMED) {
 		VOP_UNLOCK(vn, 0);
 		return (EBADF);
 	}
 	pvd = vn->v_data;
 	pn = pvd->pvd_pn;
 
 	PFS_TRACE(("%s: %lx", pn->pn_name, va->a_command));
 	pfs_assert_not_owned(pn);
 
 	if (vn->v_type != VREG) {
 		VOP_UNLOCK(vn, 0);
 		PFS_RETURN (EINVAL);
 	}
 	KASSERT_PN_IS_FILE(pn);
 
 	if (pn->pn_ioctl == NULL) {
 		VOP_UNLOCK(vn, 0);
 		PFS_RETURN (ENOTTY);
 	}
 
 	/*
 	 * This is necessary because process' privileges may
 	 * have changed since the open() call.
 	 */
 	if (!pfs_visible(curthread, pn, pvd->pvd_pid, false, &proc)) {
 		VOP_UNLOCK(vn, 0);
 		PFS_RETURN (EIO);
 	}
 
 	error = pn_ioctl(curthread, proc, pn, va->a_command, va->a_data);
 
 	if (proc != NULL)
 		PROC_UNLOCK(proc);
 
 	VOP_UNLOCK(vn, 0);
 	PFS_RETURN (error);
 }
 
 /*
  * Perform getextattr
  */
 static int
 pfs_getextattr(struct vop_getextattr_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 	struct proc *proc;
 	int error;
 
 	PFS_TRACE(("%s", pn->pn_name));
 	pfs_assert_not_owned(pn);
 
 	/*
 	 * This is necessary because either process' privileges may
 	 * have changed since the open() call.
 	 */
 	if (!pfs_visible(curthread, pn, pvd->pvd_pid, false, &proc))
 		PFS_RETURN (EIO);
 
 	if (pn->pn_getextattr == NULL)
 		error = EOPNOTSUPP;
 	else
 		error = pn_getextattr(curthread, proc, pn,
 		    va->a_attrnamespace, va->a_name, va->a_uio,
 		    va->a_size, va->a_cred);
 
 	if (proc != NULL)
 		PROC_UNLOCK(proc);
 
 	PFS_RETURN (error);
 }
 
 /*
  * Convert a vnode to its component name
  */
 static int
 pfs_vptocnp(struct vop_vptocnp_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode **dvp = ap->a_vpp;
 	struct pfs_vdata *pvd = vp->v_data;
 	struct pfs_node *pd = pvd->pvd_pn;
 	struct pfs_node *pn;
 	struct mount *mp;
 	char *buf = ap->a_buf;
 	int *buflen = ap->a_buflen;
 	char pidbuf[PFS_NAMELEN];
 	pid_t pid = pvd->pvd_pid;
 	int len, i, error, locked;
 
 	i = *buflen;
 	error = 0;
 
 	pfs_lock(pd);
 
 	if (vp->v_type == VDIR && pd->pn_type == pfstype_root) {
 		*dvp = vp;
 		vhold(*dvp);
 		pfs_unlock(pd);
 		PFS_RETURN (0);
 	} else if (vp->v_type == VDIR && pd->pn_type == pfstype_procdir) {
 		len = snprintf(pidbuf, sizeof(pidbuf), "%d", pid);
 		i -= len;
 		if (i < 0) {
 			error = ENOMEM;
 			goto failed;
 		}
 		bcopy(pidbuf, buf + i, len);
 	} else {
 		len = strlen(pd->pn_name);
 		i -= len;
 		if (i < 0) {
 			error = ENOMEM;
 			goto failed;
 		}
 		bcopy(pd->pn_name, buf + i, len);
 	}
 
 	pn = pd->pn_parent;
 	pfs_unlock(pd);
 
 	mp = vp->v_mount;
 	error = vfs_busy(mp, 0);
 	if (error)
 		return (error);
 
 	/*
 	 * vp is held by caller.
 	 */
 	locked = VOP_ISLOCKED(vp);
 	VOP_UNLOCK(vp, 0);
 
 	error = pfs_vncache_alloc(mp, dvp, pn, pid);
 	if (error) {
 		vn_lock(vp, locked | LK_RETRY);
 		vfs_unbusy(mp);
 		PFS_RETURN(error);
 	}
 
 	*buflen = i;
 	VOP_UNLOCK(*dvp, 0);
 	vn_lock(vp, locked | LK_RETRY);
 	vfs_unbusy(mp);
 
 	PFS_RETURN (0);
 failed:
 	pfs_unlock(pd);
 	PFS_RETURN(error);
 }
 
 /*
  * Look up a file or directory
  */
 static int
 pfs_lookup(struct vop_cachedlookup_args *va)
 {
 	struct vnode *vn = va->a_dvp;
 	struct vnode **vpp = va->a_vpp;
 	struct componentname *cnp = va->a_cnp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pd = pvd->pvd_pn;
 	struct pfs_node *pn, *pdn = NULL;
 	struct mount *mp;
 	pid_t pid = pvd->pvd_pid;
 	char *pname;
 	int error, i, namelen, visible;
 
 	PFS_TRACE(("%.*s", (int)cnp->cn_namelen, cnp->cn_nameptr));
 	pfs_assert_not_owned(pd);
 
 	if (vn->v_type != VDIR)
 		PFS_RETURN (ENOTDIR);
 	KASSERT_PN_IS_DIR(pd);
 
 	error = VOP_ACCESS(vn, VEXEC, cnp->cn_cred, cnp->cn_thread);
 	if (error)
 		PFS_RETURN (error);
 
 	/*
 	 * Don't support DELETE or RENAME.  CREATE is supported so
 	 * that O_CREAT will work, but the lookup will still fail if
 	 * the file does not exist.
 	 */
 	if ((cnp->cn_flags & ISLASTCN) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 		PFS_RETURN (EOPNOTSUPP);
 
 	/* shortcut: check if the name is too long */
 	if (cnp->cn_namelen >= PFS_NAMELEN)
 		PFS_RETURN (ENOENT);
 
 	/* check that parent directory is visible... */
 	if (!pfs_visible(curthread, pd, pvd->pvd_pid, false, NULL))
 		PFS_RETURN (ENOENT);
 
 	/* self */
 	namelen = cnp->cn_namelen;
 	pname = cnp->cn_nameptr;
 	if (namelen == 1 && pname[0] == '.') {
 		pn = pd;
 		*vpp = vn;
 		VREF(vn);
 		PFS_RETURN (0);
 	}
 
 	mp = vn->v_mount;
 
 	/* parent */
 	if (cnp->cn_flags & ISDOTDOT) {
 		if (pd->pn_type == pfstype_root)
 			PFS_RETURN (EIO);
 		error = vfs_busy(mp, MBF_NOWAIT);
 		if (error != 0) {
 			vfs_ref(mp);
 			VOP_UNLOCK(vn, 0);
 			error = vfs_busy(mp, 0);
 			vn_lock(vn, LK_EXCLUSIVE | LK_RETRY);
 			vfs_rel(mp);
 			if (error != 0)
 				PFS_RETURN(ENOENT);
 			if (vn->v_iflag & VI_DOOMED) {
 				vfs_unbusy(mp);
 				PFS_RETURN(ENOENT);
 			}
 		}
 		VOP_UNLOCK(vn, 0);
 		KASSERT(pd->pn_parent != NULL,
 		    ("%s(): non-root directory has no parent", __func__));
 		/*
 		 * This one is tricky.  Descendents of procdir nodes
 		 * inherit their parent's process affinity, but
 		 * there's no easy reverse mapping.  For simplicity,
 		 * we assume that if this node is a procdir, its
 		 * parent isn't (which is correct as long as
 		 * descendents of procdir nodes are never procdir
 		 * nodes themselves)
 		 */
 		if (pd->pn_type == pfstype_procdir)
 			pid = NO_PID;
 		pfs_lock(pd);
 		pn = pd->pn_parent;
 		pfs_unlock(pd);
 		goto got_pnode;
 	}
 
 	pfs_lock(pd);
 
 	/* named node */
 	for (pn = pd->pn_nodes; pn != NULL; pn = pn->pn_next)
 		if (pn->pn_type == pfstype_procdir)
 			pdn = pn;
 		else if (pn->pn_name[namelen] == '\0' &&
 		    bcmp(pname, pn->pn_name, namelen) == 0) {
 			pfs_unlock(pd);
 			goto got_pnode;
 		}
 
 	/* process dependent node */
 	if ((pn = pdn) != NULL) {
 		pid = 0;
 		for (pid = 0, i = 0; i < namelen && isdigit(pname[i]); ++i)
 			if ((pid = pid * 10 + pname[i] - '0') > PID_MAX)
 				break;
 		if (i == cnp->cn_namelen) {
 			pfs_unlock(pd);
 			goto got_pnode;
 		}
 	}
 
 	pfs_unlock(pd);
 
 	PFS_RETURN (ENOENT);
 
  got_pnode:
 	pfs_assert_not_owned(pd);
 	pfs_assert_not_owned(pn);
 	visible = pfs_visible(curthread, pn, pid, false, NULL);
 	if (!visible) {
 		error = ENOENT;
 		goto failed;
 	}
 
 	error = pfs_vncache_alloc(mp, vpp, pn, pid);
 	if (error)
 		goto failed;
 
 	if (cnp->cn_flags & ISDOTDOT) {
 		vfs_unbusy(mp);
 		vn_lock(vn, LK_EXCLUSIVE | LK_RETRY);
 		if (vn->v_iflag & VI_DOOMED) {
 			vput(*vpp);
 			*vpp = NULL;
 			PFS_RETURN(ENOENT);
 		}
 	}
 	if (cnp->cn_flags & MAKEENTRY && !(vn->v_iflag & VI_DOOMED))
 		cache_enter(vn, *vpp, cnp);
 	PFS_RETURN (0);
  failed:
 	if (cnp->cn_flags & ISDOTDOT) {
 		vfs_unbusy(mp);
 		vn_lock(vn, LK_EXCLUSIVE | LK_RETRY);
 		*vpp = NULL;
 	}
 	PFS_RETURN(error);
 }
 
 /*
  * Open a file or directory.
  */
 static int
 pfs_open(struct vop_open_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 	int mode = va->a_mode;
 
 	PFS_TRACE(("%s (mode 0x%x)", pn->pn_name, mode));
 	pfs_assert_not_owned(pn);
 
 	/* check if the requested mode is permitted */
 	if (((mode & FREAD) && !(mode & PFS_RD)) ||
 	    ((mode & FWRITE) && !(mode & PFS_WR)))
 		PFS_RETURN (EPERM);
 
 	/* we don't support locking */
 	if ((mode & O_SHLOCK) || (mode & O_EXLOCK))
 		PFS_RETURN (EOPNOTSUPP);
 
 	PFS_RETURN (0);
 }
 
 /*
  * Read from a file
  */
 static int
 pfs_read(struct vop_read_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 	struct uio *uio = va->a_uio;
 	struct proc *proc;
 	struct sbuf *sb = NULL;
 	int error, locked;
 	off_t buflen;
 
 	PFS_TRACE(("%s", pn->pn_name));
 	pfs_assert_not_owned(pn);
 
 	if (vn->v_type != VREG)
 		PFS_RETURN (EINVAL);
 	KASSERT_PN_IS_FILE(pn);
 
 	if (!(pn->pn_flags & PFS_RD))
 		PFS_RETURN (EBADF);
 
 	if (pn->pn_fill == NULL)
 		PFS_RETURN (EIO);
 
 	/*
 	 * This is necessary because either process' privileges may
 	 * have changed since the open() call.
 	 */
 	if (!pfs_visible(curthread, pn, pvd->pvd_pid, false, &proc))
 		PFS_RETURN (EIO);
 	if (proc != NULL) {
 		_PHOLD(proc);
 		PROC_UNLOCK(proc);
 	}
 
 	vhold(vn);
 	locked = VOP_ISLOCKED(vn);
 	VOP_UNLOCK(vn, 0);
 
 	if (pn->pn_flags & PFS_RAWRD) {
 		PFS_TRACE(("%zd resid", uio->uio_resid));
 		error = pn_fill(curthread, proc, pn, NULL, uio);
 		PFS_TRACE(("%zd resid", uio->uio_resid));
 		goto ret;
 	}
 
 	if (uio->uio_resid < 0 || uio->uio_offset < 0 ||
 	    uio->uio_resid > OFF_MAX - uio->uio_offset) {
 		error = EINVAL;
 		goto ret;
 	}
 	buflen = uio->uio_offset + uio->uio_resid;
 	if (buflen > MAXPHYS)
 		buflen = MAXPHYS;
 
 	sb = sbuf_new(sb, NULL, buflen + 1, 0);
 	if (sb == NULL) {
 		error = EIO;
 		goto ret;
 	}
 
 	error = pn_fill(curthread, proc, pn, sb, uio);
 
 	if (error) {
 		sbuf_delete(sb);
 		goto ret;
 	}
 
 	/*
 	 * XXX: If the buffer overflowed, sbuf_len() will not return
 	 * the data length. Then just use the full length because an
 	 * overflowed sbuf must be full.
 	 */
 	if (sbuf_finish(sb) == 0)
 		buflen = sbuf_len(sb);
 	error = uiomove_frombuf(sbuf_data(sb), buflen, uio);
 	sbuf_delete(sb);
 ret:
 	vn_lock(vn, locked | LK_RETRY);
 	vdrop(vn);
 	if (proc != NULL)
 		PRELE(proc);
 	PFS_RETURN (error);
 }
 
 /*
  * Iterate through directory entries
  */
 static int
 pfs_iterate(struct thread *td, struct proc *proc, struct pfs_node *pd,
 	    struct pfs_node **pn, struct proc **p)
 {
 	int visible;
 
 	sx_assert(&allproc_lock, SX_SLOCKED);
 	pfs_assert_owned(pd);
  again:
 	if (*pn == NULL) {
 		/* first node */
 		*pn = pd->pn_nodes;
 	} else if ((*pn)->pn_type != pfstype_procdir) {
 		/* next node */
 		*pn = (*pn)->pn_next;
 	}
 	if (*pn != NULL && (*pn)->pn_type == pfstype_procdir) {
 		/* next process */
 		if (*p == NULL)
 			*p = LIST_FIRST(&allproc);
 		else
 			*p = LIST_NEXT(*p, p_list);
 		/* out of processes: next node */
 		if (*p == NULL)
 			*pn = (*pn)->pn_next;
 		else
 			PROC_LOCK(*p);
 	}
 
 	if ((*pn) == NULL)
 		return (-1);
 
 	if (*p != NULL) {
 		visible = pfs_visible_proc(td, *pn, *p);
 		PROC_UNLOCK(*p);
 	} else if (proc != NULL) {
 		visible = pfs_visible_proc(td, *pn, proc);
 	} else {
 		visible = 1;
 	}
 	if (!visible)
 		goto again;
 
 	return (0);
 }
 
 /* Directory entry list */
 struct pfsentry {
 	STAILQ_ENTRY(pfsentry)	link;
 	struct dirent		entry;
 };
 STAILQ_HEAD(pfsdirentlist, pfsentry);
 
 /*
  * Return directory entries.
  */
 static int
 pfs_readdir(struct vop_readdir_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pd = pvd->pvd_pn;
 	pid_t pid = pvd->pvd_pid;
 	struct proc *p, *proc;
 	struct pfs_node *pn;
 	struct uio *uio;
 	struct pfsentry *pfsent, *pfsent2;
 	struct pfsdirentlist lst;
 	off_t offset;
 	int error, i, resid;
 
 	STAILQ_INIT(&lst);
 	error = 0;
 	KASSERT(pd->pn_info == vn->v_mount->mnt_data,
 	    ("%s(): pn_info does not match mountpoint", __func__));
 	PFS_TRACE(("%s pid %lu", pd->pn_name, (unsigned long)pid));
 	pfs_assert_not_owned(pd);
 
 	if (vn->v_type != VDIR)
 		PFS_RETURN (ENOTDIR);
 	KASSERT_PN_IS_DIR(pd);
 	uio = va->a_uio;
 
 	/* only allow reading entire entries */
 	offset = uio->uio_offset;
 	resid = uio->uio_resid;
 	if (offset < 0 || offset % PFS_DELEN != 0 ||
 	    (resid && resid < PFS_DELEN))
 		PFS_RETURN (EINVAL);
 	if (resid == 0)
 		PFS_RETURN (0);
 
 	sx_slock(&allproc_lock);
 	pfs_lock(pd);
 
         /* check if the directory is visible to the caller */
         if (!pfs_visible(curthread, pd, pid, true, &proc)) {
 		sx_sunlock(&allproc_lock);
 		pfs_unlock(pd);
                 PFS_RETURN (ENOENT);
 	}
 	KASSERT(pid == NO_PID || proc != NULL,
 	    ("%s(): no process for pid %lu", __func__, (unsigned long)pid));
 
 	/* skip unwanted entries */
 	for (pn = NULL, p = NULL; offset > 0; offset -= PFS_DELEN) {
 		if (pfs_iterate(curthread, proc, pd, &pn, &p) == -1) {
 			/* nothing left... */
 			if (proc != NULL)
 				PROC_UNLOCK(proc);
 			pfs_unlock(pd);
 			sx_sunlock(&allproc_lock);
 			PFS_RETURN (0);
 		}
 	}
 
 	/* fill in entries */
 	while (pfs_iterate(curthread, proc, pd, &pn, &p) != -1 &&
 	    resid >= PFS_DELEN) {
 		if ((pfsent = malloc(sizeof(struct pfsentry), M_IOV,
 		    M_NOWAIT | M_ZERO)) == NULL) {
 			error = ENOMEM;
 			break;
 		}
 		pfsent->entry.d_reclen = PFS_DELEN;
 		pfsent->entry.d_fileno = pn_fileno(pn, pid);
 		/* PFS_DELEN was picked to fit PFS_NAMLEN */
 		for (i = 0; i < PFS_NAMELEN - 1 && pn->pn_name[i] != '\0'; ++i)
 			pfsent->entry.d_name[i] = pn->pn_name[i];
 		pfsent->entry.d_name[i] = 0;
 		pfsent->entry.d_namlen = i;
+		/* NOTE: d_off is the offset of the *next* entry. */
+		pfsent->entry.d_off = offset + PFS_DELEN;
 		switch (pn->pn_type) {
 		case pfstype_procdir:
 			KASSERT(p != NULL,
 			    ("reached procdir node with p == NULL"));
 			pfsent->entry.d_namlen = snprintf(pfsent->entry.d_name,
 			    PFS_NAMELEN, "%d", p->p_pid);
 			/* fall through */
 		case pfstype_root:
 		case pfstype_dir:
 		case pfstype_this:
 		case pfstype_parent:
 			pfsent->entry.d_type = DT_DIR;
 			break;
 		case pfstype_file:
 			pfsent->entry.d_type = DT_REG;
 			break;
 		case pfstype_symlink:
 			pfsent->entry.d_type = DT_LNK;
 			break;
 		default:
 			panic("%s has unexpected node type: %d", pn->pn_name, pn->pn_type);
 		}
 		PFS_TRACE(("%s", pfsent->entry.d_name));
 		STAILQ_INSERT_TAIL(&lst, pfsent, link);
 		offset += PFS_DELEN;
 		resid -= PFS_DELEN;
 	}
 	if (proc != NULL)
 		PROC_UNLOCK(proc);
 	pfs_unlock(pd);
 	sx_sunlock(&allproc_lock);
 	i = 0;
 	STAILQ_FOREACH_SAFE(pfsent, &lst, link, pfsent2) {
 		if (error == 0)
 			error = uiomove(&pfsent->entry, PFS_DELEN, uio);
 		free(pfsent, M_IOV);
 		i++;
 	}
 	PFS_TRACE(("%ju bytes", (uintmax_t)(i * PFS_DELEN)));
 	PFS_RETURN (error);
 }
 
 /*
  * Read a symbolic link
  */
 static int
 pfs_readlink(struct vop_readlink_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 	struct uio *uio = va->a_uio;
 	struct proc *proc = NULL;
 	char buf[PATH_MAX];
 	struct sbuf sb;
 	int error, locked;
 
 	PFS_TRACE(("%s", pn->pn_name));
 	pfs_assert_not_owned(pn);
 
 	if (vn->v_type != VLNK)
 		PFS_RETURN (EINVAL);
 	KASSERT_PN_IS_LINK(pn);
 
 	if (pn->pn_fill == NULL)
 		PFS_RETURN (EIO);
 
 	if (pvd->pvd_pid != NO_PID) {
 		if ((proc = pfind(pvd->pvd_pid)) == NULL)
 			PFS_RETURN (EIO);
 		if (proc->p_flag & P_WEXIT) {
 			PROC_UNLOCK(proc);
 			PFS_RETURN (EIO);
 		}
 		_PHOLD(proc);
 		PROC_UNLOCK(proc);
 	}
 	vhold(vn);
 	locked = VOP_ISLOCKED(vn);
 	VOP_UNLOCK(vn, 0);
 
 	/* sbuf_new() can't fail with a static buffer */
 	sbuf_new(&sb, buf, sizeof buf, 0);
 
 	error = pn_fill(curthread, proc, pn, &sb, NULL);
 
 	if (proc != NULL)
 		PRELE(proc);
 	vn_lock(vn, locked | LK_RETRY);
 	vdrop(vn);
 
 	if (error) {
 		sbuf_delete(&sb);
 		PFS_RETURN (error);
 	}
 
 	if (sbuf_finish(&sb) != 0) {
 		sbuf_delete(&sb);
 		PFS_RETURN (ENAMETOOLONG);
 	}
 
 	error = uiomove_frombuf(sbuf_data(&sb), sbuf_len(&sb), uio);
 	sbuf_delete(&sb);
 	PFS_RETURN (error);
 }
 
 /*
  * Reclaim a vnode
  */
 static int
 pfs_reclaim(struct vop_reclaim_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 
 	PFS_TRACE(("%s", pn->pn_name));
 	pfs_assert_not_owned(pn);
 
 	return (pfs_vncache_free(va->a_vp));
 }
 
 /*
  * Set attributes
  */
 static int
 pfs_setattr(struct vop_setattr_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 
 	PFS_TRACE(("%s", pn->pn_name));
 	pfs_assert_not_owned(pn);
 
 	PFS_RETURN (EOPNOTSUPP);
 }
 
 /*
  * Write to a file
  */
 static int
 pfs_write(struct vop_write_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 	struct uio *uio = va->a_uio;
 	struct proc *proc;
 	struct sbuf sb;
 	int error;
 
 	PFS_TRACE(("%s", pn->pn_name));
 	pfs_assert_not_owned(pn);
 
 	if (vn->v_type != VREG)
 		PFS_RETURN (EINVAL);
 	KASSERT_PN_IS_FILE(pn);
 
 	if (!(pn->pn_flags & PFS_WR))
 		PFS_RETURN (EBADF);
 
 	if (pn->pn_fill == NULL)
 		PFS_RETURN (EIO);
 
 	/*
 	 * This is necessary because either process' privileges may
 	 * have changed since the open() call.
 	 */
 	if (!pfs_visible(curthread, pn, pvd->pvd_pid, false, &proc))
 		PFS_RETURN (EIO);
 	if (proc != NULL) {
 		_PHOLD(proc);
 		PROC_UNLOCK(proc);
 	}
 
 	if (pn->pn_flags & PFS_RAWWR) {
 		error = pn_fill(curthread, proc, pn, NULL, uio);
 		if (proc != NULL)
 			PRELE(proc);
 		PFS_RETURN (error);
 	}
 
 	sbuf_uionew(&sb, uio, &error);
 	if (error) {
 		if (proc != NULL)
 			PRELE(proc);
 		PFS_RETURN (error);
 	}
 
 	error = pn_fill(curthread, proc, pn, &sb, uio);
 
 	sbuf_delete(&sb);
 	if (proc != NULL)
 		PRELE(proc);
 	PFS_RETURN (error);
 }
 
 /*
  * Vnode operations
  */
 struct vop_vector pfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		pfs_access,
 	.vop_cachedlookup =	pfs_lookup,
 	.vop_close =		pfs_close,
 	.vop_create =		VOP_EOPNOTSUPP,
 	.vop_getattr =		pfs_getattr,
 	.vop_getextattr =	pfs_getextattr,
 	.vop_ioctl =		pfs_ioctl,
 	.vop_link =		VOP_EOPNOTSUPP,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_mkdir =		VOP_EOPNOTSUPP,
 	.vop_mknod =		VOP_EOPNOTSUPP,
 	.vop_open =		pfs_open,
 	.vop_read =		pfs_read,
 	.vop_readdir =		pfs_readdir,
 	.vop_readlink =		pfs_readlink,
 	.vop_reclaim =		pfs_reclaim,
 	.vop_remove =		VOP_EOPNOTSUPP,
 	.vop_rename =		VOP_EOPNOTSUPP,
 	.vop_rmdir =		VOP_EOPNOTSUPP,
 	.vop_setattr =		pfs_setattr,
 	.vop_symlink =		VOP_EOPNOTSUPP,
 	.vop_vptocnp =		pfs_vptocnp,
 	.vop_write =		pfs_write,
 	/* XXX I've probably forgotten a few that need VOP_EOPNOTSUPP */
 };
Index: stable/12/sys/fs/udf/udf_vnops.c
===================================================================
--- stable/12/sys/fs/udf/udf_vnops.c	(revision 340720)
+++ stable/12/sys/fs/udf/udf_vnops.c	(revision 340721)
@@ -1,1492 +1,1495 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2001, 2002 Scott Long <scottl@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /* udf_vnops.c */
 /* Take care of the vnode side of things */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/stat.h>
 #include <sys/bio.h>
 #include <sys/conf.h>
 #include <sys/buf.h>
 #include <sys/iconv.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/dirent.h>
 #include <sys/queue.h>
 #include <sys/unistd.h>
 #include <sys/endian.h>
 
 #include <vm/uma.h>
 
 #include <fs/udf/ecma167-udf.h>
 #include <fs/udf/osta.h>
 #include <fs/udf/udf.h>
 #include <fs/udf/udf_mount.h>
 
 extern struct iconv_functions *udf_iconv;
 
 static vop_access_t	udf_access;
 static vop_getattr_t	udf_getattr;
 static vop_open_t	udf_open;
 static vop_ioctl_t	udf_ioctl;
 static vop_pathconf_t	udf_pathconf;
 static vop_print_t	udf_print;
 static vop_read_t	udf_read;
 static vop_readdir_t	udf_readdir;
 static vop_readlink_t	udf_readlink;
 static vop_setattr_t	udf_setattr;
 static vop_strategy_t	udf_strategy;
 static vop_bmap_t	udf_bmap;
 static vop_cachedlookup_t	udf_lookup;
 static vop_reclaim_t	udf_reclaim;
 static vop_vptofh_t	udf_vptofh;
 static int udf_readatoffset(struct udf_node *node, int *size, off_t offset,
     struct buf **bp, uint8_t **data);
 static int udf_bmap_internal(struct udf_node *node, off_t offset,
     daddr_t *sector, uint32_t *max_size);
 
 static struct vop_vector udf_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		udf_access,
 	.vop_bmap =		udf_bmap,
 	.vop_cachedlookup =	udf_lookup,
 	.vop_getattr =		udf_getattr,
 	.vop_ioctl =		udf_ioctl,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_open =		udf_open,
 	.vop_pathconf =		udf_pathconf,
 	.vop_print =		udf_print,
 	.vop_read =		udf_read,
 	.vop_readdir =		udf_readdir,
 	.vop_readlink =		udf_readlink,
 	.vop_reclaim =		udf_reclaim,
 	.vop_setattr =		udf_setattr,
 	.vop_strategy =		udf_strategy,
 	.vop_vptofh =		udf_vptofh,
 };
 
 struct vop_vector udf_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_access =		udf_access,
 	.vop_getattr =		udf_getattr,
 	.vop_pathconf =		udf_pathconf,
 	.vop_print =		udf_print,
 	.vop_reclaim =		udf_reclaim,
 	.vop_setattr =		udf_setattr,
 	.vop_vptofh =		udf_vptofh,
 };
 
 static MALLOC_DEFINE(M_UDFFID, "udf_fid", "UDF FileId structure");
 static MALLOC_DEFINE(M_UDFDS, "udf_ds", "UDF Dirstream structure");
 
 #define UDF_INVALID_BMAP	-1
 
 int
 udf_allocv(struct mount *mp, struct vnode **vpp, struct thread *td)
 {
 	int error;
 	struct vnode *vp;
 
 	error = getnewvnode("udf", mp, &udf_vnodeops, &vp);
 	if (error) {
 		printf("udf_allocv: failed to allocate new vnode\n");
 		return (error);
 	}
 
 	*vpp = vp;
 	return (0);
 }
 
 /* Convert file entry permission (5 bits per owner/group/user) to a mode_t */
 static mode_t
 udf_permtomode(struct udf_node *node)
 {
 	uint32_t perm;
 	uint16_t flags;
 	mode_t mode;
 
 	perm = le32toh(node->fentry->perm);
 	flags = le16toh(node->fentry->icbtag.flags);
 
 	mode = perm & UDF_FENTRY_PERM_USER_MASK;
 	mode |= ((perm & UDF_FENTRY_PERM_GRP_MASK) >> 2);
 	mode |= ((perm & UDF_FENTRY_PERM_OWNER_MASK) >> 4);
 	mode |= ((flags & UDF_ICB_TAG_FLAGS_STICKY) << 4);
 	mode |= ((flags & UDF_ICB_TAG_FLAGS_SETGID) << 6);
 	mode |= ((flags & UDF_ICB_TAG_FLAGS_SETUID) << 8);
 
 	return (mode);
 }
 
 static int
 udf_access(struct vop_access_args *a)
 {
 	struct vnode *vp;
 	struct udf_node *node;
 	accmode_t accmode;
 	mode_t mode;
 
 	vp = a->a_vp;
 	node = VTON(vp);
 	accmode = a->a_accmode;
 
 	if (accmode & VWRITE) {
 		switch (vp->v_type) {
 		case VDIR:
 		case VLNK:
 		case VREG:
 			return (EROFS);
 			/* NOT REACHED */
 		default:
 			break;
 		}
 	}
 
 	mode = udf_permtomode(node);
 
 	return (vaccess(vp->v_type, mode, node->fentry->uid, node->fentry->gid,
 	    accmode, a->a_cred, NULL));
 }
 
 static int
 udf_open(struct vop_open_args *ap) {
 	struct udf_node *np = VTON(ap->a_vp);
 	off_t fsize;
 
 	fsize = le64toh(np->fentry->inf_len);
 	vnode_create_vobject(ap->a_vp, fsize, ap->a_td);
 	return 0;
 }
 
 static const int mon_lens[2][12] = {
 	{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334},
 	{0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335}
 };
 
 static int
 udf_isaleapyear(int year)
 {
 	int i;
 
 	i = (year % 4) ? 0 : 1;
 	i &= (year % 100) ? 1 : 0;
 	i |= (year % 400) ? 0 : 1;
 
 	return i;
 }
 
 /*
  * Timezone calculation compliments of Julian Elischer <julian@elischer.org>.
  */
 static void
 udf_timetotimespec(struct timestamp *time, struct timespec *t)
 {
 	int i, lpyear, daysinyear, year, startyear;
 	union {
 		uint16_t	u_tz_offset;
 		int16_t		s_tz_offset;
 	} tz;
 
 	/*
 	 * DirectCD seems to like using bogus year values.
 	 * Don't trust time->month as it will be used for an array index.
 	 */
 	year = le16toh(time->year);
 	if (year < 1970 || time->month < 1 || time->month > 12) {
 		t->tv_sec = 0;
 		t->tv_nsec = 0;
 		return;
 	}
 
 	/* Calculate the time and day */
 	t->tv_sec = time->second;
 	t->tv_sec += time->minute * 60;
 	t->tv_sec += time->hour * 3600;
 	t->tv_sec += (time->day - 1) * 3600 * 24;
 
 	/* Calculate the month */
 	lpyear = udf_isaleapyear(year);
 	t->tv_sec += mon_lens[lpyear][time->month - 1] * 3600 * 24;
 
 	/* Speed up the calculation */
 	startyear = 1970;
 	if (year > 2009) {
 		t->tv_sec += 1262304000;
 		startyear += 40;
 	} else if (year > 1999) {
 		t->tv_sec += 946684800;
 		startyear += 30;
 	} else if (year > 1989) {
 		t->tv_sec += 631152000;
 		startyear += 20;
 	} else if (year > 1979) {
 		t->tv_sec += 315532800;
 		startyear += 10;
 	}
 
 	daysinyear = (year - startyear) * 365;
 	for (i = startyear; i < year; i++)
 		daysinyear += udf_isaleapyear(i);
 	t->tv_sec += daysinyear * 3600 * 24;
 
 	/* Calculate microseconds */
 	t->tv_nsec = time->centisec * 10000 + time->hund_usec * 100 +
 	    time->usec;
 
 	/*
 	 * Calculate the time zone.  The timezone is 12 bit signed 2's
 	 * complement, so we gotta do some extra magic to handle it right.
 	 */
 	tz.u_tz_offset = le16toh(time->type_tz);
 	tz.u_tz_offset &= 0x0fff;
 	if (tz.u_tz_offset & 0x0800)
 		tz.u_tz_offset |= 0xf000;	/* extend the sign to 16 bits */
 	if ((le16toh(time->type_tz) & 0x1000) && (tz.s_tz_offset != -2047))
 		t->tv_sec -= tz.s_tz_offset * 60;
 
 	return;
 }
 
 static int
 udf_getattr(struct vop_getattr_args *a)
 {
 	struct vnode *vp;
 	struct udf_node *node;
 	struct vattr *vap;
 	struct file_entry *fentry;
 	struct timespec ts;
 
 	ts.tv_sec = 0;
 
 	vp = a->a_vp;
 	vap = a->a_vap;
 	node = VTON(vp);
 	fentry = node->fentry;
 
 	vap->va_fsid = dev2udev(node->udfmp->im_dev);
 	vap->va_fileid = node->hash_id;
 	vap->va_mode = udf_permtomode(node);
 	vap->va_nlink = le16toh(fentry->link_cnt);
 	/*
 	 * XXX The spec says that -1 is valid for uid/gid and indicates an
 	 * invalid uid/gid.  How should this be represented?
 	 */
 	vap->va_uid = (le32toh(fentry->uid) == -1) ? 0 : le32toh(fentry->uid);
 	vap->va_gid = (le32toh(fentry->gid) == -1) ? 0 : le32toh(fentry->gid);
 	udf_timetotimespec(&fentry->atime, &vap->va_atime);
 	udf_timetotimespec(&fentry->mtime, &vap->va_mtime);
 	vap->va_ctime = vap->va_mtime; /* XXX Stored as an Extended Attribute */
 	vap->va_rdev = NODEV;
 	if (vp->v_type & VDIR) {
 		/*
 		 * Directories that are recorded within their ICB will show
 		 * as having 0 blocks recorded.  Since tradition dictates
 		 * that directories consume at least one logical block,
 		 * make it appear so.
 		 */
 		if (fentry->logblks_rec != 0) {
 			vap->va_size =
 			    le64toh(fentry->logblks_rec) * node->udfmp->bsize;
 		} else {
 			vap->va_size = node->udfmp->bsize;
 		}
 	} else {
 		vap->va_size = le64toh(fentry->inf_len);
 	}
 	vap->va_flags = 0;
 	vap->va_gen = 1;
 	vap->va_blocksize = node->udfmp->bsize;
 	vap->va_bytes = le64toh(fentry->inf_len);
 	vap->va_type = vp->v_type;
 	vap->va_filerev = 0; /* XXX */
 	return (0);
 }
 
 static int
 udf_setattr(struct vop_setattr_args *a)
 {
 	struct vnode *vp;
 	struct vattr *vap;
 
 	vp = a->a_vp;
 	vap = a->a_vap;
 	if (vap->va_flags != (u_long)VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
 	    vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
 	    vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL)
 		return (EROFS);
 	if (vap->va_size != (u_quad_t)VNOVAL) {
 		switch (vp->v_type) {
 		case VDIR:
 			return (EISDIR);
 		case VLNK:
 		case VREG:
 			return (EROFS);
 		case VCHR:
 		case VBLK:
 		case VSOCK:
 		case VFIFO:
 		case VNON:
 		case VBAD:
 		case VMARKER:
 			return (0);
 		}
 	}
 	return (0);
 }
 
 /*
  * File specific ioctls.
  */
 static int
 udf_ioctl(struct vop_ioctl_args *a)
 {
 	printf("%s called\n", __func__);
 	return (ENOTTY);
 }
 
 /*
  * I'm not sure that this has much value in a read-only filesystem, but
  * cd9660 has it too.
  */
 static int
 udf_pathconf(struct vop_pathconf_args *a)
 {
 
 	switch (a->a_name) {
 	case _PC_FILESIZEBITS:
 		*a->a_retval = 64;
 		return (0);
 	case _PC_LINK_MAX:
 		*a->a_retval = 65535;
 		return (0);
 	case _PC_NAME_MAX:
 		*a->a_retval = NAME_MAX;
 		return (0);
 	case _PC_SYMLINK_MAX:
 		*a->a_retval = MAXPATHLEN;
 		return (0);
 	case _PC_NO_TRUNC:
 		*a->a_retval = 1;
 		return (0);
 	case _PC_PIPE_BUF:
 		if (a->a_vp->v_type == VDIR || a->a_vp->v_type == VFIFO) {
 			*a->a_retval = PIPE_BUF;
 			return (0);
 		}
 		return (EINVAL);
 	default:
 		return (vop_stdpathconf(a));
 	}
 }
 
 static int
 udf_print(struct vop_print_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct udf_node *node = VTON(vp);
 
 	printf("    ino %lu, on dev %s", (u_long)node->hash_id,
 	    devtoname(node->udfmp->im_dev));
 	if (vp->v_type == VFIFO)
 		fifo_printinfo(vp);
 	printf("\n");
 	return (0);
 }
 
 #define lblkno(udfmp, loc)	((loc) >> (udfmp)->bshift)
 #define blkoff(udfmp, loc)	((loc) & (udfmp)->bmask)
 #define lblktosize(udfmp, blk)	((blk) << (udfmp)->bshift)
 
 static inline int
 is_data_in_fentry(const struct udf_node *node)
 {
 	const struct file_entry *fentry = node->fentry;
 
 	return ((le16toh(fentry->icbtag.flags) & 0x7) == 3);
 }
 
 static int
 udf_read(struct vop_read_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct udf_node *node = VTON(vp);
 	struct udf_mnt *udfmp;
 	struct file_entry *fentry;
 	struct buf *bp;
 	uint8_t *data;
 	daddr_t lbn, rablock;
 	off_t diff, fsize;
 	ssize_t n;
 	int error = 0;
 	long size, on;
 
 	if (uio->uio_resid == 0)
 		return (0);
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 
 	if (is_data_in_fentry(node)) {
 		fentry = node->fentry;
 		data = &fentry->data[le32toh(fentry->l_ea)];
 		fsize = le32toh(fentry->l_ad);
 
 		n = uio->uio_resid;
 		diff = fsize - uio->uio_offset;
 		if (diff <= 0)
 			return (0);
 		if (diff < n)
 			n = diff;
 		error = uiomove(data + uio->uio_offset, (int)n, uio);
 		return (error);
 	}
 
 	fsize = le64toh(node->fentry->inf_len);
 	udfmp = node->udfmp;
 	do {
 		lbn = lblkno(udfmp, uio->uio_offset);
 		on = blkoff(udfmp, uio->uio_offset);
 		n = min((u_int)(udfmp->bsize - on),
 			uio->uio_resid);
 		diff = fsize - uio->uio_offset;
 		if (diff <= 0)
 			return (0);
 		if (diff < n)
 			n = diff;
 		size = udfmp->bsize;
 		rablock = lbn + 1;
 		if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
 			if (lblktosize(udfmp, rablock) < fsize) {
 				error = cluster_read(vp, fsize, lbn, size,
 				    NOCRED, uio->uio_resid,
 				    (ap->a_ioflag >> 16), 0, &bp);
 			} else {
 				error = bread(vp, lbn, size, NOCRED, &bp);
 			}
 		} else {
 			error = bread(vp, lbn, size, NOCRED, &bp);
 		}
 		if (error != 0) {
 			brelse(bp);
 			return (error);
 		}
 		n = min(n, size - bp->b_resid);
 
 		error = uiomove(bp->b_data + on, (int)n, uio);
 		brelse(bp);
 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
 	return (error);
 }
 
 /*
  * Call the OSTA routines to translate the name from a CS0 dstring to a
  * 16-bit Unicode String.  Hooks need to be placed in here to translate from
  * Unicode to the encoding that the kernel/user expects.  Return the length
  * of the translated string.
  */
 static int
 udf_transname(char *cs0string, char *destname, int len, struct udf_mnt *udfmp)
 {
 	unicode_t *transname;
 	char *unibuf, *unip;
 	int i, destlen;
 	ssize_t unilen = 0;
 	size_t destleft = MAXNAMLEN;
 
 	/* Convert 16-bit Unicode to destname */
 	if (udfmp->im_flags & UDFMNT_KICONV && udf_iconv) {
 		/* allocate a buffer big enough to hold an 8->16 bit expansion */
 		unibuf = uma_zalloc(udf_zone_trans, M_WAITOK);
 		unip = unibuf;
 		if ((unilen = (ssize_t)udf_UncompressUnicodeByte(len, cs0string, unibuf)) == -1) {
 			printf("udf: Unicode translation failed\n");
 			uma_zfree(udf_zone_trans, unibuf);
 			return 0;
 		}
 
 		while (unilen > 0 && destleft > 0) {
 			udf_iconv->conv(udfmp->im_d2l, __DECONST(const char **,
 			    &unibuf), (size_t *)&unilen, (char **)&destname,
 			    &destleft);
 			/* Unconverted character found */
 			if (unilen > 0 && destleft > 0) {
 				*destname++ = '?';
 				destleft--;
 				unibuf += 2;
 				unilen -= 2;
 			}
 		}
 		uma_zfree(udf_zone_trans, unip);
 		*destname = '\0';
 		destlen = MAXNAMLEN - (int)destleft;
 	} else {
 		/* allocate a buffer big enough to hold an 8->16 bit expansion */
 		transname = uma_zalloc(udf_zone_trans, M_WAITOK);
 
 		if ((unilen = (ssize_t)udf_UncompressUnicode(len, cs0string, transname)) == -1) {
 			printf("udf: Unicode translation failed\n");
 			uma_zfree(udf_zone_trans, transname);
 			return 0;
 		}
 
 		for (i = 0; i < unilen ; i++) {
 			if (transname[i] & 0xff00) {
 				destname[i] = '.';	/* Fudge the 16bit chars */
 			} else {
 				destname[i] = transname[i] & 0xff;
 			}
 		}
 		uma_zfree(udf_zone_trans, transname);
 		destname[unilen] = 0;
 		destlen = (int)unilen;
 	}
 
 	return (destlen);
 }
 
 /*
  * Compare a CS0 dstring with a name passed in from the VFS layer.  Return
  * 0 on a successful match, nonzero otherwise.  Unicode work may need to be done
  * here also.
  */
 static int
 udf_cmpname(char *cs0string, char *cmpname, int cs0len, int cmplen, struct udf_mnt *udfmp)
 {
 	char *transname;
 	int error = 0;
 
 	/* This is overkill, but not worth creating a new zone */
 	transname = uma_zalloc(udf_zone_trans, M_WAITOK);
 
 	cs0len = udf_transname(cs0string, transname, cs0len, udfmp);
 
 	/* Easy check.  If they aren't the same length, they aren't equal */
 	if ((cs0len == 0) || (cs0len != cmplen))
 		error = -1;
 	else
 		error = bcmp(transname, cmpname, cmplen);
 
 	uma_zfree(udf_zone_trans, transname);
 	return (error);
 }
 
 struct udf_uiodir {
 	struct dirent *dirent;
 	u_long *cookies;
 	int ncookies;
 	int acookies;
 	int eofflag;
 };
 
 static int
 udf_uiodir(struct udf_uiodir *uiodir, int de_size, struct uio *uio, long cookie)
 {
 	if (uiodir->cookies != NULL) {
 		if (++uiodir->acookies > uiodir->ncookies) {
 			uiodir->eofflag = 0;
 			return (-1);
 		}
 		*uiodir->cookies++ = cookie;
 	}
 
 	if (uio->uio_resid < de_size) {
 		uiodir->eofflag = 0;
 		return (-1);
 	}
 
 	return (uiomove(uiodir->dirent, de_size, uio));
 }
 
 static struct udf_dirstream *
 udf_opendir(struct udf_node *node, int offset, int fsize, struct udf_mnt *udfmp)
 {
 	struct udf_dirstream *ds;
 
 	ds = uma_zalloc(udf_zone_ds, M_WAITOK | M_ZERO);
 
 	ds->node = node;
 	ds->offset = offset;
 	ds->udfmp = udfmp;
 	ds->fsize = fsize;
 
 	return (ds);
 }
 
 static struct fileid_desc *
 udf_getfid(struct udf_dirstream *ds)
 {
 	struct fileid_desc *fid;
 	int error, frag_size = 0, total_fid_size;
 
 	/* End of directory? */
 	if (ds->offset + ds->off >= ds->fsize) {
 		ds->error = 0;
 		return (NULL);
 	}
 
 	/* Grab the first extent of the directory */
 	if (ds->off == 0) {
 		ds->size = 0;
 		error = udf_readatoffset(ds->node, &ds->size, ds->offset,
 		    &ds->bp, &ds->data);
 		if (error) {
 			ds->error = error;
 			if (ds->bp != NULL)
 				brelse(ds->bp);
 			return (NULL);
 		}
 	}
 
 	/*
 	 * Clean up from a previous fragmented FID.
 	 * XXX Is this the right place for this?
 	 */
 	if (ds->fid_fragment && ds->buf != NULL) {
 		ds->fid_fragment = 0;
 		free(ds->buf, M_UDFFID);
 	}
 
 	fid = (struct fileid_desc*)&ds->data[ds->off];
 
 	/*
 	 * Check to see if the fid is fragmented. The first test
 	 * ensures that we don't wander off the end of the buffer
 	 * looking for the l_iu and l_fi fields.
 	 */
 	if (ds->off + UDF_FID_SIZE > ds->size ||
 	    ds->off + le16toh(fid->l_iu) + fid->l_fi + UDF_FID_SIZE > ds->size){
 
 		/* Copy what we have of the fid into a buffer */
 		frag_size = ds->size - ds->off;
 		if (frag_size >= ds->udfmp->bsize) {
 			printf("udf: invalid FID fragment\n");
 			ds->error = EINVAL;
 			return (NULL);
 		}
 
 		/*
 		 * File ID descriptors can only be at most one
 		 * logical sector in size.
 		 */
 		ds->buf = malloc(ds->udfmp->bsize, M_UDFFID,
 		     M_WAITOK | M_ZERO);
 		bcopy(fid, ds->buf, frag_size);
 
 		/* Reduce all of the casting magic */
 		fid = (struct fileid_desc*)ds->buf;
 
 		if (ds->bp != NULL)
 			brelse(ds->bp);
 
 		/* Fetch the next allocation */
 		ds->offset += ds->size;
 		ds->size = 0;
 		error = udf_readatoffset(ds->node, &ds->size, ds->offset,
 		    &ds->bp, &ds->data);
 		if (error) {
 			ds->error = error;
 			return (NULL);
 		}
 
 		/*
 		 * If the fragment was so small that we didn't get
 		 * the l_iu and l_fi fields, copy those in.
 		 */
 		if (frag_size < UDF_FID_SIZE)
 			bcopy(ds->data, &ds->buf[frag_size],
 			    UDF_FID_SIZE - frag_size);
 
 		/*
 		 * Now that we have enough of the fid to work with,
 		 * copy in the rest of the fid from the new
 		 * allocation.
 		 */
 		total_fid_size = UDF_FID_SIZE + le16toh(fid->l_iu) + fid->l_fi;
 		if (total_fid_size > ds->udfmp->bsize) {
 			printf("udf: invalid FID\n");
 			ds->error = EIO;
 			return (NULL);
 		}
 		bcopy(ds->data, &ds->buf[frag_size],
 		    total_fid_size - frag_size);
 
 		ds->fid_fragment = 1;
 	} else {
 		total_fid_size = le16toh(fid->l_iu) + fid->l_fi + UDF_FID_SIZE;
 	}
 
 	/*
 	 * Update the offset. Align on a 4 byte boundary because the
 	 * UDF spec says so.
 	 */
 	ds->this_off = ds->offset + ds->off;
 	if (!ds->fid_fragment) {
 		ds->off += (total_fid_size + 3) & ~0x03;
 	} else {
 		ds->off = (total_fid_size - frag_size + 3) & ~0x03;
 	}
 
 	return (fid);
 }
 
 static void
 udf_closedir(struct udf_dirstream *ds)
 {
 
 	if (ds->bp != NULL)
 		brelse(ds->bp);
 
 	if (ds->fid_fragment && ds->buf != NULL)
 		free(ds->buf, M_UDFFID);
 
 	uma_zfree(udf_zone_ds, ds);
 }
 
 static int
 udf_readdir(struct vop_readdir_args *a)
 {
 	struct vnode *vp;
 	struct uio *uio;
 	struct dirent dir;
 	struct udf_node *node;
 	struct udf_mnt *udfmp;
 	struct fileid_desc *fid;
 	struct udf_uiodir uiodir;
 	struct udf_dirstream *ds;
 	u_long *cookies = NULL;
 	int ncookies;
 	int error = 0;
 
 	vp = a->a_vp;
 	uio = a->a_uio;
 	node = VTON(vp);
 	udfmp = node->udfmp;
 	uiodir.eofflag = 1;
 
 	if (a->a_ncookies != NULL) {
 		/*
 		 * Guess how many entries are needed.  If we run out, this
 		 * function will be called again and thing will pick up were
 		 * it left off.
 		 */
 		ncookies = uio->uio_resid / 8;
 		cookies = malloc(sizeof(u_long) * ncookies,
 		    M_TEMP, M_WAITOK);
 		if (cookies == NULL)
 			return (ENOMEM);
 		uiodir.ncookies = ncookies;
 		uiodir.cookies = cookies;
 		uiodir.acookies = 0;
 	} else {
 		uiodir.cookies = NULL;
 	}
 
 	/*
 	 * Iterate through the file id descriptors.  Give the parent dir
 	 * entry special attention.
 	 */
 	ds = udf_opendir(node, uio->uio_offset, le64toh(node->fentry->inf_len),
 	    node->udfmp);
 
 	while ((fid = udf_getfid(ds)) != NULL) {
 
 		/* XXX Should we return an error on a bad fid? */
 		if (udf_checktag(&fid->tag, TAGID_FID)) {
 			printf("Invalid FID tag\n");
 			hexdump(fid, UDF_FID_SIZE, NULL, 0);
 			error = EIO;
 			break;
 		}
 
 		/* Is this a deleted file? */
 		if (fid->file_char & UDF_FILE_CHAR_DEL)
 			continue;
 
 		if ((fid->l_fi == 0) && (fid->file_char & UDF_FILE_CHAR_PAR)) {
 			/* Do up the '.' and '..' entries.  Dummy values are
 			 * used for the cookies since the offset here is
 			 * usually zero, and NFS doesn't like that value
 			 */
 			dir.d_fileno = node->hash_id;
 			dir.d_type = DT_DIR;
 			dir.d_name[0] = '.';
 			dir.d_name[1] = '\0';
 			dir.d_namlen = 1;
 			dir.d_reclen = GENERIC_DIRSIZ(&dir);
+			dir.d_off = 1;
 			uiodir.dirent = &dir;
 			error = udf_uiodir(&uiodir, dir.d_reclen, uio, 1);
 			if (error)
 				break;
 
 			dir.d_fileno = udf_getid(&fid->icb);
 			dir.d_type = DT_DIR;
 			dir.d_name[0] = '.';
 			dir.d_name[1] = '.';
 			dir.d_name[2] = '\0';
 			dir.d_namlen = 2;
 			dir.d_reclen = GENERIC_DIRSIZ(&dir);
+			dir.d_off = 2;
 			uiodir.dirent = &dir;
 			error = udf_uiodir(&uiodir, dir.d_reclen, uio, 2);
 		} else {
 			dir.d_namlen = udf_transname(&fid->data[fid->l_iu],
 			    &dir.d_name[0], fid->l_fi, udfmp);
 			dir.d_fileno = udf_getid(&fid->icb);
 			dir.d_type = (fid->file_char & UDF_FILE_CHAR_DIR) ?
 			    DT_DIR : DT_UNKNOWN;
 			dir.d_reclen = GENERIC_DIRSIZ(&dir);
+			dir.d_off = ds->this_off;
 			uiodir.dirent = &dir;
 			error = udf_uiodir(&uiodir, dir.d_reclen, uio,
 			    ds->this_off);
 		}
 		if (error)
 			break;
 		uio->uio_offset = ds->offset + ds->off;
 	}
 
 	/* tell the calling layer whether we need to be called again */
 	*a->a_eofflag = uiodir.eofflag;
 
 	if (error < 0)
 		error = 0;
 	if (!error)
 		error = ds->error;
 
 	udf_closedir(ds);
 
 	if (a->a_ncookies != NULL) {
 		if (error)
 			free(cookies, M_TEMP);
 		else {
 			*a->a_ncookies = uiodir.acookies;
 			*a->a_cookies = cookies;
 		}
 	}
 
 	return (error);
 }
 
 static int
 udf_readlink(struct vop_readlink_args *ap)
 {
 	struct path_component *pc, *end;
 	struct vnode *vp;
 	struct uio uio;
 	struct iovec iov[1];
 	struct udf_node *node;
 	void *buf;
 	char *cp;
 	int error, len, root;
 
 	/*
 	 * A symbolic link in UDF is a list of variable-length path
 	 * component structures.  We build a pathname in the caller's
 	 * uio by traversing this list.
 	 */
 	vp = ap->a_vp;
 	node = VTON(vp);
 	len = le64toh(node->fentry->inf_len);
 	buf = malloc(len, M_DEVBUF, M_WAITOK);
 	iov[0].iov_len = len;
 	iov[0].iov_base = buf;
 	uio.uio_iov = iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = 0;
 	uio.uio_resid = iov[0].iov_len;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_READ;
 	uio.uio_td = curthread;
 	error = VOP_READ(vp, &uio, 0, ap->a_cred);
 	if (error)
 		goto error;
 
 	pc = buf;
 	end = (void *)((char *)buf + len);
 	root = 0;
 	while (pc < end) {
 		switch (pc->type) {
 		case UDF_PATH_ROOT:
 			/* Only allow this at the beginning of a path. */
 			if ((void *)pc != buf) {
 				error = EINVAL;
 				goto error;
 			}
 			cp = "/";
 			len = 1;
 			root = 1;
 			break;
 		case UDF_PATH_DOT:
 			cp = ".";
 			len = 1;
 			break;
 		case UDF_PATH_DOTDOT:
 			cp = "..";
 			len = 2;
 			break;
 		case UDF_PATH_PATH:
 			if (pc->length == 0) {
 				error = EINVAL;
 				goto error;
 			}
 			/*
 			 * XXX: We only support CS8 which appears to map
 			 * to ASCII directly.
 			 */
 			switch (pc->identifier[0]) {
 			case 8:
 				cp = pc->identifier + 1;
 				len = pc->length - 1;
 				break;
 			default:
 				error = EOPNOTSUPP;
 				goto error;
 			}
 			break;
 		default:
 			error = EINVAL;
 			goto error;
 		}
 
 		/*
 		 * If this is not the first component, insert a path
 		 * separator.
 		 */
 		if (pc != buf) {
 			/* If we started with root we already have a "/". */
 			if (root)
 				goto skipslash;
 			root = 0;
 			if (ap->a_uio->uio_resid < 1) {
 				error = ENAMETOOLONG;
 				goto error;
 			}
 			error = uiomove("/", 1, ap->a_uio);
 			if (error)
 				break;
 		}
 	skipslash:
 
 		/* Append string at 'cp' of length 'len' to our path. */
 		if (len > ap->a_uio->uio_resid) {
 			error = ENAMETOOLONG;
 			goto error;
 		}
 		error = uiomove(cp, len, ap->a_uio);
 		if (error)
 			break;
 
 		/* Advance to next component. */
 		pc = (void *)((char *)pc + 4 + pc->length);
 	}
 error:
 	free(buf, M_DEVBUF);
 	return (error);
 }
 
 static int
 udf_strategy(struct vop_strategy_args *a)
 {
 	struct buf *bp;
 	struct vnode *vp;
 	struct udf_node *node;
 	struct bufobj *bo;
 	off_t offset;
 	uint32_t maxsize;
 	daddr_t sector;
 	int error;
 
 	bp = a->a_bp;
 	vp = a->a_vp;
 	node = VTON(vp);
 
 	if (bp->b_blkno == bp->b_lblkno) {
 		offset = lblktosize(node->udfmp, bp->b_lblkno);
 		error = udf_bmap_internal(node, offset, &sector, &maxsize);
 		if (error) {
 			clrbuf(bp);
 			bp->b_blkno = -1;
 			bufdone(bp);
 			return (0);
 		}
 		/* bmap gives sector numbers, bio works with device blocks */
 		bp->b_blkno = sector << (node->udfmp->bshift - DEV_BSHIFT);
 	}
 	bo = node->udfmp->im_bo;
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	BO_STRATEGY(bo, bp);
 	return (0);
 }
 
 static int
 udf_bmap(struct vop_bmap_args *a)
 {
 	struct udf_node *node;
 	uint32_t max_size;
 	daddr_t lsector;
 	int nblk;
 	int error;
 
 	node = VTON(a->a_vp);
 
 	if (a->a_bop != NULL)
 		*a->a_bop = &node->udfmp->im_devvp->v_bufobj;
 	if (a->a_bnp == NULL)
 		return (0);
 	if (a->a_runb)
 		*a->a_runb = 0;
 
 	/*
 	 * UDF_INVALID_BMAP means data embedded into fentry, this is an internal
 	 * error that should not be propagated to calling code.
 	 * Most obvious mapping for this error is EOPNOTSUPP as we can not truly
 	 * translate block numbers in this case.
 	 * Incidentally, this return code will make vnode pager to use VOP_READ
 	 * to get data for mmap-ed pages and udf_read knows how to do the right
 	 * thing for this kind of files.
 	 */
 	error = udf_bmap_internal(node, a->a_bn << node->udfmp->bshift,
 	    &lsector, &max_size);
 	if (error == UDF_INVALID_BMAP)
 		return (EOPNOTSUPP);
 	if (error)
 		return (error);
 
 	/* Translate logical to physical sector number */
 	*a->a_bnp = lsector << (node->udfmp->bshift - DEV_BSHIFT);
 
 	/*
 	 * Determine maximum number of readahead blocks following the
 	 * requested block.
 	 */
 	if (a->a_runp) {
 		nblk = (max_size >> node->udfmp->bshift) - 1;
 		if (nblk <= 0)
 			*a->a_runp = 0;
 		else if (nblk >= (MAXBSIZE >> node->udfmp->bshift))
 			*a->a_runp = (MAXBSIZE >> node->udfmp->bshift) - 1;
 		else
 			*a->a_runp = nblk;
 	}
 
 	if (a->a_runb) {
 		*a->a_runb = 0;
 	}
 
 	return (0);
 }
 
 /*
  * The all powerful VOP_LOOKUP().
  */
 static int
 udf_lookup(struct vop_cachedlookup_args *a)
 {
 	struct vnode *dvp;
 	struct vnode *tdp = NULL;
 	struct vnode **vpp = a->a_vpp;
 	struct udf_node *node;
 	struct udf_mnt *udfmp;
 	struct fileid_desc *fid = NULL;
 	struct udf_dirstream *ds;
 	u_long nameiop;
 	u_long flags;
 	char *nameptr;
 	long namelen;
 	ino_t id = 0;
 	int offset, error = 0;
 	int fsize, lkflags, ltype, numdirpasses;
 
 	dvp = a->a_dvp;
 	node = VTON(dvp);
 	udfmp = node->udfmp;
 	nameiop = a->a_cnp->cn_nameiop;
 	flags = a->a_cnp->cn_flags;
 	lkflags = a->a_cnp->cn_lkflags;
 	nameptr = a->a_cnp->cn_nameptr;
 	namelen = a->a_cnp->cn_namelen;
 	fsize = le64toh(node->fentry->inf_len);
 
 	/*
 	 * If this is a LOOKUP and we've already partially searched through
 	 * the directory, pick up where we left off and flag that the
 	 * directory may need to be searched twice.  For a full description,
 	 * see /sys/fs/cd9660/cd9660_lookup.c:cd9660_lookup()
 	 */
 	if (nameiop != LOOKUP || node->diroff == 0 || node->diroff > fsize) {
 		offset = 0;
 		numdirpasses = 1;
 	} else {
 		offset = node->diroff;
 		numdirpasses = 2;
 		nchstats.ncs_2passes++;
 	}
 
 lookloop:
 	ds = udf_opendir(node, offset, fsize, udfmp);
 
 	while ((fid = udf_getfid(ds)) != NULL) {
 
 		/* XXX Should we return an error on a bad fid? */
 		if (udf_checktag(&fid->tag, TAGID_FID)) {
 			printf("udf_lookup: Invalid tag\n");
 			error = EIO;
 			break;
 		}
 
 		/* Is this a deleted file? */
 		if (fid->file_char & UDF_FILE_CHAR_DEL)
 			continue;
 
 		if ((fid->l_fi == 0) && (fid->file_char & UDF_FILE_CHAR_PAR)) {
 			if (flags & ISDOTDOT) {
 				id = udf_getid(&fid->icb);
 				break;
 			}
 		} else {
 			if (!(udf_cmpname(&fid->data[fid->l_iu],
 			    nameptr, fid->l_fi, namelen, udfmp))) {
 				id = udf_getid(&fid->icb);
 				break;
 			}
 		}
 	}
 
 	if (!error)
 		error = ds->error;
 
 	/* XXX Bail out here? */
 	if (error) {
 		udf_closedir(ds);
 		return (error);
 	}
 
 	/* Did we have a match? */
 	if (id) {
 		/*
 		 * Remember where this entry was if it's the final
 		 * component.
 		 */
 		if ((flags & ISLASTCN) && nameiop == LOOKUP)
 			node->diroff = ds->offset + ds->off;
 		if (numdirpasses == 2)
 			nchstats.ncs_pass2++;
 		udf_closedir(ds);
 
 		if (flags & ISDOTDOT) {
 			error = vn_vget_ino(dvp, id, lkflags, &tdp);
 		} else if (node->hash_id == id) {
 			VREF(dvp);	/* we want ourself, ie "." */
 			/*
 			 * When we lookup "." we still can be asked to lock it
 			 * differently.
 			 */
 			ltype = lkflags & LK_TYPE_MASK;
 			if (ltype != VOP_ISLOCKED(dvp)) {
 				if (ltype == LK_EXCLUSIVE)
 					vn_lock(dvp, LK_UPGRADE | LK_RETRY);
 				else /* if (ltype == LK_SHARED) */
 					vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
 			}
 			tdp = dvp;
 		} else
 			error = udf_vget(udfmp->im_mountp, id, lkflags, &tdp);
 		if (!error) {
 			*vpp = tdp;
 			/* Put this entry in the cache */
 			if (flags & MAKEENTRY)
 				cache_enter(dvp, *vpp, a->a_cnp);
 		}
 	} else {
 		/* Name wasn't found on this pass.  Do another pass? */
 		if (numdirpasses == 2) {
 			numdirpasses--;
 			offset = 0;
 			udf_closedir(ds);
 			goto lookloop;
 		}
 		udf_closedir(ds);
 
 		/* Enter name into cache as non-existant */
 		if (flags & MAKEENTRY)
 			cache_enter(dvp, *vpp, a->a_cnp);
 
 		if ((flags & ISLASTCN) &&
 		    (nameiop == CREATE || nameiop == RENAME)) {
 			error = EROFS;
 		} else {
 			error = ENOENT;
 		}
 	}
 
 	return (error);
 }
 
 static int
 udf_reclaim(struct vop_reclaim_args *a)
 {
 	struct vnode *vp;
 	struct udf_node *unode;
 
 	vp = a->a_vp;
 	unode = VTON(vp);
 
 	/*
 	 * Destroy the vm object and flush associated pages.
 	 */
 	vnode_destroy_vobject(vp);
 
 	if (unode != NULL) {
 		vfs_hash_remove(vp);
 
 		if (unode->fentry != NULL)
 			free(unode->fentry, M_UDFFENTRY);
 		uma_zfree(udf_zone_node, unode);
 		vp->v_data = NULL;
 	}
 
 	return (0);
 }
 
 static int
 udf_vptofh(struct vop_vptofh_args *a)
 {
 	struct udf_node *node;
 	struct ifid *ifhp;
 
 	node = VTON(a->a_vp);
 	ifhp = (struct ifid *)a->a_fhp;
 	ifhp->ifid_len = sizeof(struct ifid);
 	ifhp->ifid_ino = node->hash_id;
 
 	return (0);
 }
 
 /*
  * Read the block and then set the data pointer to correspond with the
  * offset passed in.  Only read in at most 'size' bytes, and then set 'size'
  * to the number of bytes pointed to.  If 'size' is zero, try to read in a
  * whole extent.
  *
  * Note that *bp may be assigned error or not.
  *
  */
 static int
 udf_readatoffset(struct udf_node *node, int *size, off_t offset,
     struct buf **bp, uint8_t **data)
 {
 	struct udf_mnt *udfmp = node->udfmp;
 	struct vnode *vp = node->i_vnode;
 	struct file_entry *fentry;
 	struct buf *bp1;
 	uint32_t max_size;
 	daddr_t sector;
 	off_t off;
 	int adj_size;
 	int error;
 
 	/*
 	 * This call is made *not* only to detect UDF_INVALID_BMAP case,
 	 * max_size is used as an ad-hoc read-ahead hint for "normal" case.
 	 */
 	error = udf_bmap_internal(node, offset, &sector, &max_size);
 	if (error == UDF_INVALID_BMAP) {
 		/*
 		 * This error means that the file *data* is stored in the
 		 * allocation descriptor field of the file entry.
 		 */
 		fentry = node->fentry;
 		*data = &fentry->data[le32toh(fentry->l_ea)];
 		*size = le32toh(fentry->l_ad);
 		if (offset >= *size)
 			*size = 0;
 		else {
 			*data += offset;
 			*size -= offset;
 		}
 		return (0);
 	} else if (error != 0) {
 		return (error);
 	}
 
 	/* Adjust the size so that it is within range */
 	if (*size == 0 || *size > max_size)
 		*size = max_size;
 
 	/*
 	 * Because we will read starting at block boundary, we need to adjust
 	 * how much we need to read so that all promised data is in.
 	 * Also, we can't promise to read more than MAXBSIZE bytes starting
 	 * from block boundary, so adjust what we promise too.
 	 */
 	off = blkoff(udfmp, offset);
 	*size = min(*size, MAXBSIZE - off);
 	adj_size = (*size + off + udfmp->bmask) & ~udfmp->bmask;
 	*bp = NULL;
 	if ((error = bread(vp, lblkno(udfmp, offset), adj_size, NOCRED, bp))) {
 		printf("warning: udf_readlblks returned error %d\n", error);
 		/* note: *bp may be non-NULL */
 		return (error);
 	}
 
 	bp1 = *bp;
 	*data = (uint8_t *)&bp1->b_data[offset & udfmp->bmask];
 	return (0);
 }
 
 /*
  * Translate a file offset into a logical block and then into a physical
  * block.
  * max_size - maximum number of bytes that can be read starting from given
  * offset, rather than beginning of calculated sector number
  */
 static int
 udf_bmap_internal(struct udf_node *node, off_t offset, daddr_t *sector,
     uint32_t *max_size)
 {
 	struct udf_mnt *udfmp;
 	struct file_entry *fentry;
 	void *icb;
 	struct icb_tag *tag;
 	uint32_t icblen = 0;
 	daddr_t lsector;
 	int ad_offset, ad_num = 0;
 	int i, p_offset;
 
 	udfmp = node->udfmp;
 	fentry = node->fentry;
 	tag = &fentry->icbtag;
 
 	switch (le16toh(tag->strat_type)) {
 	case 4:
 		break;
 
 	case 4096:
 		printf("Cannot deal with strategy4096 yet!\n");
 		return (ENODEV);
 
 	default:
 		printf("Unknown strategy type %d\n", tag->strat_type);
 		return (ENODEV);
 	}
 
 	switch (le16toh(tag->flags) & 0x7) {
 	case 0:
 		/*
 		 * The allocation descriptor field is filled with short_ad's.
 		 * If the offset is beyond the current extent, look for the
 		 * next extent.
 		 */
 		do {
 			offset -= icblen;
 			ad_offset = sizeof(struct short_ad) * ad_num;
 			if (ad_offset > le32toh(fentry->l_ad)) {
 				printf("File offset out of bounds\n");
 				return (EINVAL);
 			}
 			icb = GETICB(short_ad, fentry,
 			    le32toh(fentry->l_ea) + ad_offset);
 			icblen = GETICBLEN(short_ad, icb);
 			ad_num++;
 		} while(offset >= icblen);
 
 		lsector = (offset  >> udfmp->bshift) +
 		    le32toh(((struct short_ad *)(icb))->pos);
 
 		*max_size = icblen - offset;
 
 		break;
 	case 1:
 		/*
 		 * The allocation descriptor field is filled with long_ad's
 		 * If the offset is beyond the current extent, look for the
 		 * next extent.
 		 */
 		do {
 			offset -= icblen;
 			ad_offset = sizeof(struct long_ad) * ad_num;
 			if (ad_offset > le32toh(fentry->l_ad)) {
 				printf("File offset out of bounds\n");
 				return (EINVAL);
 			}
 			icb = GETICB(long_ad, fentry,
 			    le32toh(fentry->l_ea) + ad_offset);
 			icblen = GETICBLEN(long_ad, icb);
 			ad_num++;
 		} while(offset >= icblen);
 
 		lsector = (offset >> udfmp->bshift) +
 		    le32toh(((struct long_ad *)(icb))->loc.lb_num);
 
 		*max_size = icblen - offset;
 
 		break;
 	case 3:
 		/*
 		 * This type means that the file *data* is stored in the
 		 * allocation descriptor field of the file entry.
 		 */
 		*max_size = 0;
 		*sector = node->hash_id + udfmp->part_start;
 
 		return (UDF_INVALID_BMAP);
 	case 2:
 		/* DirectCD does not use extended_ad's */
 	default:
 		printf("Unsupported allocation descriptor %d\n",
 		       tag->flags & 0x7);
 		return (ENODEV);
 	}
 
 	*sector = lsector + udfmp->part_start;
 
 	/*
 	 * Check the sparing table.  Each entry represents the beginning of
 	 * a packet.
 	 */
 	if (udfmp->s_table != NULL) {
 		for (i = 0; i< udfmp->s_table_entries; i++) {
 			p_offset =
 			    lsector - le32toh(udfmp->s_table->entries[i].org);
 			if ((p_offset < udfmp->p_sectors) && (p_offset >= 0)) {
 				*sector =
 				   le32toh(udfmp->s_table->entries[i].map) +
 				    p_offset;
 				break;
 			}
 		}
 	}
 
 	return (0);
 }
Index: stable/12/sys/ufs/ufs/ufs_vnops.c
===================================================================
--- stable/12/sys/ufs/ufs/ufs_vnops.c	(revision 340720)
+++ stable/12/sys/ufs/ufs/ufs_vnops.c	(revision 340721)
@@ -1,2788 +1,2790 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ufs_vnops.c	8.27 (Berkeley) 5/27/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_quota.h"
 #include "opt_suiddir.h"
 #include "opt_ufs.h"
 #include "opt_ffs.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/namei.h>
 #include <sys/kernel.h>
 #include <sys/fcntl.h>
 #include <sys/filio.h>
 #include <sys/stat.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/mount.h>
 #include <sys/priv.h>
 #include <sys/refcount.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/dirent.h>
 #include <sys/lockf.h>
 #include <sys/conf.h>
 #include <sys/acl.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <sys/file.h>		/* XXX */
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include <ufs/ufs/acl.h>
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/dir.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>
 #ifdef UFS_DIRHASH
 #include <ufs/ufs/dirhash.h>
 #endif
 #ifdef UFS_GJOURNAL
 #include <ufs/ufs/gjournal.h>
 FEATURE(ufs_gjournal, "Journaling support through GEOM for UFS");
 #endif
 
 #ifdef QUOTA
 FEATURE(ufs_quota, "UFS disk quotas support");
 FEATURE(ufs_quota64, "64bit UFS disk quotas support");
 #endif
 
 #ifdef SUIDDIR
 FEATURE(suiddir,
     "Give all new files in directory the same ownership as the directory");
 #endif
 
 
 #include <ufs/ffs/ffs_extern.h>
 
 static vop_accessx_t	ufs_accessx;
 static int ufs_chmod(struct vnode *, int, struct ucred *, struct thread *);
 static int ufs_chown(struct vnode *, uid_t, gid_t, struct ucred *, struct thread *);
 static vop_close_t	ufs_close;
 static vop_create_t	ufs_create;
 static vop_getattr_t	ufs_getattr;
 static vop_ioctl_t	ufs_ioctl;
 static vop_link_t	ufs_link;
 static int ufs_makeinode(int mode, struct vnode *, struct vnode **, struct componentname *, const char *);
 static vop_markatime_t	ufs_markatime;
 static vop_mkdir_t	ufs_mkdir;
 static vop_mknod_t	ufs_mknod;
 static vop_open_t	ufs_open;
 static vop_pathconf_t	ufs_pathconf;
 static vop_print_t	ufs_print;
 static vop_readlink_t	ufs_readlink;
 static vop_remove_t	ufs_remove;
 static vop_rename_t	ufs_rename;
 static vop_rmdir_t	ufs_rmdir;
 static vop_setattr_t	ufs_setattr;
 static vop_strategy_t	ufs_strategy;
 static vop_symlink_t	ufs_symlink;
 static vop_whiteout_t	ufs_whiteout;
 static vop_close_t	ufsfifo_close;
 static vop_kqfilter_t	ufsfifo_kqfilter;
 
 SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem");
 
 /*
  * A virgin directory (no blushing please).
  */
 static struct dirtemplate mastertemplate = {
 	0, 12, DT_DIR, 1, ".",
 	0, DIRBLKSIZ - 12, DT_DIR, 2, ".."
 };
 static struct odirtemplate omastertemplate = {
 	0, 12, 1, ".",
 	0, DIRBLKSIZ - 12, 2, ".."
 };
 
 static void
 ufs_itimes_locked(struct vnode *vp)
 {
 	struct inode *ip;
 	struct timespec ts;
 
 	ASSERT_VI_LOCKED(vp, __func__);
 
 	ip = VTOI(vp);
 	if (UFS_RDONLY(ip))
 		goto out;
 	if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0)
 		return;
 
 	if ((vp->v_type == VBLK || vp->v_type == VCHR) && !DOINGSOFTDEP(vp))
 		ip->i_flag |= IN_LAZYMOD;
 	else if (((vp->v_mount->mnt_kern_flag &
 		    (MNTK_SUSPENDED | MNTK_SUSPEND)) == 0) ||
 		    (ip->i_flag & (IN_CHANGE | IN_UPDATE)))
 		ip->i_flag |= IN_MODIFIED;
 	else if (ip->i_flag & IN_ACCESS)
 		ip->i_flag |= IN_LAZYACCESS;
 	vfs_timestamp(&ts);
 	if (ip->i_flag & IN_ACCESS) {
 		DIP_SET(ip, i_atime, ts.tv_sec);
 		DIP_SET(ip, i_atimensec, ts.tv_nsec);
 	}
 	if (ip->i_flag & IN_UPDATE) {
 		DIP_SET(ip, i_mtime, ts.tv_sec);
 		DIP_SET(ip, i_mtimensec, ts.tv_nsec);
 	}
 	if (ip->i_flag & IN_CHANGE) {
 		DIP_SET(ip, i_ctime, ts.tv_sec);
 		DIP_SET(ip, i_ctimensec, ts.tv_nsec);
 		DIP_SET(ip, i_modrev, DIP(ip, i_modrev) + 1);
 	}
 
  out:
 	ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE);
 }
 
 void
 ufs_itimes(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	ufs_itimes_locked(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * Create a regular file
  */
 static int
 ufs_create(ap)
 	struct vop_create_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	int error;
 
 	error =
 	    ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode),
 	    ap->a_dvp, ap->a_vpp, ap->a_cnp, "ufs_create");
 	if (error != 0)
 		return (error);
 	if ((ap->a_cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(ap->a_dvp, *ap->a_vpp, ap->a_cnp);
 	return (0);
 }
 
 /*
  * Mknod vnode call
  */
 /* ARGSUSED */
 static int
 ufs_mknod(ap)
 	struct vop_mknod_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	struct vattr *vap = ap->a_vap;
 	struct vnode **vpp = ap->a_vpp;
 	struct inode *ip;
 	ino_t ino;
 	int error;
 
 	error = ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode),
 	    ap->a_dvp, vpp, ap->a_cnp, "ufs_mknod");
 	if (error)
 		return (error);
 	ip = VTOI(*vpp);
 	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 	if (vap->va_rdev != VNOVAL) {
 		/*
 		 * Want to be able to use this to make badblock
 		 * inodes, so don't truncate the dev number.
 		 */
 		DIP_SET(ip, i_rdev, vap->va_rdev);
 	}
 	/*
 	 * Remove inode, then reload it through VFS_VGET so it is
 	 * checked to see if it is an alias of an existing entry in
 	 * the inode cache.  XXX I don't believe this is necessary now.
 	 */
 	(*vpp)->v_type = VNON;
 	ino = ip->i_number;	/* Save this before vgone() invalidates ip. */
 	vgone(*vpp);
 	vput(*vpp);
 	error = VFS_VGET(ap->a_dvp->v_mount, ino, LK_EXCLUSIVE, vpp);
 	if (error) {
 		*vpp = NULL;
 		return (error);
 	}
 	return (0);
 }
 
 /*
  * Open called.
  */
 /* ARGSUSED */
 static int
 ufs_open(struct vop_open_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip;
 
 	if (vp->v_type == VCHR || vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	ip = VTOI(vp);
 	/*
 	 * Files marked append-only must be opened for appending.
 	 */
 	if ((ip->i_flags & APPEND) &&
 	    (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE)
 		return (EPERM);
 	vnode_create_vobject(vp, DIP(ip, i_size), ap->a_td);
 	return (0);
 }
 
 /*
  * Close called.
  *
  * Update the times on the inode.
  */
 /* ARGSUSED */
 static int
 ufs_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	int usecount;
 
 	VI_LOCK(vp);
 	usecount = vp->v_usecount;
 	if (usecount > 1)
 		ufs_itimes_locked(vp);
 	VI_UNLOCK(vp);
 	return (0);
 }
 
 static int
 ufs_accessx(ap)
 	struct vop_accessx_args /* {
 		struct vnode *a_vp;
 		accmode_t a_accmode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	accmode_t accmode = ap->a_accmode;
 	int error;
 #ifdef UFS_ACL
 	struct acl *acl;
 	acl_type_t type;
 #endif
 
 	/*
 	 * Disallow write attempts on read-only filesystems;
 	 * unless the file is a socket, fifo, or a block or
 	 * character device resident on the filesystem.
 	 */
 	if (accmode & VMODIFY_PERMS) {
 		switch (vp->v_type) {
 		case VDIR:
 		case VLNK:
 		case VREG:
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 #ifdef QUOTA
 			/*
 			 * Inode is accounted in the quotas only if struct
 			 * dquot is attached to it. VOP_ACCESS() is called
 			 * from vn_open_cred() and provides a convenient
 			 * point to call getinoquota().  The lock mode is
 			 * exclusive when the file is opening for write.
 			 */
 			if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) {
 				error = getinoquota(ip);
 				if (error != 0)
 					return (error);
 			}
 #endif
 			break;
 		default:
 			break;
 		}
 	}
 
 	/*
 	 * If immutable bit set, nobody gets to write it.  "& ~VADMIN_PERMS"
 	 * permits the owner of the file to remove the IMMUTABLE flag.
 	 */
 	if ((accmode & (VMODIFY_PERMS & ~VADMIN_PERMS)) &&
 	    (ip->i_flags & (IMMUTABLE | SF_SNAPSHOT)))
 		return (EPERM);
 
 #ifdef UFS_ACL
 	if ((vp->v_mount->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS)) != 0) {
 		if (vp->v_mount->mnt_flag & MNT_NFS4ACLS)
 			type = ACL_TYPE_NFS4;
 		else
 			type = ACL_TYPE_ACCESS;
 
 		acl = acl_alloc(M_WAITOK);
 		if (type == ACL_TYPE_NFS4)
 			error = ufs_getacl_nfs4_internal(vp, acl, ap->a_td);
 		else
 			error = VOP_GETACL(vp, type, acl, ap->a_cred, ap->a_td);
 		switch (error) {
 		case 0:
 			if (type == ACL_TYPE_NFS4) {
 				error = vaccess_acl_nfs4(vp->v_type, ip->i_uid,
 				    ip->i_gid, acl, accmode, ap->a_cred, NULL);
 			} else {
 				error = vfs_unixify_accmode(&accmode);
 				if (error == 0)
 					error = vaccess_acl_posix1e(vp->v_type, ip->i_uid,
 					    ip->i_gid, acl, accmode, ap->a_cred, NULL);
 			}
 			break;
 		default:
 			if (error != EOPNOTSUPP)
 				printf(
 "ufs_accessx(): Error retrieving ACL on object (%d).\n",
 				    error);
 			/*
 			 * XXX: Fall back until debugged.  Should
 			 * eventually possibly log an error, and return
 			 * EPERM for safety.
 			 */
 			error = vfs_unixify_accmode(&accmode);
 			if (error == 0)
 				error = vaccess(vp->v_type, ip->i_mode, ip->i_uid,
 				    ip->i_gid, accmode, ap->a_cred, NULL);
 		}
 		acl_free(acl);
 
 		return (error);
 	}
 #endif /* !UFS_ACL */
 	error = vfs_unixify_accmode(&accmode);
 	if (error == 0)
 		error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid,
 		    accmode, ap->a_cred, NULL);
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 ufs_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	struct vattr *vap = ap->a_vap;
 
 	VI_LOCK(vp);
 	ufs_itimes_locked(vp);
 	if (I_IS_UFS1(ip)) {
 		vap->va_atime.tv_sec = ip->i_din1->di_atime;
 		vap->va_atime.tv_nsec = ip->i_din1->di_atimensec;
 	} else {
 		vap->va_atime.tv_sec = ip->i_din2->di_atime;
 		vap->va_atime.tv_nsec = ip->i_din2->di_atimensec;
 	}
 	VI_UNLOCK(vp);
 	/*
 	 * Copy from inode table
 	 */
 	vap->va_fsid = dev2udev(ITOUMP(ip)->um_dev);
 	vap->va_fileid = ip->i_number;
 	vap->va_mode = ip->i_mode & ~IFMT;
 	vap->va_nlink = ip->i_effnlink;
 	vap->va_uid = ip->i_uid;
 	vap->va_gid = ip->i_gid;
 	if (I_IS_UFS1(ip)) {
 		vap->va_rdev = ip->i_din1->di_rdev;
 		vap->va_size = ip->i_din1->di_size;
 		vap->va_mtime.tv_sec = ip->i_din1->di_mtime;
 		vap->va_mtime.tv_nsec = ip->i_din1->di_mtimensec;
 		vap->va_ctime.tv_sec = ip->i_din1->di_ctime;
 		vap->va_ctime.tv_nsec = ip->i_din1->di_ctimensec;
 		vap->va_bytes = dbtob((u_quad_t)ip->i_din1->di_blocks);
 		vap->va_filerev = ip->i_din1->di_modrev;
 	} else {
 		vap->va_rdev = ip->i_din2->di_rdev;
 		vap->va_size = ip->i_din2->di_size;
 		vap->va_mtime.tv_sec = ip->i_din2->di_mtime;
 		vap->va_mtime.tv_nsec = ip->i_din2->di_mtimensec;
 		vap->va_ctime.tv_sec = ip->i_din2->di_ctime;
 		vap->va_ctime.tv_nsec = ip->i_din2->di_ctimensec;
 		vap->va_birthtime.tv_sec = ip->i_din2->di_birthtime;
 		vap->va_birthtime.tv_nsec = ip->i_din2->di_birthnsec;
 		vap->va_bytes = dbtob((u_quad_t)ip->i_din2->di_blocks);
 		vap->va_filerev = ip->i_din2->di_modrev;
 	}
 	vap->va_flags = ip->i_flags;
 	vap->va_gen = ip->i_gen;
 	vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
 	vap->va_type = IFTOVT(ip->i_mode);
 	return (0);
 }
 
 /*
  * Set attribute vnode op. called from several syscalls
  */
 static int
 ufs_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vattr *vap = ap->a_vap;
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	struct ucred *cred = ap->a_cred;
 	struct thread *td = curthread;
 	int error;
 
 	/*
 	 * Check for unsettable attributes.
 	 */
 	if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) ||
 	    ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) {
 		return (EINVAL);
 	}
 	if (vap->va_flags != VNOVAL) {
 		if ((vap->va_flags & ~(SF_APPEND | SF_ARCHIVED | SF_IMMUTABLE |
 		    SF_NOUNLINK | SF_SNAPSHOT | UF_APPEND | UF_ARCHIVE |
 		    UF_HIDDEN | UF_IMMUTABLE | UF_NODUMP | UF_NOUNLINK |
 		    UF_OFFLINE | UF_OPAQUE | UF_READONLY | UF_REPARSE |
 		    UF_SPARSE | UF_SYSTEM)) != 0)
 			return (EOPNOTSUPP);
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		/*
 		 * Callers may only modify the file flags on objects they
 		 * have VADMIN rights for.
 		 */
 		if ((error = VOP_ACCESS(vp, VADMIN, cred, td)))
 			return (error);
 		/*
 		 * Unprivileged processes are not permitted to unset system
 		 * flags, or modify flags if any system flags are set.
 		 * Privileged non-jail processes may not modify system flags
 		 * if securelevel > 0 and any existing system flags are set.
 		 * Privileged jail processes behave like privileged non-jail
 		 * processes if the PR_ALLOW_CHFLAGS permission bit is set;
 		 * otherwise, they behave like unprivileged processes.
 		 */
 		if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0)) {
 			if (ip->i_flags &
 			    (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) {
 				error = securelevel_gt(cred, 0);
 				if (error)
 					return (error);
 			}
 			/* The snapshot flag cannot be toggled. */
 			if ((vap->va_flags ^ ip->i_flags) & SF_SNAPSHOT)
 				return (EPERM);
 		} else {
 			if (ip->i_flags &
 			    (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) ||
 			    ((vap->va_flags ^ ip->i_flags) & SF_SETTABLE))
 				return (EPERM);
 		}
 		ip->i_flags = vap->va_flags;
 		DIP_SET(ip, i_flags, vap->va_flags);
 		ip->i_flag |= IN_CHANGE;
 		error = UFS_UPDATE(vp, 0);
 		if (ip->i_flags & (IMMUTABLE | APPEND))
 			return (error);
 	}
 	/*
 	 * If immutable or append, no one can change any of its attributes
 	 * except the ones already handled (in some cases, file flags
 	 * including the immutability flags themselves for the superuser).
 	 */
 	if (ip->i_flags & (IMMUTABLE | APPEND))
 		return (EPERM);
 	/*
 	 * Go through the fields and update iff not VNOVAL.
 	 */
 	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if ((error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred,
 		    td)) != 0)
 			return (error);
 	}
 	if (vap->va_size != VNOVAL) {
 		/*
 		 * XXX most of the following special cases should be in
 		 * callers instead of in N filesystems.  The VDIR check
 		 * mostly already is.
 		 */
 		switch (vp->v_type) {
 		case VDIR:
 			return (EISDIR);
 		case VLNK:
 		case VREG:
 			/*
 			 * Truncation should have an effect in these cases.
 			 * Disallow it if the filesystem is read-only or
 			 * the file is being snapshotted.
 			 */
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			if ((ip->i_flags & SF_SNAPSHOT) != 0)
 				return (EPERM);
 			break;
 		default:
 			/*
 			 * According to POSIX, the result is unspecified
 			 * for file types other than regular files,
 			 * directories and shared memory objects.  We
 			 * don't support shared memory objects in the file
 			 * system, and have dubious support for truncating
 			 * symlinks.  Just ignore the request in other cases.
 			 */
 			return (0);
 		}
 		if ((error = UFS_TRUNCATE(vp, vap->va_size, IO_NORMAL |
 		    ((vap->va_vaflags & VA_SYNC) != 0 ? IO_SYNC : 0),
 		    cred)) != 0)
 			return (error);
 	}
 	if (vap->va_atime.tv_sec != VNOVAL ||
 	    vap->va_mtime.tv_sec != VNOVAL ||
 	    vap->va_birthtime.tv_sec != VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if ((ip->i_flags & SF_SNAPSHOT) != 0)
 			return (EPERM);
 		error = vn_utimes_perm(vp, vap, cred, td);
 		if (error != 0)
 			return (error);
 		ip->i_flag |= IN_CHANGE | IN_MODIFIED;
 		if (vap->va_atime.tv_sec != VNOVAL) {
 			ip->i_flag &= ~IN_ACCESS;
 			DIP_SET(ip, i_atime, vap->va_atime.tv_sec);
 			DIP_SET(ip, i_atimensec, vap->va_atime.tv_nsec);
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			ip->i_flag &= ~IN_UPDATE;
 			DIP_SET(ip, i_mtime, vap->va_mtime.tv_sec);
 			DIP_SET(ip, i_mtimensec, vap->va_mtime.tv_nsec);
 		}
 		if (vap->va_birthtime.tv_sec != VNOVAL && I_IS_UFS2(ip)) {
 			ip->i_din2->di_birthtime = vap->va_birthtime.tv_sec;
 			ip->i_din2->di_birthnsec = vap->va_birthtime.tv_nsec;
 		}
 		error = UFS_UPDATE(vp, 0);
 		if (error)
 			return (error);
 	}
 	error = 0;
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if ((ip->i_flags & SF_SNAPSHOT) != 0 && (vap->va_mode &
 		   (S_IXUSR | S_IWUSR | S_IXGRP | S_IWGRP | S_IXOTH | S_IWOTH)))
 			return (EPERM);
 		error = ufs_chmod(vp, (int)vap->va_mode, cred, td);
 	}
 	return (error);
 }
 
 #ifdef UFS_ACL
 static int
 ufs_update_nfs4_acl_after_mode_change(struct vnode *vp, int mode,
     int file_owner_id, struct ucred *cred, struct thread *td)
 {
 	int error;
 	struct acl *aclp;
 
 	aclp = acl_alloc(M_WAITOK);
 	error = ufs_getacl_nfs4_internal(vp, aclp, td);
 	/*
 	 * We don't have to handle EOPNOTSUPP here, as the filesystem claims
 	 * it supports ACLs.
 	 */
 	if (error)
 		goto out;
 
 	acl_nfs4_sync_acl_from_mode(aclp, mode, file_owner_id);
 	error = ufs_setacl_nfs4_internal(vp, aclp, td);
 
 out:
 	acl_free(aclp);
 	return (error);
 }
 #endif /* UFS_ACL */
 
 /*
  * Mark this file's access time for update for vfs_mark_atime().  This
  * is called from execve() and mmap().
  */
 static int
 ufs_markatime(ap)
 	struct vop_markatime_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 
 	VI_LOCK(vp);
 	ip->i_flag |= IN_ACCESS;
 	VI_UNLOCK(vp);
 	/*
 	 * XXXKIB No UFS_UPDATE(ap->a_vp, 0) there.
 	 */
 	return (0);
 }
 
 /*
  * Change the mode on a file.
  * Inode must be locked before calling.
  */
 static int
 ufs_chmod(vp, mode, cred, td)
 	struct vnode *vp;
 	int mode;
 	struct ucred *cred;
 	struct thread *td;
 {
 	struct inode *ip = VTOI(vp);
 	int error;
 
 	/*
 	 * To modify the permissions on a file, must possess VADMIN
 	 * for that file.
 	 */
 	if ((error = VOP_ACCESSX(vp, VWRITE_ACL, cred, td)))
 		return (error);
 	/*
 	 * Privileged processes may set the sticky bit on non-directories,
 	 * as well as set the setgid bit on a file with a group that the
 	 * process is not a member of.  Both of these are allowed in
 	 * jail(8).
 	 */
 	if (vp->v_type != VDIR && (mode & S_ISTXT)) {
 		if (priv_check_cred(cred, PRIV_VFS_STICKYFILE, 0))
 			return (EFTYPE);
 	}
 	if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) {
 		error = priv_check_cred(cred, PRIV_VFS_SETGID, 0);
 		if (error)
 			return (error);
 	}
 
 	/*
 	 * Deny setting setuid if we are not the file owner.
 	 */
 	if ((mode & ISUID) && ip->i_uid != cred->cr_uid) {
 		error = priv_check_cred(cred, PRIV_VFS_ADMIN, 0);
 		if (error)
 			return (error);
 	}
 
 	ip->i_mode &= ~ALLPERMS;
 	ip->i_mode |= (mode & ALLPERMS);
 	DIP_SET(ip, i_mode, ip->i_mode);
 	ip->i_flag |= IN_CHANGE;
 #ifdef UFS_ACL
 	if ((vp->v_mount->mnt_flag & MNT_NFS4ACLS) != 0)
 		error = ufs_update_nfs4_acl_after_mode_change(vp, mode, ip->i_uid, cred, td);
 #endif
 	if (error == 0 && (ip->i_flag & IN_CHANGE) != 0)
 		error = UFS_UPDATE(vp, 0);
 
 	return (error);
 }
 
 /*
  * Perform chown operation on inode ip;
  * inode must be locked prior to call.
  */
 static int
 ufs_chown(vp, uid, gid, cred, td)
 	struct vnode *vp;
 	uid_t uid;
 	gid_t gid;
 	struct ucred *cred;
 	struct thread *td;
 {
 	struct inode *ip = VTOI(vp);
 	uid_t ouid;
 	gid_t ogid;
 	int error = 0;
 #ifdef QUOTA
 	int i;
 	ufs2_daddr_t change;
 #endif
 
 	if (uid == (uid_t)VNOVAL)
 		uid = ip->i_uid;
 	if (gid == (gid_t)VNOVAL)
 		gid = ip->i_gid;
 	/*
 	 * To modify the ownership of a file, must possess VADMIN for that
 	 * file.
 	 */
 	if ((error = VOP_ACCESSX(vp, VWRITE_OWNER, cred, td)))
 		return (error);
 	/*
 	 * To change the owner of a file, or change the group of a file to a
 	 * group of which we are not a member, the caller must have
 	 * privilege.
 	 */
 	if (((uid != ip->i_uid && uid != cred->cr_uid) || 
 	    (gid != ip->i_gid && !groupmember(gid, cred))) &&
 	    (error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0)))
 		return (error);
 	ogid = ip->i_gid;
 	ouid = ip->i_uid;
 #ifdef QUOTA
 	if ((error = getinoquota(ip)) != 0)
 		return (error);
 	if (ouid == uid) {
 		dqrele(vp, ip->i_dquot[USRQUOTA]);
 		ip->i_dquot[USRQUOTA] = NODQUOT;
 	}
 	if (ogid == gid) {
 		dqrele(vp, ip->i_dquot[GRPQUOTA]);
 		ip->i_dquot[GRPQUOTA] = NODQUOT;
 	}
 	change = DIP(ip, i_blocks);
 	(void) chkdq(ip, -change, cred, CHOWN);
 	(void) chkiq(ip, -1, cred, CHOWN);
 	for (i = 0; i < MAXQUOTAS; i++) {
 		dqrele(vp, ip->i_dquot[i]);
 		ip->i_dquot[i] = NODQUOT;
 	}
 #endif
 	ip->i_gid = gid;
 	DIP_SET(ip, i_gid, gid);
 	ip->i_uid = uid;
 	DIP_SET(ip, i_uid, uid);
 #ifdef QUOTA
 	if ((error = getinoquota(ip)) == 0) {
 		if (ouid == uid) {
 			dqrele(vp, ip->i_dquot[USRQUOTA]);
 			ip->i_dquot[USRQUOTA] = NODQUOT;
 		}
 		if (ogid == gid) {
 			dqrele(vp, ip->i_dquot[GRPQUOTA]);
 			ip->i_dquot[GRPQUOTA] = NODQUOT;
 		}
 		if ((error = chkdq(ip, change, cred, CHOWN)) == 0) {
 			if ((error = chkiq(ip, 1, cred, CHOWN)) == 0)
 				goto good;
 			else
 				(void) chkdq(ip, -change, cred, CHOWN|FORCE);
 		}
 		for (i = 0; i < MAXQUOTAS; i++) {
 			dqrele(vp, ip->i_dquot[i]);
 			ip->i_dquot[i] = NODQUOT;
 		}
 	}
 	ip->i_gid = ogid;
 	DIP_SET(ip, i_gid, ogid);
 	ip->i_uid = ouid;
 	DIP_SET(ip, i_uid, ouid);
 	if (getinoquota(ip) == 0) {
 		if (ouid == uid) {
 			dqrele(vp, ip->i_dquot[USRQUOTA]);
 			ip->i_dquot[USRQUOTA] = NODQUOT;
 		}
 		if (ogid == gid) {
 			dqrele(vp, ip->i_dquot[GRPQUOTA]);
 			ip->i_dquot[GRPQUOTA] = NODQUOT;
 		}
 		(void) chkdq(ip, change, cred, FORCE|CHOWN);
 		(void) chkiq(ip, 1, cred, FORCE|CHOWN);
 		(void) getinoquota(ip);
 	}
 	return (error);
 good:
 	if (getinoquota(ip))
 		panic("ufs_chown: lost quota");
 #endif /* QUOTA */
 	ip->i_flag |= IN_CHANGE;
 	if ((ip->i_mode & (ISUID | ISGID)) && (ouid != uid || ogid != gid)) {
 		if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID, 0)) {
 			ip->i_mode &= ~(ISUID | ISGID);
 			DIP_SET(ip, i_mode, ip->i_mode);
 		}
 	}
 	error = UFS_UPDATE(vp, 0);
 	return (error);
 }
 
 static int
 ufs_remove(ap)
 	struct vop_remove_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct inode *ip;
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	int error;
 	struct thread *td;
 
 	td = curthread;
 	ip = VTOI(vp);
 	if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
 	    (VTOI(dvp)->i_flags & APPEND)) {
 		error = EPERM;
 		goto out;
 	}
 #ifdef UFS_GJOURNAL
 	ufs_gjournal_orphan(vp);
 #endif
 	error = ufs_dirremove(dvp, ip, ap->a_cnp->cn_flags, 0);
 	if (ip->i_nlink <= 0)
 		vp->v_vflag |= VV_NOSYNC;
 	if ((ip->i_flags & SF_SNAPSHOT) != 0) {
 		/*
 		 * Avoid deadlock where another thread is trying to
 		 * update the inodeblock for dvp and is waiting on
 		 * snaplk.  Temporary unlock the vnode lock for the
 		 * unlinked file and sync the directory.  This should
 		 * allow vput() of the directory to not block later on
 		 * while holding the snapshot vnode locked, assuming
 		 * that the directory hasn't been unlinked too.
 		 */
 		VOP_UNLOCK(vp, 0);
 		(void) VOP_FSYNC(dvp, MNT_WAIT, td);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 out:
 	return (error);
 }
 
 static void
 print_bad_link_count(const char *funcname, struct vnode *dvp)
 {
 	struct inode *dip;
 
 	dip = VTOI(dvp);
 	uprintf("%s: Bad link count %d on parent inode %jd in file system %s\n",
 	    funcname, dip->i_effnlink, (intmax_t)dip->i_number,
 	    dvp->v_mount->mnt_stat.f_mntonname);
 }
 
 /*
  * link vnode call
  */
 static int
 ufs_link(ap)
 	struct vop_link_args /* {
 		struct vnode *a_tdvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct inode *ip;
 	struct direct newdir;
 	int error;
 
 #ifdef INVARIANTS
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("ufs_link: no name");
 #endif
 	if (VTOI(tdvp)->i_effnlink < 2) {
 		print_bad_link_count("ufs_link", tdvp);
 		error = EINVAL;
 		goto out;
 	}
 	ip = VTOI(vp);
 	if (ip->i_nlink >= UFS_LINK_MAX) {
 		error = EMLINK;
 		goto out;
 	}
 	/*
 	 * The file may have been removed after namei droped the original
 	 * lock.
 	 */
 	if (ip->i_effnlink == 0) {
 		error = ENOENT;
 		goto out;
 	}
 	if (ip->i_flags & (IMMUTABLE | APPEND)) {
 		error = EPERM;
 		goto out;
 	}
 	ip->i_effnlink++;
 	ip->i_nlink++;
 	DIP_SET(ip, i_nlink, ip->i_nlink);
 	ip->i_flag |= IN_CHANGE;
 	if (DOINGSOFTDEP(vp))
 		softdep_setup_link(VTOI(tdvp), ip);
 	error = UFS_UPDATE(vp, !DOINGSOFTDEP(vp) && !DOINGASYNC(vp));
 	if (!error) {
 		ufs_makedirentry(ip, cnp, &newdir);
 		error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL, 0);
 	}
 
 	if (error) {
 		ip->i_effnlink--;
 		ip->i_nlink--;
 		DIP_SET(ip, i_nlink, ip->i_nlink);
 		ip->i_flag |= IN_CHANGE;
 		if (DOINGSOFTDEP(vp))
 			softdep_revert_link(VTOI(tdvp), ip);
 	}
 out:
 	return (error);
 }
 
 /*
  * whiteout vnode call
  */
 static int
 ufs_whiteout(ap)
 	struct vop_whiteout_args /* {
 		struct vnode *a_dvp;
 		struct componentname *a_cnp;
 		int a_flags;
 	} */ *ap;
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct direct newdir;
 	int error = 0;
 
 	switch (ap->a_flags) {
 	case LOOKUP:
 		/* 4.4 format directories support whiteout operations */
 		if (dvp->v_mount->mnt_maxsymlinklen > 0)
 			return (0);
 		return (EOPNOTSUPP);
 
 	case CREATE:
 		/* create a new directory whiteout */
 #ifdef INVARIANTS
 		if ((cnp->cn_flags & SAVENAME) == 0)
 			panic("ufs_whiteout: missing name");
 		if (dvp->v_mount->mnt_maxsymlinklen <= 0)
 			panic("ufs_whiteout: old format filesystem");
 #endif
 
 		newdir.d_ino = UFS_WINO;
 		newdir.d_namlen = cnp->cn_namelen;
 		bcopy(cnp->cn_nameptr, newdir.d_name, (unsigned)cnp->cn_namelen + 1);
 		newdir.d_type = DT_WHT;
 		error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL, 0);
 		break;
 
 	case DELETE:
 		/* remove an existing directory whiteout */
 #ifdef INVARIANTS
 		if (dvp->v_mount->mnt_maxsymlinklen <= 0)
 			panic("ufs_whiteout: old format filesystem");
 #endif
 
 		cnp->cn_flags &= ~DOWHITEOUT;
 		error = ufs_dirremove(dvp, NULL, cnp->cn_flags, 0);
 		break;
 	default:
 		panic("ufs_whiteout: unknown op");
 	}
 	return (error);
 }
 
 static volatile int rename_restarts;
 SYSCTL_INT(_vfs_ufs, OID_AUTO, rename_restarts, CTLFLAG_RD,
     __DEVOLATILE(int *, &rename_restarts), 0,
     "Times rename had to restart due to lock contention");
 
 /*
  * Rename system call.
  * 	rename("foo", "bar");
  * is essentially
  *	unlink("bar");
  *	link("foo", "bar");
  *	unlink("foo");
  * but ``atomically''.  Can't do full commit without saving state in the
  * inode on disk which isn't feasible at this time.  Best we can do is
  * always guarantee the target exists.
  *
  * Basic algorithm is:
  *
  * 1) Bump link count on source while we're linking it to the
  *    target.  This also ensure the inode won't be deleted out
  *    from underneath us while we work (it may be truncated by
  *    a concurrent `trunc' or `open' for creation).
  * 2) Link source to destination.  If destination already exists,
  *    delete it first.
  * 3) Unlink source reference to inode if still around. If a
  *    directory was moved and the parent of the destination
  *    is different from the source, patch the ".." entry in the
  *    directory.
  */
 static int
 ufs_rename(ap)
 	struct vop_rename_args  /* {
 		struct vnode *a_fdvp;
 		struct vnode *a_fvp;
 		struct componentname *a_fcnp;
 		struct vnode *a_tdvp;
 		struct vnode *a_tvp;
 		struct componentname *a_tcnp;
 	} */ *ap;
 {
 	struct vnode *tvp = ap->a_tvp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *fdvp = ap->a_fdvp;
 	struct vnode *nvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct componentname *fcnp = ap->a_fcnp;
 	struct thread *td = fcnp->cn_thread;
 	struct inode *fip, *tip, *tdp, *fdp;
 	struct direct newdir;
 	off_t endoff;
 	int doingdirectory, newparent;
 	int error = 0;
 	struct mount *mp;
 	ino_t ino;
 
 #ifdef INVARIANTS
 	if ((tcnp->cn_flags & HASBUF) == 0 ||
 	    (fcnp->cn_flags & HASBUF) == 0)
 		panic("ufs_rename: no name");
 #endif
 	endoff = 0;
 	mp = tdvp->v_mount;
 	VOP_UNLOCK(tdvp, 0);
 	if (tvp && tvp != tdvp)
 		VOP_UNLOCK(tvp, 0);
 	/*
 	 * Check for cross-device rename.
 	 */
 	if ((fvp->v_mount != tdvp->v_mount) ||
 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
 		error = EXDEV;
 		mp = NULL;
 		goto releout;
 	}
 relock:
 	/* 
 	 * We need to acquire 2 to 4 locks depending on whether tvp is NULL
 	 * and fdvp and tdvp are the same directory.  Subsequently we need
 	 * to double-check all paths and in the directory rename case we
 	 * need to verify that we are not creating a directory loop.  To
 	 * handle this we acquire all but fdvp using non-blocking
 	 * acquisitions.  If we fail to acquire any lock in the path we will
 	 * drop all held locks, acquire the new lock in a blocking fashion,
 	 * and then release it and restart the rename.  This acquire/release
 	 * step ensures that we do not spin on a lock waiting for release.
 	 */
 	error = vn_lock(fdvp, LK_EXCLUSIVE);
 	if (error)
 		goto releout;
 	if (vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
 		VOP_UNLOCK(fdvp, 0);
 		error = vn_lock(tdvp, LK_EXCLUSIVE);
 		if (error)
 			goto releout;
 		VOP_UNLOCK(tdvp, 0);
 		atomic_add_int(&rename_restarts, 1);
 		goto relock;
 	}
 	/*
 	 * Re-resolve fvp to be certain it still exists and fetch the
 	 * correct vnode.
 	 */
 	error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino);
 	if (error) {
 		VOP_UNLOCK(fdvp, 0);
 		VOP_UNLOCK(tdvp, 0);
 		goto releout;
 	}
 	error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp);
 	if (error) {
 		VOP_UNLOCK(fdvp, 0);
 		VOP_UNLOCK(tdvp, 0);
 		if (error != EBUSY)
 			goto releout;
 		error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp);
 		if (error != 0)
 			goto releout;
 		VOP_UNLOCK(nvp, 0);
 		vrele(fvp);
 		fvp = nvp;
 		atomic_add_int(&rename_restarts, 1);
 		goto relock;
 	}
 	vrele(fvp);
 	fvp = nvp;
 	/*
 	 * Re-resolve tvp and acquire the vnode lock if present.
 	 */
 	error = ufs_lookup_ino(tdvp, NULL, tcnp, &ino);
 	if (error != 0 && error != EJUSTRETURN) {
 		VOP_UNLOCK(fdvp, 0);
 		VOP_UNLOCK(tdvp, 0);
 		VOP_UNLOCK(fvp, 0);
 		goto releout;
 	}
 	/*
 	 * If tvp disappeared we just carry on.
 	 */
 	if (error == EJUSTRETURN && tvp != NULL) {
 		vrele(tvp);
 		tvp = NULL;
 	}
 	/*
 	 * Get the tvp ino if the lookup succeeded.  We may have to restart
 	 * if the non-blocking acquire fails.
 	 */
 	if (error == 0) {
 		nvp = NULL;
 		error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp);
 		if (tvp)
 			vrele(tvp);
 		tvp = nvp;
 		if (error) {
 			VOP_UNLOCK(fdvp, 0);
 			VOP_UNLOCK(tdvp, 0);
 			VOP_UNLOCK(fvp, 0);
 			if (error != EBUSY)
 				goto releout;
 			error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp);
 			if (error != 0)
 				goto releout;
 			vput(nvp);
 			atomic_add_int(&rename_restarts, 1);
 			goto relock;
 		}
 	}
 	fdp = VTOI(fdvp);
 	fip = VTOI(fvp);
 	tdp = VTOI(tdvp);
 	tip = NULL;
 	if (tvp)
 		tip = VTOI(tvp);
 	if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
 	    (VTOI(tdvp)->i_flags & APPEND))) {
 		error = EPERM;
 		goto unlockout;
 	}
 	/*
 	 * Renaming a file to itself has no effect.  The upper layers should
 	 * not call us in that case.  However, things could change after
 	 * we drop the locks above.
 	 */
 	if (fvp == tvp) {
 		error = 0;
 		goto unlockout;
 	}
 	doingdirectory = 0;
 	newparent = 0;
 	ino = fip->i_number;
 	if (fip->i_nlink >= UFS_LINK_MAX) {
 		error = EMLINK;
 		goto unlockout;
 	}
 	if ((fip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))
 	    || (fdp->i_flags & APPEND)) {
 		error = EPERM;
 		goto unlockout;
 	}
 	if ((fip->i_mode & IFMT) == IFDIR) {
 		/*
 		 * Avoid ".", "..", and aliases of "." for obvious reasons.
 		 */
 		if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
 		    fdp == fip ||
 		    (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
 			error = EINVAL;
 			goto unlockout;
 		}
 		if (fdp->i_number != tdp->i_number)
 			newparent = tdp->i_number;
 		doingdirectory = 1;
 	}
 	if ((fvp->v_type == VDIR && fvp->v_mountedhere != NULL) ||
 	    (tvp != NULL && tvp->v_type == VDIR &&
 	    tvp->v_mountedhere != NULL)) {
 		error = EXDEV;
 		goto unlockout;
 	}
 
 	/*
 	 * If ".." must be changed (ie the directory gets a new
 	 * parent) then the source directory must not be in the
 	 * directory hierarchy above the target, as this would
 	 * orphan everything below the source directory. Also
 	 * the user must have write permission in the source so
 	 * as to be able to change "..".
 	 */
 	if (doingdirectory && newparent) {
 		error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread);
 		if (error)
 			goto unlockout;
 		error = ufs_checkpath(ino, fdp->i_number, tdp, tcnp->cn_cred,
 		    &ino);
 		/*
 		 * We encountered a lock that we have to wait for.  Unlock
 		 * everything else and VGET before restarting.
 		 */
 		if (ino) {
 			VOP_UNLOCK(fdvp, 0);
 			VOP_UNLOCK(fvp, 0);
 			VOP_UNLOCK(tdvp, 0);
 			if (tvp)
 				VOP_UNLOCK(tvp, 0);
 			error = VFS_VGET(mp, ino, LK_SHARED, &nvp);
 			if (error == 0)
 				vput(nvp);
 			atomic_add_int(&rename_restarts, 1);
 			goto relock;
 		}
 		if (error)
 			goto unlockout;
 		if ((tcnp->cn_flags & SAVESTART) == 0)
 			panic("ufs_rename: lost to startdir");
 	}
 	if (fip->i_effnlink == 0 || fdp->i_effnlink == 0 ||
 	    tdp->i_effnlink == 0)
 		panic("Bad effnlink fip %p, fdp %p, tdp %p", fip, fdp, tdp);
 
 	/*
 	 * 1) Bump link count while we're moving stuff
 	 *    around.  If we crash somewhere before
 	 *    completing our work, the link count
 	 *    may be wrong, but correctable.
 	 */
 	fip->i_effnlink++;
 	fip->i_nlink++;
 	DIP_SET(fip, i_nlink, fip->i_nlink);
 	fip->i_flag |= IN_CHANGE;
 	if (DOINGSOFTDEP(fvp))
 		softdep_setup_link(tdp, fip);
 	error = UFS_UPDATE(fvp, !DOINGSOFTDEP(fvp) && !DOINGASYNC(fvp));
 	if (error)
 		goto bad;
 
 	/*
 	 * 2) If target doesn't exist, link the target
 	 *    to the source and unlink the source.
 	 *    Otherwise, rewrite the target directory
 	 *    entry to reference the source inode and
 	 *    expunge the original entry's existence.
 	 */
 	if (tip == NULL) {
 		if (ITODEV(tdp) != ITODEV(fip))
 			panic("ufs_rename: EXDEV");
 		if (doingdirectory && newparent) {
 			/*
 			 * Account for ".." in new directory.
 			 * When source and destination have the same
 			 * parent we don't adjust the link count.  The
 			 * actual link modification is completed when
 			 * .. is rewritten below.
 			 */
 			if (tdp->i_nlink >= UFS_LINK_MAX) {
 				error = EMLINK;
 				goto bad;
 			}
 		}
 		ufs_makedirentry(fip, tcnp, &newdir);
 		error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL, 1);
 		if (error)
 			goto bad;
 		/* Setup tdvp for directory compaction if needed. */
 		if (tdp->i_count && tdp->i_endoff &&
 		    tdp->i_endoff < tdp->i_size)
 			endoff = tdp->i_endoff;
 	} else {
 		if (ITODEV(tip) != ITODEV(tdp) || ITODEV(tip) != ITODEV(fip))
 			panic("ufs_rename: EXDEV");
 		/*
 		 * Short circuit rename(foo, foo).
 		 */
 		if (tip->i_number == fip->i_number)
 			panic("ufs_rename: same file");
 		/*
 		 * If the parent directory is "sticky", then the caller
 		 * must possess VADMIN for the parent directory, or the
 		 * destination of the rename.  This implements append-only
 		 * directories.
 		 */
 		if ((tdp->i_mode & S_ISTXT) &&
 		    VOP_ACCESS(tdvp, VADMIN, tcnp->cn_cred, td) &&
 		    VOP_ACCESS(tvp, VADMIN, tcnp->cn_cred, td)) {
 			error = EPERM;
 			goto bad;
 		}
 		/*
 		 * Target must be empty if a directory and have no links
 		 * to it. Also, ensure source and target are compatible
 		 * (both directories, or both not directories).
 		 */
 		if ((tip->i_mode & IFMT) == IFDIR) {
 			if ((tip->i_effnlink > 2) ||
 			    !ufs_dirempty(tip, tdp->i_number, tcnp->cn_cred)) {
 				error = ENOTEMPTY;
 				goto bad;
 			}
 			if (!doingdirectory) {
 				error = ENOTDIR;
 				goto bad;
 			}
 			cache_purge(tdvp);
 		} else if (doingdirectory) {
 			error = EISDIR;
 			goto bad;
 		}
 		if (doingdirectory) {
 			if (!newparent) {
 				tdp->i_effnlink--;
 				if (DOINGSOFTDEP(tdvp))
 					softdep_change_linkcnt(tdp);
 			}
 			tip->i_effnlink--;
 			if (DOINGSOFTDEP(tvp))
 				softdep_change_linkcnt(tip);
 		}
 		error = ufs_dirrewrite(tdp, tip, fip->i_number,
 		    IFTODT(fip->i_mode),
 		    (doingdirectory && newparent) ? newparent : doingdirectory);
 		if (error) {
 			if (doingdirectory) {
 				if (!newparent) {
 					tdp->i_effnlink++;
 					if (DOINGSOFTDEP(tdvp))
 						softdep_change_linkcnt(tdp);
 				}
 				tip->i_effnlink++;
 				if (DOINGSOFTDEP(tvp))
 					softdep_change_linkcnt(tip);
 			}
 		}
 		if (doingdirectory && !DOINGSOFTDEP(tvp)) {
 			/*
 			 * The only stuff left in the directory is "."
 			 * and "..". The "." reference is inconsequential
 			 * since we are quashing it. We have removed the "."
 			 * reference and the reference in the parent directory,
 			 * but there may be other hard links. The soft
 			 * dependency code will arrange to do these operations
 			 * after the parent directory entry has been deleted on
 			 * disk, so when running with that code we avoid doing
 			 * them now.
 			 */
 			if (!newparent) {
 				tdp->i_nlink--;
 				DIP_SET(tdp, i_nlink, tdp->i_nlink);
 				tdp->i_flag |= IN_CHANGE;
 			}
 			tip->i_nlink--;
 			DIP_SET(tip, i_nlink, tip->i_nlink);
 			tip->i_flag |= IN_CHANGE;
 		}
 	}
 
 	/*
 	 * 3) Unlink the source.  We have to resolve the path again to
 	 * fixup the directory offset and count for ufs_dirremove.
 	 */
 	if (fdvp == tdvp) {
 		error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino);
 		if (error)
 			panic("ufs_rename: from entry went away!");
 		if (ino != fip->i_number)
 			panic("ufs_rename: ino mismatch %ju != %ju\n",
 			    (uintmax_t)ino, (uintmax_t)fip->i_number);
 	}
 	/*
 	 * If the source is a directory with a
 	 * new parent, the link count of the old
 	 * parent directory must be decremented
 	 * and ".." set to point to the new parent.
 	 */
 	if (doingdirectory && newparent) {
 		/*
 		 * If tip exists we simply use its link, otherwise we must
 		 * add a new one.
 		 */
 		if (tip == NULL) {
 			tdp->i_effnlink++;
 			tdp->i_nlink++;
 			DIP_SET(tdp, i_nlink, tdp->i_nlink);
 			tdp->i_flag |= IN_CHANGE;
 			if (DOINGSOFTDEP(tdvp))
 				softdep_setup_dotdot_link(tdp, fip);
 			error = UFS_UPDATE(tdvp, !DOINGSOFTDEP(tdvp) &&
 			    !DOINGASYNC(tdvp));
 			/* Don't go to bad here as the new link exists. */
 			if (error)
 				goto unlockout;
 		} else if (DOINGSUJ(tdvp))
 			/* Journal must account for each new link. */
 			softdep_setup_dotdot_link(tdp, fip);
 		fip->i_offset = mastertemplate.dot_reclen;
 		ufs_dirrewrite(fip, fdp, newparent, DT_DIR, 0);
 		cache_purge(fdvp);
 	}
 	error = ufs_dirremove(fdvp, fip, fcnp->cn_flags, 0);
 	/*
 	 * The kern_renameat() looks up the fvp using the DELETE flag, which
 	 * causes the removal of the name cache entry for fvp.
 	 * As the relookup of the fvp is done in two steps:
 	 * ufs_lookup_ino() and then VFS_VGET(), another thread might do a
 	 * normal lookup of the from name just before the VFS_VGET() call,
 	 * causing the cache entry to be re-instantiated.
 	 *
 	 * The same issue also applies to tvp if it exists as
 	 * otherwise we may have a stale name cache entry for the new
 	 * name that references the old i-node if it has other links
 	 * or open file descriptors.
 	 */
 	cache_purge(fvp);
 	if (tvp)
 		cache_purge(tvp);
 	cache_purge_negative(tdvp);
 
 unlockout:
 	vput(fdvp);
 	vput(fvp);
 	if (tvp)
 		vput(tvp);
 	/*
 	 * If compaction or fsync was requested do it now that other locks
 	 * are no longer needed.
 	 */
 	if (error == 0 && endoff != 0) {
 		error = UFS_TRUNCATE(tdvp, endoff, IO_NORMAL |
 		    (DOINGASYNC(tdvp) ? 0 : IO_SYNC), tcnp->cn_cred);
 		if (error != 0)
 			vn_printf(tdvp,
 			    "ufs_rename: failed to truncate, error %d\n",
 			    error);
 #ifdef UFS_DIRHASH
 		else if (tdp->i_dirhash != NULL)
 			ufsdirhash_dirtrunc(tdp, endoff);
 #endif
 		/*
 		 * Even if the directory compaction failed, rename was
 		 * succesful.  Do not propagate a UFS_TRUNCATE() error
 		 * to the caller.
 		 */
 		error = 0;
 	}
 	if (error == 0 && tdp->i_flag & IN_NEEDSYNC)
 		error = VOP_FSYNC(tdvp, MNT_WAIT, td);
 	vput(tdvp);
 	return (error);
 
 bad:
 	fip->i_effnlink--;
 	fip->i_nlink--;
 	DIP_SET(fip, i_nlink, fip->i_nlink);
 	fip->i_flag |= IN_CHANGE;
 	if (DOINGSOFTDEP(fvp))
 		softdep_revert_link(tdp, fip);
 	goto unlockout;
 
 releout:
 	vrele(fdvp);
 	vrele(fvp);
 	vrele(tdvp);
 	if (tvp)
 		vrele(tvp);
 
 	return (error);
 }
 
 #ifdef UFS_ACL
 static int
 ufs_do_posix1e_acl_inheritance_dir(struct vnode *dvp, struct vnode *tvp,
     mode_t dmode, struct ucred *cred, struct thread *td)
 {
 	int error;
 	struct inode *ip = VTOI(tvp);
 	struct acl *dacl, *acl;
 
 	acl = acl_alloc(M_WAITOK);
 	dacl = acl_alloc(M_WAITOK);
 
 	/*
 	 * Retrieve default ACL from parent, if any.
 	 */
 	error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td);
 	switch (error) {
 	case 0:
 		/*
 		 * Retrieved a default ACL, so merge mode and ACL if
 		 * necessary.  If the ACL is empty, fall through to
 		 * the "not defined or available" case.
 		 */
 		if (acl->acl_cnt != 0) {
 			dmode = acl_posix1e_newfilemode(dmode, acl);
 			ip->i_mode = dmode;
 			DIP_SET(ip, i_mode, dmode);
 			*dacl = *acl;
 			ufs_sync_acl_from_inode(ip, acl);
 			break;
 		}
 		/* FALLTHROUGH */
 
 	case EOPNOTSUPP:
 		/*
 		 * Just use the mode as-is.
 		 */
 		ip->i_mode = dmode;
 		DIP_SET(ip, i_mode, dmode);
 		error = 0;
 		goto out;
 	
 	default:
 		goto out;
 	}
 
 	/*
 	 * XXX: If we abort now, will Soft Updates notify the extattr
 	 * code that the EAs for the file need to be released?
 	 */
 	error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td);
 	if (error == 0)
 		error = VOP_SETACL(tvp, ACL_TYPE_DEFAULT, dacl, cred, td);
 	switch (error) {
 	case 0:
 		break;
 
 	case EOPNOTSUPP:
 		/*
 		 * XXX: This should not happen, as EOPNOTSUPP above
 		 * was supposed to free acl.
 		 */
 		printf("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()\n");
 		/*
 		panic("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()");
 		 */
 		break;
 
 	default:
 		goto out;
 	}
 
 out:
 	acl_free(acl);
 	acl_free(dacl);
 
 	return (error);
 }
 
 static int
 ufs_do_posix1e_acl_inheritance_file(struct vnode *dvp, struct vnode *tvp,
     mode_t mode, struct ucred *cred, struct thread *td)
 {
 	int error;
 	struct inode *ip = VTOI(tvp);
 	struct acl *acl;
 
 	acl = acl_alloc(M_WAITOK);
 
 	/*
 	 * Retrieve default ACL for parent, if any.
 	 */
 	error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td);
 	switch (error) {
 	case 0:
 		/*
 		 * Retrieved a default ACL, so merge mode and ACL if
 		 * necessary.
 		 */
 		if (acl->acl_cnt != 0) {
 			/*
 			 * Two possible ways for default ACL to not
 			 * be present.  First, the EA can be
 			 * undefined, or second, the default ACL can
 			 * be blank.  If it's blank, fall through to
 			 * the it's not defined case.
 			 */
 			mode = acl_posix1e_newfilemode(mode, acl);
 			ip->i_mode = mode;
 			DIP_SET(ip, i_mode, mode);
 			ufs_sync_acl_from_inode(ip, acl);
 			break;
 		}
 		/* FALLTHROUGH */
 
 	case EOPNOTSUPP:
 		/*
 		 * Just use the mode as-is.
 		 */
 		ip->i_mode = mode;
 		DIP_SET(ip, i_mode, mode);
 		error = 0;
 		goto out;
 
 	default:
 		goto out;
 	}
 
 	/*
 	 * XXX: If we abort now, will Soft Updates notify the extattr
 	 * code that the EAs for the file need to be released?
 	 */
 	error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td);
 	switch (error) {
 	case 0:
 		break;
 
 	case EOPNOTSUPP:
 		/*
 		 * XXX: This should not happen, as EOPNOTSUPP above was
 		 * supposed to free acl.
 		 */
 		printf("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() "
 		    "but no VOP_SETACL()\n");
 		/* panic("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() "
 		    "but no VOP_SETACL()"); */
 		break;
 
 	default:
 		goto out;
 	}
 
 out:
 	acl_free(acl);
 
 	return (error);
 }
 
 static int
 ufs_do_nfs4_acl_inheritance(struct vnode *dvp, struct vnode *tvp,
     mode_t child_mode, struct ucred *cred, struct thread *td)
 {
 	int error;
 	struct acl *parent_aclp, *child_aclp;
 
 	parent_aclp = acl_alloc(M_WAITOK);
 	child_aclp = acl_alloc(M_WAITOK | M_ZERO);
 
 	error = ufs_getacl_nfs4_internal(dvp, parent_aclp, td);
 	if (error)
 		goto out;
 	acl_nfs4_compute_inherited_acl(parent_aclp, child_aclp,
 	    child_mode, VTOI(tvp)->i_uid, tvp->v_type == VDIR);
 	error = ufs_setacl_nfs4_internal(tvp, child_aclp, td);
 	if (error)
 		goto out;
 out:
 	acl_free(parent_aclp);
 	acl_free(child_aclp);
 
 	return (error);
 }
 #endif
 
 /*
  * Mkdir system call
  */
 static int
 ufs_mkdir(ap)
 	struct vop_mkdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct componentname *cnp = ap->a_cnp;
 	struct inode *ip, *dp;
 	struct vnode *tvp;
 	struct buf *bp;
 	struct dirtemplate dirtemplate, *dtp;
 	struct direct newdir;
 	int error, dmode;
 	long blkoff;
 
 #ifdef INVARIANTS
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("ufs_mkdir: no name");
 #endif
 	dp = VTOI(dvp);
 	if (dp->i_nlink >= UFS_LINK_MAX) {
 		error = EMLINK;
 		goto out;
 	}
 	dmode = vap->va_mode & 0777;
 	dmode |= IFDIR;
 	/*
 	 * Must simulate part of ufs_makeinode here to acquire the inode,
 	 * but not have it entered in the parent directory. The entry is
 	 * made later after writing "." and ".." entries.
 	 */
 	if (dp->i_effnlink < 2) {
 		print_bad_link_count("ufs_mkdir", dvp);
 		error = EINVAL;
 		goto out;
 	}
 	error = UFS_VALLOC(dvp, dmode, cnp->cn_cred, &tvp);
 	if (error)
 		goto out;
 	ip = VTOI(tvp);
 	ip->i_gid = dp->i_gid;
 	DIP_SET(ip, i_gid, dp->i_gid);
 #ifdef SUIDDIR
 	{
 #ifdef QUOTA
 		struct ucred ucred, *ucp;
 		gid_t ucred_group;
 		ucp = cnp->cn_cred;
 #endif
 		/*
 		 * If we are hacking owners here, (only do this where told to)
 		 * and we are not giving it TO root, (would subvert quotas)
 		 * then go ahead and give it to the other user.
 		 * The new directory also inherits the SUID bit.
 		 * If user's UID and dir UID are the same,
 		 * 'give it away' so that the SUID is still forced on.
 		 */
 		if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) &&
 		    (dp->i_mode & ISUID) && dp->i_uid) {
 			dmode |= ISUID;
 			ip->i_uid = dp->i_uid;
 			DIP_SET(ip, i_uid, dp->i_uid);
 #ifdef QUOTA
 			if (dp->i_uid != cnp->cn_cred->cr_uid) {
 				/*
 				 * Make sure the correct user gets charged
 				 * for the space.
 				 * Make a dummy credential for the victim.
 				 * XXX This seems to never be accessed out of
 				 * our context so a stack variable is ok.
 				 */
 				refcount_init(&ucred.cr_ref, 1);
 				ucred.cr_uid = ip->i_uid;
 				ucred.cr_ngroups = 1;
 				ucred.cr_groups = &ucred_group;
 				ucred.cr_groups[0] = dp->i_gid;
 				ucp = &ucred;
 			}
 #endif
 		} else {
 			ip->i_uid = cnp->cn_cred->cr_uid;
 			DIP_SET(ip, i_uid, ip->i_uid);
 		}
 #ifdef QUOTA
 		if ((error = getinoquota(ip)) ||
 	    	    (error = chkiq(ip, 1, ucp, 0))) {
 			if (DOINGSOFTDEP(tvp))
 				softdep_revert_link(dp, ip);
 			UFS_VFREE(tvp, ip->i_number, dmode);
 			vput(tvp);
 			return (error);
 		}
 #endif
 	}
 #else	/* !SUIDDIR */
 	ip->i_uid = cnp->cn_cred->cr_uid;
 	DIP_SET(ip, i_uid, ip->i_uid);
 #ifdef QUOTA
 	if ((error = getinoquota(ip)) ||
 	    (error = chkiq(ip, 1, cnp->cn_cred, 0))) {
 		if (DOINGSOFTDEP(tvp))
 			softdep_revert_link(dp, ip);
 		UFS_VFREE(tvp, ip->i_number, dmode);
 		vput(tvp);
 		return (error);
 	}
 #endif
 #endif	/* !SUIDDIR */
 	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 	ip->i_mode = dmode;
 	DIP_SET(ip, i_mode, dmode);
 	tvp->v_type = VDIR;	/* Rest init'd in getnewvnode(). */
 	ip->i_effnlink = 2;
 	ip->i_nlink = 2;
 	DIP_SET(ip, i_nlink, 2);
 
 	if (cnp->cn_flags & ISWHITEOUT) {
 		ip->i_flags |= UF_OPAQUE;
 		DIP_SET(ip, i_flags, ip->i_flags);
 	}
 
 	/*
 	 * Bump link count in parent directory to reflect work done below.
 	 * Should be done before reference is created so cleanup is
 	 * possible if we crash.
 	 */
 	dp->i_effnlink++;
 	dp->i_nlink++;
 	DIP_SET(dp, i_nlink, dp->i_nlink);
 	dp->i_flag |= IN_CHANGE;
 	if (DOINGSOFTDEP(dvp))
 		softdep_setup_mkdir(dp, ip);
 	error = UFS_UPDATE(dvp, !DOINGSOFTDEP(dvp) && !DOINGASYNC(dvp));
 	if (error)
 		goto bad;
 #ifdef MAC
 	if (dvp->v_mount->mnt_flag & MNT_MULTILABEL) {
 		error = mac_vnode_create_extattr(cnp->cn_cred, dvp->v_mount,
 		    dvp, tvp, cnp);
 		if (error)
 			goto bad;
 	}
 #endif
 #ifdef UFS_ACL
 	if (dvp->v_mount->mnt_flag & MNT_ACLS) {
 		error = ufs_do_posix1e_acl_inheritance_dir(dvp, tvp, dmode,
 		    cnp->cn_cred, cnp->cn_thread);
 		if (error)
 			goto bad;
 	} else if (dvp->v_mount->mnt_flag & MNT_NFS4ACLS) {
 		error = ufs_do_nfs4_acl_inheritance(dvp, tvp, dmode,
 		    cnp->cn_cred, cnp->cn_thread);
 		if (error)
 			goto bad;
 	}
 #endif /* !UFS_ACL */
 
 	/*
 	 * Initialize directory with "." and ".." from static template.
 	 */
 	if (dvp->v_mount->mnt_maxsymlinklen > 0)
 		dtp = &mastertemplate;
 	else
 		dtp = (struct dirtemplate *)&omastertemplate;
 	dirtemplate = *dtp;
 	dirtemplate.dot_ino = ip->i_number;
 	dirtemplate.dotdot_ino = dp->i_number;
 	vnode_pager_setsize(tvp, DIRBLKSIZ);
 	if ((error = UFS_BALLOC(tvp, (off_t)0, DIRBLKSIZ, cnp->cn_cred,
 	    BA_CLRBUF, &bp)) != 0)
 		goto bad;
 	ip->i_size = DIRBLKSIZ;
 	DIP_SET(ip, i_size, DIRBLKSIZ);
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	bcopy((caddr_t)&dirtemplate, (caddr_t)bp->b_data, sizeof dirtemplate);
 	if (DOINGSOFTDEP(tvp)) {
 		/*
 		 * Ensure that the entire newly allocated block is a
 		 * valid directory so that future growth within the
 		 * block does not have to ensure that the block is
 		 * written before the inode.
 		 */
 		blkoff = DIRBLKSIZ;
 		while (blkoff < bp->b_bcount) {
 			((struct direct *)
 			   (bp->b_data + blkoff))->d_reclen = DIRBLKSIZ;
 			blkoff += DIRBLKSIZ;
 		}
 	}
 	if ((error = UFS_UPDATE(tvp, !DOINGSOFTDEP(tvp) &&
 	    !DOINGASYNC(tvp))) != 0) {
 		(void)bwrite(bp);
 		goto bad;
 	}
 	/*
 	 * Directory set up, now install its entry in the parent directory.
 	 *
 	 * If we are not doing soft dependencies, then we must write out the
 	 * buffer containing the new directory body before entering the new 
 	 * name in the parent. If we are doing soft dependencies, then the
 	 * buffer containing the new directory body will be passed to and
 	 * released in the soft dependency code after the code has attached
 	 * an appropriate ordering dependency to the buffer which ensures that
 	 * the buffer is written before the new name is written in the parent.
 	 */
 	if (DOINGASYNC(dvp))
 		bdwrite(bp);
 	else if (!DOINGSOFTDEP(dvp) && ((error = bwrite(bp))))
 		goto bad;
 	ufs_makedirentry(ip, cnp, &newdir);
 	error = ufs_direnter(dvp, tvp, &newdir, cnp, bp, 0);
 	
 bad:
 	if (error == 0) {
 		*ap->a_vpp = tvp;
 	} else {
 		dp->i_effnlink--;
 		dp->i_nlink--;
 		DIP_SET(dp, i_nlink, dp->i_nlink);
 		dp->i_flag |= IN_CHANGE;
 		/*
 		 * No need to do an explicit VOP_TRUNCATE here, vrele will
 		 * do this for us because we set the link count to 0.
 		 */
 		ip->i_effnlink = 0;
 		ip->i_nlink = 0;
 		DIP_SET(ip, i_nlink, 0);
 		ip->i_flag |= IN_CHANGE;
 		if (DOINGSOFTDEP(tvp))
 			softdep_revert_mkdir(dp, ip);
 
 		vput(tvp);
 	}
 out:
 	return (error);
 }
 
 /*
  * Rmdir system call.
  */
 static int
 ufs_rmdir(ap)
 	struct vop_rmdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct inode *ip, *dp;
 	int error;
 
 	ip = VTOI(vp);
 	dp = VTOI(dvp);
 
 	/*
 	 * Do not remove a directory that is in the process of being renamed.
 	 * Verify the directory is empty (and valid). Rmdir ".." will not be
 	 * valid since ".." will contain a reference to the current directory
 	 * and thus be non-empty. Do not allow the removal of mounted on
 	 * directories (this can happen when an NFS exported filesystem
 	 * tries to remove a locally mounted on directory).
 	 */
 	error = 0;
 	if (dp->i_effnlink <= 2) {
 		if (dp->i_effnlink == 2)
 			print_bad_link_count("ufs_rmdir", dvp);
 		error = EINVAL;
 		goto out;
 	}
 	if (!ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) {
 		error = ENOTEMPTY;
 		goto out;
 	}
 	if ((dp->i_flags & APPEND)
 	    || (ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))) {
 		error = EPERM;
 		goto out;
 	}
 	if (vp->v_mountedhere != 0) {
 		error = EINVAL;
 		goto out;
 	}
 #ifdef UFS_GJOURNAL
 	ufs_gjournal_orphan(vp);
 #endif
 	/*
 	 * Delete reference to directory before purging
 	 * inode.  If we crash in between, the directory
 	 * will be reattached to lost+found,
 	 */
 	dp->i_effnlink--;
 	ip->i_effnlink--;
 	if (DOINGSOFTDEP(vp))
 		softdep_setup_rmdir(dp, ip);
 	error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1);
 	if (error) {
 		dp->i_effnlink++;
 		ip->i_effnlink++;
 		if (DOINGSOFTDEP(vp))
 			softdep_revert_rmdir(dp, ip);
 		goto out;
 	}
 	cache_purge(dvp);
 	/*
 	 * The only stuff left in the directory is "." and "..". The "."
 	 * reference is inconsequential since we are quashing it. The soft
 	 * dependency code will arrange to do these operations after
 	 * the parent directory entry has been deleted on disk, so
 	 * when running with that code we avoid doing them now.
 	 */
 	if (!DOINGSOFTDEP(vp)) {
 		dp->i_nlink--;
 		DIP_SET(dp, i_nlink, dp->i_nlink);
 		dp->i_flag |= IN_CHANGE;
 		error = UFS_UPDATE(dvp, 0);
 		ip->i_nlink--;
 		DIP_SET(ip, i_nlink, ip->i_nlink);
 		ip->i_flag |= IN_CHANGE;
 	}
 	cache_purge(vp);
 #ifdef UFS_DIRHASH
 	/* Kill any active hash; i_effnlink == 0, so it will not come back. */
 	if (ip->i_dirhash != NULL)
 		ufsdirhash_free(ip);
 #endif
 out:
 	return (error);
 }
 
 /*
  * symlink -- make a symbolic link
  */
 static int
 ufs_symlink(ap)
 	struct vop_symlink_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 		char *a_target;
 	} */ *ap;
 {
 	struct vnode *vp, **vpp = ap->a_vpp;
 	struct inode *ip;
 	int len, error;
 
 	error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp,
 	    vpp, ap->a_cnp, "ufs_symlink");
 	if (error)
 		return (error);
 	vp = *vpp;
 	len = strlen(ap->a_target);
 	if (len < vp->v_mount->mnt_maxsymlinklen) {
 		ip = VTOI(vp);
 		bcopy(ap->a_target, SHORTLINK(ip), len);
 		ip->i_size = len;
 		DIP_SET(ip, i_size, len);
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		error = UFS_UPDATE(vp, 0);
 	} else
 		error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0,
 		    UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK,
 		    ap->a_cnp->cn_cred, NOCRED, NULL, NULL);
 	if (error)
 		vput(vp);
 	return (error);
 }
 
 /*
  * Vnode op for reading directories.
  */
 int
 ufs_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		int *a_ncookies;
 		u_long **a_cookies;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct buf *bp;
 	struct inode *ip;
 	struct direct *dp, *edp;
 	u_long *cookies;
 	struct dirent dstdp;
 	off_t offset, startoffset;
 	size_t readcnt, skipcnt;
 	ssize_t startresid;
 	u_int ncookies;
 	int error;
 
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 	ip = VTOI(vp);
 	if (ip->i_effnlink == 0)
 		return (0);
 	if (ap->a_ncookies != NULL) {
 		if (uio->uio_resid < 0)
 			ncookies = 0;
 		else
 			ncookies = uio->uio_resid;
 		if (uio->uio_offset >= ip->i_size)
 			ncookies = 0;
 		else if (ip->i_size - uio->uio_offset < ncookies)
 			ncookies = ip->i_size - uio->uio_offset;
 		ncookies = ncookies / (offsetof(struct direct, d_name) + 4) + 1;
 		cookies = malloc(ncookies * sizeof(*cookies), M_TEMP, M_WAITOK);
 		*ap->a_ncookies = ncookies;
 		*ap->a_cookies = cookies;
 	} else {
 		ncookies = 0;
 		cookies = NULL;
 	}
 	offset = startoffset = uio->uio_offset;
 	startresid = uio->uio_resid;
 	error = 0;
 	while (error == 0 && uio->uio_resid > 0 &&
 	    uio->uio_offset < ip->i_size) {
 		error = ffs_blkatoff(vp, uio->uio_offset, NULL, &bp);
 		if (error)
 			break;
 		if (bp->b_offset + bp->b_bcount > ip->i_size)
 			readcnt = ip->i_size - bp->b_offset;
 		else
 			readcnt = bp->b_bcount;
 		skipcnt = (size_t)(uio->uio_offset - bp->b_offset) &
 		    ~(size_t)(DIRBLKSIZ - 1);
 		offset = bp->b_offset + skipcnt;
 		dp = (struct direct *)&bp->b_data[skipcnt];
 		edp = (struct direct *)&bp->b_data[readcnt];
 		while (error == 0 && uio->uio_resid > 0 && dp < edp) {
 			if (dp->d_reclen <= offsetof(struct direct, d_name) ||
 			    (caddr_t)dp + dp->d_reclen > (caddr_t)edp) {
 				error = EIO;
 				break;
 			}
 #if BYTE_ORDER == LITTLE_ENDIAN
 			/* Old filesystem format. */
 			if (vp->v_mount->mnt_maxsymlinklen <= 0) {
 				dstdp.d_namlen = dp->d_type;
 				dstdp.d_type = dp->d_namlen;
 			} else
 #endif
 			{
 				dstdp.d_namlen = dp->d_namlen;
 				dstdp.d_type = dp->d_type;
 			}
 			if (offsetof(struct direct, d_name) + dstdp.d_namlen >
 			    dp->d_reclen) {
 				error = EIO;
 				break;
 			}
 			if (offset < startoffset || dp->d_ino == 0)
 				goto nextentry;
 			dstdp.d_fileno = dp->d_ino;
 			dstdp.d_reclen = GENERIC_DIRSIZ(&dstdp);
 			bcopy(dp->d_name, dstdp.d_name, dstdp.d_namlen);
 			dstdp.d_name[dstdp.d_namlen] = '\0';
+			/* NOTE: d_off is the offset of the *next* entry. */
+			dstdp.d_off = offset + dp->d_reclen;
 			if (dstdp.d_reclen > uio->uio_resid) {
 				if (uio->uio_resid == startresid)
 					error = EINVAL;
 				else
 					error = EJUSTRETURN;
 				break;
 			}
 			/* Advance dp. */
 			error = uiomove((caddr_t)&dstdp, dstdp.d_reclen, uio);
 			if (error)
 				break;
 			if (cookies != NULL) {
 				KASSERT(ncookies > 0,
 				    ("ufs_readdir: cookies buffer too small"));
 				*cookies = offset + dp->d_reclen;
 				cookies++;
 				ncookies--;
 			}
 nextentry:
 			offset += dp->d_reclen;
 			dp = (struct direct *)((caddr_t)dp + dp->d_reclen);
 		}
 		bqrelse(bp);
 		uio->uio_offset = offset;
 	}
 	/* We need to correct uio_offset. */
 	uio->uio_offset = offset;
 	if (error == EJUSTRETURN)
 		error = 0;
 	if (ap->a_ncookies != NULL) {
 		if (error == 0) {
 			ap->a_ncookies -= ncookies;
 		} else {
 			free(*ap->a_cookies, M_TEMP);
 			*ap->a_ncookies = 0;
 			*ap->a_cookies = NULL;
 		}
 	}
 	if (error == 0 && ap->a_eofflag)
 		*ap->a_eofflag = ip->i_size <= uio->uio_offset;
 	return (error);
 }
 
 /*
  * Return target name of a symbolic link
  */
 static int
 ufs_readlink(ap)
 	struct vop_readlink_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	doff_t isize;
 
 	isize = ip->i_size;
 	if ((isize < vp->v_mount->mnt_maxsymlinklen) ||
 	    DIP(ip, i_blocks) == 0) { /* XXX - for old fastlink support */
 		return (uiomove(SHORTLINK(ip), isize, ap->a_uio));
 	}
 	return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred));
 }
 
 /*
  * Calculate the logical to physical mapping if not done already,
  * then call the device strategy routine.
  *
  * In order to be able to swap to a file, the ufs_bmaparray() operation may not
  * deadlock on memory.  See ufs_bmap() for details.
  */
 static int
 ufs_strategy(ap)
 	struct vop_strategy_args /* {
 		struct vnode *a_vp;
 		struct buf *a_bp;
 	} */ *ap;
 {
 	struct buf *bp = ap->a_bp;
 	struct vnode *vp = ap->a_vp;
 	ufs2_daddr_t blkno;
 	int error;
 
 	if (bp->b_blkno == bp->b_lblkno) {
 		error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, bp, NULL, NULL);
 		bp->b_blkno = blkno;
 		if (error) {
 			bp->b_error = error;
 			bp->b_ioflags |= BIO_ERROR;
 			bufdone(bp);
 			return (0);
 		}
 		if ((long)bp->b_blkno == -1)
 			vfs_bio_clrbuf(bp);
 	}
 	if ((long)bp->b_blkno == -1) {
 		bufdone(bp);
 		return (0);
 	}
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	BO_STRATEGY(VFSTOUFS(vp->v_mount)->um_bo, bp);
 	return (0);
 }
 
 /*
  * Print out the contents of an inode.
  */
 static int
 ufs_print(ap)
 	struct vop_print_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 
 	printf("\tino %lu, on dev %s", (u_long)ip->i_number,
 	    devtoname(ITODEV(ip)));
 	if (vp->v_type == VFIFO)
 		fifo_printinfo(vp);
 	printf("\n");
 	return (0);
 }
 
 /*
  * Close wrapper for fifos.
  *
  * Update the times on the inode then do device close.
  */
 static int
 ufsfifo_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	int usecount;
 
 	VI_LOCK(vp);
 	usecount = vp->v_usecount;
 	if (usecount > 1)
 		ufs_itimes_locked(vp);
 	VI_UNLOCK(vp);
 	return (fifo_specops.vop_close(ap));
 }
 
 /*
  * Kqfilter wrapper for fifos.
  *
  * Fall through to ufs kqfilter routines if needed 
  */
 static int
 ufsfifo_kqfilter(ap)
 	struct vop_kqfilter_args *ap;
 {
 	int error;
 
 	error = fifo_specops.vop_kqfilter(ap);
 	if (error)
 		error = vfs_kqfilter(ap);
 	return (error);
 }
 
 /*
  * Return POSIX pathconf information applicable to ufs filesystems.
  */
 static int
 ufs_pathconf(ap)
 	struct vop_pathconf_args /* {
 		struct vnode *a_vp;
 		int a_name;
 		int *a_retval;
 	} */ *ap;
 {
 	int error;
 
 	error = 0;
 	switch (ap->a_name) {
 	case _PC_LINK_MAX:
 		*ap->a_retval = UFS_LINK_MAX;
 		break;
 	case _PC_NAME_MAX:
 		*ap->a_retval = UFS_MAXNAMLEN;
 		break;
 	case _PC_PIPE_BUF:
 		if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO)
 			*ap->a_retval = PIPE_BUF;
 		else
 			error = EINVAL;
 		break;
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = 1;
 		break;
 	case _PC_NO_TRUNC:
 		*ap->a_retval = 1;
 		break;
 	case _PC_ACL_EXTENDED:
 #ifdef UFS_ACL
 		if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS)
 			*ap->a_retval = 1;
 		else
 			*ap->a_retval = 0;
 #else
 		*ap->a_retval = 0;
 #endif
 		break;
 
 	case _PC_ACL_NFS4:
 #ifdef UFS_ACL
 		if (ap->a_vp->v_mount->mnt_flag & MNT_NFS4ACLS)
 			*ap->a_retval = 1;
 		else
 			*ap->a_retval = 0;
 #else
 		*ap->a_retval = 0;
 #endif
 		break;
 
 	case _PC_ACL_PATH_MAX:
 #ifdef UFS_ACL
 		if (ap->a_vp->v_mount->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS))
 			*ap->a_retval = ACL_MAX_ENTRIES;
 		else
 			*ap->a_retval = 3;
 #else
 		*ap->a_retval = 3;
 #endif
 		break;
 	case _PC_MAC_PRESENT:
 #ifdef MAC
 		if (ap->a_vp->v_mount->mnt_flag & MNT_MULTILABEL)
 			*ap->a_retval = 1;
 		else
 			*ap->a_retval = 0;
 #else
 		*ap->a_retval = 0;
 #endif
 		break;
 	case _PC_MIN_HOLE_SIZE:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
 		break;
 	case _PC_PRIO_IO:
 		*ap->a_retval = 0;
 		break;
 	case _PC_SYNC_IO:
 		*ap->a_retval = 0;
 		break;
 	case _PC_ALLOC_SIZE_MIN:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_bsize;
 		break;
 	case _PC_FILESIZEBITS:
 		*ap->a_retval = 64;
 		break;
 	case _PC_REC_INCR_XFER_SIZE:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
 		break;
 	case _PC_REC_MAX_XFER_SIZE:
 		*ap->a_retval = -1; /* means ``unlimited'' */
 		break;
 	case _PC_REC_MIN_XFER_SIZE:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
 		break;
 	case _PC_REC_XFER_ALIGN:
 		*ap->a_retval = PAGE_SIZE;
 		break;
 	case _PC_SYMLINK_MAX:
 		*ap->a_retval = MAXPATHLEN;
 		break;
 
 	default:
 		error = vop_stdpathconf(ap);
 		break;
 	}
 	return (error);
 }
 
 /*
  * Initialize the vnode associated with a new inode, handle aliased
  * vnodes.
  */
 int
 ufs_vinit(mntp, fifoops, vpp)
 	struct mount *mntp;
 	struct vop_vector *fifoops;
 	struct vnode **vpp;
 {
 	struct inode *ip;
 	struct vnode *vp;
 
 	vp = *vpp;
 	ip = VTOI(vp);
 	vp->v_type = IFTOVT(ip->i_mode);
 	if (vp->v_type == VFIFO)
 		vp->v_op = fifoops;
 	ASSERT_VOP_LOCKED(vp, "ufs_vinit");
 	if (ip->i_number == UFS_ROOTINO)
 		vp->v_vflag |= VV_ROOT;
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * Allocate a new inode.
  * Vnode dvp must be locked.
  */
 static int
 ufs_makeinode(mode, dvp, vpp, cnp, callfunc)
 	int mode;
 	struct vnode *dvp;
 	struct vnode **vpp;
 	struct componentname *cnp;
 	const char *callfunc;
 {
 	struct inode *ip, *pdir;
 	struct direct newdir;
 	struct vnode *tvp;
 	int error;
 
 	pdir = VTOI(dvp);
 #ifdef INVARIANTS
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("%s: no name", callfunc);
 #endif
 	*vpp = NULL;
 	if ((mode & IFMT) == 0)
 		mode |= IFREG;
 
 	if (pdir->i_effnlink < 2) {
 		print_bad_link_count(callfunc, dvp);
 		return (EINVAL);
 	}
 	error = UFS_VALLOC(dvp, mode, cnp->cn_cred, &tvp);
 	if (error)
 		return (error);
 	ip = VTOI(tvp);
 	ip->i_gid = pdir->i_gid;
 	DIP_SET(ip, i_gid, pdir->i_gid);
 #ifdef SUIDDIR
 	{
 #ifdef QUOTA
 		struct ucred ucred, *ucp;
 		gid_t ucred_group;
 		ucp = cnp->cn_cred;
 #endif
 		/*
 		 * If we are not the owner of the directory,
 		 * and we are hacking owners here, (only do this where told to)
 		 * and we are not giving it TO root, (would subvert quotas)
 		 * then go ahead and give it to the other user.
 		 * Note that this drops off the execute bits for security.
 		 */
 		if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) &&
 		    (pdir->i_mode & ISUID) &&
 		    (pdir->i_uid != cnp->cn_cred->cr_uid) && pdir->i_uid) {
 			ip->i_uid = pdir->i_uid;
 			DIP_SET(ip, i_uid, ip->i_uid);
 			mode &= ~07111;
 #ifdef QUOTA
 			/*
 			 * Make sure the correct user gets charged
 			 * for the space.
 			 * Quickly knock up a dummy credential for the victim.
 			 * XXX This seems to never be accessed out of our
 			 * context so a stack variable is ok.
 			 */
 			refcount_init(&ucred.cr_ref, 1);
 			ucred.cr_uid = ip->i_uid;
 			ucred.cr_ngroups = 1;
 			ucred.cr_groups = &ucred_group;
 			ucred.cr_groups[0] = pdir->i_gid;
 			ucp = &ucred;
 #endif
 		} else {
 			ip->i_uid = cnp->cn_cred->cr_uid;
 			DIP_SET(ip, i_uid, ip->i_uid);
 		}
 
 #ifdef QUOTA
 		if ((error = getinoquota(ip)) ||
 	    	    (error = chkiq(ip, 1, ucp, 0))) {
 			if (DOINGSOFTDEP(tvp))
 				softdep_revert_link(pdir, ip);
 			UFS_VFREE(tvp, ip->i_number, mode);
 			vput(tvp);
 			return (error);
 		}
 #endif
 	}
 #else	/* !SUIDDIR */
 	ip->i_uid = cnp->cn_cred->cr_uid;
 	DIP_SET(ip, i_uid, ip->i_uid);
 #ifdef QUOTA
 	if ((error = getinoquota(ip)) ||
 	    (error = chkiq(ip, 1, cnp->cn_cred, 0))) {
 		if (DOINGSOFTDEP(tvp))
 			softdep_revert_link(pdir, ip);
 		UFS_VFREE(tvp, ip->i_number, mode);
 		vput(tvp);
 		return (error);
 	}
 #endif
 #endif	/* !SUIDDIR */
 	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 	ip->i_mode = mode;
 	DIP_SET(ip, i_mode, mode);
 	tvp->v_type = IFTOVT(mode);	/* Rest init'd in getnewvnode(). */
 	ip->i_effnlink = 1;
 	ip->i_nlink = 1;
 	DIP_SET(ip, i_nlink, 1);
 	if (DOINGSOFTDEP(tvp))
 		softdep_setup_create(VTOI(dvp), ip);
 	if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) &&
 	    priv_check_cred(cnp->cn_cred, PRIV_VFS_SETGID, 0)) {
 		ip->i_mode &= ~ISGID;
 		DIP_SET(ip, i_mode, ip->i_mode);
 	}
 
 	if (cnp->cn_flags & ISWHITEOUT) {
 		ip->i_flags |= UF_OPAQUE;
 		DIP_SET(ip, i_flags, ip->i_flags);
 	}
 
 	/*
 	 * Make sure inode goes to disk before directory entry.
 	 */
 	error = UFS_UPDATE(tvp, !DOINGSOFTDEP(tvp) && !DOINGASYNC(tvp));
 	if (error)
 		goto bad;
 #ifdef MAC
 	if (dvp->v_mount->mnt_flag & MNT_MULTILABEL) {
 		error = mac_vnode_create_extattr(cnp->cn_cred, dvp->v_mount,
 		    dvp, tvp, cnp);
 		if (error)
 			goto bad;
 	}
 #endif
 #ifdef UFS_ACL
 	if (dvp->v_mount->mnt_flag & MNT_ACLS) {
 		error = ufs_do_posix1e_acl_inheritance_file(dvp, tvp, mode,
 		    cnp->cn_cred, cnp->cn_thread);
 		if (error)
 			goto bad;
 	} else if (dvp->v_mount->mnt_flag & MNT_NFS4ACLS) {
 		error = ufs_do_nfs4_acl_inheritance(dvp, tvp, mode,
 		    cnp->cn_cred, cnp->cn_thread);
 		if (error)
 			goto bad;
 	}
 #endif /* !UFS_ACL */
 	ufs_makedirentry(ip, cnp, &newdir);
 	error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL, 0);
 	if (error)
 		goto bad;
 	*vpp = tvp;
 	return (0);
 
 bad:
 	/*
 	 * Write error occurred trying to update the inode
 	 * or the directory so must deallocate the inode.
 	 */
 	ip->i_effnlink = 0;
 	ip->i_nlink = 0;
 	DIP_SET(ip, i_nlink, 0);
 	ip->i_flag |= IN_CHANGE;
 	if (DOINGSOFTDEP(tvp))
 		softdep_revert_create(VTOI(dvp), ip);
 	vput(tvp);
 	return (error);
 }
 
 static int
 ufs_ioctl(struct vop_ioctl_args *ap)
 {
 
 	switch (ap->a_command) {
 	case FIOSEEKDATA:
 	case FIOSEEKHOLE:
 		return (vn_bmap_seekhole(ap->a_vp, ap->a_command,
 		    (off_t *)ap->a_data, ap->a_cred));
 	default:
 		return (ENOTTY);
 	}
 }
 
 /* Global vfs data structures for ufs. */
 struct vop_vector ufs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_fsync =		VOP_PANIC,
 	.vop_read =		VOP_PANIC,
 	.vop_reallocblks =	VOP_PANIC,
 	.vop_write =		VOP_PANIC,
 	.vop_accessx =		ufs_accessx,
 	.vop_bmap =		ufs_bmap,
 	.vop_cachedlookup =	ufs_lookup,
 	.vop_close =		ufs_close,
 	.vop_create =		ufs_create,
 	.vop_getattr =		ufs_getattr,
 	.vop_inactive =		ufs_inactive,
 	.vop_ioctl =		ufs_ioctl,
 	.vop_link =		ufs_link,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_markatime =	ufs_markatime,
 	.vop_mkdir =		ufs_mkdir,
 	.vop_mknod =		ufs_mknod,
 	.vop_open =		ufs_open,
 	.vop_pathconf =		ufs_pathconf,
 	.vop_poll =		vop_stdpoll,
 	.vop_print =		ufs_print,
 	.vop_readdir =		ufs_readdir,
 	.vop_readlink =		ufs_readlink,
 	.vop_reclaim =		ufs_reclaim,
 	.vop_remove =		ufs_remove,
 	.vop_rename =		ufs_rename,
 	.vop_rmdir =		ufs_rmdir,
 	.vop_setattr =		ufs_setattr,
 #ifdef MAC
 	.vop_setlabel =		vop_stdsetlabel_ea,
 #endif
 	.vop_strategy =		ufs_strategy,
 	.vop_symlink =		ufs_symlink,
 	.vop_whiteout =		ufs_whiteout,
 #ifdef UFS_EXTATTR
 	.vop_getextattr =	ufs_getextattr,
 	.vop_deleteextattr =	ufs_deleteextattr,
 	.vop_setextattr =	ufs_setextattr,
 #endif
 #ifdef UFS_ACL
 	.vop_getacl =		ufs_getacl,
 	.vop_setacl =		ufs_setacl,
 	.vop_aclcheck =		ufs_aclcheck,
 #endif
 };
 
 struct vop_vector ufs_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_fsync =		VOP_PANIC,
 	.vop_accessx =		ufs_accessx,
 	.vop_close =		ufsfifo_close,
 	.vop_getattr =		ufs_getattr,
 	.vop_inactive =		ufs_inactive,
 	.vop_kqfilter =		ufsfifo_kqfilter,
 	.vop_markatime =	ufs_markatime,
 	.vop_pathconf = 	ufs_pathconf,
 	.vop_print =		ufs_print,
 	.vop_read =		VOP_PANIC,
 	.vop_reclaim =		ufs_reclaim,
 	.vop_setattr =		ufs_setattr,
 #ifdef MAC
 	.vop_setlabel =		vop_stdsetlabel_ea,
 #endif
 	.vop_write =		VOP_PANIC,
 #ifdef UFS_EXTATTR
 	.vop_getextattr =	ufs_getextattr,
 	.vop_deleteextattr =	ufs_deleteextattr,
 	.vop_setextattr =	ufs_setextattr,
 #endif
 #ifdef UFS_ACL
 	.vop_getacl =		ufs_getacl,
 	.vop_setacl =		ufs_setacl,
 	.vop_aclcheck =		ufs_aclcheck,
 #endif
 };
Index: stable/12
===================================================================
--- stable/12	(revision 340720)
+++ stable/12	(revision 340721)

Property changes on: stable/12
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r340431