Index: head/sys/cddl/compat/opensolaris/sys/vnode.h =================================================================== --- head/sys/cddl/compat/opensolaris/sys/vnode.h +++ head/sys/cddl/compat/opensolaris/sys/vnode.h @@ -87,8 +87,6 @@ #define VN_RELE(v) vrele(v) #define VN_URELE(v) vput(v) -#define VOP_REALVP(vp, vpp, ct) (*(vpp) = (vp), 0) - #define vnevent_create(vp, ct) do { } while (0) #define vnevent_link(vp, ct) do { } while (0) #define vnevent_remove(vp, dvp, name, ct) do { } while (0) Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h @@ -48,18 +48,18 @@ #define IS_ROOT_NODE 0x01 /* create a root node */ #define IS_XATTR 0x02 /* create an extended attribute node */ -extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **, - int, int *, pathname_t *); -extern void zfs_dirent_unlock(zfs_dirlock_t *); -extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int); -extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int, +extern int zfs_dirent_lookup(znode_t *, const char *, znode_t **, int); +extern int zfs_link_create(znode_t *, const char *, znode_t *, dmu_tx_t *, int); +extern int zfs_link_destroy(znode_t *, const char *, znode_t *, dmu_tx_t *, int, boolean_t *); -extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *, - pathname_t *); +#if 0 +extern int zfs_dirlook(vnode_t *, const char *, vnode_t **, int); +#else +extern int zfs_dirlook(znode_t *, const char *name, znode_t **); +#endif extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *, uint_t, znode_t **, zfs_acl_ids_t *); extern void zfs_rmnode(znode_t *); -extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old); extern boolean_t zfs_dirempty(znode_t *); extern void zfs_unlinked_add(znode_t *, dmu_tx_t *); extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs); Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h @@ -75,6 +75,7 @@ boolean_t z_use_fuids; /* version allows fuids */ boolean_t z_replay; /* set during ZIL replay */ boolean_t z_use_sa; /* version allow system attributes */ + boolean_t z_use_namecache;/* make use of FreeBSD name cache */ uint64_t z_version; /* ZPL version */ uint64_t z_shares_dir; /* hidden shares dir */ kmutex_t z_lock; Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h @@ -181,10 +181,12 @@ struct zfsvfs *z_zfsvfs; vnode_t *z_vnode; uint64_t z_id; /* object ID for this znode */ +#ifdef illumos kmutex_t z_lock; /* znode modification lock */ krwlock_t z_parent_lock; /* parent lock for directories */ krwlock_t z_name_lock; /* "master" lock for dirent locks */ zfs_dirlock_t *z_dirlocks; /* directory entry lock list */ +#endif kmutex_t z_range_lock; /* protects changes to z_range_avl */ avl_tree_t z_range_avl; /* avl tree of file range locks */ uint8_t z_unlinked; /* file has been unlinked */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c @@ -1055,8 +1055,7 @@ * create a new acl and leave any cached acl in place. */ static int -zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp, - boolean_t will_modify) +zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) { zfs_acl_t *aclp; int aclsize; @@ -1065,26 +1064,15 @@ zfs_acl_phys_t znode_acl; int version; int error; - boolean_t drop_lock = B_FALSE; ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + ASSERT_VOP_LOCKED(ZTOV(zp), __func__); if (zp->z_acl_cached && !will_modify) { *aclpp = zp->z_acl_cached; return (0); } - /* - * close race where znode could be upgrade while trying to - * read the znode attributes. - * - * But this could only happen if the file isn't already an SA - * znode - */ - if (!zp->z_is_sa && !have_lock) { - mutex_enter(&zp->z_lock); - drop_lock = B_TRUE; - } version = zfs_znode_acl_version(zp); if ((error = zfs_acl_znode_info(zp, &aclsize, @@ -1130,8 +1118,6 @@ if (!will_modify) zp->z_acl_cached = aclp; done: - if (drop_lock) - mutex_exit(&zp->z_lock); return (error); } @@ -1158,10 +1144,10 @@ int error; zfs_acl_t *aclp; - ASSERT(MUTEX_HELD(&zp->z_lock)); + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); ASSERT(MUTEX_HELD(&zp->z_acl_lock)); - if ((error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE)) == 0) + if ((error = zfs_acl_node_read(zp, &aclp, B_FALSE)) == 0) zp->z_mode = zfs_mode_compute(zp->z_mode, aclp, &zp->z_pflags, zp->z_uid, zp->z_gid); return (error); @@ -1453,18 +1439,17 @@ int error = 0; mutex_enter(&zp->z_acl_lock); - mutex_enter(&zp->z_lock); + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); else - error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE); + error = zfs_acl_node_read(zp, aclp, B_TRUE); if (error == 0) { (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; zfs_acl_chmod(ZTOV(zp)->v_type, mode, B_TRUE, (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp); } - mutex_exit(&zp->z_lock); mutex_exit(&zp->z_acl_lock); return (error); @@ -1617,6 +1602,7 @@ boolean_t trim = B_FALSE; boolean_t inherited = B_FALSE; + ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); bzero(acl_ids, sizeof (zfs_acl_ids_t)); acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode); @@ -1700,12 +1686,10 @@ if (acl_ids->z_aclp == NULL) { mutex_enter(&dzp->z_acl_lock); - mutex_enter(&dzp->z_lock); if (!(flag & IS_ROOT_NODE) && (dzp->z_pflags & ZFS_INHERIT_ACE) && !(dzp->z_pflags & ZFS_XATTR)) { - VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE, - &paclp, B_FALSE)); + VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE)); acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, vap->va_type, paclp, acl_ids->z_mode); inherited = B_TRUE; @@ -1714,7 +1698,6 @@ zfs_acl_alloc(zfs_acl_version_zp(dzp)); acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; } - mutex_exit(&dzp->z_lock); mutex_exit(&dzp->z_acl_lock); if (vap->va_type == VDIR) @@ -1783,7 +1766,8 @@ mutex_enter(&zp->z_acl_lock); - error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); + ASSERT_VOP_LOCKED(ZTOV(zp), __func__); + error = zfs_acl_node_read(zp, &aclp, B_FALSE); if (error != 0) { mutex_exit(&zp->z_acl_lock); return (error); @@ -1931,6 +1915,7 @@ boolean_t fuid_dirtied; uint64_t acl_obj; + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); if (mask == 0) return (SET_ERROR(ENOSYS)); @@ -1955,7 +1940,6 @@ } top: mutex_enter(&zp->z_acl_lock); - mutex_enter(&zp->z_lock); tx = dmu_tx_create(zfsvfs->z_os); @@ -1987,7 +1971,6 @@ zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { - mutex_exit(&zp->z_lock); mutex_exit(&zp->z_acl_lock); if (error == ERESTART) { @@ -2013,7 +1996,6 @@ if (fuidp) zfs_fuid_info_free(fuidp); dmu_tx_commit(tx); - mutex_exit(&zp->z_lock); mutex_exit(&zp->z_acl_lock); return (error); @@ -2117,7 +2099,8 @@ mutex_enter(&zp->z_acl_lock); - error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); + ASSERT_VOP_LOCKED(ZTOV(zp), __func__); + error = zfs_acl_node_read(zp, &aclp, B_FALSE); if (error != 0) { mutex_exit(&zp->z_acl_lock); return (error); Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c @@ -58,96 +58,64 @@ #include /* - * zfs_match_find() is used by zfs_dirent_lock() to peform zap lookups + * zfs_match_find() is used by zfs_dirent_lookup() to peform zap lookups * of names after deciding which is the appropriate lookup interface. */ static int -zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, boolean_t exact, - boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid) +zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name, + boolean_t exact, uint64_t *zoid) { int error; if (zfsvfs->z_norm) { - matchtype_t mt = MT_FIRST; - boolean_t conflict = B_FALSE; - size_t bufsz = 0; - char *buf = NULL; - - if (rpnp) { - buf = rpnp->pn_buf; - bufsz = rpnp->pn_bufsize; - } - if (exact) - mt = MT_EXACT; + matchtype_t mt = exact? MT_EXACT : MT_FIRST; + /* * In the non-mixed case we only expect there would ever * be one match, but we need to use the normalizing lookup. */ error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, - zoid, mt, buf, bufsz, &conflict); - if (!error && deflags) - *deflags = conflict ? ED_CASE_CONFLICT : 0; + zoid, mt, NULL, 0, NULL); } else { error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); } *zoid = ZFS_DIRENT_OBJ(*zoid); - if (error == ENOENT && update) - dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE); - return (error); } /* - * Lock a directory entry. A dirlock on protects that name - * in dzp's directory zap object. As long as you hold a dirlock, you can - * assume two things: (1) dzp cannot be reaped, and (2) no other thread - * can change the zap entry for (i.e. link or unlink) this name. + * Look up a directory entry under a locked vnode. + * dvp being locked gives us a guarantee that there are no concurrent + * modification of the directory and, thus, if a node can be found in + * the directory, then it must not be unlinked. * * Input arguments: * dzp - znode for directory * name - name of entry to lock * flag - ZNEW: if the entry already exists, fail with EEXIST. * ZEXISTS: if the entry does not exist, fail with ENOENT. - * ZSHARED: allow concurrent access with other ZSHARED callers. * ZXATTR: we want dzp's xattr directory - * ZCILOOK: On a mixed sensitivity file system, - * this lookup should be case-insensitive. - * ZCIEXACT: On a purely case-insensitive file system, - * this lookup should be case-sensitive. - * ZRENAMING: we are locking for renaming, force narrow locks - * ZHAVELOCK: Don't grab the z_name_lock for this call. The - * current thread already holds it. * * Output arguments: * zpp - pointer to the znode for the entry (NULL if there isn't one) - * dlpp - pointer to the dirlock for this entry (NULL on error) - * direntflags - (case-insensitive lookup only) - * flags if multiple case-sensitive matches exist in directory - * realpnp - (case-insensitive lookup only) - * actual name matched within the directory * * Return value: 0 on success or errno on failure. * * NOTE: Always checks for, and rejects, '.' and '..'. - * NOTE: For case-insensitive file systems we take wide locks (see below), - * but return znode pointers to a single match. */ int -zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, - int flag, int *direntflags, pathname_t *realpnp) +zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag) { zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zfs_dirlock_t *dl; - boolean_t update; boolean_t exact; uint64_t zoid; vnode_t *vp = NULL; int error = 0; - int cmpflags; + + ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); *zpp = NULL; - *dlpp = NULL; /* * Verify that we are not trying to lock '.', '..', or '.zfs' @@ -161,280 +129,108 @@ * Case sensitivity and normalization preferences are set when * the file system is created. These are stored in the * zfsvfs->z_case and zfsvfs->z_norm fields. These choices - * affect what vnodes can be cached in the DNLC, how we - * perform zap lookups, and the "width" of our dirlocks. + * affect how we perform zap lookups. * - * A normal dirlock locks a single name. Note that with - * normalization a name can be composed multiple ways, but - * when normalized, these names all compare equal. A wide - * dirlock locks multiple names. We need these when the file - * system is supporting mixed-mode access. It is sometimes - * necessary to lock all case permutations of file name at - * once so that simultaneous case-insensitive/case-sensitive - * behaves as rationally as possible. - */ - - /* * Decide if exact matches should be requested when performing * a zap lookup on file systems supporting case-insensitive * access. - */ - exact = - ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && (flag & ZCIEXACT)) || - ((zfsvfs->z_case == ZFS_CASE_MIXED) && !(flag & ZCILOOK)); - - /* - * Only look in or update the DNLC if we are looking for the - * name on a file system that does not require normalization - * or case folding. We can also look there if we happen to be - * on a non-normalizing, mixed sensitivity file system IF we - * are looking for the exact name. * - * Maybe can add TO-UPPERed version of name to dnlc in ci-only - * case for performance improvement? + * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE + * because in that case MT_EXACT and MT_FIRST should produce exactly + * the same result. */ - update = !zfsvfs->z_norm || - ((zfsvfs->z_case == ZFS_CASE_MIXED) && - !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK)); + exact = zfsvfs->z_case == ZFS_CASE_MIXED; - /* - * ZRENAMING indicates we are in a situation where we should - * take narrow locks regardless of the file system's - * preferences for normalizing and case folding. This will - * prevent us deadlocking trying to grab the same wide lock - * twice if the two names happen to be case-insensitive - * matches. - */ - if (flag & ZRENAMING) - cmpflags = 0; - else - cmpflags = zfsvfs->z_norm; - - /* - * Wait until there are no locks on this name. - * - * Don't grab the the lock if it is already held. However, cannot - * have both ZSHARED and ZHAVELOCK together. - */ - ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK)); - if (!(flag & ZHAVELOCK)) - rw_enter(&dzp->z_name_lock, RW_READER); - - mutex_enter(&dzp->z_lock); - for (;;) { - if (dzp->z_unlinked && !(flag & ZXATTR)) { - mutex_exit(&dzp->z_lock); - if (!(flag & ZHAVELOCK)) - rw_exit(&dzp->z_name_lock); - return (SET_ERROR(ENOENT)); - } - for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) { - if ((u8_strcmp(name, dl->dl_name, 0, cmpflags, - U8_UNICODE_LATEST, &error) == 0) || error != 0) - break; - } - if (error != 0) { - mutex_exit(&dzp->z_lock); - if (!(flag & ZHAVELOCK)) - rw_exit(&dzp->z_name_lock); - return (SET_ERROR(ENOENT)); - } - if (dl == NULL) { - size_t namesize; - - /* - * Allocate a new dirlock and add it to the list. - */ - namesize = strlen(name) + 1; - dl = kmem_alloc(sizeof (zfs_dirlock_t) + namesize, - KM_SLEEP); - cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL); - dl->dl_name = (char *)(dl + 1); - bcopy(name, dl->dl_name, namesize); - dl->dl_sharecnt = 0; - dl->dl_namelock = 0; - dl->dl_namesize = namesize; - dl->dl_dzp = dzp; - dl->dl_next = dzp->z_dirlocks; - dzp->z_dirlocks = dl; - break; - } - if ((flag & ZSHARED) && dl->dl_sharecnt != 0) - break; - cv_wait(&dl->dl_cv, &dzp->z_lock); - } - - /* - * If the z_name_lock was NOT held for this dirlock record it. - */ - if (flag & ZHAVELOCK) - dl->dl_namelock = 1; - - if (flag & ZSHARED) - dl->dl_sharecnt++; - - mutex_exit(&dzp->z_lock); - - /* - * We have a dirlock on the name. (Note that it is the dirlock, - * not the dzp's z_lock, that protects the name in the zap object.) - * See if there's an object by this name; if so, put a hold on it. - */ + if (dzp->z_unlinked && !(flag & ZXATTR)) + return (ENOENT); if (flag & ZXATTR) { error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid, sizeof (zoid)); if (error == 0) error = (zoid == 0 ? ENOENT : 0); } else { - if (update) - vp = dnlc_lookup(ZTOV(dzp), name); - if (vp == DNLC_NO_VNODE) { - VN_RELE(vp); - error = SET_ERROR(ENOENT); - } else if (vp) { - if (flag & ZNEW) { - zfs_dirent_unlock(dl); - VN_RELE(vp); - return (SET_ERROR(EEXIST)); - } - *dlpp = dl; - *zpp = VTOZ(vp); - return (0); - } else { - error = zfs_match_find(zfsvfs, dzp, name, exact, - update, direntflags, realpnp, &zoid); - } + error = zfs_match_find(zfsvfs, dzp, name, exact, &zoid); } if (error) { if (error != ENOENT || (flag & ZEXISTS)) { - zfs_dirent_unlock(dl); return (error); } } else { if (flag & ZNEW) { - zfs_dirent_unlock(dl); return (SET_ERROR(EEXIST)); } error = zfs_zget(zfsvfs, zoid, zpp); - if (error) { - zfs_dirent_unlock(dl); + if (error) return (error); - } - if (!(flag & ZXATTR) && update) - dnlc_update(ZTOV(dzp), name, ZTOV(*zpp)); + ASSERT(!(*zpp)->z_unlinked); } - *dlpp = dl; - return (0); } -/* - * Unlock this directory entry and wake anyone who was waiting for it. - */ -void -zfs_dirent_unlock(zfs_dirlock_t *dl) +static int +zfs_dd_lookup(znode_t *dzp, znode_t **zpp) { - znode_t *dzp = dl->dl_dzp; - zfs_dirlock_t **prev_dl, *cur_dl; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + znode_t *zp; + vnode_t *vp; + uint64_t parent; + int error; - mutex_enter(&dzp->z_lock); + ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); + ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock)); - if (!dl->dl_namelock) - rw_exit(&dzp->z_name_lock); + if (dzp->z_unlinked) + return (ENOENT); - if (dl->dl_sharecnt > 1) { - dl->dl_sharecnt--; - mutex_exit(&dzp->z_lock); - return; - } - prev_dl = &dzp->z_dirlocks; - while ((cur_dl = *prev_dl) != dl) - prev_dl = &cur_dl->dl_next; - *prev_dl = dl->dl_next; - cv_broadcast(&dl->dl_cv); - mutex_exit(&dzp->z_lock); + if ((error = sa_lookup(dzp->z_sa_hdl, + SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) + return (error); - cv_destroy(&dl->dl_cv); - kmem_free(dl, sizeof (*dl) + dl->dl_namesize); + /* + * If we are a snapshot mounted under .zfs, return + * the snapshot directory. + */ + if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) { + error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir, + "snapshot", &vp, NULL, 0, NULL, kcred, + NULL, NULL, NULL); + if (error == 0) + zp = VTOZ(vp); + } else { + error = zfs_zget(zfsvfs, parent, &zp); + } + if (error == 0) + *zpp = zp; + return (error); } -/* - * Look up an entry in a directory. - * - * NOTE: '.' and '..' are handled as special cases because - * no directory entries are actually stored for them. If this is - * the root of a filesystem, then '.zfs' is also treated as a - * special pseudo-directory. - */ int -zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp, int flags, - int *deflg, pathname_t *rpnp) +zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp) { - zfs_dirlock_t *dl; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; znode_t *zp; int error = 0; - uint64_t parent; - int unlinked; - - if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { - mutex_enter(&dzp->z_lock); - unlinked = dzp->z_unlinked; - mutex_exit(&dzp->z_lock); - if (unlinked) - return (ENOENT); - - *vpp = ZTOV(dzp); - VN_HOLD(*vpp); - } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - /* - * If we are a snapshot mounted under .zfs, return - * the vp for the snapshot directory. - */ - if ((error = sa_lookup(dzp->z_sa_hdl, - SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) - return (error); - if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) { - error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir, - "snapshot", vpp, NULL, 0, NULL, kcred, - NULL, NULL, NULL); - return (error); - } + ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); + ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock)); - mutex_enter(&dzp->z_lock); - unlinked = dzp->z_unlinked; - mutex_exit(&dzp->z_lock); - if (unlinked) - return (ENOENT); + if (dzp->z_unlinked) + return (SET_ERROR(ENOENT)); - rw_enter(&dzp->z_parent_lock, RW_READER); - error = zfs_zget(zfsvfs, parent, &zp); - if (error == 0) - *vpp = ZTOV(zp); - rw_exit(&dzp->z_parent_lock); + if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { + *zpp = dzp; + } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { + error = zfs_dd_lookup(dzp, zpp); } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) { - *vpp = zfsctl_root(dzp); + *zpp = VTOZ(zfsctl_root(dzp)); } else { - int zf; - - zf = ZEXISTS | ZSHARED; - if (flags & FIGNORECASE) - zf |= ZCILOOK; - - error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp); + error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS); if (error == 0) { - *vpp = ZTOV(zp); - zfs_dirent_unlock(dl); dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ + *zpp = zp; } - rpnp = NULL; } - - if ((flags & FIGNORECASE) && rpnp && !error) - (void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize); - return (error); } @@ -510,8 +306,9 @@ if (error != 0) continue; + vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY); zp->z_unlinked = B_TRUE; - VN_RELE(ZTOV(zp)); + vput(ZTOV(zp)); } zap_cursor_fini(&zc); } @@ -535,7 +332,6 @@ znode_t *xzp; dmu_tx_t *tx; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zfs_dirlock_t dl; int skipped = 0; int error; @@ -549,6 +345,7 @@ continue; } + vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY); ASSERT((ZTOV(xzp)->v_type == VREG) || (ZTOV(xzp)->v_type == VLNK)); @@ -563,20 +360,17 @@ error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); - VN_RELE(ZTOV(xzp)); + vput(ZTOV(xzp)); skipped += 1; continue; } - bzero(&dl, sizeof (dl)); - dl.dl_dzp = dzp; - dl.dl_name = zap.za_name; - error = zfs_link_destroy(&dl, xzp, tx, 0, NULL); + error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL); if (error) skipped += 1; dmu_tx_commit(tx); - VN_RELE(ZTOV(xzp)); + vput(ZTOV(xzp)); } zap_cursor_fini(&zc); if (error != ENOENT) @@ -596,6 +390,7 @@ int error; ASSERT(zp->z_links == 0); + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); /* * If this is an attribute directory, purge its contents. @@ -640,7 +435,8 @@ &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); - ASSERT(error == 0); + ASSERT3S(error, ==, 0); + vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY); } acl_obj = zfs_external_acl(zp); @@ -674,12 +470,10 @@ if (xzp) { ASSERT(error == 0); - mutex_enter(&xzp->z_lock); xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ xzp->z_links = 0; /* no more links to it */ VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), &xzp->z_links, sizeof (xzp->z_links), tx)); - mutex_exit(&xzp->z_lock); zfs_unlinked_add(xzp, tx); } @@ -692,7 +486,7 @@ dmu_tx_commit(tx); out: if (xzp) - VN_RELE(ZTOV(xzp)); + vput(ZTOV(xzp)); } static uint64_t @@ -706,12 +500,12 @@ } /* - * Link zp into dl. Can only fail if zp has been unlinked. + * Link zp into dzp. Can only fail if zp has been unlinked. */ int -zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) +zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, + int flag) { - znode_t *dzp = dl->dl_dzp; zfsvfs_t *zfsvfs = zp->z_zfsvfs; vnode_t *vp = ZTOV(zp); uint64_t value; @@ -721,18 +515,32 @@ int count = 0; int error; - mutex_enter(&zp->z_lock); - + ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); +#if 0 + if (zp_is_dir) { + error = 0; + if (dzp->z_links >= LINK_MAX) + error = SET_ERROR(EMLINK); + return (error); + } +#endif if (!(flag & ZRENAMING)) { if (zp->z_unlinked) { /* no new links to unlinked zp */ ASSERT(!(flag & (ZNEW | ZEXISTS))); - mutex_exit(&zp->z_lock); return (SET_ERROR(ENOENT)); } +#if 0 + if (zp->z_links >= LINK_MAX) { + return (SET_ERROR(EMLINK)); + } +#endif zp->z_links++; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &zp->z_links, sizeof (zp->z_links)); + } else { + ASSERT(zp->z_unlinked == 0); } SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &dzp->z_id, sizeof (dzp->z_id)); @@ -746,11 +554,8 @@ ctime, B_TRUE); } error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - ASSERT(error == 0); + ASSERT0(error); - mutex_exit(&zp->z_lock); - - mutex_enter(&dzp->z_lock); dzp->z_size++; dzp->z_links += zp_is_dir; count = 0; @@ -766,55 +571,48 @@ &dzp->z_pflags, sizeof (dzp->z_pflags)); zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); - ASSERT(error == 0); - mutex_exit(&dzp->z_lock); + ASSERT0(error); value = zfs_dirent(zp, zp->z_mode); - error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, + error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name, 8, 1, &value, tx); - ASSERT(error == 0); - - dnlc_update(ZTOV(dzp), dl->dl_name, vp); + VERIFY0(error); return (0); } static int -zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx, +zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, int flag) { int error; if (zp->z_zfsvfs->z_norm) { - if (((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && - (flag & ZCIEXACT)) || - ((zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) && - !(flag & ZCILOOK))) + if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) error = zap_remove_norm(zp->z_zfsvfs->z_os, - dzp->z_id, dl->dl_name, MT_EXACT, tx); + dzp->z_id, name, MT_EXACT, tx); else error = zap_remove_norm(zp->z_zfsvfs->z_os, - dzp->z_id, dl->dl_name, MT_FIRST, tx); + dzp->z_id, name, MT_FIRST, tx); } else { error = zap_remove(zp->z_zfsvfs->z_os, - dzp->z_id, dl->dl_name, tx); + dzp->z_id, name, tx); } return (error); } /* - * Unlink zp from dl, and mark zp for deletion if this was the last link. + * Unlink zp from dzp, and mark zp for deletion if this was the last link. * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST). * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. * If it's non-NULL, we use it to indicate whether the znode needs deletion, * and it's the caller's job to do it. */ int -zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, - boolean_t *unlinkedp) +zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, + int flag, boolean_t *unlinkedp) { - znode_t *dzp = dl->dl_dzp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; vnode_t *vp = ZTOV(zp); int zp_is_dir = (vp->v_type == VDIR); @@ -824,22 +622,12 @@ int count = 0; int error; - dnlc_remove(ZTOV(dzp), dl->dl_name); + ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); if (!(flag & ZRENAMING)) { - if (vn_vfswlock(vp)) /* prevent new mounts on zp */ - return (SET_ERROR(EBUSY)); - - if (vn_ismntpt(vp)) { /* don't remove mount point */ - vn_vfsunlock(vp); - return (SET_ERROR(EBUSY)); - } - - mutex_enter(&zp->z_lock); if (zp_is_dir && !zfs_dirempty(zp)) { - mutex_exit(&zp->z_lock); - vn_vfsunlock(vp); #ifdef illumos return (SET_ERROR(EEXIST)); #else @@ -852,10 +640,8 @@ * First try removing the name from the directory; if that * fails, return the error. */ - error = zfs_dropname(dl, zp, dzp, tx, flag); + error = zfs_dropname(dzp, name, zp, tx, flag); if (error != 0) { - mutex_exit(&zp->z_lock); - vn_vfsunlock(vp); return (error); } @@ -882,16 +668,14 @@ NULL, &zp->z_links, sizeof (zp->z_links)); error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); count = 0; - ASSERT(error == 0); - mutex_exit(&zp->z_lock); - vn_vfsunlock(vp); + ASSERT0(error); } else { - error = zfs_dropname(dl, zp, dzp, tx, flag); + ASSERT(zp->z_unlinked == 0); + error = zfs_dropname(dzp, name, zp, tx, flag); if (error != 0) return (error); } - mutex_enter(&dzp->z_lock); dzp->z_size--; /* one dirent removed */ dzp->z_links -= zp_is_dir; /* ".." link from zp */ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), @@ -906,8 +690,7 @@ NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); - ASSERT(error == 0); - mutex_exit(&dzp->z_lock); + ASSERT0(error); if (unlinkedp != NULL) *unlinkedp = unlinked; @@ -918,14 +701,12 @@ } /* - * Indicate whether the directory is empty. Works with or without z_lock - * held, but can only be consider a hint in the latter case. Returns true - * if only "." and ".." remain and there's no work in progress. + * Indicate whether the directory is empty. */ boolean_t zfs_dirempty(znode_t *dzp) { - return (dzp->z_size == 2 && dzp->z_dirlocks == 0); + return (dzp->z_size == 2); } int @@ -1019,23 +800,20 @@ { zfsvfs_t *zfsvfs = zp->z_zfsvfs; znode_t *xzp; - zfs_dirlock_t *dl; vattr_t va; int error; top: - error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL); + error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR); if (error) return (error); if (xzp != NULL) { *xvpp = ZTOV(xzp); - zfs_dirent_unlock(dl); return (0); } if (!(flags & CREATE_XATTR_DIR)) { - zfs_dirent_unlock(dl); #ifdef illumos return (SET_ERROR(ENOENT)); #else @@ -1044,7 +822,6 @@ } if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { - zfs_dirent_unlock(dl); return (SET_ERROR(EROFS)); } @@ -1064,7 +841,6 @@ zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); error = zfs_make_xattrdir(zp, &va, xvpp, cr); - zfs_dirent_unlock(dl); if (error == ERESTART) { /* NB: we already did dmu_tx_wait() if necessary */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c @@ -124,7 +124,7 @@ zfsvfs_t *zfsvfs = zp->z_zfsvfs; xoptattr_t *xoap; - ASSERT(MUTEX_HELD(&zp->z_lock)); + ASSERT_VOP_LOCKED(ZTOV(zp), __func__); VERIFY((xoap = xva_getxoptattr(xvap)) != NULL); if (zp->z_is_sa) { if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs), @@ -158,7 +158,7 @@ zfsvfs_t *zfsvfs = zp->z_zfsvfs; xoptattr_t *xoap; - ASSERT(MUTEX_HELD(&zp->z_lock)); + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); VERIFY((xoap = xva_getxoptattr(xvap)) != NULL); if (zp->z_is_sa) VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs), @@ -205,7 +205,6 @@ uint64_t crtime[2], mtime[2], ctime[2]; zfs_acl_phys_t znode_acl; char scanstamp[AV_SCANSTAMP_SZ]; - boolean_t drop_lock = B_FALSE; /* * No upgrade if ACL isn't cached @@ -217,20 +216,16 @@ return; /* - * If the z_lock is held and we aren't the owner - * the just return since we don't want to deadlock + * If the vnode lock is held and we aren't the owner + * then just return since we don't want to deadlock * trying to update the status of z_is_sa. This * file can then be upgraded at a later time. * * Otherwise, we know we are doing the * sa_update() that caused us to enter this function. */ - if (mutex_owner(&zp->z_lock) != curthread) { - if (mutex_tryenter(&zp->z_lock) == 0) + if (vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_NOWAIT) != 0) return; - else - drop_lock = B_TRUE; - } /* First do a bulk query of the attributes that aren't cached */ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); @@ -311,8 +306,7 @@ zp->z_is_sa = B_TRUE; done: - if (drop_lock) - mutex_exit(&zp->z_lock); + VOP_UNLOCK(ZTOV(zp), 0); } void Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c @@ -956,6 +956,18 @@ else if (error != 0) return (error); + /* + * Only use the name cache if we are looking for a + * name on a file system that does not require normalization + * or case folding. We can also look there if we happen to be + * on a non-normalizing, mixed sensitivity file system IF we + * are looking for the exact name (which is always the case on + * FreeBSD). + */ + zfsvfs->z_use_namecache = !zfsvfs->z_norm || + ((zfsvfs->z_case == ZFS_CASE_MIXED) && + !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER)); + return (0); } @@ -996,7 +1008,11 @@ mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); +#ifdef DIAGNOSTIC + rrm_init(&zfsvfs->z_teardown_lock, B_TRUE); +#else rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); +#endif rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) @@ -2043,7 +2059,7 @@ ZFS_ENTER(zfsvfs); err = zfs_zget(zfsvfs, ino, &zp); if (err == 0 && zp->z_unlinked) { - VN_RELE(ZTOV(zp)); + vrele(ZTOV(zp)); err = EINVAL; } if (err == 0) @@ -2144,7 +2160,7 @@ VERIFY(zfsctl_root_lookup(*vpp, "shares", vpp, NULL, 0, NULL, NULL, NULL, NULL, NULL) == 0); } else { - VN_HOLD(*vpp); + vref(*vpp); } ZFS_EXIT(zfsvfs); err = vn_lock(*vpp, flags); @@ -2167,7 +2183,7 @@ zp_gen = 1; if (zp->z_unlinked || zp_gen != fid_gen) { dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); - VN_RELE(ZTOV(zp)); + vrele(ZTOV(zp)); ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c @@ -66,7 +66,6 @@ #include #include #include -#include #include #include #include @@ -147,7 +146,7 @@ * * ZFS_ENTER(zfsvfs); // exit if unmounted * top: - * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) + * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD()) * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify @@ -1433,26 +1432,81 @@ return (error); } -/* - * If vnode is for a device return a specfs vnode instead. - */ static int -specvp_check(vnode_t **vpp, cred_t *cr) +zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) { - int error = 0; - - if (IS_DEVVP(*vpp)) { - struct vnode *svp; + int error; - svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); - VN_RELE(*vpp); - if (svp == NULL) - error = SET_ERROR(ENOSYS); - *vpp = svp; - } + *vpp = arg; + error = vn_lock(*vpp, lkflags); + if (error != 0) + vrele(*vpp); return (error); } +static int +zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags) +{ + znode_t *zdp = VTOZ(dvp); + zfsvfs_t *zfsvfs = zdp->z_zfsvfs; + int error; + int ltype; + + ASSERT_VOP_LOCKED(dvp, __func__); +#ifdef DIAGNOSTIC + ASSERT(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock)); +#endif + + if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { + ASSERT3P(dvp, ==, vp); + vref(dvp); + ltype = lkflags & LK_TYPE_MASK; + if (ltype != VOP_ISLOCKED(dvp)) { + if (ltype == LK_EXCLUSIVE) + vn_lock(dvp, LK_UPGRADE | LK_RETRY); + else /* if (ltype == LK_SHARED) */ + vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); + + /* + * Relock for the "." case could leave us with + * reclaimed vnode. + */ + if (dvp->v_iflag & VI_DOOMED) { + vrele(dvp); + return (SET_ERROR(ENOENT)); + } + } + return (0); + } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { + /* + * Note that in this case, dvp is the child vnode, and we + * are looking up the parent vnode - exactly reverse from + * normal operation. Unlocking dvp requires some rather + * tricky unlock/relock dance to prevent mp from being freed; + * use vn_vget_ino_gen() which takes care of all that. + * + * XXX Note that there is a time window when both vnodes are + * unlocked. It is possible, although highly unlikely, that + * during that window the parent-child relationship between + * the vnodes may change, for example, get reversed. + * In that case we would have a wrong lock order for the vnodes. + * All other filesystems seem to ignore this problem, so we + * do the same here. + * A potential solution could be implemented as follows: + * - using LK_NOWAIT when locking the second vnode and retrying + * if necessary + * - checking that the parent-child relationship still holds + * after locking both vnodes and retrying if it doesn't + */ + error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp); + return (error); + } else { + error = vn_lock(vp, lkflags); + if (error != 0) + vrele(vp); + return (error); + } +} /* * Lookup an entry in a directory, or an extended attribute directory. @@ -1465,8 +1519,6 @@ * rdir - root directory vnode [UNUSED]. * cr - credentials of caller. * ct - caller context - * direntflags - directory lookup flags - * realpnp - returned pathname. * * OUT: vpp - vnode of located entry, NULL if not found. * @@ -1481,46 +1533,17 @@ int nameiop, cred_t *cr, kthread_t *td, int flags) { znode_t *zdp = VTOZ(dvp); + znode_t *zp; zfsvfs_t *zfsvfs = zdp->z_zfsvfs; int error = 0; - int *direntflags = NULL; - void *realpnp = NULL; - - /* fast path */ - if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { + /* fast path (should be redundant with vfs namecache) */ + if (!(flags & LOOKUP_XATTR)) { if (dvp->v_type != VDIR) { return (SET_ERROR(ENOTDIR)); } else if (zdp->z_sa_hdl == NULL) { return (SET_ERROR(EIO)); } - - if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { - error = zfs_fastaccesschk_execute(zdp, cr); - if (!error) { - *vpp = dvp; - VN_HOLD(*vpp); - return (0); - } - return (error); - } else { - vnode_t *tvp = dnlc_lookup(dvp, nm); - - if (tvp) { - error = zfs_fastaccesschk_execute(zdp, cr); - if (error) { - VN_RELE(tvp); - return (error); - } - if (tvp == DNLC_NO_VNODE) { - VN_RELE(tvp); - return (SET_ERROR(ENOENT)); - } else { - *vpp = tvp; - return (specvp_check(vpp, cr)); - } - } - } } DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); @@ -1558,10 +1581,9 @@ /* * Do we have permission to get into attribute directory? */ - if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, B_FALSE, cr)) { - VN_RELE(*vpp); + vrele(*vpp); *vpp = NULL; } @@ -1569,15 +1591,9 @@ return (error); } - if (dvp->v_type != VDIR) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(ENOTDIR)); - } - /* * Check accessibility of directory. */ - if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { ZFS_EXIT(zfsvfs); return (error); @@ -1589,9 +1605,56 @@ return (SET_ERROR(EILSEQ)); } - error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); - if (error == 0) - error = specvp_check(vpp, cr); + /* + * The loop is retry the lookup if the parent-child relationship + * changes during the dot-dot locking complexities. + */ + for (;;) { + uint64_t parent; + + error = zfs_dirlook(zdp, nm, &zp); + if (error == 0) + *vpp = ZTOV(zp); + + ZFS_EXIT(zfsvfs); + if (error != 0) + break; + + error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags); + if (error != 0) { + /* + * If we've got a locking error, then the vnode + * got reclaimed because of a force unmount. + * We never enter doomed vnodes into the name cache. + */ + *vpp = NULL; + return (error); + } + + if ((cnp->cn_flags & ISDOTDOT) == 0) + break; + + ZFS_ENTER(zfsvfs); + if (zdp->z_sa_hdl == NULL) { + error = SET_ERROR(EIO); + } else { + error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent)); + } + if (error != 0) { + ZFS_EXIT(zfsvfs); + vput(ZTOV(zp)); + break; + } + if (zp->z_id == parent) { + ZFS_EXIT(zfsvfs); + break; + } + vput(ZTOV(zp)); + } + + if (error != 0) + *vpp = NULL; /* Translate errors and add SAVENAME when needed. */ if (cnp->cn_flags & ISLASTCN) { @@ -1610,42 +1673,20 @@ break; } } - if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) { - int ltype = 0; - - if (cnp->cn_flags & ISDOTDOT) { - ltype = VOP_ISLOCKED(dvp); - VOP_UNLOCK(dvp, 0); - } - ZFS_EXIT(zfsvfs); - error = vn_lock(*vpp, cnp->cn_lkflags); - if (cnp->cn_flags & ISDOTDOT) - vn_lock(dvp, ltype | LK_RETRY); - if (error != 0) { - VN_RELE(*vpp); - *vpp = NULL; - return (error); - } - } else { - ZFS_EXIT(zfsvfs); - } -#ifdef FREEBSD_NAMECACHE - /* - * Insert name into cache (as non-existent) if appropriate. - */ - if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0) - cache_enter(dvp, *vpp, cnp); - /* - * Insert name into cache if appropriate. - */ - if (error == 0 && (cnp->cn_flags & MAKEENTRY)) { + /* Insert name into cache (as non-existent) if appropriate. */ + if (zfsvfs->z_use_namecache && + error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0) + cache_enter(dvp, NULL, cnp); + + /* Insert name into cache if appropriate. */ + if (zfsvfs->z_use_namecache && + error == 0 && (cnp->cn_flags & MAKEENTRY)) { if (!(cnp->cn_flags & ISLASTCN) || (nameiop != DELETE && nameiop != RENAME)) { cache_enter(dvp, *vpp, cnp); } } -#endif return (error); } @@ -1683,7 +1724,6 @@ zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; objset_t *os; - zfs_dirlock_t *dl; dmu_tx_t *tx; int error; ksid_t *ksid; @@ -1691,10 +1731,9 @@ gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; - boolean_t have_acl = B_FALSE; - boolean_t waited = B_FALSE; void *vsecp = NULL; int flag = 0; + uint64_t txtype; /* * If we have an ephemeral id, ACL, or XVATTR then @@ -1731,182 +1770,89 @@ } } - getnewvnode_reserve(1); - -top: *vpp = NULL; if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) vap->va_mode &= ~S_ISVTX; - if (*name == '\0') { - /* - * Null component name refers to the directory itself. - */ - VN_HOLD(dvp); - zp = dzp; - dl = NULL; - error = 0; - } else { - /* possible VN_HOLD(zp) */ - int zflg = 0; - - if (flag & FIGNORECASE) - zflg |= ZCILOOK; - - error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, - NULL, NULL); - if (error) { - if (have_acl) - zfs_acl_ids_free(&acl_ids); - if (strcmp(name, "..") == 0) - error = SET_ERROR(EISDIR); - getnewvnode_drop_reserve(); - ZFS_EXIT(zfsvfs); - return (error); - } + error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); } + ASSERT3P(zp, ==, NULL); - if (zp == NULL) { - uint64_t txtype; - - /* - * Create a new file object and update the directory - * to reference it. - */ - if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { - if (have_acl) - zfs_acl_ids_free(&acl_ids); - goto out; - } - - /* - * We only support the creation of regular files in - * extended attribute directories. - */ + /* + * Create a new file object and update the directory + * to reference it. + */ + if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { + goto out; + } - if ((dzp->z_pflags & ZFS_XATTR) && - (vap->va_type != VREG)) { - if (have_acl) - zfs_acl_ids_free(&acl_ids); - error = SET_ERROR(EINVAL); - goto out; - } + /* + * We only support the creation of regular files in + * extended attribute directories. + */ - if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, - cr, vsecp, &acl_ids)) != 0) - goto out; - have_acl = B_TRUE; + if ((dzp->z_pflags & ZFS_XATTR) && + (vap->va_type != VREG)) { + error = SET_ERROR(EINVAL); + goto out; + } - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { - zfs_acl_ids_free(&acl_ids); - error = SET_ERROR(EDQUOT); - goto out; - } + if ((error = zfs_acl_ids_create(dzp, 0, vap, + cr, vsecp, &acl_ids)) != 0) + goto out; - tx = dmu_tx_create(os); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + zfs_acl_ids_free(&acl_ids); + error = SET_ERROR(EDQUOT); + goto out; + } - dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + - ZFS_SA_BASE_ATTR_SIZE); + getnewvnode_reserve(1); - fuid_dirtied = zfsvfs->z_fuid_dirty; - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); - if (!zfsvfs->z_use_sa && - acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, acl_ids.z_aclp->z_acl_bytes); - } - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); - if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - zfs_acl_ids_free(&acl_ids); - dmu_tx_abort(tx); - getnewvnode_drop_reserve(); - ZFS_EXIT(zfsvfs); - return (error); - } - zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + tx = dmu_tx_create(os); - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); - (void) zfs_link_create(dl, zp, tx, ZNEW); - txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); - if (flag & FIGNORECASE) - txtype |= TX_CI; - zfs_log_create(zilog, tx, txtype, dzp, zp, name, - vsecp, acl_ids.z_fuidp, vap); + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && + acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, acl_ids.z_aclp->z_acl_bytes); + } + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { zfs_acl_ids_free(&acl_ids); - dmu_tx_commit(tx); - } else { - int aflags = (flag & FAPPEND) ? V_APPEND : 0; - - if (have_acl) - zfs_acl_ids_free(&acl_ids); - have_acl = B_FALSE; + dmu_tx_abort(tx); + getnewvnode_drop_reserve(); + ZFS_EXIT(zfsvfs); + return (error); + } + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); - /* - * A directory entry already exists for this name. - */ - /* - * Can't truncate an existing file if in exclusive mode. - */ - if (excl == EXCL) { - error = SET_ERROR(EEXIST); - goto out; - } - /* - * Can't open a directory for writing. - */ - if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { - error = SET_ERROR(EISDIR); - goto out; - } - /* - * Verify requested access to file. - */ - if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { - goto out; - } + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); - mutex_enter(&dzp->z_lock); - dzp->z_seq++; - mutex_exit(&dzp->z_lock); + (void) zfs_link_create(dzp, name, zp, tx, ZNEW); + txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); + zfs_log_create(zilog, tx, txtype, dzp, zp, name, + vsecp, acl_ids.z_fuidp, vap); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); - /* - * Truncate regular files if requested. - */ - if ((ZTOV(zp)->v_type == VREG) && - (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { - /* we can't hold any locks when calling zfs_freesp() */ - zfs_dirent_unlock(dl); - dl = NULL; - error = zfs_freesp(zp, 0, 0, mode, TRUE); - if (error == 0) { - vnevent_create(ZTOV(zp), ct); - } - } - } -out: getnewvnode_drop_reserve(); - if (dl) - zfs_dirent_unlock(dl); - if (error) { - if (zp) - VN_RELE(ZTOV(zp)); - } else { +out: + if (error == 0) { *vpp = ZTOV(zp); - error = specvp_check(vpp, cr); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) @@ -1932,57 +1878,30 @@ * vp - ctime (if nlink > 0) */ -uint64_t null_xattr = 0; - /*ARGSUSED*/ static int -zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, - int flags) +zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) { - znode_t *zp, *dzp = VTOZ(dvp); + znode_t *dzp = VTOZ(dvp); + znode_t *zp = VTOZ(vp); znode_t *xzp; - vnode_t *vp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t acl_obj, xattr_obj; - uint64_t xattr_obj_unlinked = 0; uint64_t obj = 0; - zfs_dirlock_t *dl; dmu_tx_t *tx; - boolean_t may_delete_now, delete_now = FALSE; boolean_t unlinked, toobig = FALSE; uint64_t txtype; - pathname_t *realnmp = NULL; - pathname_t realnm; int error; - int zflg = ZEXISTS; - boolean_t waited = B_FALSE; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); + ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; + zp = VTOZ(vp); - if (flags & FIGNORECASE) { - zflg |= ZCILOOK; - pn_alloc(&realnm); - realnmp = &realnm; - } - -top: xattr_obj = 0; xzp = NULL; - /* - * Attempt to lock directory; fail if entry doesn't exist. - */ - if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, - NULL, realnmp)) { - if (realnmp) - pn_free(realnmp); - ZFS_EXIT(zfsvfs); - return (error); - } - - vp = ZTOV(zp); if (error = zfs_zaccess_delete(dzp, zp, cr)) { goto out; @@ -1998,14 +1917,15 @@ vnevent_remove(vp, dvp, name, ct); - if (realnmp) - dnlc_remove(dvp, realnmp->pn_buf); - else - dnlc_remove(dvp, name); + obj = zp->z_id; - VI_LOCK(vp); - may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp); - VI_UNLOCK(vp); + /* are there any extended attributes? */ + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error == 0 && xattr_obj) { + error = zfs_zget(zfsvfs, xattr_obj, &xzp); + ASSERT0(error); + } /* * We may delete the znode now, or we may put it in the unlinked set; @@ -2013,35 +1933,17 @@ * other holds on the vnode. So we dmu_tx_hold() the right things to * allow for either case. */ - obj = zp->z_id; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); - if (may_delete_now) { - toobig = - zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; - /* if the file is too big, only hold_free a token amount */ - dmu_tx_hold_free(tx, zp->z_id, 0, - (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); - } - /* are there any extended attributes? */ - error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), - &xattr_obj, sizeof (xattr_obj)); - if (error == 0 && xattr_obj) { - error = zfs_zget(zfsvfs, xattr_obj, &xzp); - ASSERT0(error); + if (xzp) { dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } - mutex_enter(&zp->z_lock); - if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) - dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); - mutex_exit(&zp->z_lock); - /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); @@ -2050,20 +1952,8 @@ */ dmu_tx_mark_netfree(tx); - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); + error = dmu_tx_assign(tx, TXG_WAIT); if (error) { - zfs_dirent_unlock(dl); - VN_RELE(vp); - if (xzp) - VN_RELE(ZTOV(xzp)); - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - if (realnmp) - pn_free(realnmp); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); @@ -2072,7 +1962,7 @@ /* * Remove the directory entry. */ - error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); + error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked); if (error) { dmu_tx_commit(tx); @@ -2080,76 +1970,18 @@ } if (unlinked) { - /* - * Hold z_lock so that we can make sure that the ACL obj - * hasn't changed. Could have been deleted due to - * zfs_sa_upgrade(). - */ - mutex_enter(&zp->z_lock); - VI_LOCK(vp); - (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), - &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); - delete_now = may_delete_now && !toobig && - vp->v_count == 1 && !vn_has_cached_data(vp) && - xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == - acl_obj; - VI_UNLOCK(vp); - } - - if (delete_now) { -#ifdef __FreeBSD__ - panic("zfs_remove: delete_now branch taken"); -#endif - if (xattr_obj_unlinked) { - ASSERT3U(xzp->z_links, ==, 2); - mutex_enter(&xzp->z_lock); - xzp->z_unlinked = 1; - xzp->z_links = 0; - error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), - &xzp->z_links, sizeof (xzp->z_links), tx); - ASSERT3U(error, ==, 0); - mutex_exit(&xzp->z_lock); - zfs_unlinked_add(xzp, tx); - - if (zp->z_is_sa) - error = sa_remove(zp->z_sa_hdl, - SA_ZPL_XATTR(zfsvfs), tx); - else - error = sa_update(zp->z_sa_hdl, - SA_ZPL_XATTR(zfsvfs), &null_xattr, - sizeof (uint64_t), tx); - ASSERT0(error); - } - VI_LOCK(vp); - vp->v_count--; - ASSERT0(vp->v_count); - VI_UNLOCK(vp); - mutex_exit(&zp->z_lock); - zfs_znode_delete(zp, tx); - } else if (unlinked) { - mutex_exit(&zp->z_lock); zfs_unlinked_add(zp, tx); -#ifdef __FreeBSD__ vp->v_vflag |= VV_NOSYNC; -#endif } txtype = TX_REMOVE; - if (flags & FIGNORECASE) - txtype |= TX_CI; zfs_log_remove(zilog, tx, txtype, dzp, name, obj); dmu_tx_commit(tx); out: - if (realnmp) - pn_free(realnmp); - zfs_dirent_unlock(dl); - - if (!delete_now) - VN_RELE(vp); if (xzp) - VN_RELE(ZTOV(xzp)); + vrele(ZTOV(xzp)); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); @@ -2180,23 +2012,19 @@ */ /*ARGSUSED*/ static int -zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, - caller_context_t *ct, int flags, vsecattr_t *vsecp) +zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; - zfs_dirlock_t *dl; uint64_t txtype; dmu_tx_t *tx; int error; - int zf = ZNEW; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; - boolean_t waited = B_FALSE; ASSERT(vap->va_type == VDIR); @@ -2211,7 +2039,7 @@ else uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && - (vsecp || (vap->va_mask & AT_XVATTR) || + ((vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); @@ -2229,8 +2057,6 @@ ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } - if (flags & FIGNORECASE) - zf |= ZCILOOK; if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, @@ -2241,13 +2067,11 @@ } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, - vsecp, &acl_ids)) != 0) { + NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } - getnewvnode_reserve(1); - /* * First make sure the new directory doesn't exist. * @@ -2255,29 +2079,23 @@ * EACCES instead of EEXIST which can cause some applications * to fail. */ -top: *vpp = NULL; - if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, - NULL, NULL)) { + if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) { zfs_acl_ids_free(&acl_ids); - getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } + ASSERT3P(zp, ==, NULL); if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { zfs_acl_ids_free(&acl_ids); - zfs_dirent_unlock(dl); - getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); - zfs_dirent_unlock(dl); - getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } @@ -2285,6 +2103,7 @@ /* * Add a new entry to the directory. */ + getnewvnode_reserve(1); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); @@ -2299,15 +2118,8 @@ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); + error = dmu_tx_assign(tx, TXG_WAIT); if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); @@ -2326,14 +2138,12 @@ /* * Now put new name in parent dir. */ - (void) zfs_link_create(dl, zp, tx, ZNEW); + (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); *vpp = ZTOV(zp); - txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, + txtype = zfs_log_create_txtype(Z_DIR, NULL, vap); + zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); @@ -2342,8 +2152,6 @@ getnewvnode_drop_reserve(); - zfs_dirent_unlock(dl); - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); @@ -2370,39 +2178,20 @@ */ /*ARGSUSED*/ static int -zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, - caller_context_t *ct, int flags) +zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) { znode_t *dzp = VTOZ(dvp); - znode_t *zp; - vnode_t *vp; + znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; - zfs_dirlock_t *dl; dmu_tx_t *tx; int error; - int zflg = ZEXISTS; - boolean_t waited = B_FALSE; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); + ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; - if (flags & FIGNORECASE) - zflg |= ZCILOOK; -top: - zp = NULL; - - /* - * Attempt to lock directory; fail if entry doesn't exist. - */ - if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, - NULL, NULL)) { - ZFS_EXIT(zfsvfs); - return (error); - } - - vp = ZTOV(zp); if (error = zfs_zaccess_delete(dzp, zp, cr)) { goto out; @@ -2413,25 +2202,8 @@ goto out; } - if (vp == cwd) { - error = SET_ERROR(EINVAL); - goto out; - } - vnevent_rmdir(vp, dvp, name, ct); - /* - * Grab a lock on the directory to make sure that noone is - * trying to add (or lookup) entries while we are removing it. - */ - rw_enter(&zp->z_name_lock, RW_WRITER); - - /* - * Grab a lock on the parent pointer to make sure we play well - * with the treewalk and directory rename code. - */ - rw_enter(&zp->z_parent_lock, RW_WRITER); - tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); @@ -2439,48 +2211,26 @@ zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); dmu_tx_mark_netfree(tx); - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); + error = dmu_tx_assign(tx, TXG_WAIT); if (error) { - rw_exit(&zp->z_parent_lock); - rw_exit(&zp->z_name_lock); - zfs_dirent_unlock(dl); - VN_RELE(vp); - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } -#ifdef FREEBSD_NAMECACHE cache_purge(dvp); -#endif - error = zfs_link_destroy(dl, zp, tx, zflg, NULL); + error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL); if (error == 0) { uint64_t txtype = TX_RMDIR; - if (flags & FIGNORECASE) - txtype |= TX_CI; zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); } dmu_tx_commit(tx); - rw_exit(&zp->z_parent_lock); - rw_exit(&zp->z_name_lock); -#ifdef FREEBSD_NAMECACHE cache_purge(vp); -#endif out: - zfs_dirent_unlock(dl); - - VN_RELE(vp); - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); @@ -2705,10 +2455,10 @@ if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) goto skip_entry; if (!zfs_has_access(ezp, cr)) { - VN_RELE(ZTOV(ezp)); + vrele(ZTOV(ezp)); goto skip_entry; } - VN_RELE(ZTOV(ezp)); + vrele(ZTOV(ezp)); } if (flags & V_RDDIR_ENTFLAGS) @@ -2905,7 +2655,6 @@ * than to determine whether we were asked the question. */ - mutex_enter(&zp->z_lock); vap->va_type = IFTOVT(zp->z_mode); vap->va_mode = zp->z_mode & ~S_IFMT; #ifdef illumos @@ -3042,7 +2791,6 @@ ZFS_TIME_DECODE(&vap->va_ctime, ctime); ZFS_TIME_DECODE(&vap->va_birthtime, crtime); - mutex_exit(&zp->z_lock); sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); vap->va_blksize = blksize; @@ -3178,7 +2926,6 @@ } } -top: attrzp = NULL; aclp = NULL; @@ -3267,7 +3014,6 @@ } } - mutex_enter(&zp->z_lock); oldva.va_mode = zp->z_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & AT_XVATTR) { @@ -3341,7 +3087,6 @@ } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { - mutex_exit(&zp->z_lock); ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } @@ -3353,8 +3098,6 @@ } } - mutex_exit(&zp->z_lock); - if (mask & AT_MODE) { if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { err = secpolicy_setid_setsticky_clear(vp, vap, @@ -3429,7 +3172,7 @@ if (new_uid != zp->z_uid && zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { if (attrzp) - VN_RELE(ZTOV(attrzp)); + vrele(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } @@ -3441,7 +3184,7 @@ if (new_gid != zp->z_gid && zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { if (attrzp) - VN_RELE(ZTOV(attrzp)); + vrele(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } @@ -3463,7 +3206,6 @@ if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) goto out; - mutex_enter(&zp->z_lock); if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { /* * Are we upgrading ACL from old V0 format @@ -3484,7 +3226,6 @@ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } - mutex_exit(&zp->z_lock); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); } else { if ((mask & AT_XVATTR) && @@ -3517,10 +3258,8 @@ * updated as a side-effect of calling this function. */ - if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_enter(&zp->z_acl_lock); - mutex_enter(&zp->z_lock); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); @@ -3528,7 +3267,6 @@ if (attrzp) { if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_enter(&attrzp->z_acl_lock); - mutex_enter(&attrzp->z_lock); SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, sizeof (attrzp->z_pflags)); @@ -3662,14 +3400,12 @@ if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); - mutex_exit(&zp->z_lock); if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_exit(&zp->z_acl_lock); if (attrzp) { if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_exit(&attrzp->z_acl_lock); - mutex_exit(&attrzp->z_lock); } out: if (err == 0 && attrzp) { @@ -3679,7 +3415,7 @@ } if (attrzp) - VN_RELE(ZTOV(attrzp)); + vrele(ZTOV(attrzp)); if (aclp) zfs_acl_free(aclp); @@ -3691,8 +3427,6 @@ if (err) { dmu_tx_abort(tx); - if (err == ERESTART) - goto top; } else { err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); @@ -3706,101 +3440,236 @@ return (err); } -typedef struct zfs_zlock { - krwlock_t *zl_rwlock; /* lock we acquired */ - znode_t *zl_znode; /* znode we held */ - struct zfs_zlock *zl_next; /* next in list */ -} zfs_zlock_t; - /* - * Drop locks and release vnodes that were held by zfs_rename_lock(). + * We acquire all but fdvp locks using non-blocking acquisitions. If we + * fail to acquire any lock in the path we will drop all held locks, + * acquire the new lock in a blocking fashion, and then release it and + * restart the rename. This acquire/release step ensures that we do not + * spin on a lock waiting for release. On error release all vnode locks + * and decrement references the way tmpfs_rename() would do. */ -static void -zfs_rename_unlock(zfs_zlock_t **zlpp) +static int +zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp, + struct vnode *tdvp, struct vnode **tvpp, + const struct componentname *scnp, const struct componentname *tcnp) { - zfs_zlock_t *zl; + zfsvfs_t *zfsvfs; + struct vnode *nvp, *svp, *tvp; + znode_t *sdzp, *tdzp, *szp, *tzp; + const char *snm = scnp->cn_nameptr; + const char *tnm = tcnp->cn_nameptr; + int error; + + VOP_UNLOCK(tdvp, 0); + if (*tvpp != NULL && *tvpp != tdvp) + VOP_UNLOCK(*tvpp, 0); + +relock: + error = vn_lock(sdvp, LK_EXCLUSIVE); + if (error) + goto out; + sdzp = VTOZ(sdvp); - while ((zl = *zlpp) != NULL) { - if (zl->zl_znode != NULL) - VN_RELE(ZTOV(zl->zl_znode)); - rw_exit(zl->zl_rwlock); - *zlpp = zl->zl_next; - kmem_free(zl, sizeof (*zl)); + error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT); + if (error != 0) { + VOP_UNLOCK(sdvp, 0); + if (error != EBUSY) + goto out; + error = vn_lock(tdvp, LK_EXCLUSIVE); + if (error) + goto out; + VOP_UNLOCK(tdvp, 0); + goto relock; } -} + tdzp = VTOZ(tdvp); -/* - * Search back through the directory tree, using the ".." entries. - * Lock each directory in the chain to prevent concurrent renames. - * Fail any attempt to move a directory into one of its own descendants. - * XXX - z_parent_lock can overlap with map or grow locks - */ -static int -zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) -{ - zfs_zlock_t *zl; - znode_t *zp = tdzp; - uint64_t rootid = zp->z_zfsvfs->z_root; - uint64_t oidp = zp->z_id; - krwlock_t *rwlp = &szp->z_parent_lock; - krw_t rw = RW_WRITER; + /* + * Before using sdzp and tdzp we must ensure that they are live. + * As a porting legacy from illumos we have two things to worry + * about. One is typical for FreeBSD and it is that the vnode is + * not reclaimed (doomed). The other is that the znode is live. + * The current code can invalidate the znode without acquiring the + * corresponding vnode lock if the object represented by the znode + * and vnode is no longer valid after a rollback or receive operation. + * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock + * that protects the znodes from the invalidation. + */ + zfsvfs = sdzp->z_zfsvfs; + ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs); + ZFS_ENTER(zfsvfs); /* - * First pass write-locks szp and compares to zp->z_id. - * Later passes read-lock zp and compare to zp->z_parent. + * We can not use ZFS_VERIFY_ZP() here because it could directly return + * bypassing the cleanup code in the case of an error. */ - do { - if (!rw_tryenter(rwlp, rw)) { - /* - * Another thread is renaming in this path. - * Note that if we are a WRITER, we don't have any - * parent_locks held yet. - */ - if (rw == RW_READER && zp->z_id > szp->z_id) { - /* - * Drop our locks and restart - */ - zfs_rename_unlock(&zl); - *zlpp = NULL; - zp = tdzp; - oidp = zp->z_id; - rwlp = &szp->z_parent_lock; - rw = RW_WRITER; - continue; - } else { - /* - * Wait for other thread to drop its locks - */ - rw_enter(rwlp, rw); + if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { + ZFS_EXIT(zfsvfs); + VOP_UNLOCK(sdvp, 0); + VOP_UNLOCK(tdvp, 0); + error = SET_ERROR(EIO); + goto out; + } + + /* + * Re-resolve svp to be certain it still exists and fetch the + * correct vnode. + */ + error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS); + if (error != 0) { + /* Source entry invalid or not there. */ + ZFS_EXIT(zfsvfs); + VOP_UNLOCK(sdvp, 0); + VOP_UNLOCK(tdvp, 0); + if ((scnp->cn_flags & ISDOTDOT) != 0 || + (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.')) + error = SET_ERROR(EINVAL); + goto out; + } + svp = ZTOV(szp); + + /* + * Re-resolve tvp, if it disappeared we just carry on. + */ + error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0); + if (error != 0) { + ZFS_EXIT(zfsvfs); + VOP_UNLOCK(sdvp, 0); + VOP_UNLOCK(tdvp, 0); + vrele(svp); + if ((tcnp->cn_flags & ISDOTDOT) != 0) + error = SET_ERROR(EINVAL); + goto out; + } + if (tzp != NULL) + tvp = ZTOV(tzp); + else + tvp = NULL; + + /* + * At present the vnode locks must be acquired before z_teardown_lock, + * although it would be more logical to use the opposite order. + */ + ZFS_EXIT(zfsvfs); + + /* + * Now try acquire locks on svp and tvp. + */ + nvp = svp; + error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); + if (error != 0) { + VOP_UNLOCK(sdvp, 0); + VOP_UNLOCK(tdvp, 0); + if (tvp != NULL) + vrele(tvp); + if (error != EBUSY) { + vrele(nvp); + goto out; + } + error = vn_lock(nvp, LK_EXCLUSIVE); + if (error != 0) { + vrele(nvp); + goto out; + } + VOP_UNLOCK(nvp, 0); + /* + * Concurrent rename race. + * XXX ? + */ + if (nvp == tdvp) { + vrele(nvp); + error = SET_ERROR(EINVAL); + goto out; + } + vrele(*svpp); + *svpp = nvp; + goto relock; + } + vrele(*svpp); + *svpp = nvp; + + if (*tvpp != NULL) + vrele(*tvpp); + *tvpp = NULL; + if (tvp != NULL) { + nvp = tvp; + error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); + if (error != 0) { + VOP_UNLOCK(sdvp, 0); + VOP_UNLOCK(tdvp, 0); + VOP_UNLOCK(*svpp, 0); + if (error != EBUSY) { + vrele(nvp); + goto out; + } + error = vn_lock(nvp, LK_EXCLUSIVE); + if (error != 0) { + vrele(nvp); + goto out; } + vput(nvp); + goto relock; } + *tvpp = nvp; + } - zl = kmem_alloc(sizeof (*zl), KM_SLEEP); - zl->zl_rwlock = rwlp; - zl->zl_znode = NULL; - zl->zl_next = *zlpp; - *zlpp = zl; + return (0); - if (oidp == szp->z_id) /* We're a descendant of szp */ - return (SET_ERROR(EINVAL)); +out: + return (error); +} - if (oidp == rootid) /* We've hit the top */ - return (0); +/* + * Note that we must use VRELE_ASYNC in this function as it walks + * up the directory tree and vrele may need to acquire an exclusive + * lock if a last reference to a vnode is dropped. + */ +static int +zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp) +{ + zfsvfs_t *zfsvfs; + znode_t *zp, *zp1; + uint64_t parent; + int error; - if (rw == RW_READER) { /* i.e. not the first pass */ - int error = zfs_zget(zp->z_zfsvfs, oidp, &zp); - if (error) - return (error); - zl->zl_znode = zp; + zfsvfs = tdzp->z_zfsvfs; + if (tdzp == szp) + return (SET_ERROR(EINVAL)); + if (tdzp == sdzp) + return (0); + if (tdzp->z_id == zfsvfs->z_root) + return (0); + zp = tdzp; + for (;;) { + ASSERT(!zp->z_unlinked); + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) + break; + + if (parent == szp->z_id) { + error = SET_ERROR(EINVAL); + break; } - (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs), - &oidp, sizeof (oidp)); - rwlp = &zp->z_parent_lock; - rw = RW_READER; + if (parent == zfsvfs->z_root) + break; + if (parent == sdzp->z_id) + break; + + error = zfs_zget(zfsvfs, parent, &zp1); + if (error != 0) + break; - } while (zp->z_id != sdzp->z_id); + if (zp != tdzp) + VN_RELE_ASYNC(ZTOV(zp), + dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); + zp = zp1; + } - return (0); + if (error == ENOTDIR) + panic("checkpath: .. not a directory\n"); + if (zp != tdzp) + VN_RELE_ASYNC(ZTOV(zp), + dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); + return (error); } /* @@ -3822,187 +3691,93 @@ */ /*ARGSUSED*/ static int -zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, - caller_context_t *ct, int flags) +zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, + vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp, + cred_t *cr) { - znode_t *tdzp, *sdzp, *szp, *tzp; - zfsvfs_t *zfsvfs; - zilog_t *zilog; - vnode_t *realvp; - zfs_dirlock_t *sdl, *tdl; + zfsvfs_t *zfsvfs; + znode_t *sdzp, *tdzp, *szp, *tzp; + zilog_t *zilog = NULL; dmu_tx_t *tx; - zfs_zlock_t *zl; - int cmp, serr, terr; + char *snm = scnp->cn_nameptr; + char *tnm = tcnp->cn_nameptr; int error = 0; - int zflg = 0; - boolean_t waited = B_FALSE; - tdzp = VTOZ(tdvp); - ZFS_VERIFY_ZP(tdzp); - zfsvfs = tdzp->z_zfsvfs; - ZFS_ENTER(zfsvfs); - zilog = zfsvfs->z_log; - sdzp = VTOZ(sdvp); + /* Reject renames across filesystems. */ + if ((*svpp)->v_mount != tdvp->v_mount || + ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) { + error = SET_ERROR(EXDEV); + goto out; + } + + if (zfsctl_is_node(tdvp)) { + error = SET_ERROR(EXDEV); + goto out; + } /* - * In case sdzp is not valid, let's be sure to exit from the right - * zfsvfs_t. + * Lock all four vnodes to ensure safety and semantics of renaming. */ - if (sdzp->z_sa_hdl == NULL) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EIO)); + error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp); + if (error != 0) { + /* no vnodes are locked in the case of error here */ + return (error); } + tdzp = VTOZ(tdvp); + sdzp = VTOZ(sdvp); + zfsvfs = tdzp->z_zfsvfs; + zilog = zfsvfs->z_log; + /* - * We check z_zfsvfs rather than v_vfsp here, because snapshots and the - * ctldir appear to have the same v_vfsp. + * After we re-enter ZFS_ENTER() we will have to revalidate all + * znodes involved. */ - if (sdzp->z_zfsvfs != zfsvfs || zfsctl_is_node(tdvp)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EXDEV)); - } + ZFS_ENTER(zfsvfs); if (zfsvfs->z_utf8 && u8_validate(tnm, strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EILSEQ)); + error = SET_ERROR(EILSEQ); + goto unlockout; } - if (flags & FIGNORECASE) - zflg |= ZCILOOK; - -top: - szp = NULL; - tzp = NULL; - zl = NULL; - - /* - * This is to prevent the creation of links into attribute space - * by renaming a linked file into/outof an attribute directory. - * See the comment in zfs_link() for why this is considered bad. - */ - if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); + /* If source and target are the same file, there is nothing to do. */ + if ((*svpp) == (*tvpp)) { + error = 0; + goto unlockout; } - /* - * Lock source and target directory entries. To prevent deadlock, - * a lock ordering must be defined. We lock the directory with - * the smallest object id first, or if it's a tie, the one with - * the lexically first name. - */ - if (sdzp->z_id < tdzp->z_id) { - cmp = -1; - } else if (sdzp->z_id > tdzp->z_id) { - cmp = 1; - } else { - /* - * First compare the two name arguments without - * considering any case folding. - */ - int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); - - cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); - ASSERT(error == 0 || !zfsvfs->z_utf8); - if (cmp == 0) { - /* - * POSIX: "If the old argument and the new argument - * both refer to links to the same existing file, - * the rename() function shall return successfully - * and perform no other action." - */ - ZFS_EXIT(zfsvfs); - return (0); - } - /* - * If the file system is case-folding, then we may - * have some more checking to do. A case-folding file - * system is either supporting mixed case sensitivity - * access or is completely case-insensitive. Note - * that the file system is always case preserving. - * - * In mixed sensitivity mode case sensitive behavior - * is the default. FIGNORECASE must be used to - * explicitly request case insensitive behavior. - * - * If the source and target names provided differ only - * by case (e.g., a request to rename 'tim' to 'Tim'), - * we will treat this as a special case in the - * case-insensitive mode: as long as the source name - * is an exact match, we will allow this to proceed as - * a name-change request. - */ - if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || - (zfsvfs->z_case == ZFS_CASE_MIXED && - flags & FIGNORECASE)) && - u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, - &error) == 0) { - /* - * case preserving rename request, require exact - * name matches - */ - zflg |= ZCIEXACT; - zflg &= ~ZCILOOK; - } + if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) || + ((*tvpp) != NULL && (*tvpp)->v_type == VDIR && + (*tvpp)->v_mountedhere != NULL)) { + error = SET_ERROR(EXDEV); + goto unlockout; } /* - * If the source and destination directories are the same, we should - * grab the z_name_lock of that directory only once. + * We can not use ZFS_VERIFY_ZP() here because it could directly return + * bypassing the cleanup code in the case of an error. */ - if (sdzp == tdzp) { - zflg |= ZHAVELOCK; - rw_enter(&sdzp->z_name_lock, RW_READER); - } - - if (cmp < 0) { - serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, - ZEXISTS | zflg, NULL, NULL); - terr = zfs_dirent_lock(&tdl, - tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); - } else { - terr = zfs_dirent_lock(&tdl, - tdzp, tnm, &tzp, zflg, NULL, NULL); - serr = zfs_dirent_lock(&sdl, - sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, - NULL, NULL); + if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { + error = SET_ERROR(EIO); + goto unlockout; } - if (serr) { - /* - * Source entry invalid or not there. - */ - if (!terr) { - zfs_dirent_unlock(tdl); - if (tzp) - VN_RELE(ZTOV(tzp)); - } - - if (sdzp == tdzp) - rw_exit(&sdzp->z_name_lock); - - /* - * FreeBSD: In OpenSolaris they only check if rename source is - * ".." here, because "." is handled in their lookup. This is - * not the case for FreeBSD, so we check for "." explicitly. - */ - if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0) - serr = SET_ERROR(EINVAL); - ZFS_EXIT(zfsvfs); - return (serr); + szp = VTOZ(*svpp); + tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp); + if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) { + error = SET_ERROR(EIO); + goto unlockout; } - if (terr) { - zfs_dirent_unlock(sdl); - VN_RELE(ZTOV(szp)); - if (sdzp == tdzp) - rw_exit(&sdzp->z_name_lock); - - if (strcmp(tnm, "..") == 0) - terr = SET_ERROR(EINVAL); - ZFS_EXIT(zfsvfs); - return (terr); + /* + * This is to prevent the creation of links into attribute space + * by renaming a linked file into/outof an attribute directory. + * See the comment in zfs_link() for why this is considered bad. + */ + if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { + error = SET_ERROR(EINVAL); + goto unlockout; } /* @@ -4011,17 +3786,26 @@ * Note that if target and source are the same, this can be * done in a single check. */ - if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) - goto out; + goto unlockout; + + if ((*svpp)->v_type == VDIR) { + /* + * Avoid ".", "..", and aliases of "." for obvious reasons. + */ + if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') || + sdzp == szp || + (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { + error = EINVAL; + goto unlockout; + } - if (ZTOV(szp)->v_type == VDIR) { /* * Check to make sure rename is valid. * Can't do a move like this: /usr/a/b to /usr/a/b/c/d */ - if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) - goto out; + if (error = zfs_rename_check(szp, sdzp, tdzp)) + goto unlockout; } /* @@ -4031,31 +3815,26 @@ /* * Source and target must be the same type. */ - if (ZTOV(szp)->v_type == VDIR) { - if (ZTOV(tzp)->v_type != VDIR) { + if ((*svpp)->v_type == VDIR) { + if ((*tvpp)->v_type != VDIR) { error = SET_ERROR(ENOTDIR); - goto out; + goto unlockout; + } else { + cache_purge(tdvp); + if (sdvp != tdvp) + cache_purge(sdvp); } } else { - if (ZTOV(tzp)->v_type == VDIR) { + if ((*tvpp)->v_type == VDIR) { error = SET_ERROR(EISDIR); - goto out; + goto unlockout; } } - /* - * POSIX dictates that when the source and target - * entries refer to the same file object, rename - * must do nothing and exit without error. - */ - if (szp->z_id == tzp->z_id) { - error = 0; - goto out; - } } - vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); + vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct); if (tzp) - vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); + vnevent_rename_dest(*tvpp, tdvp, tnm, ct); /* * notify the target directory if it is not the same @@ -4081,35 +3860,18 @@ zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); + error = dmu_tx_assign(tx, TXG_WAIT); if (error) { - if (zl != NULL) - zfs_rename_unlock(&zl); - zfs_dirent_unlock(sdl); - zfs_dirent_unlock(tdl); - - if (sdzp == tdzp) - rw_exit(&sdzp->z_name_lock); - - VN_RELE(ZTOV(szp)); - if (tzp) - VN_RELE(ZTOV(tzp)); - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); + goto unlockout; } + if (tzp) /* Attempt to remove the existing target */ - error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); + error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL); if (error == 0) { - error = zfs_link_create(tdl, szp, tx, ZRENAMING); + error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING); if (error == 0) { szp->z_pflags |= ZFS_AV_MODIFIED; @@ -4117,17 +3879,16 @@ (void *)&szp->z_pflags, sizeof (uint64_t), tx); ASSERT0(error); - error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); + error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING, + NULL); if (error == 0) { - zfs_log_rename(zilog, tx, TX_RENAME | - (flags & FIGNORECASE ? TX_CI : 0), sdzp, - sdl->dl_name, tdzp, tdl->dl_name, szp); + zfs_log_rename(zilog, tx, TX_RENAME, sdzp, + snm, tdzp, tnm, szp); /* * Update path information for the target vnode */ - vn_renamepath(tdvp, ZTOV(szp), tnm, - strlen(tnm)); + vn_renamepath(tdvp, *svpp, tnm, strlen(tnm)); } else { /* * At this point, we have successfully created @@ -4141,42 +3902,33 @@ * succeed; fortunately, it is very unlikely to * fail, since we just created it. */ - VERIFY3U(zfs_link_destroy(tdl, szp, tx, + VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx, ZRENAMING, NULL), ==, 0); } } -#ifdef FREEBSD_NAMECACHE if (error == 0) { - cache_purge(sdvp); - cache_purge(tdvp); - cache_purge(ZTOV(szp)); - if (tzp) - cache_purge(ZTOV(tzp)); + cache_purge(*svpp); + if (*tvpp != NULL) + cache_purge(*tvpp); + cache_purge_negative(tdvp); } -#endif } dmu_tx_commit(tx); -out: - if (zl != NULL) - zfs_rename_unlock(&zl); - - zfs_dirent_unlock(sdl); - zfs_dirent_unlock(tdl); - if (sdzp == tdzp) - rw_exit(&sdzp->z_name_lock); - - - VN_RELE(ZTOV(szp)); - if (tzp) - VN_RELE(ZTOV(tzp)); +unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */ + ZFS_EXIT(zfsvfs); + VOP_UNLOCK(*svpp, 0); + VOP_UNLOCK(sdvp, 0); - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) +out: /* original two vnodes are locked */ + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS && error == 0) zil_commit(zilog, 0); - ZFS_EXIT(zfsvfs); - + if (*tvpp != NULL) + VOP_UNLOCK(*tvpp, 0); + if (tdvp != *tvpp) + VOP_UNLOCK(tdvp, 0); return (error); } @@ -4201,17 +3953,14 @@ cred_t *cr, kthread_t *td) { znode_t *zp, *dzp = VTOZ(dvp); - zfs_dirlock_t *dl; dmu_tx_t *tx; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t len = strlen(link); int error; - int zflg = ZNEW; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t txtype = TX_SYMLINK; - boolean_t waited = B_FALSE; int flags = 0; ASSERT(vap->va_type == VLNK); @@ -4225,8 +3974,6 @@ ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } - if (flags & FIGNORECASE) - zflg |= ZCILOOK; if (len > MAXPATHLEN) { ZFS_EXIT(zfsvfs); @@ -4239,35 +3986,29 @@ return (error); } - getnewvnode_reserve(1); - -top: /* * Attempt to lock directory; fail if entry already exists. */ - error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); + error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); if (error) { zfs_acl_ids_free(&acl_ids); - getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { zfs_acl_ids_free(&acl_ids); - zfs_dirent_unlock(dl); - getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); - zfs_dirent_unlock(dl); - getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } + + getnewvnode_reserve(1); tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); @@ -4281,15 +4022,8 @@ } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); + error = dmu_tx_assign(tx, TXG_WAIT); if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); @@ -4306,13 +4040,11 @@ if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); - mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), link, len, tx); else zfs_sa_symlink(zp, link, len, tx); - mutex_exit(&zp->z_lock); zp->z_size = len; (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), @@ -4320,10 +4052,8 @@ /* * Insert the new object into the directory. */ - (void) zfs_link_create(dl, zp, tx, ZNEW); + (void) zfs_link_create(dzp, name, zp, tx, ZNEW); - if (flags & FIGNORECASE) - txtype |= TX_CI; zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); *vpp = ZTOV(zp); @@ -4333,8 +4063,6 @@ getnewvnode_drop_reserve(); - zfs_dirent_unlock(dl); - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); @@ -4369,13 +4097,11 @@ ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_lookup_uio(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), uio); else error = zfs_sa_readlink(zp, uio); - mutex_exit(&zp->z_lock); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); @@ -4407,14 +4133,10 @@ znode_t *tzp, *szp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; - zfs_dirlock_t *dl; dmu_tx_t *tx; - vnode_t *realvp; int error; - int zf = ZNEW; uint64_t parent; uid_t owner; - boolean_t waited = B_FALSE; ASSERT(tdvp->v_type == VDIR); @@ -4422,9 +4144,6 @@ ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; - if (VOP_REALVP(svp, &realvp, ct) == 0) - svp = realvp; - /* * POSIX dictates that we return EPERM here. * Better choices include ENOTSUP or EISDIR. @@ -4442,819 +4161,135 @@ return (SET_ERROR(EPERM)); } - /* - * We check z_zfsvfs rather than v_vfsp here, because snapshots and the - * ctldir appear to have the same v_vfsp. - */ - if (szp->z_zfsvfs != zfsvfs || zfsctl_is_node(svp)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EXDEV)); - } - /* Prevent links to .zfs/shares files */ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); - return (error); - } - if (parent == zfsvfs->z_shares_dir) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - if (zfsvfs->z_utf8 && u8_validate(name, - strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EILSEQ)); - } - if (flags & FIGNORECASE) - zf |= ZCILOOK; - - /* - * We do not support links between attributes and non-attributes - * because of the potential security risk of creating links - * into "normal" file space in order to circumvent restrictions - * imposed in attribute space. - */ - if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - - owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); - if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { - ZFS_EXIT(zfsvfs); - return (error); - } - -top: - /* - * Attempt to lock directory; fail if entry already exists. - */ - error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); - if (error) { - ZFS_EXIT(zfsvfs); - return (error); - } - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); - dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - zfs_sa_upgrade_txholds(tx, szp); - zfs_sa_upgrade_txholds(tx, dzp); - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); - if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - - error = zfs_link_create(dl, szp, tx, 0); - - if (error == 0) { - uint64_t txtype = TX_LINK; - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_link(zilog, tx, txtype, dzp, szp, name); - } - - dmu_tx_commit(tx); - - zfs_dirent_unlock(dl); - - if (error == 0) { - vnevent_link(svp, ct); - } - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (error); -} - -#ifdef illumos -/* - * zfs_null_putapage() is used when the file system has been force - * unmounted. It just drops the pages. - */ -/* ARGSUSED */ -static int -zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, - size_t *lenp, int flags, cred_t *cr) -{ - pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR); - return (0); -} - -/* - * Push a page out to disk, klustering if possible. - * - * IN: vp - file to push page to. - * pp - page to push. - * flags - additional flags. - * cr - credentials of caller. - * - * OUT: offp - start of range pushed. - * lenp - len of range pushed. - * - * RETURN: 0 on success, error code on failure. - * - * NOTE: callers must have locked the page to be pushed. On - * exit, the page (and all other pages in the kluster) must be - * unlocked. - */ -/* ARGSUSED */ -static int -zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, - size_t *lenp, int flags, cred_t *cr) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - dmu_tx_t *tx; - u_offset_t off, koff; - size_t len, klen; - int err; - - off = pp->p_offset; - len = PAGESIZE; - /* - * If our blocksize is bigger than the page size, try to kluster - * multiple pages so that we write a full block (thus avoiding - * a read-modify-write). - */ - if (off < zp->z_size && zp->z_blksz > PAGESIZE) { - klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); - koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0; - ASSERT(koff <= zp->z_size); - if (koff + klen > zp->z_size) - klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE); - pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags); - } - ASSERT3U(btop(len), ==, btopr(len)); - - /* - * Can't push pages past end-of-file. - */ - if (off >= zp->z_size) { - /* ignore all pages */ - err = 0; - goto out; - } else if (off + len > zp->z_size) { - int npages = btopr(zp->z_size - off); - page_t *trunc; - - page_list_break(&pp, &trunc, npages); - /* ignore pages past end of file */ - if (trunc) - pvn_write_done(trunc, flags); - len = zp->z_size - off; - } - - if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || - zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { - err = SET_ERROR(EDQUOT); - goto out; - } - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_write(tx, zp->z_id, off, len); - - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, zp); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err != 0) { - dmu_tx_abort(tx); - goto out; - } - - if (zp->z_blksz <= PAGESIZE) { - caddr_t va = zfs_map_page(pp, S_READ); - ASSERT3U(len, <=, PAGESIZE); - dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx); - zfs_unmap_page(pp, va); - } else { - err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx); - } - - if (err == 0) { - uint64_t mtime[2], ctime[2]; - sa_bulk_attr_t bulk[3]; - int count = 0; - - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, - &mtime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, - &ctime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, - &zp->z_pflags, 8); - zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, - B_TRUE); - zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); - } - dmu_tx_commit(tx); - -out: - pvn_write_done(pp, (err ? B_ERROR : 0) | flags); - if (offp) - *offp = off; - if (lenp) - *lenp = len; - - return (err); -} - -/* - * Copy the portion of the file indicated from pages into the file. - * The pages are stored in a page list attached to the files vnode. - * - * IN: vp - vnode of file to push page data to. - * off - position in file to put data. - * len - amount of data to write. - * flags - flags to control the operation. - * cr - credentials of caller. - * ct - caller context. - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * vp - ctime|mtime updated - */ -/*ARGSUSED*/ -static int -zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, - caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - page_t *pp; - size_t io_len; - u_offset_t io_off; - uint_t blksz; - rl_t *rl; - int error = 0; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - /* - * Align this request to the file block size in case we kluster. - * XXX - this can result in pretty aggresive locking, which can - * impact simultanious read/write access. One option might be - * to break up long requests (len == 0) into block-by-block - * operations to get narrower locking. - */ - blksz = zp->z_blksz; - if (ISP2(blksz)) - io_off = P2ALIGN_TYPED(off, blksz, u_offset_t); - else - io_off = 0; - if (len > 0 && ISP2(blksz)) - io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t); - else - io_len = 0; - - if (io_len == 0) { - /* - * Search the entire vp list for pages >= io_off. - */ - rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER); - error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr); - goto out; - } - rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER); - - if (off > zp->z_size) { - /* past end of file */ - zfs_range_unlock(rl); - ZFS_EXIT(zfsvfs); - return (0); - } - - len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off); - - for (off = io_off; io_off < off + len; io_off += io_len) { - if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { - pp = page_lookup(vp, io_off, - (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED); - } else { - pp = page_lookup_nowait(vp, io_off, - (flags & B_FREE) ? SE_EXCL : SE_SHARED); - } - - if (pp != NULL && pvn_getdirty(pp, flags)) { - int err; - - /* - * Found a dirty page to push - */ - err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr); - if (err) - error = err; - } else { - io_len = PAGESIZE; - } - } -out: - zfs_range_unlock(rl); - if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zfsvfs->z_log, zp->z_id); - ZFS_EXIT(zfsvfs); - return (error); -} -#endif /* illumos */ - -/*ARGSUSED*/ -void -zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - - rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); - if (zp->z_sa_hdl == NULL) { - /* - * The fs has been unmounted, or we did a - * suspend/resume and this file no longer exists. - */ - rw_exit(&zfsvfs->z_teardown_inactive_lock); - vrecycle(vp); - return; - } - - mutex_enter(&zp->z_lock); - if (zp->z_unlinked) { - /* - * Fast path to recycle a vnode of a removed file. - */ - mutex_exit(&zp->z_lock); - rw_exit(&zfsvfs->z_teardown_inactive_lock); - vrecycle(vp); - return; - } - mutex_exit(&zp->z_lock); - - if (zp->z_atime_dirty && zp->z_unlinked == 0) { - dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); - - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, zp); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - } else { - mutex_enter(&zp->z_lock); - (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), - (void *)&zp->z_atime, sizeof (zp->z_atime), tx); - zp->z_atime_dirty = 0; - mutex_exit(&zp->z_lock); - dmu_tx_commit(tx); - } - } - rw_exit(&zfsvfs->z_teardown_inactive_lock); -} - -#ifdef illumos -/* - * Bounds-check the seek operation. - * - * IN: vp - vnode seeking within - * ooff - old file offset - * noffp - pointer to new file offset - * ct - caller context - * - * RETURN: 0 on success, EINVAL if new offset invalid. - */ -/* ARGSUSED */ -static int -zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, - caller_context_t *ct) -{ - if (vp->v_type == VDIR) - return (0); - return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); -} - -/* - * Pre-filter the generic locking function to trap attempts to place - * a mandatory lock on a memory mapped file. - */ -static int -zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, - flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - /* - * We are following the UFS semantics with respect to mapcnt - * here: If we see that the file is mapped already, then we will - * return an error, but we don't worry about races between this - * function and zfs_map(). - */ - if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EAGAIN)); - } - ZFS_EXIT(zfsvfs); - return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); -} - -/* - * If we can't find a page in the cache, we will create a new page - * and fill it with file data. For efficiency, we may try to fill - * multiple pages at once (klustering) to fill up the supplied page - * list. Note that the pages to be filled are held with an exclusive - * lock to prevent access by other threads while they are being filled. - */ -static int -zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, - caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw) -{ - znode_t *zp = VTOZ(vp); - page_t *pp, *cur_pp; - objset_t *os = zp->z_zfsvfs->z_os; - u_offset_t io_off, total; - size_t io_len; - int err; - - if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) { - /* - * We only have a single page, don't bother klustering - */ - io_off = off; - io_len = PAGESIZE; - pp = page_create_va(vp, io_off, io_len, - PG_EXCL | PG_WAIT, seg, addr); - } else { - /* - * Try to find enough pages to fill the page list - */ - pp = pvn_read_kluster(vp, off, seg, addr, &io_off, - &io_len, off, plsz, 0); - } - if (pp == NULL) { - /* - * The page already exists, nothing to do here. - */ - *pl = NULL; - return (0); - } - - /* - * Fill the pages in the kluster. - */ - cur_pp = pp; - for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { - caddr_t va; - - ASSERT3U(io_off, ==, cur_pp->p_offset); - va = zfs_map_page(cur_pp, S_WRITE); - err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, - DMU_READ_PREFETCH); - zfs_unmap_page(cur_pp, va); - if (err) { - /* On error, toss the entire kluster */ - pvn_read_done(pp, B_ERROR); - /* convert checksum errors into IO errors */ - if (err == ECKSUM) - err = SET_ERROR(EIO); - return (err); - } - cur_pp = cur_pp->p_next; - } - - /* - * Fill in the page list array from the kluster starting - * from the desired offset `off'. - * NOTE: the page list will always be null terminated. - */ - pvn_plist_init(pp, pl, plsz, off, io_len, rw); - ASSERT(pl == NULL || (*pl)->p_offset == off); - - return (0); -} - -/* - * Return pointers to the pages for the file region [off, off + len] - * in the pl array. If plsz is greater than len, this function may - * also return page pointers from after the specified region - * (i.e. the region [off, off + plsz]). These additional pages are - * only returned if they are already in the cache, or were created as - * part of a klustered read. - * - * IN: vp - vnode of file to get data from. - * off - position in file to get data from. - * len - amount of data to retrieve. - * plsz - length of provided page list. - * seg - segment to obtain pages for. - * addr - virtual address of fault. - * rw - mode of created pages. - * cr - credentials of caller. - * ct - caller context. - * - * OUT: protp - protection mode of created pages. - * pl - list of pages created. - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * vp - atime updated - */ -/* ARGSUSED */ -static int -zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, - page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, - enum seg_rw rw, cred_t *cr, caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - page_t **pl0 = pl; - int err = 0; - - /* we do our own caching, faultahead is unnecessary */ - if (pl == NULL) - return (0); - else if (len > plsz) - len = plsz; - else - len = P2ROUNDUP(len, PAGESIZE); - ASSERT(plsz >= len); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if (protp) - *protp = PROT_ALL; - - /* - * Loop through the requested range [off, off + len) looking - * for pages. If we don't find a page, we will need to create - * a new page and fill it with data from the file. - */ - while (len > 0) { - if (*pl = page_lookup(vp, off, SE_SHARED)) - *(pl+1) = NULL; - else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw)) - goto out; - while (*pl) { - ASSERT3U((*pl)->p_offset, ==, off); - off += PAGESIZE; - addr += PAGESIZE; - if (len > 0) { - ASSERT3U(len, >=, PAGESIZE); - len -= PAGESIZE; - } - ASSERT3U(plsz, >=, PAGESIZE); - plsz -= PAGESIZE; - pl++; - } - } - - /* - * Fill out the page array with any pages already in the cache. - */ - while (plsz > 0 && - (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) { - off += PAGESIZE; - plsz -= PAGESIZE; - } -out: - if (err) { - /* - * Release any pages we have previously locked. - */ - while (pl > pl0) - page_unlock(*--pl); - } else { - ZFS_ACCESSTIME_STAMP(zfsvfs, zp); - } - - *pl = NULL; - - ZFS_EXIT(zfsvfs); - return (err); -} - -/* - * Request a memory map for a section of a file. This code interacts - * with common code and the VM system as follows: - * - * - common code calls mmap(), which ends up in smmap_common() - * - this calls VOP_MAP(), which takes you into (say) zfs - * - zfs_map() calls as_map(), passing segvn_create() as the callback - * - segvn_create() creates the new segment and calls VOP_ADDMAP() - * - zfs_addmap() updates z_mapcnt - */ -/*ARGSUSED*/ -static int -zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, - size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, - caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - segvn_crargs_t vn_a; - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if ((prot & PROT_WRITE) && (zp->z_pflags & - (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - if ((prot & (PROT_READ | PROT_EXEC)) && - (zp->z_pflags & ZFS_AV_QUARANTINED)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EACCES)); - } - - if (vp->v_flag & VNOMAP) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(ENOSYS)); - } - - if (off < 0 || len > MAXOFFSET_T - off) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(ENXIO)); - } - - if (vp->v_type != VREG) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(ENODEV)); - } - - /* - * If file is locked, disallow mapping. - */ - if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EAGAIN)); - } - - as_rangelock(as); - error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); - if (error != 0) { - as_rangeunlock(as); - ZFS_EXIT(zfsvfs); - return (error); - } - - vn_a.vp = vp; - vn_a.offset = (u_offset_t)off; - vn_a.type = flags & MAP_TYPE; - vn_a.prot = prot; - vn_a.maxprot = maxprot; - vn_a.cred = cr; - vn_a.amp = NULL; - vn_a.flags = flags & ~MAP_TYPE; - vn_a.szc = 0; - vn_a.lgrp_mem_policy_flags = 0; - - error = as_map(as, *addrp, len, segvn_create, &vn_a); - - as_rangeunlock(as); - ZFS_EXIT(zfsvfs); - return (error); -} - -/* ARGSUSED */ -static int -zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, - size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, - caller_context_t *ct) -{ - uint64_t pages = btopr(len); - - atomic_add_64(&VTOZ(vp)->z_mapcnt, pages); - return (0); -} - -/* - * The reason we push dirty pages as part of zfs_delmap() is so that we get a - * more accurate mtime for the associated file. Since we don't have a way of - * detecting when the data was actually modified, we have to resort to - * heuristics. If an explicit msync() is done, then we mark the mtime when the - * last page is pushed. The problem occurs when the msync() call is omitted, - * which by far the most common case: - * - * open() - * mmap() - * - * munmap() - * close() - *