diff --git a/module/os/linux/zfs/zfs_dir.c b/module/os/linux/zfs/zfs_dir.c index 82b32d1cc3fa..8ad5454b5a78 100644 --- a/module/os/linux/zfs/zfs_dir.c +++ b/module/os/linux/zfs/zfs_dir.c @@ -1,1225 +1,1222 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2016 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * zfs_match_find() is used by zfs_dirent_lock() to perform zap lookups * of names after deciding which is the appropriate lookup interface. */ static int zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name, matchtype_t mt, boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid) { boolean_t conflict = B_FALSE; int error; if (zfsvfs->z_norm) { size_t bufsz = 0; char *buf = NULL; if (rpnp) { buf = rpnp->pn_buf; bufsz = rpnp->pn_bufsize; } /* * In the non-mixed case we only expect there would ever * be one match, but we need to use the normalizing lookup. */ error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid, mt, buf, bufsz, &conflict); } else { error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); } /* * Allow multiple entries provided the first entry is * the object id. Non-zpl consumers may safely make * use of the additional space. * * XXX: This should be a feature flag for compatibility */ if (error == EOVERFLOW) error = 0; if (zfsvfs->z_norm && !error && deflags) *deflags = conflict ? ED_CASE_CONFLICT : 0; *zoid = ZFS_DIRENT_OBJ(*zoid); return (error); } /* * Lock a directory entry. A dirlock on protects that name * in dzp's directory zap object. As long as you hold a dirlock, you can * assume two things: (1) dzp cannot be reaped, and (2) no other thread * can change the zap entry for (i.e. link or unlink) this name. * * Input arguments: * dzp - znode for directory * name - name of entry to lock * flag - ZNEW: if the entry already exists, fail with EEXIST. * ZEXISTS: if the entry does not exist, fail with ENOENT. * ZSHARED: allow concurrent access with other ZSHARED callers. * ZXATTR: we want dzp's xattr directory * ZCILOOK: On a mixed sensitivity file system, * this lookup should be case-insensitive. * ZCIEXACT: On a purely case-insensitive file system, * this lookup should be case-sensitive. * ZRENAMING: we are locking for renaming, force narrow locks * ZHAVELOCK: Don't grab the z_name_lock for this call. The * current thread already holds it. * * Output arguments: * zpp - pointer to the znode for the entry (NULL if there isn't one) * dlpp - pointer to the dirlock for this entry (NULL on error) * direntflags - (case-insensitive lookup only) * flags if multiple case-sensitive matches exist in directory * realpnp - (case-insensitive lookup only) * actual name matched within the directory * * Return value: 0 on success or errno on failure. * * NOTE: Always checks for, and rejects, '.' and '..'. * NOTE: For case-insensitive file systems we take wide locks (see below), * but return znode pointers to a single match. */ int zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, int flag, int *direntflags, pathname_t *realpnp) { zfsvfs_t *zfsvfs = ZTOZSB(dzp); zfs_dirlock_t *dl; boolean_t update; matchtype_t mt = 0; uint64_t zoid; int error = 0; int cmpflags; *zpp = NULL; *dlpp = NULL; /* * Verify that we are not trying to lock '.', '..', or '.zfs' */ if ((name[0] == '.' && (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'))) || (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)) return (SET_ERROR(EEXIST)); /* * Case sensitivity and normalization preferences are set when * the file system is created. These are stored in the * zfsvfs->z_case and zfsvfs->z_norm fields. These choices * affect what vnodes can be cached in the DNLC, how we * perform zap lookups, and the "width" of our dirlocks. * * A normal dirlock locks a single name. Note that with * normalization a name can be composed multiple ways, but * when normalized, these names all compare equal. A wide * dirlock locks multiple names. We need these when the file * system is supporting mixed-mode access. It is sometimes * necessary to lock all case permutations of file name at * once so that simultaneous case-insensitive/case-sensitive * behaves as rationally as possible. */ /* * When matching we may need to normalize & change case according to * FS settings. * * Note that a normalized match is necessary for a case insensitive * filesystem when the lookup request is not exact because normalization * can fold case independent of normalizing code point sequences. * * See the table above zfs_dropname(). */ if (zfsvfs->z_norm != 0) { mt = MT_NORMALIZE; /* * Determine if the match needs to honor the case specified in * lookup, and if so keep track of that so that during * normalization we don't fold case. */ if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE && (flag & ZCIEXACT)) || (zfsvfs->z_case == ZFS_CASE_MIXED && !(flag & ZCILOOK))) { mt |= MT_MATCH_CASE; } } /* * Only look in or update the DNLC if we are looking for the * name on a file system that does not require normalization * or case folding. We can also look there if we happen to be * on a non-normalizing, mixed sensitivity file system IF we * are looking for the exact name. * * Maybe can add TO-UPPERed version of name to dnlc in ci-only * case for performance improvement? */ update = !zfsvfs->z_norm || (zfsvfs->z_case == ZFS_CASE_MIXED && !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK)); /* * ZRENAMING indicates we are in a situation where we should * take narrow locks regardless of the file system's * preferences for normalizing and case folding. This will * prevent us deadlocking trying to grab the same wide lock * twice if the two names happen to be case-insensitive * matches. */ if (flag & ZRENAMING) cmpflags = 0; else cmpflags = zfsvfs->z_norm; /* * Wait until there are no locks on this name. * * Don't grab the lock if it is already held. However, cannot * have both ZSHARED and ZHAVELOCK together. */ ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK)); if (!(flag & ZHAVELOCK)) rw_enter(&dzp->z_name_lock, RW_READER); mutex_enter(&dzp->z_lock); for (;;) { if (dzp->z_unlinked && !(flag & ZXATTR)) { mutex_exit(&dzp->z_lock); if (!(flag & ZHAVELOCK)) rw_exit(&dzp->z_name_lock); return (SET_ERROR(ENOENT)); } for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) { if ((u8_strcmp(name, dl->dl_name, 0, cmpflags, U8_UNICODE_LATEST, &error) == 0) || error != 0) break; } if (error != 0) { mutex_exit(&dzp->z_lock); if (!(flag & ZHAVELOCK)) rw_exit(&dzp->z_name_lock); return (SET_ERROR(ENOENT)); } if (dl == NULL) { /* * Allocate a new dirlock and add it to the list. */ dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP); cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL); dl->dl_name = name; dl->dl_sharecnt = 0; dl->dl_namelock = 0; dl->dl_namesize = 0; dl->dl_dzp = dzp; dl->dl_next = dzp->z_dirlocks; dzp->z_dirlocks = dl; break; } if ((flag & ZSHARED) && dl->dl_sharecnt != 0) break; cv_wait(&dl->dl_cv, &dzp->z_lock); } /* * If the z_name_lock was NOT held for this dirlock record it. */ if (flag & ZHAVELOCK) dl->dl_namelock = 1; if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) { /* * We're the second shared reference to dl. Make a copy of * dl_name in case the first thread goes away before we do. * Note that we initialize the new name before storing its * pointer into dl_name, because the first thread may load * dl->dl_name at any time. It'll either see the old value, * which belongs to it, or the new shared copy; either is OK. */ dl->dl_namesize = strlen(dl->dl_name) + 1; name = kmem_alloc(dl->dl_namesize, KM_SLEEP); bcopy(dl->dl_name, name, dl->dl_namesize); dl->dl_name = name; } mutex_exit(&dzp->z_lock); /* * We have a dirlock on the name. (Note that it is the dirlock, * not the dzp's z_lock, that protects the name in the zap object.) * See if there's an object by this name; if so, put a hold on it. */ if (flag & ZXATTR) { error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid, sizeof (zoid)); if (error == 0) error = (zoid == 0 ? SET_ERROR(ENOENT) : 0); } else { error = zfs_match_find(zfsvfs, dzp, name, mt, update, direntflags, realpnp, &zoid); } if (error) { if (error != ENOENT || (flag & ZEXISTS)) { zfs_dirent_unlock(dl); return (error); } } else { if (flag & ZNEW) { zfs_dirent_unlock(dl); return (SET_ERROR(EEXIST)); } error = zfs_zget(zfsvfs, zoid, zpp); if (error) { zfs_dirent_unlock(dl); return (error); } } *dlpp = dl; return (0); } /* * Unlock this directory entry and wake anyone who was waiting for it. */ void zfs_dirent_unlock(zfs_dirlock_t *dl) { znode_t *dzp = dl->dl_dzp; zfs_dirlock_t **prev_dl, *cur_dl; mutex_enter(&dzp->z_lock); if (!dl->dl_namelock) rw_exit(&dzp->z_name_lock); if (dl->dl_sharecnt > 1) { dl->dl_sharecnt--; mutex_exit(&dzp->z_lock); return; } prev_dl = &dzp->z_dirlocks; while ((cur_dl = *prev_dl) != dl) prev_dl = &cur_dl->dl_next; *prev_dl = dl->dl_next; cv_broadcast(&dl->dl_cv); mutex_exit(&dzp->z_lock); if (dl->dl_namesize != 0) kmem_free(dl->dl_name, dl->dl_namesize); cv_destroy(&dl->dl_cv); kmem_free(dl, sizeof (*dl)); } /* * Look up an entry in a directory. * * NOTE: '.' and '..' are handled as special cases because * no directory entries are actually stored for them. If this is * the root of a filesystem, then '.zfs' is also treated as a * special pseudo-directory. */ int zfs_dirlook(znode_t *dzp, char *name, znode_t **zpp, int flags, int *deflg, pathname_t *rpnp) { zfs_dirlock_t *dl; znode_t *zp; struct inode *ip; int error = 0; uint64_t parent; if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { *zpp = dzp; zhold(*zpp); } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { zfsvfs_t *zfsvfs = ZTOZSB(dzp); /* * If we are a snapshot mounted under .zfs, return * the inode pointer for the snapshot directory. */ if ((error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) return (error); if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) { error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir, "snapshot", &ip, 0, kcred, NULL, NULL); *zpp = ITOZ(ip); return (error); } rw_enter(&dzp->z_parent_lock, RW_READER); error = zfs_zget(zfsvfs, parent, &zp); if (error == 0) *zpp = zp; rw_exit(&dzp->z_parent_lock); } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) { ip = zfsctl_root(dzp); *zpp = ITOZ(ip); } else { int zf; zf = ZEXISTS | ZSHARED; if (flags & FIGNORECASE) zf |= ZCILOOK; error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp); if (error == 0) { *zpp = zp; zfs_dirent_unlock(dl); dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ } rpnp = NULL; } if ((flags & FIGNORECASE) && rpnp && !error) (void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize); return (error); } /* * unlinked Set (formerly known as the "delete queue") Error Handling * * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we * don't specify the name of the entry that we will be manipulating. We * also fib and say that we won't be adding any new entries to the * unlinked set, even though we might (this is to lower the minimum file * size that can be deleted in a full filesystem). So on the small * chance that the nlink list is using a fat zap (ie. has more than * 2000 entries), we *may* not pre-read a block that's needed. * Therefore it is remotely possible for some of the assertions * regarding the unlinked set below to fail due to i/o error. On a * nondebug system, this will result in the space being leaked. */ void zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) { zfsvfs_t *zfsvfs = ZTOZSB(zp); ASSERT(zp->z_unlinked); ASSERT(ZTOI(zp)->i_nlink == 0); VERIFY3U(0, ==, zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1); } /* * Clean up any znodes that had no links when we either crashed or * (force) umounted the file system. */ static void zfs_unlinked_drain_task(void *arg) { zfsvfs_t *zfsvfs = arg; zap_cursor_t zc; zap_attribute_t zap; dmu_object_info_t doi; znode_t *zp; int error; ASSERT3B(zfsvfs->z_draining, ==, B_TRUE); /* * Iterate over the contents of the unlinked set. */ for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj); zap_cursor_retrieve(&zc, &zap) == 0 && !zfsvfs->z_drain_cancel; zap_cursor_advance(&zc)) { /* * See what kind of object we have in list */ error = dmu_object_info(zfsvfs->z_os, zap.za_first_integer, &doi); if (error != 0) continue; ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); /* * We need to re-mark these list entries for deletion, * so we pull them back into core and set zp->z_unlinked. */ error = zfs_zget(zfsvfs, zap.za_first_integer, &zp); /* * We may pick up znodes that are already marked for deletion. * This could happen during the purge of an extended attribute * directory. All we need to do is skip over them, since they * are already in the system marked z_unlinked. */ if (error != 0) continue; zp->z_unlinked = B_TRUE; /* * zrele() decrements the znode's ref count and may cause * it to be synchronously freed. We interrupt freeing * of this znode by checking the return value of * dmu_objset_zfs_unmounting() in dmu_free_long_range() * when an unmount is requested. */ zrele(zp); ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); } zap_cursor_fini(&zc); zfsvfs->z_draining = B_FALSE; zfsvfs->z_drain_task = TASKQID_INVALID; } /* * Sets z_draining then tries to dispatch async unlinked drain. * If that fails executes synchronous unlinked drain. */ void zfs_unlinked_drain(zfsvfs_t *zfsvfs) { ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); ASSERT3B(zfsvfs->z_draining, ==, B_FALSE); zfsvfs->z_draining = B_TRUE; zfsvfs->z_drain_cancel = B_FALSE; zfsvfs->z_drain_task = taskq_dispatch( dsl_pool_unlinked_drain_taskq(dmu_objset_pool(zfsvfs->z_os)), zfs_unlinked_drain_task, zfsvfs, TQ_SLEEP); if (zfsvfs->z_drain_task == TASKQID_INVALID) { zfs_dbgmsg("async zfs_unlinked_drain dispatch failed"); zfs_unlinked_drain_task(zfsvfs); } } /* * Wait for the unlinked drain taskq task to stop. This will interrupt the * unlinked set processing if it is in progress. */ void zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs) { ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); if (zfsvfs->z_draining) { zfsvfs->z_drain_cancel = B_TRUE; taskq_cancel_id(dsl_pool_unlinked_drain_taskq( dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task); zfsvfs->z_drain_task = TASKQID_INVALID; zfsvfs->z_draining = B_FALSE; } } /* * Delete the entire contents of a directory. Return a count * of the number of entries that could not be deleted. If we encounter * an error, return a count of at least one so that the directory stays * in the unlinked set. * * NOTE: this function assumes that the directory is inactive, * so there is no need to lock its entries before deletion. * Also, it assumes the directory contents is *only* regular * files. */ static int zfs_purgedir(znode_t *dzp) { zap_cursor_t zc; zap_attribute_t zap; znode_t *xzp; dmu_tx_t *tx; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zfs_dirlock_t dl; int skipped = 0; int error; for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); (error = zap_cursor_retrieve(&zc, &zap)) == 0; zap_cursor_advance(&zc)) { error = zfs_zget(zfsvfs, ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp); if (error) { skipped += 1; continue; } ASSERT(S_ISREG(ZTOI(xzp)->i_mode) || S_ISLNK(ZTOI(xzp)->i_mode)); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); /* Is this really needed ? */ zfs_sa_upgrade_txholds(tx, xzp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_zrele_async(xzp); skipped += 1; continue; } bzero(&dl, sizeof (dl)); dl.dl_dzp = dzp; dl.dl_name = zap.za_name; error = zfs_link_destroy(&dl, xzp, tx, 0, NULL); if (error) skipped += 1; dmu_tx_commit(tx); zfs_zrele_async(xzp); } zap_cursor_fini(&zc); if (error != ENOENT) skipped += 1; return (skipped); } void zfs_rmnode(znode_t *zp) { zfsvfs_t *zfsvfs = ZTOZSB(zp); objset_t *os = zfsvfs->z_os; znode_t *xzp = NULL; dmu_tx_t *tx; uint64_t acl_obj; uint64_t xattr_obj; uint64_t links; int error; ASSERT(ZTOI(zp)->i_nlink == 0); ASSERT(atomic_read(&ZTOI(zp)->i_count) == 0); /* * If this is an attribute directory, purge its contents. */ if (S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_XATTR)) { if (zfs_purgedir(zp) != 0) { /* * Not enough space to delete some xattrs. * Leave it in the unlinked set. */ zfs_znode_dmu_fini(zp); return; } } /* * Free up all the data in the file. We don't do this for directories * because we need truncate and remove to be in the same tx, like in * zfs_znode_delete(). Otherwise, if we crash here we'll end up with * an inconsistent truncated zap object in the delete queue. Note a * truncated file is harmless since it only contains user data. */ if (S_ISREG(ZTOI(zp)->i_mode)) { error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); if (error) { /* * Not enough space or we were interrupted by unmount. * Leave the file in the unlinked set. */ zfs_znode_dmu_fini(zp); return; } } /* * If the file has extended attributes, we're going to unlink * the xattr dir. */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT(error == 0); } acl_obj = zfs_external_acl(zp); /* * Set up the final transaction. */ tx = dmu_tx_create(os); dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); if (xzp) { dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } if (acl_obj) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { /* * Not enough space to delete the file. Leave it in the * unlinked set, leaking it until the fs is remounted (at * which point we'll call zfs_unlinked_drain() to process it). */ dmu_tx_abort(tx); zfs_znode_dmu_fini(zp); goto out; } if (xzp) { ASSERT(error == 0); mutex_enter(&xzp->z_lock); xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ clear_nlink(ZTOI(xzp)); /* no more links to it */ links = 0; VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), &links, sizeof (links), tx)); mutex_exit(&xzp->z_lock); zfs_unlinked_add(xzp, tx); } mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock); /* * Remove this znode from the unlinked set. If a has rollback has * occurred while a file is open and unlinked. Then when the file * is closed post rollback it will not exist in the rolled back * version of the unlinked object. */ error = zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx); VERIFY(error == 0 || error == ENOENT); uint64_t count; if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) { cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv); } mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock); dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1); zfs_znode_delete(zp, tx); dmu_tx_commit(tx); out: if (xzp) zfs_zrele_async(xzp); } static uint64_t zfs_dirent(znode_t *zp, uint64_t mode) { uint64_t de = zp->z_id; if (ZTOZSB(zp)->z_version >= ZPL_VERSION_DIRENT_TYPE) de |= IFTODT(mode) << 60; return (de); } /* * Link zp into dl. Can fail in the following cases : * - if zp has been unlinked. * - if the number of entries with the same hash (aka. colliding entries) * exceed the capacity of a leaf-block of fatzap and splitting of the * leaf-block does not help. */ int zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) { znode_t *dzp = dl->dl_dzp; zfsvfs_t *zfsvfs = ZTOZSB(zp); uint64_t value; int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode); sa_bulk_attr_t bulk[5]; uint64_t mtime[2], ctime[2]; uint64_t links; int count = 0; int error; mutex_enter(&zp->z_lock); if (!(flag & ZRENAMING)) { if (zp->z_unlinked) { /* no new links to unlinked zp */ ASSERT(!(flag & (ZNEW | ZEXISTS))); mutex_exit(&zp->z_lock); return (SET_ERROR(ENOENT)); } if (!(flag & ZNEW)) { /* * ZNEW nodes come from zfs_mknode() where the link * count has already been initialised */ inc_nlink(ZTOI(zp)); links = ZTOI(zp)->i_nlink; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, sizeof (links)); } } value = zfs_dirent(zp, zp->z_mode); error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 8, 1, &value, tx); /* * zap_add could fail to add the entry if it exceeds the capacity of the * leaf-block and zap_leaf_split() failed to help. * The caller of this routine is responsible for failing the transaction * which will rollback the SA updates done above. */ if (error != 0) { if (!(flag & ZRENAMING) && !(flag & ZNEW)) drop_nlink(ZTOI(zp)); mutex_exit(&zp->z_lock); return (error); } SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &dzp->z_id, sizeof (dzp->z_id)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (!(flag & ZNEW)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime); } error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT(error == 0); mutex_exit(&zp->z_lock); mutex_enter(&dzp->z_lock); dzp->z_size++; if (zp_is_dir) inc_nlink(ZTOI(dzp)); links = ZTOI(dzp)->i_nlink; count = 0; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &dzp->z_size, sizeof (dzp->z_size)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, sizeof (links)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); ASSERT(error == 0); mutex_exit(&dzp->z_lock); return (0); } /* * The match type in the code for this function should conform to: * * ------------------------------------------------------------------------ * fs type | z_norm | lookup type | match type * ---------|-------------|-------------|---------------------------------- * CS !norm | 0 | 0 | 0 (exact) * CS norm | formX | 0 | MT_NORMALIZE * CI !norm | upper | !ZCIEXACT | MT_NORMALIZE * CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE * CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE * CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE * CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE * CM !norm | upper | ZCILOOK | MT_NORMALIZE * CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE * CM norm | upper|formX | ZCILOOK | MT_NORMALIZE * * Abbreviations: * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER) * formX = unicode normalization form set on fs creation */ static int zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx, int flag) { int error; if (ZTOZSB(zp)->z_norm) { matchtype_t mt = MT_NORMALIZE; if ((ZTOZSB(zp)->z_case == ZFS_CASE_INSENSITIVE && (flag & ZCIEXACT)) || (ZTOZSB(zp)->z_case == ZFS_CASE_MIXED && !(flag & ZCILOOK))) { mt |= MT_MATCH_CASE; } error = zap_remove_norm(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, mt, tx); } else { error = zap_remove(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, tx); } return (error); } /* * Unlink zp from dl, and mark zp for deletion if this was the last link. Can * fail if zp is a mount point (EBUSY) or a non-empty directory (ENOTEMPTY). * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. * If it's non-NULL, we use it to indicate whether the znode needs deletion, * and it's the caller's job to do it. */ int zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, boolean_t *unlinkedp) { znode_t *dzp = dl->dl_dzp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode); boolean_t unlinked = B_FALSE; sa_bulk_attr_t bulk[5]; uint64_t mtime[2], ctime[2]; uint64_t links; int count = 0; int error; if (!(flag & ZRENAMING)) { mutex_enter(&zp->z_lock); if (zp_is_dir && !zfs_dirempty(zp)) { mutex_exit(&zp->z_lock); return (SET_ERROR(ENOTEMPTY)); } /* * If we get here, we are going to try to remove the object. * First try removing the name from the directory; if that * fails, return the error. */ error = zfs_dropname(dl, zp, dzp, tx, flag); if (error != 0) { mutex_exit(&zp->z_lock); return (error); } if (ZTOI(zp)->i_nlink <= zp_is_dir) { zfs_panic_recover("zfs: link count on %lu is %u, " "should be at least %u", zp->z_id, (int)ZTOI(zp)->i_nlink, zp_is_dir + 1); set_nlink(ZTOI(zp), zp_is_dir + 1); } drop_nlink(ZTOI(zp)); if (ZTOI(zp)->i_nlink == zp_is_dir) { zp->z_unlinked = B_TRUE; clear_nlink(ZTOI(zp)); unlinked = B_TRUE; } else { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime); } links = ZTOI(zp)->i_nlink; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, sizeof (links)); error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); count = 0; ASSERT(error == 0); mutex_exit(&zp->z_lock); } else { error = zfs_dropname(dl, zp, dzp, tx, flag); if (error != 0) return (error); } mutex_enter(&dzp->z_lock); dzp->z_size--; /* one dirent removed */ if (zp_is_dir) drop_nlink(ZTOI(dzp)); /* ".." link from zp */ links = ZTOI(dzp)->i_nlink; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, sizeof (links)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &dzp->z_size, sizeof (dzp->z_size)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); ASSERT(error == 0); mutex_exit(&dzp->z_lock); if (unlinkedp != NULL) *unlinkedp = unlinked; else if (unlinked) zfs_unlinked_add(zp, tx); return (0); } /* * Indicate whether the directory is empty. Works with or without z_lock * held, but can only be consider a hint in the latter case. Returns true * if only "." and ".." remain and there's no work in progress. * * The internal ZAP size, rather than zp->z_size, needs to be checked since * some consumers (Lustre) do not strictly maintain an accurate SA_ZPL_SIZE. */ boolean_t zfs_dirempty(znode_t *dzp) { zfsvfs_t *zfsvfs = ZTOZSB(dzp); uint64_t count; int error; if (dzp->z_dirlocks != NULL) return (B_FALSE); error = zap_count(zfsvfs->z_os, dzp->z_id, &count); if (error != 0 || count != 0) return (B_FALSE); return (B_TRUE); } int zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xzpp, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); znode_t *xzp; dmu_tx_t *tx; int error; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; #ifdef ZFS_DEBUG uint64_t parent; #endif *xzpp = NULL; - if ((error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))) - return (error); - if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, &acl_ids)) != 0) return (error); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zp->z_projid)) { zfs_acl_ids_free(&acl_ids); return (SET_ERROR(EDQUOT)); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); return (error); } zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); #ifdef ZFS_DEBUG error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent)); ASSERT(error == 0 && parent == zp->z_id); #endif VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, sizeof (xzp->z_id), tx)); if (!zp->z_unlinked) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "", NULL, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); *xzpp = xzp; return (0); } /* * Return a znode for the extended attribute directory for zp. * ** If the directory does not already exist, it is created ** * * IN: zp - znode to obtain attribute directory from * cr - credentials of caller * flags - flags from the VOP_LOOKUP call * * OUT: xipp - pointer to extended attribute znode * * RETURN: 0 on success * error number on failure */ int zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags) { zfsvfs_t *zfsvfs = ZTOZSB(zp); znode_t *xzp; zfs_dirlock_t *dl; vattr_t va; int error; top: error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL); if (error) return (error); if (xzp != NULL) { *xzpp = xzp; zfs_dirent_unlock(dl); return (0); } if (!(flags & CREATE_XATTR_DIR)) { zfs_dirent_unlock(dl); return (SET_ERROR(ENOENT)); } if (zfs_is_readonly(zfsvfs)) { zfs_dirent_unlock(dl); return (SET_ERROR(EROFS)); } /* * The ability to 'create' files in an attribute * directory comes from the write_xattr permission on the base file. * * The ability to 'search' an attribute directory requires * read_xattr permission on the base file. * * Once in a directory the ability to read/write attributes * is controlled by the permissions on the attribute file. */ va.va_mask = ATTR_MODE | ATTR_UID | ATTR_GID; va.va_mode = S_IFDIR | S_ISVTX | 0777; zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); va.va_dentry = NULL; error = zfs_make_xattrdir(zp, &va, xzpp, cr); zfs_dirent_unlock(dl); if (error == ERESTART) { /* NB: we already did dmu_tx_wait() if necessary */ goto top; } return (error); } /* * Decide whether it is okay to remove within a sticky directory. * * In sticky directories, write access is not sufficient; * you can remove entries from a directory only if: * * you own the directory, * you own the entry, * you have write access to the entry, * or you are privileged (checked in secpolicy...). * * The function returns 0 if remove access is granted. */ int zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) { uid_t uid; uid_t downer; uid_t fowner; zfsvfs_t *zfsvfs = ZTOZSB(zdp); if (zfsvfs->z_replay) return (0); if ((zdp->z_mode & S_ISVTX) == 0) return (0); downer = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zdp)->i_uid), cr, ZFS_OWNER); fowner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zp)->i_uid), cr, ZFS_OWNER); if ((uid = crgetuid(cr)) == downer || uid == fowner || zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0) return (0); else return (secpolicy_vnode_remove(cr)); } diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 796a86c25ab7..ae0401e60e54 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -1,4036 +1,4037 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Programming rules. * * Each vnode op performs some logical unit of work. To do this, the ZPL must * properly lock its in-core state, create a DMU transaction, do the work, * record this work in the intent log (ZIL), commit the DMU transaction, * and wait for the intent log to commit if it is a synchronous operation. * Moreover, the vnode ops must work in both normal and log replay context. * The ordering of events is important to avoid deadlocks and references * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. * This is done avoiding races using ZFS_ENTER(zfsvfs). * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros * can return EIO from the calling function. * * (2) zrele() should always be the last thing except for zil_commit() (if * necessary) and ZFS_EXIT(). This is for 3 reasons: First, if it's the * last reference, the vnode/znode can be freed, so the zp may point to * freed memory. Second, the last reference will call zfs_zinactive(), * which may induce a lot of work -- pushing cached pages (which acquires * range locks) and syncing out cached atime changes. Third, * zfs_zinactive() may require a new tx, which could deadlock the system * if you were already holding one. This deadlock occurs because the tx * currently being operated on prevents a txg from syncing, which * prevents the new tx from progressing, resulting in a deadlock. If you * must call zrele() within a tx, use zfs_zrele_async(). Note that iput() * is a synonym for zrele(). * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to * dmu_tx_assign(). This is critical because we don't want to block * while holding locks. * * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This * reduces lock contention and CPU usage when we must wait (note that if * throughput is constrained by the storage, nearly every transaction * must wait). * * Note, in particular, that if a lock is sometimes acquired before * the tx assigns, and sometimes after (e.g. z_lock), then failing * to use a non-blocking assign can deadlock the system. The scenario: * * Thread A has grabbed a lock before calling dmu_tx_assign(). * Thread B is in an already-assigned tx, and blocks for this lock. * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() * forever, because the previous txg can't quiesce until B's tx commits. * * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, * then drop all locks, call dmu_tx_wait(), and try again. On subsequent * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, * to indicate that this operation has already called dmu_tx_wait(). * This will ensure that we don't retry forever, waiting a short bit * each time. * * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. * During ZIL replay the zfs_log_* functions will update the sequence * number to indicate the zil transaction has replayed. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. * * (7) After dropping all locks, invoke zil_commit(zilog, foid) * to ensure that synchronous semantics are provided when necessary. * * In general, this is how things should be ordered in each vnode op: * * ZFS_ENTER(zfsvfs); // exit if unmounted * top: * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * zrele(...); // release held znodes * if (error == ERESTART) { * waited = B_TRUE; * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; * } * dmu_tx_abort(tx); // abort DMU tx * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // really out of space * } * error = do_real_work(); // do whatever this VOP does * if (error == 0) * zfs_log_*(...); // on success, make ZIL entry * dmu_tx_commit(tx); // commit DMU tx -- error or not * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * zrele(...); // release held znodes * zil_commit(zilog, foid); // synchronous when necessary * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // done, report error */ /* * Virus scanning is unsupported. It would be possible to add a hook * here to performance the required virus scan. This could be done * entirely in the kernel or potentially as an update to invoke a * scanning utility. */ static int zfs_vscan(struct inode *ip, cred_t *cr, int async) { return (0); } /* ARGSUSED */ int zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* Honor ZFS_APPENDONLY file attribute */ if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & O_APPEND) == 0)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* Virus scan eligible files on open */ if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { if (zfs_vscan(ip, cr, 0) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } } /* Keep a count of the synchronous opens in the znode */ if (flag & O_SYNC) atomic_inc_32(&zp->z_sync_cnt); ZFS_EXIT(zfsvfs); return (0); } /* ARGSUSED */ int zfs_close(struct inode *ip, int flag, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* Decrement the synchronous opens in the znode */ if (flag & O_SYNC) atomic_dec_32(&zp->z_sync_cnt); if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) VERIFY(zfs_vscan(ip, cr, 1) == 0); ZFS_EXIT(zfsvfs); return (0); } #if defined(_KERNEL) /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. */ void update_pages(znode_t *zp, int64_t start, int len, objset_t *os) { struct inode *ip = ZTOI(zp); struct address_space *mp = ip->i_mapping; struct page *pp; uint64_t nbytes; int64_t off; void *pb; off = start & (PAGE_SIZE-1); for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { nbytes = MIN(PAGE_SIZE - off, len); pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { if (mapping_writably_mapped(mp)) flush_dcache_page(pp); pb = kmap(pp); (void) dmu_read(os, zp->z_id, start + off, nbytes, pb + off, DMU_READ_PREFETCH); kunmap(pp); if (mapping_writably_mapped(mp)) flush_dcache_page(pp); mark_page_accessed(pp); SetPageUptodate(pp); ClearPageError(pp); unlock_page(pp); put_page(pp); } len -= nbytes; off = 0; } } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Read: We "read" preferentially from memory mapped pages, * else we default from the dmu buffer. * * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when * the file is memory mapped. */ int mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) { struct inode *ip = ZTOI(zp); struct address_space *mp = ip->i_mapping; struct page *pp; int64_t start, off; uint64_t bytes; int len = nbytes; int error = 0; void *pb; start = uio->uio_loffset; off = start & (PAGE_SIZE-1); for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { bytes = MIN(PAGE_SIZE - off, len); pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { ASSERT(PageUptodate(pp)); unlock_page(pp); pb = kmap(pp); error = zfs_uiomove(pb + off, bytes, UIO_READ, uio); kunmap(pp); if (mapping_writably_mapped(mp)) flush_dcache_page(pp); mark_page_accessed(pp); put_page(pp); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, bytes); } len -= bytes; off = 0; if (error) break; } return (error); } #endif /* _KERNEL */ unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; /* * Write the bytes to a file. * * IN: zp - znode of file to be written to * data - bytes to write * len - number of bytes to write * pos - offset to start writing at * * OUT: resid - remaining bytes to write * * RETURN: 0 if success * positive error code if failure. EIO is returned * for a short write when residp isn't provided. * * Timestamps: * zp - ctime|mtime updated if byte count > 0 */ int zfs_write_simple(znode_t *zp, const void *data, size_t len, loff_t pos, size_t *residp) { fstrans_cookie_t cookie; int error; struct iovec iov; iov.iov_base = (void *)data; iov.iov_len = len; zfs_uio_t uio; zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0); cookie = spl_fstrans_mark(); error = zfs_write(zp, &uio, 0, kcred); spl_fstrans_unmark(cookie); if (error == 0) { if (residp != NULL) *residp = zfs_uio_resid(&uio); else if (zfs_uio_resid(&uio) != 0) error = SET_ERROR(EIO); } return (error); } static void zfs_rele_async_task(void *arg) { iput(arg); } void zfs_zrele_async(znode_t *zp) { struct inode *ip = ZTOI(zp); objset_t *os = ITOZSB(ip)->z_os; ASSERT(atomic_read(&ip->i_count) > 0); ASSERT(os != NULL); /* * If decrementing the count would put us at 0, we can't do it inline * here, because that would be synchronous. Instead, dispatch an iput * to run later. * * For more information on the dangers of a synchronous iput, see the * header comment of this file. */ if (!atomic_add_unless(&ip->i_count, -1, 1)) { VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)), zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID); } } /* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held inode reference for it. * * IN: zdp - znode of directory to search. * nm - name of entry to lookup. * flags - LOOKUP_XATTR set if looking for an attribute. * cr - credentials of caller. * direntflags - directory lookup flags * realpnp - returned pathname. * * OUT: zpp - znode of located entry, NULL if not found. * * RETURN: 0 on success, error code on failure. * * Timestamps: * NA */ /* ARGSUSED */ int zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) { zfsvfs_t *zfsvfs = ZTOZSB(zdp); int error = 0; /* * Fast path lookup, however we must skip DNLC lookup * for case folding or normalizing lookups because the * DNLC code only stores the passed in name. This means * creating 'a' and removing 'A' on a case insensitive * file system would work, but DNLC still thinks 'a' * exists and won't let you create it again on the next * pass through fast path. */ if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { if (!S_ISDIR(ZTOI(zdp)->i_mode)) { return (SET_ERROR(ENOTDIR)); } else if (zdp->z_sa_hdl == NULL) { return (SET_ERROR(EIO)); } if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { error = zfs_fastaccesschk_execute(zdp, cr); if (!error) { *zpp = zdp; zhold(*zpp); return (0); } return (error); } } ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zdp); *zpp = NULL; if (flags & LOOKUP_XATTR) { /* * We don't allow recursive attributes.. * Maybe someday we will. */ if (zdp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) { ZFS_EXIT(zfsvfs); return (error); } /* * Do we have permission to get into attribute directory? */ if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0, B_FALSE, cr))) { zrele(*zpp); *zpp = NULL; } ZFS_EXIT(zfsvfs); return (error); } if (!S_ISDIR(ZTOI(zdp)->i_mode)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOTDIR)); } /* * Check accessibility of directory. */ if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { ZFS_EXIT(zfsvfs); return (error); } if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp); if ((error == 0) && (*zpp)) zfs_znode_update_vfs(*zpp); ZFS_EXIT(zfsvfs); return (error); } /* * Attempt to create a new entry in a directory. If the entry * already exists, truncate the file if permissible, else return * an error. Return the ip of the created or trunc'd file. * * IN: dzp - znode of directory to put new file entry in. * name - name of new file entry. * vap - attributes of new file. * excl - flag indicating exclusive or non-exclusive mode. * mode - mode to open file with. * cr - credentials of caller. * flag - file flag. * vsecp - ACL to be set * * OUT: zpp - znode of created or trunc'd entry. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dzp - ctime|mtime updated if new entry created * zp - ctime|mtime always, atime if new */ /* ARGSUSED */ int zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; objset_t *os; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; uid_t uid; gid_t gid; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; boolean_t have_acl = B_FALSE; boolean_t waited = B_FALSE; + boolean_t skip_acl = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ gid = crgetgid(cr); uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); if (name == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); os = zfsvfs->z_os; zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } top: *zpp = NULL; if (*name == '\0') { /* * Null component name refers to the directory itself. */ zhold(dzp); zp = dzp; dl = NULL; error = 0; } else { /* possible igrab(zp) */ int zflg = 0; if (flag & FIGNORECASE) zflg |= ZCILOOK; error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { if (have_acl) zfs_acl_ids_free(&acl_ids); if (strcmp(name, "..") == 0) error = SET_ERROR(EISDIR); ZFS_EXIT(zfsvfs); return (error); } } if (zp == NULL) { uint64_t txtype; uint64_t projid = ZFS_DEFAULT_PROJID; /* * Create a new file object and update the directory * to reference it. */ - if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, skip_acl, cr))) { if (have_acl) zfs_acl_ids_free(&acl_ids); goto out; } /* * We only support the creation of regular files in * extended attribute directories. */ if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { if (have_acl) zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EINVAL); goto out; } if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) goto out; have_acl = B_TRUE; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) projid = zfs_inherit_projid(dzp); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { /* * Since, we failed to add the directory entry for it, * delete the newly created dnode. */ zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); goto out; } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); if (flag & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, name, vsecp, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); } else { int aflags = (flag & O_APPEND) ? V_APPEND : 0; if (have_acl) zfs_acl_ids_free(&acl_ids); have_acl = B_FALSE; /* * A directory entry already exists for this name. */ /* * Can't truncate an existing file if in exclusive mode. */ if (excl) { error = SET_ERROR(EEXIST); goto out; } /* * Can't open a directory for writing. */ if (S_ISDIR(ZTOI(zp)->i_mode)) { error = SET_ERROR(EISDIR); goto out; } /* * Verify requested access to file. */ if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { goto out; } mutex_enter(&dzp->z_lock); dzp->z_seq++; mutex_exit(&dzp->z_lock); /* * Truncate regular files if requested. */ if (S_ISREG(ZTOI(zp)->i_mode) && (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { /* we can't hold any locks when calling zfs_freesp() */ if (dl) { zfs_dirent_unlock(dl); dl = NULL; } error = zfs_freesp(zp, 0, 0, mode, TRUE); } } out: if (dl) zfs_dirent_unlock(dl); if (error) { if (zp) zrele(zp); } else { zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); *zpp = zp; } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* ARGSUSED */ int zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp) { znode_t *zp = NULL, *dzp = ITOZ(dip); zfsvfs_t *zfsvfs = ITOZSB(dip); objset_t *os; dmu_tx_t *tx; int error; uid_t uid; gid_t gid; zfs_acl_ids_t acl_ids; uint64_t projid = ZFS_DEFAULT_PROJID; boolean_t fuid_dirtied; boolean_t have_acl = B_FALSE; boolean_t waited = B_FALSE; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ gid = crgetgid(cr); uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); os = zfsvfs->z_os; if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } top: *ipp = NULL; /* * Create a new file object and update the directory * to reference it. */ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { if (have_acl) zfs_acl_ids_free(&acl_ids); goto out; } if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) goto out; have_acl = B_TRUE; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) projid = zfs_inherit_projid(dzp); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); /* Add to unlinked set */ zp->z_unlinked = B_TRUE; zfs_unlinked_add(zp, tx); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); out: if (error) { if (zp) zrele(zp); } else { zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); *ipp = ZTOI(zp); } ZFS_EXIT(zfsvfs); return (error); } /* * Remove an entry from a directory. * * IN: dzp - znode of directory to remove entry from. * name - name of entry to remove. * cr - credentials of caller. * flags - case flags. * * RETURN: 0 if success * error code if failure * * Timestamps: * dzp - ctime|mtime * ip - ctime (if nlink > 0) */ uint64_t null_xattr = 0; /*ARGSUSED*/ int zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) { znode_t *zp; znode_t *xzp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; uint64_t acl_obj, xattr_obj; uint64_t xattr_obj_unlinked = 0; uint64_t obj = 0; uint64_t links; zfs_dirlock_t *dl; dmu_tx_t *tx; boolean_t may_delete_now, delete_now = FALSE; boolean_t unlinked, toobig = FALSE; uint64_t txtype; pathname_t *realnmp = NULL; pathname_t realnm; int error; int zflg = ZEXISTS; boolean_t waited = B_FALSE; if (name == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) { zflg |= ZCILOOK; pn_alloc(&realnm); realnmp = &realnm; } top: xattr_obj = 0; xzp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, realnmp))) { if (realnmp) pn_free(realnmp); ZFS_EXIT(zfsvfs); return (error); } if ((error = zfs_zaccess_delete(dzp, zp, cr))) { goto out; } /* * Need to use rmdir for removing directories. */ if (S_ISDIR(ZTOI(zp)->i_mode)) { error = SET_ERROR(EPERM); goto out; } mutex_enter(&zp->z_lock); may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 && !(zp->z_is_mapped); mutex_exit(&zp->z_lock); /* * We may delete the znode now, or we may put it in the unlinked set; * it depends on whether we're the last link, and on whether there are * other holds on the inode. So we dmu_tx_hold() the right things to * allow for either case. */ obj = zp->z_id; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); if (may_delete_now) { toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; /* if the file is too big, only hold_free a token amount */ dmu_tx_hold_free(tx, zp->z_id, 0, (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); } /* are there any extended attributes? */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT0(error); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } mutex_enter(&zp->z_lock); if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); mutex_exit(&zp->z_lock); /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); /* * Mark this transaction as typically resulting in a net free of space */ dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); zrele(zp); if (xzp) zrele(xzp); goto top; } if (realnmp) pn_free(realnmp); dmu_tx_abort(tx); zrele(zp); if (xzp) zrele(xzp); ZFS_EXIT(zfsvfs); return (error); } /* * Remove the directory entry. */ error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); if (error) { dmu_tx_commit(tx); goto out; } if (unlinked) { /* * Hold z_lock so that we can make sure that the ACL obj * hasn't changed. Could have been deleted due to * zfs_sa_upgrade(). */ mutex_enter(&zp->z_lock); (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); delete_now = may_delete_now && !toobig && atomic_read(&ZTOI(zp)->i_count) == 1 && !(zp->z_is_mapped) && xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == acl_obj; } if (delete_now) { if (xattr_obj_unlinked) { ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2); mutex_enter(&xzp->z_lock); xzp->z_unlinked = B_TRUE; clear_nlink(ZTOI(xzp)); links = 0; error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), &links, sizeof (links), tx); ASSERT3U(error, ==, 0); mutex_exit(&xzp->z_lock); zfs_unlinked_add(xzp, tx); if (zp->z_is_sa) error = sa_remove(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), tx); else error = sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &null_xattr, sizeof (uint64_t), tx); ASSERT0(error); } /* * Add to the unlinked set because a new reference could be * taken concurrently resulting in a deferred destruction. */ zfs_unlinked_add(zp, tx); mutex_exit(&zp->z_lock); } else if (unlinked) { mutex_exit(&zp->z_lock); zfs_unlinked_add(zp, tx); } txtype = TX_REMOVE; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); dmu_tx_commit(tx); out: if (realnmp) pn_free(realnmp); zfs_dirent_unlock(dl); zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); if (delete_now) zrele(zp); else zfs_zrele_async(zp); if (xzp) { zfs_znode_update_vfs(xzp); zfs_zrele_async(xzp); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new directory and insert it into dzp using the name * provided. Return a pointer to the inserted directory. * * IN: dzp - znode of directory to add subdir to. * dirname - name of new directory. * vap - attributes of new directory. * cr - credentials of caller. * flags - case flags. * vsecp - ACL to be set * * OUT: zpp - znode of created directory. * * RETURN: 0 if success * error code if failure * * Timestamps: * dzp - ctime|mtime updated * zpp - ctime|mtime|atime updated */ /*ARGSUSED*/ int zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, cred_t *cr, int flags, vsecattr_t *vsecp) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; zfs_dirlock_t *dl; uint64_t txtype; dmu_tx_t *tx; int error; int zf = ZNEW; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; boolean_t waited = B_FALSE; ASSERT(S_ISDIR(vap->va_mode)); /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); if (dirname == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (dzp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (zfsvfs->z_utf8 && u8_validate(dirname, strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zf |= ZCILOOK; if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * First make sure the new directory doesn't exist. * * Existence is checked first to make sure we don't return * EACCES instead of EEXIST which can cause some applications * to fail. */ top: *zpp = NULL; if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, NULL, NULL))) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } /* * Add a new entry to the directory. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Create new node. */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); /* * Now put new name in parent dir. */ error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); goto out; } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); *zpp = zp; txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, acl_ids.z_fuidp, vap); out: zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); if (error != 0) { zrele(zp); } else { zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); } ZFS_EXIT(zfsvfs); return (error); } /* * Remove a directory subdir entry. If the current working * directory is the same as the subdir to be removed, the * remove will fail. * * IN: dzp - znode of directory to remove from. * name - name of directory to be removed. * cwd - inode of current working directory. * cr - credentials of caller. * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dzp - ctime|mtime updated */ /*ARGSUSED*/ int zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, int flags) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; int zflg = ZEXISTS; boolean_t waited = B_FALSE; if (name == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) zflg |= ZCILOOK; top: zp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL))) { ZFS_EXIT(zfsvfs); return (error); } if ((error = zfs_zaccess_delete(dzp, zp, cr))) { goto out; } if (!S_ISDIR(ZTOI(zp)->i_mode)) { error = SET_ERROR(ENOTDIR); goto out; } if (zp == cwd) { error = SET_ERROR(EINVAL); goto out; } /* * Grab a lock on the directory to make sure that no one is * trying to add (or lookup) entries while we are removing it. */ rw_enter(&zp->z_name_lock, RW_WRITER); /* * Grab a lock on the parent pointer to make sure we play well * with the treewalk and directory rename code. */ rw_enter(&zp->z_parent_lock, RW_WRITER); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); zrele(zp); goto top; } dmu_tx_abort(tx); zrele(zp); ZFS_EXIT(zfsvfs); return (error); } error = zfs_link_destroy(dl, zp, tx, zflg, NULL); if (error == 0) { uint64_t txtype = TX_RMDIR; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, B_FALSE); } dmu_tx_commit(tx); rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); out: zfs_dirent_unlock(dl); zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); zrele(zp); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Read directory entries from the given directory cursor position and emit * name and position for each entry. * * IN: ip - inode of directory to read. * ctx - directory entry context. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - atime updated * * Note that the low 4 bits of the cookie returned by zap is always zero. * This allows us to use the low range for "special" directory entries: * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, * we use the offset 2 for the '.zfs' directory. */ /* ARGSUSED */ int zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); objset_t *os; zap_cursor_t zc; zap_attribute_t zap; int error; uint8_t prefetch; uint8_t type; int done = 0; uint64_t parent; uint64_t offset; /* must be unsigned; checks for < 1 */ ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) goto out; /* * Quit if directory has been removed (posix) */ if (zp->z_unlinked) goto out; error = 0; os = zfsvfs->z_os; offset = ctx->pos; prefetch = zp->z_zn_prefetch; /* * Initialize the iterator cursor. */ if (offset <= 3) { /* * Start iteration from the beginning of the directory. */ zap_cursor_init(&zc, os, zp->z_id); } else { /* * The offset is a serialized cursor. */ zap_cursor_init_serialized(&zc, os, zp->z_id, offset); } /* * Transform to file-system independent format */ while (!done) { uint64_t objnum; /* * Special case `.', `..', and `.zfs'. */ if (offset == 0) { (void) strcpy(zap.za_name, "."); zap.za_normalization_conflict = 0; objnum = zp->z_id; type = DT_DIR; } else if (offset == 1) { (void) strcpy(zap.za_name, ".."); zap.za_normalization_conflict = 0; objnum = parent; type = DT_DIR; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); zap.za_normalization_conflict = 0; objnum = ZFSCTL_INO_ROOT; type = DT_DIR; } else { /* * Grab next entry. */ if ((error = zap_cursor_retrieve(&zc, &zap))) { if (error == ENOENT) break; else goto update; } /* * Allow multiple entries provided the first entry is * the object id. Non-zpl consumers may safely make * use of the additional space. * * XXX: This should be a feature flag for compatibility */ if (zap.za_integer_length != 8 || zap.za_num_integers == 0) { cmn_err(CE_WARN, "zap_readdir: bad directory " "entry, obj = %lld, offset = %lld, " "length = %d, num = %lld\n", (u_longlong_t)zp->z_id, (u_longlong_t)offset, zap.za_integer_length, (u_longlong_t)zap.za_num_integers); error = SET_ERROR(ENXIO); goto update; } objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); type = ZFS_DIRENT_TYPE(zap.za_first_integer); } done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name), objnum, type); if (done) break; /* Prefetch znode */ if (prefetch) { dmu_prefetch(os, objnum, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); } /* * Move to the next entry, fill in the previous offset. */ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { zap_cursor_advance(&zc); offset = zap_cursor_serialize(&zc); } else { offset += 1; } ctx->pos = offset; } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ update: zap_cursor_fini(&zc); if (error == ENOENT) error = 0; out: ZFS_EXIT(zfsvfs); return (error); } /* * Get the basic file attributes and place them in the provided kstat * structure. The inode is assumed to be the authoritative source * for most of the attributes. However, the znode currently has the * authoritative atime, blksize, and block count. * * IN: ip - inode of file. * * OUT: sp - kstat values. * * RETURN: 0 (always succeeds) */ /* ARGSUSED */ int zfs_getattr_fast(struct user_namespace *user_ns, struct inode *ip, struct kstat *sp) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint32_t blksize; u_longlong_t nblocks; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); mutex_enter(&zp->z_lock); zpl_generic_fillattr(user_ns, ip, sp); /* * +1 link count for root inode with visible '.zfs' directory. */ if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) if (sp->nlink < ZFS_LINK_MAX) sp->nlink++; sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); sp->blksize = blksize; sp->blocks = nblocks; if (unlikely(zp->z_blksz == 0)) { /* * Block size hasn't been set; suggest maximal I/O transfers. */ sp->blksize = zfsvfs->z_max_blksz; } mutex_exit(&zp->z_lock); /* * Required to prevent NFS client from detecting different inode * numbers of snapshot root dentry before and after snapshot mount. */ if (zfsvfs->z_issnap) { if (ip->i_sb->s_root->d_inode == ip) sp->ino = ZFSCTL_INO_SNAPDIRS - dmu_objset_id(zfsvfs->z_os); } ZFS_EXIT(zfsvfs); return (0); } /* * For the operation of changing file's user/group/project, we need to * handle not only the main object that is assigned to the file directly, * but also the ones that are used by the file via hidden xattr directory. * * Because the xattr directory may contains many EA entries, as to it may * be impossible to change all of them via the transaction of changing the * main object's user/group/project attributes. Then we have to change them * via other multiple independent transactions one by one. It may be not good * solution, but we have no better idea yet. */ static int zfs_setattr_dir(znode_t *dzp) { struct inode *dxip = ZTOI(dzp); struct inode *xip = NULL; zfsvfs_t *zfsvfs = ZTOZSB(dzp); objset_t *os = zfsvfs->z_os; zap_cursor_t zc; zap_attribute_t zap; zfs_dirlock_t *dl; znode_t *zp = NULL; dmu_tx_t *tx = NULL; uint64_t uid, gid; sa_bulk_attr_t bulk[4]; int count; int err; zap_cursor_init(&zc, os, dzp->z_id); while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) { count = 0; if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { err = ENXIO; break; } err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp, ZEXISTS, NULL, NULL); if (err == ENOENT) goto next; if (err) break; xip = ZTOI(zp); if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) && KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) && zp->z_projid == dzp->z_projid) goto next; tx = dmu_tx_create(os); if (!(zp->z_pflags & ZFS_PROJID)) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); err = dmu_tx_assign(tx, TXG_WAIT); if (err) break; mutex_enter(&dzp->z_lock); if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) { xip->i_uid = dxip->i_uid; uid = zfs_uid_read(dxip); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, sizeof (uid)); } if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) { xip->i_gid = dxip->i_gid; gid = zfs_gid_read(dxip); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, sizeof (gid)); } if (zp->z_projid != dzp->z_projid) { if (!(zp->z_pflags & ZFS_PROJID)) { zp->z_pflags |= ZFS_PROJID; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); } zp->z_projid = dzp->z_projid; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, sizeof (zp->z_projid)); } mutex_exit(&dzp->z_lock); if (likely(count > 0)) { err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); } else { dmu_tx_abort(tx); } tx = NULL; if (err != 0 && err != ENOENT) break; next: if (zp) { zrele(zp); zp = NULL; zfs_dirent_unlock(dl); } zap_cursor_advance(&zc); } if (tx) dmu_tx_abort(tx); if (zp) { zrele(zp); zfs_dirent_unlock(dl); } zap_cursor_fini(&zc); return (err == ENOENT ? 0 : err); } /* * Set the file attributes to the values contained in the * vattr structure. * * IN: zp - znode of file to be modified. * vap - new attribute values. * If ATTR_XVATTR set, then optional attrs are being set * flags - ATTR_UTIME set if non-default time values provided. * - ATTR_NOACLCHECK (CIFS context only). * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - ctime updated, mtime updated if size changed. */ /* ARGSUSED */ int zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) { struct inode *ip; zfsvfs_t *zfsvfs = ZTOZSB(zp); objset_t *os = zfsvfs->z_os; zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; xvattr_t *tmpxvattr; uint_t mask = vap->va_mask; uint_t saved_mask = 0; int trim_mask = 0; uint64_t new_mode; uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; uint64_t xattr_obj; uint64_t mtime[2], ctime[2], atime[2]; uint64_t projid = ZFS_INVALID_PROJID; znode_t *attrzp; int need_policy = FALSE; int err, err2 = 0; zfs_fuid_info_t *fuidp = NULL; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap; zfs_acl_t *aclp; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; boolean_t fuid_dirtied = B_FALSE; boolean_t handle_eadir = B_FALSE; sa_bulk_attr_t *bulk, *xattr_bulk; int count = 0, xattr_count = 0, bulks = 8; if (mask == 0) return (0); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); ip = ZTOI(zp); /* * If this is a xvattr_t, then get a pointer to the structure of * optional attributes. If this is NULL, then we have a vattr_t. */ xoap = xva_getxoptattr(xvap); if (xoap != NULL && (mask & ATTR_XVATTR)) { if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { if (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOTSUP)); } projid = xoap->xoa_projid; if (unlikely(projid == ZFS_INVALID_PROJID)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) projid = ZFS_INVALID_PROJID; else need_policy = TRUE; } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && (xoap->xoa_projinherit != ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOTSUP)); } } zilog = zfsvfs->z_log; /* * Make sure that if we have ephemeral uid/gid or xvattr specified * that file system is at proper version level */ if (zfsvfs->z_use_fuids == B_FALSE && (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || (mask & ATTR_XVATTR))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EISDIR)); } if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); xva_init(tmpxvattr); bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); /* * Immutable files can only alter immutable bit and atime */ if ((zp->z_pflags & ZFS_IMMUTABLE) && ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { err = SET_ERROR(EPERM); goto out3; } if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { err = SET_ERROR(EPERM); goto out3; } /* * Verify timestamps doesn't overflow 32 bits. * ZFS can handle large timestamps, but 32bit syscalls can't * handle times greater than 2039. This check should be removed * once large timestamps are fully supported. */ if (mask & (ATTR_ATIME | ATTR_MTIME)) { if (((mask & ATTR_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || ((mask & ATTR_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { err = SET_ERROR(EOVERFLOW); goto out3; } } top: attrzp = NULL; aclp = NULL; /* Can this be moved to before the top label? */ if (zfs_is_readonly(zfsvfs)) { err = SET_ERROR(EROFS); goto out3; } /* * First validate permissions */ if (mask & ATTR_SIZE) { err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); if (err) goto out3; /* * XXX - Note, we are not providing any open * mode flags here (like FNDELAY), so we may * block if there are locks present... this * should be addressed in openat(). */ /* XXX - would it be OK to generate a log record here? */ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); if (err) goto out3; } if (mask & (ATTR_ATIME|ATTR_MTIME) || ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || XVA_ISSET_REQ(xvap, XAT_READONLY) || XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || XVA_ISSET_REQ(xvap, XAT_OFFLINE) || XVA_ISSET_REQ(xvap, XAT_SPARSE) || XVA_ISSET_REQ(xvap, XAT_CREATETIME) || XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, skipaclchk, cr); } if (mask & (ATTR_UID|ATTR_GID)) { int idmask = (mask & (ATTR_UID|ATTR_GID)); int take_owner; int take_group; /* * NOTE: even if a new mode is being set, * we may clear S_ISUID/S_ISGID bits. */ if (!(mask & ATTR_MODE)) vap->va_mode = zp->z_mode; /* * Take ownership or chgrp to group we are a member of */ take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr)); take_group = (mask & ATTR_GID) && zfs_groupmember(zfsvfs, vap->va_gid, cr); /* * If both ATTR_UID and ATTR_GID are set then take_owner and * take_group must both be set in order to allow taking * ownership. * * Otherwise, send the check through secpolicy_vnode_setattr() * */ if (((idmask == (ATTR_UID|ATTR_GID)) && take_owner && take_group) || ((idmask == ATTR_UID) && take_owner) || ((idmask == ATTR_GID) && take_group)) { if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, skipaclchk, cr) == 0) { /* * Remove setuid/setgid for non-privileged users */ (void) secpolicy_setid_clear(vap, cr); trim_mask = (mask & (ATTR_UID|ATTR_GID)); } else { need_policy = TRUE; } } else { need_policy = TRUE; } } mutex_enter(&zp->z_lock); oldva.va_mode = zp->z_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & ATTR_XVATTR) { /* * Update xvattr mask to include only those attributes * that are actually changing. * * the bits will be restored prior to actually setting * the attributes so the caller thinks they were set. */ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { if (xoap->xoa_appendonly != ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_APPENDONLY); XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); } } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { if (xoap->xoa_projinherit != ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_PROJINHERIT); XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); } } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { if (xoap->xoa_nounlink != ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NOUNLINK); XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); } } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { if (xoap->xoa_immutable != ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_IMMUTABLE); XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); } } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { if (xoap->xoa_nodump != ((zp->z_pflags & ZFS_NODUMP) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NODUMP); XVA_SET_REQ(tmpxvattr, XAT_NODUMP); } } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { if (xoap->xoa_av_modified != ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); } } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { if ((!S_ISREG(ip->i_mode) && xoap->xoa_av_quarantined) || xoap->xoa_av_quarantined != ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); } } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { mutex_exit(&zp->z_lock); err = SET_ERROR(EPERM); goto out3; } if (need_policy == FALSE && (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { need_policy = TRUE; } } mutex_exit(&zp->z_lock); if (mask & ATTR_MODE) { if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { err = secpolicy_setid_setsticky_clear(ip, vap, &oldva, cr); if (err) goto out3; trim_mask |= ATTR_MODE; } else { need_policy = TRUE; } } if (need_policy) { /* * If trim_mask is set then take ownership * has been granted or write_acl is present and user * has the ability to modify mode. In that case remove * UID|GID and or MODE from mask so that * secpolicy_vnode_setattr() doesn't revoke it. */ if (trim_mask) { saved_mask = vap->va_mask; vap->va_mask &= ~trim_mask; } err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags, (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); if (err) goto out3; if (trim_mask) vap->va_mask |= saved_mask; } /* * secpolicy_vnode_setattr, or take ownership may have * changed va_mask */ mask = vap->va_mask; if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { handle_eadir = B_TRUE; err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (err == 0 && xattr_obj) { err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); if (err) goto out2; } if (mask & ATTR_UID) { new_kuid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) && zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, new_kuid)) { if (attrzp) zrele(attrzp); err = SET_ERROR(EDQUOT); goto out2; } } if (mask & ATTR_GID) { new_kgid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) && zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, new_kgid)) { if (attrzp) zrele(attrzp); err = SET_ERROR(EDQUOT); goto out2; } } if (projid != ZFS_INVALID_PROJID && zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { if (attrzp) zrele(attrzp); err = EDQUOT; goto out2; } } tx = dmu_tx_create(os); if (mask & ATTR_MODE) { uint64_t pmode = zp->z_mode; uint64_t acl_obj; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED && !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { err = EPERM; goto out; } if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) goto out; mutex_enter(&zp->z_lock); if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { /* * Are we upgrading ACL from old V0 format * to V1 format? */ if (zfsvfs->z_version >= ZPL_VERSION_FUID && zfs_znode_acl_version(zp) == ZFS_ACL_VERSION_INITIAL) { dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } mutex_exit(&zp->z_lock); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); } else { if (((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID))) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); } if (attrzp) { dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); } fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err) goto out; count = 0; /* * Set each attribute requested. * We group settings according to the locks they need to acquire. * * Note: you cannot set ctime directly, although it will be * updated as a side-effect of calling this function. */ if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { /* * For the existed object that is upgraded from old system, * its on-disk layout has no slot for the project ID attribute. * But quota accounting logic needs to access related slots by * offset directly. So we need to adjust old objects' layout * to make the project ID to some unified and fixed offset. */ if (attrzp) err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); if (err == 0) err = sa_add_projid(zp->z_sa_hdl, tx, projid); if (unlikely(err == EEXIST)) err = 0; else if (err != 0) goto out; else projid = ZFS_INVALID_PROJID; } if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_lock); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (attrzp) { if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_enter(&attrzp->z_acl_lock); mutex_enter(&attrzp->z_lock); SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, sizeof (attrzp->z_pflags)); if (projid != ZFS_INVALID_PROJID) { attrzp->z_projid = projid; SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, sizeof (attrzp->z_projid)); } } if (mask & (ATTR_UID|ATTR_GID)) { if (mask & ATTR_UID) { ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid); new_uid = zfs_uid_read(ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid); } } if (mask & ATTR_GID) { ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid); new_gid = zfs_gid_read(ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid); } } if (!(mask & ATTR_MODE)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); new_mode = zp->z_mode; } err = zfs_acl_chown_setattr(zp); ASSERT(err == 0); if (attrzp) { err = zfs_acl_chown_setattr(attrzp); ASSERT(err == 0); } } if (mask & ATTR_MODE) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); zp->z_mode = ZTOI(zp)->i_mode = new_mode; ASSERT3P(aclp, !=, NULL); err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT0(err); if (zp->z_acl_cached) zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = aclp; aclp = NULL; } if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { zp->z_atime_dirty = B_FALSE; ZFS_TIME_ENCODE(&ip->i_atime, atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, sizeof (atime)); } if (mask & (ATTR_MTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate( vap->va_mtime, ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); } if (mask & (ATTR_CTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_ctime, ctime); ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime, ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); } if (projid != ZFS_INVALID_PROJID) { zp->z_projid = projid; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, sizeof (zp->z_projid)); } if (attrzp && mask) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); } /* * Do this after setting timestamps to prevent timestamp * update from toggling bit */ if (xoap && (mask & ATTR_XVATTR)) { /* * restore trimmed off masks * so that return masks can be set for caller. */ if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { XVA_SET_REQ(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { XVA_SET_REQ(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { XVA_SET_REQ(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { XVA_SET_REQ(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { XVA_SET_REQ(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { XVA_SET_REQ(xvap, XAT_PROJINHERIT); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ASSERT(S_ISREG(ip->i_mode)); zfs_xvattr_set(zp, xvap, tx); } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); mutex_exit(&zp->z_lock); if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_exit(&zp->z_acl_lock); if (attrzp) { if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_exit(&attrzp->z_acl_lock); mutex_exit(&attrzp->z_lock); } out: if (err == 0 && xattr_count > 0) { err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, xattr_count, tx); ASSERT(err2 == 0); } if (aclp) zfs_acl_free(aclp); if (fuidp) { zfs_fuid_info_free(fuidp); fuidp = NULL; } if (err) { dmu_tx_abort(tx); if (attrzp) zrele(attrzp); if (err == ERESTART) goto top; } else { if (count > 0) err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); if (attrzp) { if (err2 == 0 && handle_eadir) err2 = zfs_setattr_dir(attrzp); zrele(attrzp); } zfs_znode_update_vfs(zp); } out2: if (os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); out3: kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); kmem_free(tmpxvattr, sizeof (xvattr_t)); ZFS_EXIT(zfsvfs); return (err); } typedef struct zfs_zlock { krwlock_t *zl_rwlock; /* lock we acquired */ znode_t *zl_znode; /* znode we held */ struct zfs_zlock *zl_next; /* next in list */ } zfs_zlock_t; /* * Drop locks and release vnodes that were held by zfs_rename_lock(). */ static void zfs_rename_unlock(zfs_zlock_t **zlpp) { zfs_zlock_t *zl; while ((zl = *zlpp) != NULL) { if (zl->zl_znode != NULL) zfs_zrele_async(zl->zl_znode); rw_exit(zl->zl_rwlock); *zlpp = zl->zl_next; kmem_free(zl, sizeof (*zl)); } } /* * Search back through the directory tree, using the ".." entries. * Lock each directory in the chain to prevent concurrent renames. * Fail any attempt to move a directory into one of its own descendants. * XXX - z_parent_lock can overlap with map or grow locks */ static int zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) { zfs_zlock_t *zl; znode_t *zp = tdzp; uint64_t rootid = ZTOZSB(zp)->z_root; uint64_t oidp = zp->z_id; krwlock_t *rwlp = &szp->z_parent_lock; krw_t rw = RW_WRITER; /* * First pass write-locks szp and compares to zp->z_id. * Later passes read-lock zp and compare to zp->z_parent. */ do { if (!rw_tryenter(rwlp, rw)) { /* * Another thread is renaming in this path. * Note that if we are a WRITER, we don't have any * parent_locks held yet. */ if (rw == RW_READER && zp->z_id > szp->z_id) { /* * Drop our locks and restart */ zfs_rename_unlock(&zl); *zlpp = NULL; zp = tdzp; oidp = zp->z_id; rwlp = &szp->z_parent_lock; rw = RW_WRITER; continue; } else { /* * Wait for other thread to drop its locks */ rw_enter(rwlp, rw); } } zl = kmem_alloc(sizeof (*zl), KM_SLEEP); zl->zl_rwlock = rwlp; zl->zl_znode = NULL; zl->zl_next = *zlpp; *zlpp = zl; if (oidp == szp->z_id) /* We're a descendant of szp */ return (SET_ERROR(EINVAL)); if (oidp == rootid) /* We've hit the top */ return (0); if (rw == RW_READER) { /* i.e. not the first pass */ int error = zfs_zget(ZTOZSB(zp), oidp, &zp); if (error) return (error); zl->zl_znode = zp; } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), &oidp, sizeof (oidp)); rwlp = &zp->z_parent_lock; rw = RW_READER; } while (zp->z_id != sdzp->z_id); return (0); } /* * Move an entry from the provided source directory to the target * directory. Change the entry name as indicated. * * IN: sdzp - Source directory containing the "old entry". * snm - Old entry name. * tdzp - Target directory to contain the "new entry". * tnm - New entry name. * cr - credentials of caller. * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * sdzp,tdzp - ctime|mtime updated */ /*ARGSUSED*/ int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, cred_t *cr, int flags) { znode_t *szp, *tzp; zfsvfs_t *zfsvfs = ZTOZSB(sdzp); zilog_t *zilog; zfs_dirlock_t *sdl, *tdl; dmu_tx_t *tx; zfs_zlock_t *zl; int cmp, serr, terr; int error = 0; int zflg = 0; boolean_t waited = B_FALSE; if (snm == NULL || tnm == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(sdzp); zilog = zfsvfs->z_log; ZFS_VERIFY_ZP(tdzp); /* * We check i_sb because snapshots and the ctldir must have different * super blocks. */ if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb || zfsctl_is_node(ZTOI(tdzp))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EXDEV)); } if (zfsvfs->z_utf8 && u8_validate(tnm, strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zflg |= ZCILOOK; top: szp = NULL; tzp = NULL; zl = NULL; /* * This is to prevent the creation of links into attribute space * by renaming a linked file into/outof an attribute directory. * See the comment in zfs_link() for why this is considered bad. */ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Lock source and target directory entries. To prevent deadlock, * a lock ordering must be defined. We lock the directory with * the smallest object id first, or if it's a tie, the one with * the lexically first name. */ if (sdzp->z_id < tdzp->z_id) { cmp = -1; } else if (sdzp->z_id > tdzp->z_id) { cmp = 1; } else { /* * First compare the two name arguments without * considering any case folding. */ int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); ASSERT(error == 0 || !zfsvfs->z_utf8); if (cmp == 0) { /* * POSIX: "If the old argument and the new argument * both refer to links to the same existing file, * the rename() function shall return successfully * and perform no other action." */ ZFS_EXIT(zfsvfs); return (0); } /* * If the file system is case-folding, then we may * have some more checking to do. A case-folding file * system is either supporting mixed case sensitivity * access or is completely case-insensitive. Note * that the file system is always case preserving. * * In mixed sensitivity mode case sensitive behavior * is the default. FIGNORECASE must be used to * explicitly request case insensitive behavior. * * If the source and target names provided differ only * by case (e.g., a request to rename 'tim' to 'Tim'), * we will treat this as a special case in the * case-insensitive mode: as long as the source name * is an exact match, we will allow this to proceed as * a name-change request. */ if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || (zfsvfs->z_case == ZFS_CASE_MIXED && flags & FIGNORECASE)) && u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, &error) == 0) { /* * case preserving rename request, require exact * name matches */ zflg |= ZCIEXACT; zflg &= ~ZCILOOK; } } /* * If the source and destination directories are the same, we should * grab the z_name_lock of that directory only once. */ if (sdzp == tdzp) { zflg |= ZHAVELOCK; rw_enter(&sdzp->z_name_lock, RW_READER); } if (cmp < 0) { serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS | zflg, NULL, NULL); terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); } else { terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, zflg, NULL, NULL); serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, NULL, NULL); } if (serr) { /* * Source entry invalid or not there. */ if (!terr) { zfs_dirent_unlock(tdl); if (tzp) zrele(tzp); } if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (strcmp(snm, "..") == 0) serr = EINVAL; ZFS_EXIT(zfsvfs); return (serr); } if (terr) { zfs_dirent_unlock(sdl); zrele(szp); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (strcmp(tnm, "..") == 0) terr = EINVAL; ZFS_EXIT(zfsvfs); return (terr); } /* * If we are using project inheritance, means if the directory has * ZFS_PROJINHERIT set, then its descendant directories will inherit * not only the project ID, but also the ZFS_PROJINHERIT flag. Under * such case, we only allow renames into our tree when the project * IDs are the same. */ if (tdzp->z_pflags & ZFS_PROJINHERIT && tdzp->z_projid != szp->z_projid) { error = SET_ERROR(EXDEV); goto out; } /* * Must have write access at the source to remove the old entry * and write access at the target to create the new entry. * Note that if target and source are the same, this can be * done in a single check. */ if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))) goto out; if (S_ISDIR(ZTOI(szp)->i_mode)) { /* * Check to make sure rename is valid. * Can't do a move like this: /usr/a/b to /usr/a/b/c/d */ if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) goto out; } /* * Does target exist? */ if (tzp) { /* * Source and target must be the same type. */ if (S_ISDIR(ZTOI(szp)->i_mode)) { if (!S_ISDIR(ZTOI(tzp)->i_mode)) { error = SET_ERROR(ENOTDIR); goto out; } } else { if (S_ISDIR(ZTOI(tzp)->i_mode)) { error = SET_ERROR(EISDIR); goto out; } } /* * POSIX dictates that when the source and target * entries refer to the same file object, rename * must do nothing and exit without error. */ if (szp->z_id == tzp->z_id) { error = 0; goto out; } } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tdzp); } if (tzp) { dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); zrele(szp); if (tzp) zrele(tzp); goto top; } dmu_tx_abort(tx); zrele(szp); if (tzp) zrele(tzp); ZFS_EXIT(zfsvfs); return (error); } if (tzp) /* Attempt to remove the existing target */ error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); if (error == 0) { error = zfs_link_create(tdl, szp, tx, ZRENAMING); if (error == 0) { szp->z_pflags |= ZFS_AV_MODIFIED; if (tdzp->z_pflags & ZFS_PROJINHERIT) szp->z_pflags |= ZFS_PROJINHERIT; error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&szp->z_pflags, sizeof (uint64_t), tx); ASSERT0(error); error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); if (error == 0) { zfs_log_rename(zilog, tx, TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); } else { /* * At this point, we have successfully created * the target name, but have failed to remove * the source name. Since the create was done * with the ZRENAMING flag, there are * complications; for one, the link count is * wrong. The easiest way to deal with this * is to remove the newly created target, and * return the original error. This must * succeed; fortunately, it is very unlikely to * fail, since we just created it. */ VERIFY3U(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL), ==, 0); } } else { /* * If we had removed the existing target, subsequent * call to zfs_link_create() to add back the same entry * but, the new dnode (szp) should not fail. */ ASSERT(tzp == NULL); } } dmu_tx_commit(tx); out: if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); zfs_znode_update_vfs(sdzp); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (sdzp != tdzp) zfs_znode_update_vfs(tdzp); zfs_znode_update_vfs(szp); zrele(szp); if (tzp) { zfs_znode_update_vfs(tzp); zrele(tzp); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Insert the indicated symbolic reference entry into the directory. * * IN: dzp - Directory to contain new symbolic link. * name - Name of directory entry in dip. * vap - Attributes of new entry. * link - Name for new symlink entry. * cr - credentials of caller. * flags - case flags * * OUT: zpp - Znode for new symbolic link. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dip - ctime|mtime updated */ /*ARGSUSED*/ int zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, znode_t **zpp, cred_t *cr, int flags) { znode_t *zp; zfs_dirlock_t *dl; dmu_tx_t *tx; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; uint64_t len = strlen(link); int error; int zflg = ZNEW; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t txtype = TX_SYMLINK; boolean_t waited = B_FALSE; ASSERT(S_ISLNK(vap->va_mode)); if (name == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zflg |= ZCILOOK; if (len > MAXPATHLEN) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENAMETOOLONG)); } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } top: *zpp = NULL; /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE + len); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new object for the symlink. * for version 4 ZPL datasets the symlink will be an SA attribute */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), link, len, tx); else zfs_sa_symlink(zp, link, len, tx); mutex_exit(&zp->z_lock); zp->z_size = len; (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), &zp->z_size, sizeof (zp->z_size), tx); /* * Insert the new object into the directory. */ error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); } else { if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); } zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (error == 0) { *zpp = zp; if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); } else { zrele(zp); } ZFS_EXIT(zfsvfs); return (error); } /* * Return, in the buffer contained in the provided uio structure, * the symbolic path referred to by ip. * * IN: ip - inode of symbolic link * uio - structure to contain the link path. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - atime updated */ /* ARGSUSED */ int zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_lookup_uio(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), uio); else error = zfs_sa_readlink(zp, uio); mutex_exit(&zp->z_lock); ZFS_EXIT(zfsvfs); return (error); } /* * Insert a new entry into directory tdzp referencing szp. * * IN: tdzp - Directory to contain new entry. * szp - znode of new entry. * name - name of new entry. * cr - credentials of caller. * flags - case flags. * * RETURN: 0 if success * error code if failure * * Timestamps: * tdzp - ctime|mtime updated * szp - ctime updated */ /* ARGSUSED */ int zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, int flags) { struct inode *sip = ZTOI(szp); znode_t *tzp; zfsvfs_t *zfsvfs = ZTOZSB(tdzp); zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; int zf = ZNEW; uint64_t parent; uid_t owner; boolean_t waited = B_FALSE; boolean_t is_tmpfile = 0; uint64_t txg; #ifdef HAVE_TMPFILE is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE)); #endif ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode)); if (name == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(tdzp); zilog = zfsvfs->z_log; /* * POSIX dictates that we return EPERM here. * Better choices include ENOTSUP or EISDIR. */ if (S_ISDIR(sip->i_mode)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } ZFS_VERIFY_ZP(szp); /* * If we are using project inheritance, means if the directory has * ZFS_PROJINHERIT set, then its descendant directories will inherit * not only the project ID, but also the ZFS_PROJINHERIT flag. Under * such case, we only allow hard link creation in our tree when the * project IDs are the same. */ if (tdzp->z_pflags & ZFS_PROJINHERIT && tdzp->z_projid != szp->z_projid) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EXDEV)); } /* * We check i_sb because snapshots and the ctldir must have different * super blocks. */ if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EXDEV)); } /* Prevent links to .zfs/shares files */ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } if (parent == zfsvfs->z_shares_dir) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zf |= ZCILOOK; /* * We do not support links between attributes and non-attributes * because of the potential security risk of creating links * into "normal" file space in order to circumvent restrictions * imposed in attribute space. */ if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid), cr, ZFS_OWNER); if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { ZFS_EXIT(zfsvfs); return (error); } top: /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL); if (error) { ZFS_EXIT(zfsvfs); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); if (is_tmpfile) dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, szp); zfs_sa_upgrade_txholds(tx, tdzp); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* unmark z_unlinked so zfs_link_create will not reject */ if (is_tmpfile) szp->z_unlinked = B_FALSE; error = zfs_link_create(dl, szp, tx, 0); if (error == 0) { uint64_t txtype = TX_LINK; /* * tmpfile is created to be in z_unlinkedobj, so remove it. * Also, we don't log in ZIL, because all previous file * operation on the tmpfile are ignored by ZIL. Instead we * always wait for txg to sync to make sure all previous * operation are sync safe. */ if (is_tmpfile) { VERIFY(zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0); } else { if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_link(zilog, tx, txtype, tdzp, szp, name); } } else if (is_tmpfile) { /* restore z_unlinked since when linking failed */ szp->z_unlinked = B_TRUE; } txg = dmu_tx_get_txg(tx); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg); zfs_znode_update_vfs(tdzp); zfs_znode_update_vfs(szp); ZFS_EXIT(zfsvfs); return (error); } static void zfs_putpage_commit_cb(void *arg) { struct page *pp = arg; ClearPageError(pp); end_page_writeback(pp); } /* * Push a page out to disk, once the page is on stable storage the * registered commit callback will be run as notification of completion. * * IN: ip - page mapped for inode. * pp - page to push (page is locked) * wbc - writeback control data * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - ctime|mtime updated */ /* ARGSUSED */ int zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); loff_t offset; loff_t pgoff; unsigned int pglen; dmu_tx_t *tx; caddr_t va; int err = 0; uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int cnt = 0; struct address_space *mapping; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); ASSERT(PageLocked(pp)); pgoff = page_offset(pp); /* Page byte-offset in file */ offset = i_size_read(ip); /* File length in bytes */ pglen = MIN(PAGE_SIZE, /* Page length in bytes */ P2ROUNDUP(offset, PAGE_SIZE)-pgoff); /* Page is beyond end of file */ if (pgoff >= offset) { unlock_page(pp); ZFS_EXIT(zfsvfs); return (0); } /* Truncate page length to end of file */ if (pgoff + pglen > offset) pglen = offset - pgoff; #if 0 /* * FIXME: Allow mmap writes past its quota. The correct fix * is to register a page_mkwrite() handler to count the page * against its quota when it is about to be dirtied. */ if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, KUID_TO_SUID(ip->i_uid)) || zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, KGID_TO_SGID(ip->i_gid)) || (zp->z_projid != ZFS_DEFAULT_PROJID && zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, zp->z_projid))) { err = EDQUOT; } #endif /* * The ordering here is critical and must adhere to the following * rules in order to avoid deadlocking in either zfs_read() or * zfs_free_range() due to a lock inversion. * * 1) The page must be unlocked prior to acquiring the range lock. * This is critical because zfs_read() calls find_lock_page() * which may block on the page lock while holding the range lock. * * 2) Before setting or clearing write back on a page the range lock * must be held in order to prevent a lock inversion with the * zfs_free_range() function. * * This presents a problem because upon entering this function the * page lock is already held. To safely acquire the range lock the * page lock must be dropped. This creates a window where another * process could truncate, invalidate, dirty, or write out the page. * * Therefore, after successfully reacquiring the range and page locks * the current page state is checked. In the common case everything * will be as is expected and it can be written out. However, if * the page state has changed it must be handled accordingly. */ mapping = pp->mapping; redirty_page_for_writepage(wbc, pp); unlock_page(pp); zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, pgoff, pglen, RL_WRITER); lock_page(pp); /* Page mapping changed or it was no longer dirty, we're done */ if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { unlock_page(pp); zfs_rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (0); } /* Another process started write block if required */ if (PageWriteback(pp)) { unlock_page(pp); zfs_rangelock_exit(lr); if (wbc->sync_mode != WB_SYNC_NONE) { if (PageWriteback(pp)) #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT folio_wait_bit(page_folio(pp), PG_writeback); #else wait_on_page_bit(pp, PG_writeback); #endif } ZFS_EXIT(zfsvfs); return (0); } /* Clear the dirty flag the required locks are held */ if (!clear_page_dirty_for_io(pp)) { unlock_page(pp); zfs_rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (0); } /* * Counterpart for redirty_page_for_writepage() above. This page * was in fact not skipped and should not be counted as if it were. */ wbc->pages_skipped--; set_page_writeback(pp); unlock_page(pp); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_NOWAIT); if (err != 0) { if (err == ERESTART) dmu_tx_wait(tx); dmu_tx_abort(tx); #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO filemap_dirty_folio(page_mapping(pp), page_folio(pp)); #else __set_page_dirty_nobuffers(pp); #endif ClearPageError(pp); end_page_writeback(pp); zfs_rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (err); } va = kmap(pp); ASSERT3U(pglen, <=, PAGE_SIZE); dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx); kunmap(pp); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); /* Preserve the mtime and ctime provided by the inode */ ZFS_TIME_ENCODE(&ip->i_mtime, mtime); ZFS_TIME_ENCODE(&ip->i_ctime, ctime); zp->z_atime_dirty = B_FALSE; zp->z_seq++; err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0, zfs_putpage_commit_cb, pp); dmu_tx_commit(tx); zfs_rangelock_exit(lr); if (wbc->sync_mode != WB_SYNC_NONE) { /* * Note that this is rarely called under writepages(), because * writepages() normally handles the entire commit for * performance reasons. */ zil_commit(zfsvfs->z_log, zp->z_id); } dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen); ZFS_EXIT(zfsvfs); return (err); } /* * Update the system attributes when the inode has been dirtied. For the * moment we only update the mode, atime, mtime, and ctime. */ int zfs_dirty_inode(struct inode *ip, int flags) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); dmu_tx_t *tx; uint64_t mode, atime[2], mtime[2], ctime[2]; sa_bulk_attr_t bulk[4]; int error = 0; int cnt = 0; if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) return (0); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); #ifdef I_DIRTY_TIME /* * This is the lazytime semantic introduced in Linux 4.0 * This flag will only be called from update_time when lazytime is set. * (Note, I_DIRTY_SYNC will also set if not lazytime) * Fortunately mtime and ctime are managed within ZFS itself, so we * only need to dirty atime. */ if (flags == I_DIRTY_TIME) { zp->z_atime_dirty = B_TRUE; goto out; } #endif tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); goto out; } mutex_enter(&zp->z_lock); zp->z_atime_dirty = B_FALSE; SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); /* Preserve the mode, mtime and ctime provided by the inode */ ZFS_TIME_ENCODE(&ip->i_atime, atime); ZFS_TIME_ENCODE(&ip->i_mtime, mtime); ZFS_TIME_ENCODE(&ip->i_ctime, ctime); mode = ip->i_mode; zp->z_mode = mode; error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); mutex_exit(&zp->z_lock); dmu_tx_commit(tx); out: ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ void zfs_inactive(struct inode *ip) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint64_t atime[2]; int error; int need_unlock = 0; /* Only read lock if we haven't already write locked, e.g. rollback */ if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) { need_unlock = 1; rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); } if (zp->z_sa_hdl == NULL) { if (need_unlock) rw_exit(&zfsvfs->z_teardown_inactive_lock); return; } if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { ZFS_TIME_ENCODE(&ip->i_atime, atime); mutex_enter(&zp->z_lock); (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), (void *)&atime, sizeof (atime), tx); zp->z_atime_dirty = B_FALSE; mutex_exit(&zp->z_lock); dmu_tx_commit(tx); } } zfs_zinactive(zp); if (need_unlock) rw_exit(&zfsvfs->z_teardown_inactive_lock); } /* * Fill pages with data from the disk. */ static int zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); objset_t *os; struct page *cur_pp; u_offset_t io_off, total; size_t io_len; loff_t i_size; unsigned page_idx; int err; os = zfsvfs->z_os; io_len = nr_pages << PAGE_SHIFT; i_size = i_size_read(ip); io_off = page_offset(pl[0]); if (io_off + io_len > i_size) io_len = i_size - io_off; /* * Iterate over list of pages and read each page individually. */ page_idx = 0; for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { caddr_t va; cur_pp = pl[page_idx++]; va = kmap(cur_pp); err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, DMU_READ_PREFETCH); kunmap(cur_pp); if (err) { /* convert checksum errors into IO errors */ if (err == ECKSUM) err = SET_ERROR(EIO); return (err); } } return (0); } /* * Uses zfs_fillpage to read data from the file and fill the pages. * * IN: ip - inode of file to get data from. * pl - list of pages to read * nr_pages - number of pages to read * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated */ /* ARGSUSED */ int zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int err; if (pl == NULL) return (0); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); err = zfs_fillpage(ip, pl, nr_pages); dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nr_pages*PAGESIZE); ZFS_EXIT(zfsvfs); return (err); } /* * Check ZFS specific permissions to memory map a section of a file. * * IN: ip - inode of the file to mmap * off - file offset * addrp - start address in memory region * len - length of memory region * vm_flags- address flags * * RETURN: 0 if success * error code if failure */ /*ARGSUSED*/ int zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, unsigned long vm_flags) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((vm_flags & VM_WRITE) && (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if ((vm_flags & (VM_READ | VM_EXEC)) && (zp->z_pflags & ZFS_AV_QUARANTINED)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } if (off < 0 || len > MAXOFFSET_T - off) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENXIO)); } ZFS_EXIT(zfsvfs); return (0); } /* * Free or allocate space in a file. Currently, this function only * supports the `F_FREESP' command. However, this command is somewhat * misnamed, as its functionality includes the ability to allocate as * well as free space. * * IN: zp - znode of file to free data in. * cmd - action to take (only F_FREESP supported). * bfp - section of file to free/alloc. * flag - current file open mode flags. * offset - current file offset. * cr - credentials of caller. * * RETURN: 0 on success, error code on failure. * * Timestamps: * zp - ctime|mtime updated */ /* ARGSUSED */ int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, offset_t offset, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); uint64_t off, len; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (cmd != F_FREESP) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } if (bfp->l_len < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Permissions aren't checked on Solaris because on this OS * zfs_space() can only be called with an opened file handle. * On Linux we can get here through truncate_range() which * operates directly on inodes, so we need to check access rights. */ if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { ZFS_EXIT(zfsvfs); return (error); } off = bfp->l_start; len = bfp->l_len; /* 0 means from off to end of file */ error = zfs_freesp(zp, off, len, flag, TRUE); ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ int zfs_fid(struct inode *ip, fid_t *fidp) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint32_t gen; uint64_t gen64; uint64_t object = zp->z_id; zfid_short_t *zfid; int size, i, error; ZFS_ENTER(zfsvfs); if (fidp->fid_len < SHORT_FID_LEN) { fidp->fid_len = SHORT_FID_LEN; ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOSPC)); } ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &gen64, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } gen = (uint32_t)gen64; size = SHORT_FID_LEN; zfid = (zfid_short_t *)fidp; zfid->zf_len = size; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* Must have a non-zero generation number to distinguish from .zfs */ if (gen == 0) gen = 1; for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); ZFS_EXIT(zfsvfs); return (0); } #if defined(_KERNEL) EXPORT_SYMBOL(zfs_open); EXPORT_SYMBOL(zfs_close); EXPORT_SYMBOL(zfs_lookup); EXPORT_SYMBOL(zfs_create); EXPORT_SYMBOL(zfs_tmpfile); EXPORT_SYMBOL(zfs_remove); EXPORT_SYMBOL(zfs_mkdir); EXPORT_SYMBOL(zfs_rmdir); EXPORT_SYMBOL(zfs_readdir); EXPORT_SYMBOL(zfs_getattr_fast); EXPORT_SYMBOL(zfs_setattr); EXPORT_SYMBOL(zfs_rename); EXPORT_SYMBOL(zfs_symlink); EXPORT_SYMBOL(zfs_readlink); EXPORT_SYMBOL(zfs_link); EXPORT_SYMBOL(zfs_inactive); EXPORT_SYMBOL(zfs_space); EXPORT_SYMBOL(zfs_fid); EXPORT_SYMBOL(zfs_getpage); EXPORT_SYMBOL(zfs_putpage); EXPORT_SYMBOL(zfs_dirty_inode); EXPORT_SYMBOL(zfs_map); /* BEGIN CSTYLED */ module_param(zfs_delete_blocks, ulong, 0644); MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); /* END CSTYLED */ #endif diff --git a/module/os/linux/zfs/zpl_xattr.c b/module/os/linux/zfs/zpl_xattr.c index bd5d5803fb00..9d758d2d0e9d 100644 --- a/module/os/linux/zfs/zpl_xattr.c +++ b/module/os/linux/zfs/zpl_xattr.c @@ -1,1517 +1,1517 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * * Extended attributes (xattr) on Solaris are implemented as files * which exist in a hidden xattr directory. These extended attributes * can be accessed using the attropen() system call which opens * the extended attribute. It can then be manipulated just like * a standard file descriptor. This has a couple advantages such * as practically no size limit on the file, and the extended * attributes permissions may differ from those of the parent file. * This interface is really quite clever, but it's also completely * different than what is supported on Linux. It also comes with a * steep performance penalty when accessing small xattrs because they * are not stored with the parent file. * * Under Linux extended attributes are manipulated by the system * calls getxattr(2), setxattr(2), and listxattr(2). They consider * extended attributes to be name/value pairs where the name is a * NULL terminated string. The name must also include one of the * following namespace prefixes: * * user - No restrictions and is available to user applications. * trusted - Restricted to kernel and root (CAP_SYS_ADMIN) use. * system - Used for access control lists (system.nfs4_acl, etc). * security - Used by SELinux to store a files security context. * * The value under Linux to limited to 65536 bytes of binary data. * In practice, individual xattrs tend to be much smaller than this * and are typically less than 100 bytes. A good example of this * are the security.selinux xattrs which are less than 100 bytes and * exist for every file when xattr labeling is enabled. * * The Linux xattr implementation has been written to take advantage of * this typical usage. When the dataset property 'xattr=sa' is set, * then xattrs will be preferentially stored as System Attributes (SA). * This allows tiny xattrs (~100 bytes) to be stored with the dnode and * up to 64k of xattrs to be stored in the spill block. If additional * xattr space is required, which is unlikely under Linux, they will * be stored using the traditional directory approach. * * This optimization results in roughly a 3x performance improvement * when accessing xattrs because it avoids the need to perform a seek * for every xattr value. When multiple xattrs are stored per-file * the performance improvements are even greater because all of the * xattrs stored in the spill block will be cached. * * However, by default SA based xattrs are disabled in the Linux port * to maximize compatibility with other implementations. If you do * enable SA based xattrs then they will not be visible on platforms * which do not support this feature. * * NOTE: One additional consequence of the xattr directory implementation * is that when an extended attribute is manipulated an inode is created. * This inode will exist in the Linux inode cache but there will be no * associated entry in the dentry cache which references it. This is * safe but it may result in some confusion. Enabling SA based xattrs * largely avoids the issue except in the overflow case. */ #include #include #include #include #include #include typedef struct xattr_filldir { size_t size; size_t offset; char *buf; struct dentry *dentry; } xattr_filldir_t; static const struct xattr_handler *zpl_xattr_handler(const char *); static int zpl_xattr_permission(xattr_filldir_t *xf, const char *name, int name_len) { static const struct xattr_handler *handler; struct dentry *d = xf->dentry; handler = zpl_xattr_handler(name); if (!handler) return (0); if (handler->list) { #if defined(HAVE_XATTR_LIST_SIMPLE) if (!handler->list(d)) return (0); #elif defined(HAVE_XATTR_LIST_DENTRY) if (!handler->list(d, NULL, 0, name, name_len, 0)) return (0); #elif defined(HAVE_XATTR_LIST_HANDLER) if (!handler->list(handler, d, NULL, 0, name, name_len)) return (0); #endif } return (1); } /* * Determine is a given xattr name should be visible and if so copy it * in to the provided buffer (xf->buf). */ static int zpl_xattr_filldir(xattr_filldir_t *xf, const char *name, int name_len) { /* Check permissions using the per-namespace list xattr handler. */ if (!zpl_xattr_permission(xf, name, name_len)) return (0); /* When xf->buf is NULL only calculate the required size. */ if (xf->buf) { if (xf->offset + name_len + 1 > xf->size) return (-ERANGE); memcpy(xf->buf + xf->offset, name, name_len); xf->buf[xf->offset + name_len] = '\0'; } xf->offset += (name_len + 1); return (0); } /* * Read as many directory entry names as will fit in to the provided buffer, * or when no buffer is provided calculate the required buffer size. */ static int zpl_xattr_readdir(struct inode *dxip, xattr_filldir_t *xf) { zap_cursor_t zc; zap_attribute_t zap; int error; zap_cursor_init(&zc, ITOZSB(dxip)->z_os, ITOZ(dxip)->z_id); while ((error = -zap_cursor_retrieve(&zc, &zap)) == 0) { if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { error = -ENXIO; break; } error = zpl_xattr_filldir(xf, zap.za_name, strlen(zap.za_name)); if (error) break; zap_cursor_advance(&zc); } zap_cursor_fini(&zc); if (error == -ENOENT) error = 0; return (error); } static ssize_t zpl_xattr_list_dir(xattr_filldir_t *xf, cred_t *cr) { struct inode *ip = xf->dentry->d_inode; struct inode *dxip = NULL; znode_t *dxzp; int error; /* Lookup the xattr directory */ error = -zfs_lookup(ITOZ(ip), NULL, &dxzp, LOOKUP_XATTR, cr, NULL, NULL); if (error) { if (error == -ENOENT) error = 0; return (error); } dxip = ZTOI(dxzp); error = zpl_xattr_readdir(dxip, xf); iput(dxip); return (error); } static ssize_t zpl_xattr_list_sa(xattr_filldir_t *xf) { znode_t *zp = ITOZ(xf->dentry->d_inode); nvpair_t *nvp = NULL; int error = 0; mutex_enter(&zp->z_lock); if (zp->z_xattr_cached == NULL) error = -zfs_sa_get_xattr(zp); mutex_exit(&zp->z_lock); if (error) return (error); ASSERT(zp->z_xattr_cached); while ((nvp = nvlist_next_nvpair(zp->z_xattr_cached, nvp)) != NULL) { ASSERT3U(nvpair_type(nvp), ==, DATA_TYPE_BYTE_ARRAY); error = zpl_xattr_filldir(xf, nvpair_name(nvp), strlen(nvpair_name(nvp))); if (error) return (error); } return (0); } ssize_t zpl_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { znode_t *zp = ITOZ(dentry->d_inode); zfsvfs_t *zfsvfs = ZTOZSB(zp); xattr_filldir_t xf = { buffer_size, 0, buffer, dentry }; cred_t *cr = CRED(); fstrans_cookie_t cookie; int error = 0; crhold(cr); cookie = spl_fstrans_mark(); ZPL_ENTER(zfsvfs); ZPL_VERIFY_ZP(zp); rw_enter(&zp->z_xattr_lock, RW_READER); if (zfsvfs->z_use_sa && zp->z_is_sa) { error = zpl_xattr_list_sa(&xf); if (error) goto out; } error = zpl_xattr_list_dir(&xf, cr); if (error) goto out; error = xf.offset; out: rw_exit(&zp->z_xattr_lock); ZPL_EXIT(zfsvfs); spl_fstrans_unmark(cookie); crfree(cr); return (error); } static int zpl_xattr_get_dir(struct inode *ip, const char *name, void *value, size_t size, cred_t *cr) { fstrans_cookie_t cookie; struct inode *xip = NULL; znode_t *dxzp = NULL; znode_t *xzp = NULL; int error; /* Lookup the xattr directory */ error = -zfs_lookup(ITOZ(ip), NULL, &dxzp, LOOKUP_XATTR, cr, NULL, NULL); if (error) goto out; /* Lookup a specific xattr name in the directory */ error = -zfs_lookup(dxzp, (char *)name, &xzp, 0, cr, NULL, NULL); if (error) goto out; xip = ZTOI(xzp); if (!size) { error = i_size_read(xip); goto out; } if (size < i_size_read(xip)) { error = -ERANGE; goto out; } struct iovec iov; iov.iov_base = (void *)value; iov.iov_len = size; zfs_uio_t uio; zfs_uio_iovec_init(&uio, &iov, 1, 0, UIO_SYSSPACE, size, 0); cookie = spl_fstrans_mark(); error = -zfs_read(ITOZ(xip), &uio, 0, cr); spl_fstrans_unmark(cookie); if (error == 0) error = size - zfs_uio_resid(&uio); out: if (xzp) zrele(xzp); if (dxzp) zrele(dxzp); return (error); } static int zpl_xattr_get_sa(struct inode *ip, const char *name, void *value, size_t size) { znode_t *zp = ITOZ(ip); uchar_t *nv_value; uint_t nv_size; int error = 0; ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); mutex_enter(&zp->z_lock); if (zp->z_xattr_cached == NULL) error = -zfs_sa_get_xattr(zp); mutex_exit(&zp->z_lock); if (error) return (error); ASSERT(zp->z_xattr_cached); error = -nvlist_lookup_byte_array(zp->z_xattr_cached, name, &nv_value, &nv_size); if (error) return (error); if (size == 0 || value == NULL) return (nv_size); if (size < nv_size) return (-ERANGE); memcpy(value, nv_value, nv_size); return (nv_size); } static int __zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); if (zfsvfs->z_use_sa && zp->z_is_sa) { error = zpl_xattr_get_sa(ip, name, value, size); if (error != -ENOENT) goto out; } error = zpl_xattr_get_dir(ip, name, value, size, cr); out: if (error == -ENOENT) error = -ENODATA; return (error); } #define XATTR_NOENT 0x0 #define XATTR_IN_SA 0x1 #define XATTR_IN_DIR 0x2 /* check where the xattr resides */ static int __zpl_xattr_where(struct inode *ip, const char *name, int *where, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; ASSERT(where); ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); *where = XATTR_NOENT; if (zfsvfs->z_use_sa && zp->z_is_sa) { error = zpl_xattr_get_sa(ip, name, NULL, 0); if (error >= 0) *where |= XATTR_IN_SA; else if (error != -ENOENT) return (error); } error = zpl_xattr_get_dir(ip, name, NULL, 0, cr); if (error >= 0) *where |= XATTR_IN_DIR; else if (error != -ENOENT) return (error); if (*where == (XATTR_IN_SA|XATTR_IN_DIR)) cmn_err(CE_WARN, "ZFS: inode %p has xattr \"%s\"" " in both SA and dir", ip, name); if (*where == XATTR_NOENT) error = -ENODATA; else error = 0; return (error); } static int zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ZTOZSB(zp); cred_t *cr = CRED(); fstrans_cookie_t cookie; int error; crhold(cr); cookie = spl_fstrans_mark(); ZPL_ENTER(zfsvfs); ZPL_VERIFY_ZP(zp); rw_enter(&zp->z_xattr_lock, RW_READER); error = __zpl_xattr_get(ip, name, value, size, cr); rw_exit(&zp->z_xattr_lock); ZPL_EXIT(zfsvfs); spl_fstrans_unmark(cookie); crfree(cr); return (error); } static int zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value, size_t size, int flags, cred_t *cr) { znode_t *dxzp = NULL; znode_t *xzp = NULL; vattr_t *vap = NULL; int lookup_flags, error; const int xattr_mode = S_IFREG | 0644; loff_t pos = 0; /* * Lookup the xattr directory. When we're adding an entry pass * CREATE_XATTR_DIR to ensure the xattr directory is created. * When removing an entry this flag is not passed to avoid * unnecessarily creating a new xattr directory. */ lookup_flags = LOOKUP_XATTR; if (value != NULL) lookup_flags |= CREATE_XATTR_DIR; error = -zfs_lookup(ITOZ(ip), NULL, &dxzp, lookup_flags, cr, NULL, NULL); if (error) goto out; /* Lookup a specific xattr name in the directory */ error = -zfs_lookup(dxzp, (char *)name, &xzp, 0, cr, NULL, NULL); if (error && (error != -ENOENT)) goto out; error = 0; /* Remove a specific name xattr when value is set to NULL. */ if (value == NULL) { if (xzp) error = -zfs_remove(dxzp, (char *)name, cr, 0); goto out; } /* Lookup failed create a new xattr. */ if (xzp == NULL) { vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); vap->va_mode = xattr_mode; vap->va_mask = ATTR_MODE; vap->va_uid = crgetuid(cr); vap->va_gid = crgetgid(cr); error = -zfs_create(dxzp, (char *)name, vap, 0, 0644, &xzp, - cr, 0, NULL); + cr, ATTR_NOACLCHECK, NULL); if (error) goto out; } ASSERT(xzp != NULL); error = -zfs_freesp(xzp, 0, 0, xattr_mode, TRUE); if (error) goto out; error = -zfs_write_simple(xzp, value, size, pos, NULL); out: if (error == 0) { ip->i_ctime = current_time(ip); zfs_mark_inode_dirty(ip); } if (vap) kmem_free(vap, sizeof (vattr_t)); if (xzp) zrele(xzp); if (dxzp) zrele(dxzp); if (error == -ENOENT) error = -ENODATA; ASSERT3S(error, <=, 0); return (error); } static int zpl_xattr_set_sa(struct inode *ip, const char *name, const void *value, size_t size, int flags, cred_t *cr) { znode_t *zp = ITOZ(ip); nvlist_t *nvl; size_t sa_size; int error = 0; mutex_enter(&zp->z_lock); if (zp->z_xattr_cached == NULL) error = -zfs_sa_get_xattr(zp); mutex_exit(&zp->z_lock); if (error) return (error); ASSERT(zp->z_xattr_cached); nvl = zp->z_xattr_cached; if (value == NULL) { error = -nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY); if (error == -ENOENT) error = zpl_xattr_set_dir(ip, name, NULL, 0, flags, cr); } else { /* Limited to 32k to keep nvpair memory allocations small */ if (size > DXATTR_MAX_ENTRY_SIZE) return (-EFBIG); /* Prevent the DXATTR SA from consuming the entire SA region */ error = -nvlist_size(nvl, &sa_size, NV_ENCODE_XDR); if (error) return (error); if (sa_size > DXATTR_MAX_SA_SIZE) return (-EFBIG); error = -nvlist_add_byte_array(nvl, name, (uchar_t *)value, size); } /* * Update the SA for additions, modifications, and removals. On * error drop the inconsistent cached version of the nvlist, it * will be reconstructed from the ARC when next accessed. */ if (error == 0) error = -zfs_sa_set_xattr(zp); if (error) { nvlist_free(nvl); zp->z_xattr_cached = NULL; } ASSERT3S(error, <=, 0); return (error); } static int zpl_xattr_set(struct inode *ip, const char *name, const void *value, size_t size, int flags) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ZTOZSB(zp); cred_t *cr = CRED(); fstrans_cookie_t cookie; int where; int error; crhold(cr); cookie = spl_fstrans_mark(); ZPL_ENTER(zfsvfs); ZPL_VERIFY_ZP(zp); rw_enter(&ITOZ(ip)->z_xattr_lock, RW_WRITER); /* * Before setting the xattr check to see if it already exists. * This is done to ensure the following optional flags are honored. * * XATTR_CREATE: fail if xattr already exists * XATTR_REPLACE: fail if xattr does not exist * * We also want to know if it resides in sa or dir, so we can make * sure we don't end up with duplicate in both places. */ error = __zpl_xattr_where(ip, name, &where, cr); if (error < 0) { if (error != -ENODATA) goto out; if (flags & XATTR_REPLACE) goto out; /* The xattr to be removed already doesn't exist */ error = 0; if (value == NULL) goto out; } else { error = -EEXIST; if (flags & XATTR_CREATE) goto out; } /* Preferentially store the xattr as a SA for better performance */ if (zfsvfs->z_use_sa && zp->z_is_sa && (zfsvfs->z_xattr_sa || (value == NULL && where & XATTR_IN_SA))) { error = zpl_xattr_set_sa(ip, name, value, size, flags, cr); if (error == 0) { /* * Successfully put into SA, we need to clear the one * in dir. */ if (where & XATTR_IN_DIR) zpl_xattr_set_dir(ip, name, NULL, 0, 0, cr); goto out; } } error = zpl_xattr_set_dir(ip, name, value, size, flags, cr); /* * Successfully put into dir, we need to clear the one in SA. */ if (error == 0 && (where & XATTR_IN_SA)) zpl_xattr_set_sa(ip, name, NULL, 0, 0, cr); out: rw_exit(&ITOZ(ip)->z_xattr_lock); ZPL_EXIT(zfsvfs); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } /* * Extended user attributes * * "Extended user attributes may be assigned to files and directories for * storing arbitrary additional information such as the mime type, * character set or encoding of a file. The access permissions for user * attributes are defined by the file permission bits: read permission * is required to retrieve the attribute value, and writer permission is * required to change it. * * The file permission bits of regular files and directories are * interpreted differently from the file permission bits of special * files and symbolic links. For regular files and directories the file * permission bits define access to the file's contents, while for * device special files they define access to the device described by * the special file. The file permissions of symbolic links are not * used in access checks. These differences would allow users to * consume filesystem resources in a way not controllable by disk quotas * for group or world writable special files and directories. * * For this reason, extended user attributes are allowed only for * regular files and directories, and access to extended user attributes * is restricted to the owner and to users with appropriate capabilities * for directories with the sticky bit set (see the chmod(1) manual page * for an explanation of the sticky bit)." - xattr(7) * * ZFS allows extended user attributes to be disabled administratively * by setting the 'xattr=off' property on the dataset. */ static int __zpl_xattr_user_list(struct inode *ip, char *list, size_t list_size, const char *name, size_t name_len) { return (ITOZSB(ip)->z_flags & ZSB_XATTR); } ZPL_XATTR_LIST_WRAPPER(zpl_xattr_user_list); static int __zpl_xattr_user_get(struct inode *ip, const char *name, void *value, size_t size) { char *xattr_name; int error; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") == 0) return (-EINVAL); #endif if (!(ITOZSB(ip)->z_flags & ZSB_XATTR)) return (-EOPNOTSUPP); xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name); error = zpl_xattr_get(ip, xattr_name, value, size); kmem_strfree(xattr_name); return (error); } ZPL_XATTR_GET_WRAPPER(zpl_xattr_user_get); static int __zpl_xattr_user_set(struct inode *ip, const char *name, const void *value, size_t size, int flags) { char *xattr_name; int error; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") == 0) return (-EINVAL); #endif if (!(ITOZSB(ip)->z_flags & ZSB_XATTR)) return (-EOPNOTSUPP); xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name); error = zpl_xattr_set(ip, xattr_name, value, size, flags); kmem_strfree(xattr_name); return (error); } ZPL_XATTR_SET_WRAPPER(zpl_xattr_user_set); xattr_handler_t zpl_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .list = zpl_xattr_user_list, .get = zpl_xattr_user_get, .set = zpl_xattr_user_set, }; /* * Trusted extended attributes * * "Trusted extended attributes are visible and accessible only to * processes that have the CAP_SYS_ADMIN capability. Attributes in this * class are used to implement mechanisms in user space (i.e., outside * the kernel) which keep information in extended attributes to which * ordinary processes should not have access." - xattr(7) */ static int __zpl_xattr_trusted_list(struct inode *ip, char *list, size_t list_size, const char *name, size_t name_len) { return (capable(CAP_SYS_ADMIN)); } ZPL_XATTR_LIST_WRAPPER(zpl_xattr_trusted_list); static int __zpl_xattr_trusted_get(struct inode *ip, const char *name, void *value, size_t size) { char *xattr_name; int error; if (!capable(CAP_SYS_ADMIN)) return (-EACCES); /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") == 0) return (-EINVAL); #endif xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name); error = zpl_xattr_get(ip, xattr_name, value, size); kmem_strfree(xattr_name); return (error); } ZPL_XATTR_GET_WRAPPER(zpl_xattr_trusted_get); static int __zpl_xattr_trusted_set(struct inode *ip, const char *name, const void *value, size_t size, int flags) { char *xattr_name; int error; if (!capable(CAP_SYS_ADMIN)) return (-EACCES); /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") == 0) return (-EINVAL); #endif xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name); error = zpl_xattr_set(ip, xattr_name, value, size, flags); kmem_strfree(xattr_name); return (error); } ZPL_XATTR_SET_WRAPPER(zpl_xattr_trusted_set); xattr_handler_t zpl_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .list = zpl_xattr_trusted_list, .get = zpl_xattr_trusted_get, .set = zpl_xattr_trusted_set, }; /* * Extended security attributes * * "The security attribute namespace is used by kernel security modules, * such as Security Enhanced Linux, and also to implement file * capabilities (see capabilities(7)). Read and write access * permissions to security attributes depend on the policy implemented * for each security attribute by the security module. When no security * module is loaded, all processes have read access to extended security * attributes, and write access is limited to processes that have the * CAP_SYS_ADMIN capability." - xattr(7) */ static int __zpl_xattr_security_list(struct inode *ip, char *list, size_t list_size, const char *name, size_t name_len) { return (1); } ZPL_XATTR_LIST_WRAPPER(zpl_xattr_security_list); static int __zpl_xattr_security_get(struct inode *ip, const char *name, void *value, size_t size) { char *xattr_name; int error; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") == 0) return (-EINVAL); #endif xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name); error = zpl_xattr_get(ip, xattr_name, value, size); kmem_strfree(xattr_name); return (error); } ZPL_XATTR_GET_WRAPPER(zpl_xattr_security_get); static int __zpl_xattr_security_set(struct inode *ip, const char *name, const void *value, size_t size, int flags) { char *xattr_name; int error; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") == 0) return (-EINVAL); #endif xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name); error = zpl_xattr_set(ip, xattr_name, value, size, flags); kmem_strfree(xattr_name); return (error); } ZPL_XATTR_SET_WRAPPER(zpl_xattr_security_set); static int zpl_xattr_security_init_impl(struct inode *ip, const struct xattr *xattrs, void *fs_info) { const struct xattr *xattr; int error = 0; for (xattr = xattrs; xattr->name != NULL; xattr++) { error = __zpl_xattr_security_set(ip, xattr->name, xattr->value, xattr->value_len, 0); if (error < 0) break; } return (error); } int zpl_xattr_security_init(struct inode *ip, struct inode *dip, const struct qstr *qstr) { return security_inode_init_security(ip, dip, qstr, &zpl_xattr_security_init_impl, NULL); } /* * Security xattr namespace handlers. */ xattr_handler_t zpl_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = zpl_xattr_security_list, .get = zpl_xattr_security_get, .set = zpl_xattr_security_set, }; /* * Extended system attributes * * "Extended system attributes are used by the kernel to store system * objects such as Access Control Lists. Read and write access permissions * to system attributes depend on the policy implemented for each system * attribute implemented by filesystems in the kernel." - xattr(7) */ #ifdef CONFIG_FS_POSIX_ACL static int zpl_set_acl_impl(struct inode *ip, struct posix_acl *acl, int type) { char *name, *value = NULL; int error = 0; size_t size = 0; if (S_ISLNK(ip->i_mode)) return (-EOPNOTSUPP); switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { umode_t mode = ip->i_mode; error = posix_acl_equiv_mode(acl, &mode); if (error < 0) { return (error); } else { /* * The mode bits will have been set by * ->zfs_setattr()->zfs_acl_chmod_setattr() * using the ZFS ACL conversion. If they * differ from the Posix ACL conversion dirty * the inode to write the Posix mode bits. */ if (ip->i_mode != mode) { ip->i_mode = mode; ip->i_ctime = current_time(ip); zfs_mark_inode_dirty(ip); } if (error == 0) acl = NULL; } } break; case ACL_TYPE_DEFAULT: name = XATTR_NAME_POSIX_ACL_DEFAULT; if (!S_ISDIR(ip->i_mode)) return (acl ? -EACCES : 0); break; default: return (-EINVAL); } if (acl) { size = posix_acl_xattr_size(acl->a_count); value = kmem_alloc(size, KM_SLEEP); error = zpl_acl_to_xattr(acl, value, size); if (error < 0) { kmem_free(value, size); return (error); } } error = zpl_xattr_set(ip, name, value, size, 0); if (value) kmem_free(value, size); if (!error) { if (acl) zpl_set_cached_acl(ip, type, acl); else zpl_forget_cached_acl(ip, type); } return (error); } #ifdef HAVE_SET_ACL int #ifdef HAVE_SET_ACL_USERNS zpl_set_acl(struct user_namespace *userns, struct inode *ip, struct posix_acl *acl, int type) #else zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type) #endif /* HAVE_SET_ACL_USERNS */ { return (zpl_set_acl_impl(ip, acl, type)); } #endif /* HAVE_SET_ACL */ static struct posix_acl * zpl_get_acl_impl(struct inode *ip, int type) { struct posix_acl *acl; void *value = NULL; char *name; /* * As of Linux 3.14, the kernel get_acl will check this for us. * Also as of Linux 4.7, comparing against ACL_NOT_CACHED is wrong * as the kernel get_acl will set it to temporary sentinel value. */ #ifndef HAVE_KERNEL_GET_ACL_HANDLE_CACHE acl = get_cached_acl(ip, type); if (acl != ACL_NOT_CACHED) return (acl); #endif switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: return (ERR_PTR(-EINVAL)); } int size = zpl_xattr_get(ip, name, NULL, 0); if (size > 0) { value = kmem_alloc(size, KM_SLEEP); size = zpl_xattr_get(ip, name, value, size); } if (size > 0) { acl = zpl_acl_from_xattr(value, size); } else if (size == -ENODATA || size == -ENOSYS) { acl = NULL; } else { acl = ERR_PTR(-EIO); } if (size > 0) kmem_free(value, size); /* As of Linux 4.7, the kernel get_acl will set this for us */ #ifndef HAVE_KERNEL_GET_ACL_HANDLE_CACHE if (!IS_ERR(acl)) zpl_set_cached_acl(ip, type, acl); #endif return (acl); } #if defined(HAVE_GET_ACL_RCU) struct posix_acl * zpl_get_acl(struct inode *ip, int type, bool rcu) { if (rcu) return (ERR_PTR(-ECHILD)); return (zpl_get_acl_impl(ip, type)); } #elif defined(HAVE_GET_ACL) struct posix_acl * zpl_get_acl(struct inode *ip, int type) { return (zpl_get_acl_impl(ip, type)); } #else #error "Unsupported iops->get_acl() implementation" #endif /* HAVE_GET_ACL_RCU */ int zpl_init_acl(struct inode *ip, struct inode *dir) { struct posix_acl *acl = NULL; int error = 0; if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (0); if (!S_ISLNK(ip->i_mode)) { acl = zpl_get_acl_impl(dir, ACL_TYPE_DEFAULT); if (IS_ERR(acl)) return (PTR_ERR(acl)); if (!acl) { ip->i_mode &= ~current_umask(); ip->i_ctime = current_time(ip); zfs_mark_inode_dirty(ip); return (0); } } if (acl) { umode_t mode; if (S_ISDIR(ip->i_mode)) { error = zpl_set_acl_impl(ip, acl, ACL_TYPE_DEFAULT); if (error) goto out; } mode = ip->i_mode; error = __posix_acl_create(&acl, GFP_KERNEL, &mode); if (error >= 0) { ip->i_mode = mode; zfs_mark_inode_dirty(ip); if (error > 0) { error = zpl_set_acl_impl(ip, acl, ACL_TYPE_ACCESS); } } } out: zpl_posix_acl_release(acl); return (error); } int zpl_chmod_acl(struct inode *ip) { struct posix_acl *acl; int error; if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (0); if (S_ISLNK(ip->i_mode)) return (-EOPNOTSUPP); acl = zpl_get_acl_impl(ip, ACL_TYPE_ACCESS); if (IS_ERR(acl) || !acl) return (PTR_ERR(acl)); error = __posix_acl_chmod(&acl, GFP_KERNEL, ip->i_mode); if (!error) error = zpl_set_acl_impl(ip, acl, ACL_TYPE_ACCESS); zpl_posix_acl_release(acl); return (error); } static int __zpl_xattr_acl_list_access(struct inode *ip, char *list, size_t list_size, const char *name, size_t name_len) { char *xattr_name = XATTR_NAME_POSIX_ACL_ACCESS; size_t xattr_size = sizeof (XATTR_NAME_POSIX_ACL_ACCESS); if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (0); if (list && xattr_size <= list_size) memcpy(list, xattr_name, xattr_size); return (xattr_size); } ZPL_XATTR_LIST_WRAPPER(zpl_xattr_acl_list_access); static int __zpl_xattr_acl_list_default(struct inode *ip, char *list, size_t list_size, const char *name, size_t name_len) { char *xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT; size_t xattr_size = sizeof (XATTR_NAME_POSIX_ACL_DEFAULT); if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (0); if (list && xattr_size <= list_size) memcpy(list, xattr_name, xattr_size); return (xattr_size); } ZPL_XATTR_LIST_WRAPPER(zpl_xattr_acl_list_default); static int __zpl_xattr_acl_get_access(struct inode *ip, const char *name, void *buffer, size_t size) { struct posix_acl *acl; int type = ACL_TYPE_ACCESS; int error; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") != 0) return (-EINVAL); #endif if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); acl = zpl_get_acl_impl(ip, type); if (IS_ERR(acl)) return (PTR_ERR(acl)); if (acl == NULL) return (-ENODATA); error = zpl_acl_to_xattr(acl, buffer, size); zpl_posix_acl_release(acl); return (error); } ZPL_XATTR_GET_WRAPPER(zpl_xattr_acl_get_access); static int __zpl_xattr_acl_get_default(struct inode *ip, const char *name, void *buffer, size_t size) { struct posix_acl *acl; int type = ACL_TYPE_DEFAULT; int error; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") != 0) return (-EINVAL); #endif if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); acl = zpl_get_acl_impl(ip, type); if (IS_ERR(acl)) return (PTR_ERR(acl)); if (acl == NULL) return (-ENODATA); error = zpl_acl_to_xattr(acl, buffer, size); zpl_posix_acl_release(acl); return (error); } ZPL_XATTR_GET_WRAPPER(zpl_xattr_acl_get_default); static int __zpl_xattr_acl_set_access(struct inode *ip, const char *name, const void *value, size_t size, int flags) { struct posix_acl *acl; int type = ACL_TYPE_ACCESS; int error = 0; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") != 0) return (-EINVAL); #endif if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); if (!zpl_inode_owner_or_capable(kcred->user_ns, ip)) return (-EPERM); if (value) { acl = zpl_acl_from_xattr(value, size); if (IS_ERR(acl)) return (PTR_ERR(acl)); else if (acl) { error = zpl_posix_acl_valid(ip, acl); if (error) { zpl_posix_acl_release(acl); return (error); } } } else { acl = NULL; } error = zpl_set_acl_impl(ip, acl, type); zpl_posix_acl_release(acl); return (error); } ZPL_XATTR_SET_WRAPPER(zpl_xattr_acl_set_access); static int __zpl_xattr_acl_set_default(struct inode *ip, const char *name, const void *value, size_t size, int flags) { struct posix_acl *acl; int type = ACL_TYPE_DEFAULT; int error = 0; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") != 0) return (-EINVAL); #endif if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); if (!zpl_inode_owner_or_capable(kcred->user_ns, ip)) return (-EPERM); if (value) { acl = zpl_acl_from_xattr(value, size); if (IS_ERR(acl)) return (PTR_ERR(acl)); else if (acl) { error = zpl_posix_acl_valid(ip, acl); if (error) { zpl_posix_acl_release(acl); return (error); } } } else { acl = NULL; } error = zpl_set_acl_impl(ip, acl, type); zpl_posix_acl_release(acl); return (error); } ZPL_XATTR_SET_WRAPPER(zpl_xattr_acl_set_default); /* * ACL access xattr namespace handlers. * * Use .name instead of .prefix when available. xattr_resolve_name will match * whole name and reject anything that has .name only as prefix. */ xattr_handler_t zpl_xattr_acl_access_handler = { #ifdef HAVE_XATTR_HANDLER_NAME .name = XATTR_NAME_POSIX_ACL_ACCESS, #else .prefix = XATTR_NAME_POSIX_ACL_ACCESS, #endif .list = zpl_xattr_acl_list_access, .get = zpl_xattr_acl_get_access, .set = zpl_xattr_acl_set_access, #if defined(HAVE_XATTR_LIST_SIMPLE) || \ defined(HAVE_XATTR_LIST_DENTRY) || \ defined(HAVE_XATTR_LIST_HANDLER) .flags = ACL_TYPE_ACCESS, #endif }; /* * ACL default xattr namespace handlers. * * Use .name instead of .prefix when available. xattr_resolve_name will match * whole name and reject anything that has .name only as prefix. */ xattr_handler_t zpl_xattr_acl_default_handler = { #ifdef HAVE_XATTR_HANDLER_NAME .name = XATTR_NAME_POSIX_ACL_DEFAULT, #else .prefix = XATTR_NAME_POSIX_ACL_DEFAULT, #endif .list = zpl_xattr_acl_list_default, .get = zpl_xattr_acl_get_default, .set = zpl_xattr_acl_set_default, #if defined(HAVE_XATTR_LIST_SIMPLE) || \ defined(HAVE_XATTR_LIST_DENTRY) || \ defined(HAVE_XATTR_LIST_HANDLER) .flags = ACL_TYPE_DEFAULT, #endif }; #endif /* CONFIG_FS_POSIX_ACL */ xattr_handler_t *zpl_xattr_handlers[] = { &zpl_xattr_security_handler, &zpl_xattr_trusted_handler, &zpl_xattr_user_handler, #ifdef CONFIG_FS_POSIX_ACL &zpl_xattr_acl_access_handler, &zpl_xattr_acl_default_handler, #endif /* CONFIG_FS_POSIX_ACL */ NULL }; static const struct xattr_handler * zpl_xattr_handler(const char *name) { if (strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) == 0) return (&zpl_xattr_user_handler); if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) return (&zpl_xattr_trusted_handler); if (strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) return (&zpl_xattr_security_handler); #ifdef CONFIG_FS_POSIX_ACL if (strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, sizeof (XATTR_NAME_POSIX_ACL_ACCESS)) == 0) return (&zpl_xattr_acl_access_handler); if (strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, sizeof (XATTR_NAME_POSIX_ACL_DEFAULT)) == 0) return (&zpl_xattr_acl_default_handler); #endif /* CONFIG_FS_POSIX_ACL */ return (NULL); } #if defined(CONFIG_FS_POSIX_ACL) && \ (!defined(HAVE_POSIX_ACL_RELEASE) || \ defined(HAVE_POSIX_ACL_RELEASE_GPL_ONLY)) struct acl_rel_struct { struct acl_rel_struct *next; struct posix_acl *acl; clock_t time; }; #define ACL_REL_GRACE (60*HZ) #define ACL_REL_WINDOW (1*HZ) #define ACL_REL_SCHED (ACL_REL_GRACE+ACL_REL_WINDOW) /* * Lockless multi-producer single-consumer fifo list. * Nodes are added to tail and removed from head. Tail pointer is our * synchronization point. It always points to the next pointer of the last * node, or head if list is empty. */ static struct acl_rel_struct *acl_rel_head = NULL; static struct acl_rel_struct **acl_rel_tail = &acl_rel_head; static void zpl_posix_acl_free(void *arg) { struct acl_rel_struct *freelist = NULL; struct acl_rel_struct *a; clock_t new_time; boolean_t refire = B_FALSE; ASSERT3P(acl_rel_head, !=, NULL); while (acl_rel_head) { a = acl_rel_head; if (ddi_get_lbolt() - a->time >= ACL_REL_GRACE) { /* * If a is the last node we need to reset tail, but we * need to use cmpxchg to make sure it is still the * last node. */ if (acl_rel_tail == &a->next) { acl_rel_head = NULL; if (cmpxchg(&acl_rel_tail, &a->next, &acl_rel_head) == &a->next) { ASSERT3P(a->next, ==, NULL); a->next = freelist; freelist = a; break; } } /* * a is not last node, make sure next pointer is set * by the adder and advance the head. */ while (READ_ONCE(a->next) == NULL) cpu_relax(); acl_rel_head = a->next; a->next = freelist; freelist = a; } else { /* * a is still in grace period. We are responsible to * reschedule the free task, since adder will only do * so if list is empty. */ new_time = a->time + ACL_REL_SCHED; refire = B_TRUE; break; } } if (refire) taskq_dispatch_delay(system_delay_taskq, zpl_posix_acl_free, NULL, TQ_SLEEP, new_time); while (freelist) { a = freelist; freelist = a->next; kfree(a->acl); kmem_free(a, sizeof (struct acl_rel_struct)); } } void zpl_posix_acl_release_impl(struct posix_acl *acl) { struct acl_rel_struct *a, **prev; a = kmem_alloc(sizeof (struct acl_rel_struct), KM_SLEEP); a->next = NULL; a->acl = acl; a->time = ddi_get_lbolt(); /* atomically points tail to us and get the previous tail */ prev = xchg(&acl_rel_tail, &a->next); ASSERT3P(*prev, ==, NULL); *prev = a; /* if it was empty before, schedule the free task */ if (prev == &acl_rel_head) taskq_dispatch_delay(system_delay_taskq, zpl_posix_acl_free, NULL, TQ_SLEEP, ddi_get_lbolt() + ACL_REL_SCHED); } #endif diff --git a/tests/zfs-tests/tests/functional/acl/posix/posix_004_pos.ksh b/tests/zfs-tests/tests/functional/acl/posix/posix_004_pos.ksh index 6c6b592fbb9e..8aa2cf496040 100755 --- a/tests/zfs-tests/tests/functional/acl/posix/posix_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/acl/posix/posix_004_pos.ksh @@ -1,49 +1,52 @@ #!/bin/ksh -p # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Portions Copyright 2020 iXsystems, Inc. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/acl/acl_common.kshlib # # DESCRIPTION: # Verify chown works with POSIX ACLs. # Regression test for https://github.com/openzfs/zfs/issues/10043 # # STRATEGY: # 1. Prepare an appropriate ACL on the test directory # 2. Change the owner of the directory +# 3. Reset and set the ACLs for test directory owned by the user # verify_runnable "both" log_assert "Verify chown works with POSIX ACLs" log_must setfacl -d -m u:$ZFS_ACL_STAFF1:rwx $TESTDIR log_must setfacl -b $TESTDIR log_must chown $ZFS_ACL_STAFF1 $TESTDIR +log_must setfacl -b $TESTDIR +log_must setfacl -d -m u:$ZFS_ACL_STAFF1:rwx $TESTDIR log_must chown 0 $TESTDIR log_pass "chown works with POSIX ACLs"