diff --git a/include/sys/zfs_vfsops.h b/include/sys/zfs_vfsops.h index febfdff97f25..31c9c6d7f74b 100644 --- a/include/sys/zfs_vfsops.h +++ b/include/sys/zfs_vfsops.h @@ -1,226 +1,227 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_FS_ZFS_VFSOPS_H #define _SYS_FS_ZFS_VFSOPS_H #include #include #include #include #include #include #include +#include #include #ifdef __cplusplus extern "C" { #endif typedef struct zfsvfs zfsvfs_t; struct znode; /* * This structure emulates the vfs_t from other platforms. It's purpose * is to faciliate the handling of mount options and minimize structural * differences between the platforms. */ typedef struct vfs { struct zfsvfs *vfs_data; char *vfs_mntpoint; /* Primary mount point */ uint64_t vfs_xattr; boolean_t vfs_readonly; boolean_t vfs_do_readonly; boolean_t vfs_setuid; boolean_t vfs_do_setuid; boolean_t vfs_exec; boolean_t vfs_do_exec; boolean_t vfs_devices; boolean_t vfs_do_devices; boolean_t vfs_do_xattr; boolean_t vfs_atime; boolean_t vfs_do_atime; boolean_t vfs_relatime; boolean_t vfs_do_relatime; boolean_t vfs_nbmand; boolean_t vfs_do_nbmand; } vfs_t; typedef struct zfs_mnt { const char *mnt_osname; /* Objset name */ char *mnt_data; /* Raw mount options */ } zfs_mnt_t; struct zfsvfs { vfs_t *z_vfs; /* generic fs struct */ struct super_block *z_sb; /* generic super_block */ struct zfsvfs *z_parent; /* parent fs */ objset_t *z_os; /* objset reference */ uint64_t z_flags; /* super_block flags */ uint64_t z_root; /* id of root znode */ uint64_t z_unlinkedobj; /* id of unlinked zapobj */ uint64_t z_max_blksz; /* maximum block size for files */ uint64_t z_fuid_obj; /* fuid table object number */ uint64_t z_fuid_size; /* fuid table size */ avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ krwlock_t z_fuid_lock; /* fuid lock */ boolean_t z_fuid_loaded; /* fuid tables are loaded */ boolean_t z_fuid_dirty; /* need to sync fuid table ? */ struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ zilog_t *z_log; /* intent log pointer */ uint_t z_acl_inherit; /* acl inheritance behavior */ uint_t z_acl_type; /* type of ACL usable on this FS */ zfs_case_t z_case; /* case-sense */ boolean_t z_utf8; /* utf8-only */ int z_norm; /* normalization flags */ boolean_t z_atime; /* enable atimes mount option */ boolean_t z_relatime; /* enable relatime mount option */ boolean_t z_unmounted; /* unmounted */ rrmlock_t z_teardown_lock; krwlock_t z_teardown_inactive_lock; list_t z_all_znodes; /* all znodes in the fs */ uint64_t z_nr_znodes; /* number of znodes in the fs */ unsigned long z_rollback_time; /* last online rollback time */ unsigned long z_snap_defer_time; /* last snapshot unmount deferal */ kmutex_t z_znodes_lock; /* lock for z_all_znodes */ arc_prune_t *z_arc_prune; /* called by ARC to prune caches */ struct inode *z_ctldir; /* .zfs directory inode */ boolean_t z_show_ctldir; /* expose .zfs in the root dir */ boolean_t z_issnap; /* true if this is a snapshot */ boolean_t z_vscan; /* virus scan on/off */ boolean_t z_use_fuids; /* version allows fuids */ boolean_t z_replay; /* set during ZIL replay */ boolean_t z_use_sa; /* version allow system attributes */ boolean_t z_xattr_sa; /* allow xattrs to be stores as SA */ uint64_t z_version; /* ZPL version */ uint64_t z_shares_dir; /* hidden shares dir */ kmutex_t z_lock; uint64_t z_userquota_obj; uint64_t z_groupquota_obj; uint64_t z_userobjquota_obj; uint64_t z_groupobjquota_obj; uint64_t z_projectquota_obj; uint64_t z_projectobjquota_obj; uint64_t z_replay_eof; /* New end of file - replay only */ sa_attr_type_t *z_attr_table; /* SA attr mapping->id */ uint64_t z_hold_size; /* znode hold array size */ avl_tree_t *z_hold_trees; /* znode hold trees */ kmutex_t *z_hold_locks; /* znode hold locks */ }; #define ZSB_XATTR 0x0001 /* Enable user xattrs */ /* * Allow a maximum number of links. While ZFS does not internally limit * this the inode->i_nlink member is defined as an unsigned int. To be * safe we use 2^31-1 as the limit. */ #define ZFS_LINK_MAX ((1U << 31) - 1U) /* * Normal filesystems (those not under .zfs/snapshot) have a total * file ID size limited to 12 bytes (including the length field) due to * NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical * reasons, this same limit is being imposed by the Solaris NFSv3 implementation * (although the NFSv3 protocol actually permits a maximum of 64 bytes). It * is not possible to expand beyond 12 bytes without abandoning support * of NFSv2. * * For normal filesystems, we partition up the available space as follows: * 2 bytes fid length (required) * 6 bytes object number (48 bits) * 4 bytes generation number (32 bits) * * We reserve only 48 bits for the object number, as this is the limit * currently defined and imposed by the DMU. */ typedef struct zfid_short { uint16_t zf_len; uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */ uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */ } zfid_short_t; /* * Filesystems under .zfs/snapshot have a total file ID size of 22 bytes * (including the length field). This makes files under .zfs/snapshot * accessible by NFSv3 and NFSv4, but not NFSv2. * * For files under .zfs/snapshot, we partition up the available space * as follows: * 2 bytes fid length (required) * 6 bytes object number (48 bits) * 4 bytes generation number (32 bits) * 6 bytes objset id (48 bits) * 4 bytes currently just zero (32 bits) * * We reserve only 48 bits for the object number and objset id, as these are * the limits currently defined and imposed by the DMU. */ typedef struct zfid_long { zfid_short_t z_fid; uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */ uint8_t zf_setgen[4]; /* gen[i] = gen >> (8 * i) */ } zfid_long_t; #define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t)) #define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) extern uint_t zfs_fsyncer_key; extern int zfs_suspend_fs(zfsvfs_t *zfsvfs); extern int zfs_resume_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, const char *domain, uint64_t rid, uint64_t *valuep); extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, uint64_t *cookiep, void *vbuf, uint64_t *bufsizep); extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, const char *domain, uint64_t rid, uint64_t quota); extern boolean_t zfs_id_overblockquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id); extern boolean_t zfs_id_overobjquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id); extern boolean_t zfs_id_overquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id); extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers); extern int zfsvfs_create(const char *name, boolean_t readony, zfsvfs_t **zfvp); extern int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os); extern void zfsvfs_free(zfsvfs_t *zfsvfs); extern int zfs_check_global_label(const char *dsname, const char *hexsl); extern boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs); extern int zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent); extern void zfs_preumount(struct super_block *sb); extern int zfs_umount(struct super_block *sb); extern int zfs_remount(struct super_block *sb, int *flags, zfs_mnt_t *zm); extern int zfs_statvfs(struct dentry *dentry, struct kstatfs *statp); extern int zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp); extern int zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects); #ifdef __cplusplus } #endif #endif /* _SYS_FS_ZFS_VFSOPS_H */ diff --git a/module/zfs/zpl_super.c b/module/zfs/zpl_super.c index fc10271b787f..5c426b0a9fb2 100644 --- a/module/zfs/zpl_super.c +++ b/module/zfs/zpl_super.c @@ -1,395 +1,404 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. */ #include #include #include #include #include static struct inode * zpl_inode_alloc(struct super_block *sb) { struct inode *ip; VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0); inode_set_iversion(ip, 1); return (ip); } static void zpl_inode_destroy(struct inode *ip) { ASSERT(atomic_read(&ip->i_count) == 0); zfs_inode_destroy(ip); } /* * Called from __mark_inode_dirty() to reflect that something in the * inode has changed. We use it to ensure the znode system attributes * are always strictly update to date with respect to the inode. */ #ifdef HAVE_DIRTY_INODE_WITH_FLAGS static void zpl_dirty_inode(struct inode *ip, int flags) { fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); zfs_dirty_inode(ip, flags); spl_fstrans_unmark(cookie); } #else static void zpl_dirty_inode(struct inode *ip) { fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); zfs_dirty_inode(ip, 0); spl_fstrans_unmark(cookie); } #endif /* HAVE_DIRTY_INODE_WITH_FLAGS */ /* * When ->drop_inode() is called its return value indicates if the * inode should be evicted from the inode cache. If the inode is * unhashed and has no links the default policy is to evict it * immediately. * * Prior to 2.6.36 this eviction was accomplished by the vfs calling * ->delete_inode(). It was ->delete_inode()'s responsibility to * truncate the inode pages and call clear_inode(). The call to * clear_inode() synchronously invalidates all the buffers and * calls ->clear_inode(). It was ->clear_inode()'s responsibility * to cleanup and filesystem specific data before freeing the inode. * * This elaborate mechanism was replaced by ->evict_inode() which * does the job of both ->delete_inode() and ->clear_inode(). It * will be called exactly once, and when it returns the inode must * be in a state where it can simply be freed.i * * The ->evict_inode() callback must minimally truncate the inode pages, * and call clear_inode(). For 2.6.35 and later kernels this will * simply update the inode state, with the sync occurring before the * truncate in evict(). For earlier kernels clear_inode() maps to * end_writeback() which is responsible for completing all outstanding * write back. In either case, once this is done it is safe to cleanup * any remaining inode specific data via zfs_inactive(). * remaining filesystem specific data. */ #ifdef HAVE_EVICT_INODE static void zpl_evict_inode(struct inode *ip) { fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); truncate_setsize(ip, 0); clear_inode(ip); zfs_inactive(ip); spl_fstrans_unmark(cookie); } #else static void zpl_drop_inode(struct inode *ip) { generic_delete_inode(ip); } static void zpl_clear_inode(struct inode *ip) { fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); zfs_inactive(ip); spl_fstrans_unmark(cookie); } static void zpl_inode_delete(struct inode *ip) { truncate_setsize(ip, 0); clear_inode(ip); } #endif /* HAVE_EVICT_INODE */ static void zpl_put_super(struct super_block *sb) { fstrans_cookie_t cookie; int error; cookie = spl_fstrans_mark(); error = -zfs_umount(sb); spl_fstrans_unmark(cookie); ASSERT3S(error, <=, 0); } static int zpl_sync_fs(struct super_block *sb, int wait) { fstrans_cookie_t cookie; cred_t *cr = CRED(); int error; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_sync(sb, wait, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_statfs(struct dentry *dentry, struct kstatfs *statp) { fstrans_cookie_t cookie; int error; cookie = spl_fstrans_mark(); error = -zfs_statvfs(dentry, statp); spl_fstrans_unmark(cookie); ASSERT3S(error, <=, 0); return (error); } static int zpl_remount_fs(struct super_block *sb, int *flags, char *data) { zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data }; fstrans_cookie_t cookie; int error; cookie = spl_fstrans_mark(); error = -zfs_remount(sb, flags, &zm); spl_fstrans_unmark(cookie); ASSERT3S(error, <=, 0); return (error); } static int __zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs) { seq_printf(seq, ",%s", zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr"); #ifdef CONFIG_FS_POSIX_ACL switch (zfsvfs->z_acl_type) { case ZFS_ACLTYPE_POSIXACL: seq_puts(seq, ",posixacl"); break; default: seq_puts(seq, ",noacl"); break; } #endif /* CONFIG_FS_POSIX_ACL */ return (0); } #ifdef HAVE_SHOW_OPTIONS_WITH_DENTRY static int zpl_show_options(struct seq_file *seq, struct dentry *root) { return (__zpl_show_options(seq, root->d_sb->s_fs_info)); } #else static int zpl_show_options(struct seq_file *seq, struct vfsmount *vfsp) { return (__zpl_show_options(seq, vfsp->mnt_sb->s_fs_info)); } #endif /* HAVE_SHOW_OPTIONS_WITH_DENTRY */ static int zpl_fill_super(struct super_block *sb, void *data, int silent) { zfs_mnt_t *zm = (zfs_mnt_t *)data; fstrans_cookie_t cookie; int error; cookie = spl_fstrans_mark(); error = -zfs_domount(sb, zm, silent); spl_fstrans_unmark(cookie); ASSERT3S(error, <=, 0); return (error); } static int zpl_test_super(struct super_block *s, void *data) { zfsvfs_t *zfsvfs = s->s_fs_info; objset_t *os = data; if (zfsvfs == NULL) return (0); return (os == zfsvfs->z_os); } static struct super_block * zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm) { struct super_block *s; objset_t *os; int err; err = dmu_objset_hold(zm->mnt_osname, FTAG, &os); if (err) return (ERR_PTR(-err)); + /* + * The dsl pool lock must be released prior to calling sget(). + * It is possible sget() may block on the lock in grab_super() + * while deactivate_super() holds that same lock and waits for + * a txg sync. If the dsl_pool lock is held over over sget() + * this can prevent the pool sync and cause a deadlock. + */ + dsl_pool_rele(dmu_objset_pool(os), FTAG); s = zpl_sget(fs_type, zpl_test_super, set_anon_super, flags, os); - dmu_objset_rele(os, FTAG); + dsl_dataset_rele(dmu_objset_ds(os), FTAG); + if (IS_ERR(s)) return (ERR_CAST(s)); if (s->s_root == NULL) { err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0); if (err) { deactivate_locked_super(s); return (ERR_PTR(err)); } s->s_flags |= SB_ACTIVE; } else if ((flags ^ s->s_flags) & SB_RDONLY) { deactivate_locked_super(s); return (ERR_PTR(-EBUSY)); } return (s); } #ifdef HAVE_FST_MOUNT static struct dentry * zpl_mount(struct file_system_type *fs_type, int flags, const char *osname, void *data) { zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data }; struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm); if (IS_ERR(sb)) return (ERR_CAST(sb)); return (dget(sb->s_root)); } #else static int zpl_get_sb(struct file_system_type *fs_type, int flags, const char *osname, void *data, struct vfsmount *mnt) { zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data }; struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm); if (IS_ERR(sb)) return (PTR_ERR(sb)); (void) simple_set_mnt(mnt, sb); return (0); } #endif /* HAVE_FST_MOUNT */ static void zpl_kill_sb(struct super_block *sb) { zfs_preumount(sb); kill_anon_super(sb); #ifdef HAVE_S_INSTANCES_LIST_HEAD sb->s_instances.next = &(zpl_fs_type.fs_supers); #endif /* HAVE_S_INSTANCES_LIST_HEAD */ } void zpl_prune_sb(int64_t nr_to_scan, void *arg) { struct super_block *sb = (struct super_block *)arg; int objects = 0; (void) -zfs_prune(sb, nr_to_scan, &objects); } #ifdef HAVE_NR_CACHED_OBJECTS static int zpl_nr_cached_objects(struct super_block *sb) { return (0); } #endif /* HAVE_NR_CACHED_OBJECTS */ #ifdef HAVE_FREE_CACHED_OBJECTS static void zpl_free_cached_objects(struct super_block *sb, int nr_to_scan) { /* noop */ } #endif /* HAVE_FREE_CACHED_OBJECTS */ const struct super_operations zpl_super_operations = { .alloc_inode = zpl_inode_alloc, .destroy_inode = zpl_inode_destroy, .dirty_inode = zpl_dirty_inode, .write_inode = NULL, #ifdef HAVE_EVICT_INODE .evict_inode = zpl_evict_inode, #else .drop_inode = zpl_drop_inode, .clear_inode = zpl_clear_inode, .delete_inode = zpl_inode_delete, #endif /* HAVE_EVICT_INODE */ .put_super = zpl_put_super, .sync_fs = zpl_sync_fs, .statfs = zpl_statfs, .remount_fs = zpl_remount_fs, .show_options = zpl_show_options, .show_stats = NULL, #ifdef HAVE_NR_CACHED_OBJECTS .nr_cached_objects = zpl_nr_cached_objects, #endif /* HAVE_NR_CACHED_OBJECTS */ #ifdef HAVE_FREE_CACHED_OBJECTS .free_cached_objects = zpl_free_cached_objects, #endif /* HAVE_FREE_CACHED_OBJECTS */ }; struct file_system_type zpl_fs_type = { .owner = THIS_MODULE, .name = ZFS_DRIVER, #ifdef HAVE_FST_MOUNT .mount = zpl_mount, #else .get_sb = zpl_get_sb, #endif /* HAVE_FST_MOUNT */ .kill_sb = zpl_kill_sb, };