diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 281f58249ad5..27f179d826b9 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1,5256 +1,5257 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Programming rules. * * Each vnode op performs some logical unit of work. To do this, the ZPL must * properly lock its in-core state, create a DMU transaction, do the work, * record this work in the intent log (ZIL), commit the DMU transaction, * and wait for the intent log to commit if it is a synchronous operation. * Moreover, the vnode ops must work in both normal and log replay context. * The ordering of events is important to avoid deadlocks and references * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. * This is done avoiding races using ZFS_ENTER(zfsvfs). * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros * can return EIO from the calling function. * * (2) iput() should always be the last thing except for zil_commit() * (if necessary) and ZFS_EXIT(). This is for 3 reasons: * First, if it's the last reference, the vnode/znode * can be freed, so the zp may point to freed memory. Second, the last * reference will call zfs_zinactive(), which may induce a lot of work -- * pushing cached pages (which acquires range locks) and syncing out * cached atime changes. Third, zfs_zinactive() may require a new tx, * which could deadlock the system if you were already holding one. * If you must call iput() within a tx then use zfs_iput_async(). * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to * dmu_tx_assign(). This is critical because we don't want to block * while holding locks. * * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This * reduces lock contention and CPU usage when we must wait (note that if * throughput is constrained by the storage, nearly every transaction * must wait). * * Note, in particular, that if a lock is sometimes acquired before * the tx assigns, and sometimes after (e.g. z_lock), then failing * to use a non-blocking assign can deadlock the system. The scenario: * * Thread A has grabbed a lock before calling dmu_tx_assign(). * Thread B is in an already-assigned tx, and blocks for this lock. * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() * forever, because the previous txg can't quiesce until B's tx commits. * * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, * then drop all locks, call dmu_tx_wait(), and try again. On subsequent * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, * to indicate that this operation has already called dmu_tx_wait(). * This will ensure that we don't retry forever, waiting a short bit * each time. * * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. * During ZIL replay the zfs_log_* functions will update the sequence * number to indicate the zil transaction has replayed. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. * * (7) After dropping all locks, invoke zil_commit(zilog, foid) * to ensure that synchronous semantics are provided when necessary. * * In general, this is how things should be ordered in each vnode op: * * ZFS_ENTER(zfsvfs); // exit if unmounted * top: * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * iput(...); // release held vnodes * if (error == ERESTART) { * waited = B_TRUE; * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; * } * dmu_tx_abort(tx); // abort DMU tx * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // really out of space * } * error = do_real_work(); // do whatever this VOP does * if (error == 0) * zfs_log_*(...); // on success, make ZIL entry * dmu_tx_commit(tx); // commit DMU tx -- error or not * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * iput(...); // release held vnodes * zil_commit(zilog, foid); // synchronous when necessary * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // done, report error */ /* * Virus scanning is unsupported. It would be possible to add a hook * here to performance the required virus scan. This could be done * entirely in the kernel or potentially as an update to invoke a * scanning utility. */ static int zfs_vscan(struct inode *ip, cred_t *cr, int async) { return (0); } /* ARGSUSED */ int zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* Honor ZFS_APPENDONLY file attribute */ if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & O_APPEND) == 0)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* Virus scan eligible files on open */ if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { if (zfs_vscan(ip, cr, 0) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } } /* Keep a count of the synchronous opens in the znode */ if (flag & O_SYNC) atomic_inc_32(&zp->z_sync_cnt); ZFS_EXIT(zfsvfs); return (0); } /* ARGSUSED */ int zfs_close(struct inode *ip, int flag, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* Decrement the synchronous opens in the znode */ if (flag & O_SYNC) atomic_dec_32(&zp->z_sync_cnt); if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) VERIFY(zfs_vscan(ip, cr, 1) == 0); ZFS_EXIT(zfsvfs); return (0); } #if defined(SEEK_HOLE) && defined(SEEK_DATA) /* * Lseek support for finding holes (cmd == SEEK_HOLE) and * data (cmd == SEEK_DATA). "off" is an in/out parameter. */ static int zfs_holey_common(struct inode *ip, int cmd, loff_t *off) { znode_t *zp = ITOZ(ip); uint64_t noff = (uint64_t)*off; /* new offset */ uint64_t file_sz; int error; boolean_t hole; file_sz = zp->z_size; if (noff >= file_sz) { return (SET_ERROR(ENXIO)); } if (cmd == SEEK_HOLE) hole = B_TRUE; else hole = B_FALSE; error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); if (error == ESRCH) return (SET_ERROR(ENXIO)); /* file was dirty, so fall back to using generic logic */ if (error == EBUSY) { if (hole) *off = file_sz; return (0); } /* * We could find a hole that begins after the logical end-of-file, * because dmu_offset_next() only works on whole blocks. If the * EOF falls mid-block, then indicate that the "virtual hole" * at the end of the file begins at the logical EOF, rather than * at the end of the last block. */ if (noff > file_sz) { ASSERT(hole); noff = file_sz; } if (noff < *off) return (error); *off = noff; return (error); } int zfs_holey(struct inode *ip, int cmd, loff_t *off) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_holey_common(ip, cmd, off); ZFS_EXIT(zfsvfs); return (error); } #endif /* SEEK_HOLE && SEEK_DATA */ #if defined(_KERNEL) /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. */ static void update_pages(struct inode *ip, int64_t start, int len, objset_t *os, uint64_t oid) { struct address_space *mp = ip->i_mapping; struct page *pp; uint64_t nbytes; int64_t off; void *pb; off = start & (PAGE_SIZE-1); for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { nbytes = MIN(PAGE_SIZE - off, len); pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { if (mapping_writably_mapped(mp)) flush_dcache_page(pp); pb = kmap(pp); (void) dmu_read(os, oid, start+off, nbytes, pb+off, DMU_READ_PREFETCH); kunmap(pp); if (mapping_writably_mapped(mp)) flush_dcache_page(pp); mark_page_accessed(pp); SetPageUptodate(pp); ClearPageError(pp); unlock_page(pp); put_page(pp); } len -= nbytes; off = 0; } } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Read: We "read" preferentially from memory mapped pages, * else we default from the dmu buffer. * * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when * the file is memory mapped. */ static int mappedread(struct inode *ip, int nbytes, uio_t *uio) { struct address_space *mp = ip->i_mapping; struct page *pp; znode_t *zp = ITOZ(ip); int64_t start, off; uint64_t bytes; int len = nbytes; int error = 0; void *pb; start = uio->uio_loffset; off = start & (PAGE_SIZE-1); for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { bytes = MIN(PAGE_SIZE - off, len); pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { ASSERT(PageUptodate(pp)); unlock_page(pp); pb = kmap(pp); error = uiomove(pb + off, bytes, UIO_READ, uio); kunmap(pp); if (mapping_writably_mapped(mp)) flush_dcache_page(pp); mark_page_accessed(pp); put_page(pp); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, bytes); } len -= bytes; off = 0; if (error) break; } return (error); } #endif /* _KERNEL */ unsigned long zfs_read_chunk_size = 1024 * 1024; /* Tunable */ unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; /* * Read bytes from specified file into supplied buffer. * * IN: ip - inode of file to be read from. * uio - structure supplying read location, range info, * and return buffer. * ioflag - FSYNC flags; used to provide FRSYNC semantics. * O_DIRECT flag; used to bypass page cache. * cr - credentials of caller. * * OUT: uio - updated offset and range, buffer filled. * * RETURN: 0 on success, error code on failure. * * Side Effects: * inode - atime updated if byte count > 0 */ /* ARGSUSED */ int zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) { int error = 0; boolean_t frsync = B_FALSE; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (zp->z_pflags & ZFS_AV_QUARANTINED) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } /* * Validate file offset */ if (uio->uio_loffset < (offset_t)0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Fasttrack empty reads */ if (uio->uio_resid == 0) { ZFS_EXIT(zfsvfs); return (0); } #ifdef FRSYNC /* * If we're in FRSYNC mode, sync out this znode before reading it. * Only do this for non-snapshots. * * Some platforms do not support FRSYNC and instead map it * to FSYNC, which results in unnecessary calls to zil_commit. We * only honor FRSYNC requests on platforms which support it. */ frsync = !!(ioflag & FRSYNC); #endif if (zfsvfs->z_log && (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) zil_commit(zfsvfs->z_log, zp->z_id); /* * Lock the range against changes. */ locked_range_t *lr = rangelock_enter(&zp->z_rangelock, uio->uio_loffset, uio->uio_resid, RL_READER); /* * If we are reading past end-of-file we can skip * to the end; but we might still need to set atime. */ if (uio->uio_loffset >= zp->z_size) { error = 0; goto out; } ASSERT(uio->uio_loffset < zp->z_size); ssize_t n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); ssize_t start_resid = n; #ifdef HAVE_UIO_ZEROCOPY xuio_t *xuio = NULL; if ((uio->uio_extflg == UIO_XUIO) && (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { int nblk; int blksz = zp->z_blksz; uint64_t offset = uio->uio_loffset; xuio = (xuio_t *)uio; if ((ISP2(blksz))) { nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, blksz)) / blksz; } else { ASSERT(offset + n <= blksz); nblk = 1; } (void) dmu_xuio_init(xuio, nblk); if (vn_has_cached_data(ip)) { /* * For simplicity, we always allocate a full buffer * even if we only expect to read a portion of a block. */ while (--nblk >= 0) { (void) dmu_xuio_add(xuio, dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), blksz), 0, blksz); } } } #endif /* HAVE_UIO_ZEROCOPY */ while (n > 0) { ssize_t nbytes = MIN(n, zfs_read_chunk_size - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); if (zp->z_is_mapped && !(ioflag & O_DIRECT)) { error = mappedread(ip, nbytes, uio); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes); } if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } n -= nbytes; } int64_t nread = start_resid - n; dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); task_io_account_read(nread); out: rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (error); } /* * Write the bytes to a file. * * IN: ip - inode of file to be written to. * uio - structure supplying write location, range info, * and data buffer. * ioflag - FAPPEND flag set if in append mode. * O_DIRECT flag; used to bypass page cache. * cr - credentials of caller. * * OUT: uio - updated offset and range. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - ctime|mtime updated if byte count > 0 */ /* ARGSUSED */ int zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) { int error = 0; ssize_t start_resid = uio->uio_resid; /* * Fasttrack empty write */ ssize_t n = start_resid; if (n == 0) return (0); rlim64_t limit = uio->uio_limit; if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) limit = MAXOFFSET_T; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ZTOZSB(zp); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); sa_bulk_attr_t bulk[4]; int count = 0; uint64_t mtime[2], ctime[2]; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); /* * Callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } /* * If immutable or not appending then return EPERM */ if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && (uio->uio_loffset < zp->z_size))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* * Validate file offset */ offset_t woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; if (woff < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } int max_blksz = zfsvfs->z_max_blksz; xuio_t *xuio = NULL; /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. * Skip this if uio contains loaned arc_buf. */ #ifdef HAVE_UIO_ZEROCOPY if ((uio->uio_extflg == UIO_XUIO) && (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) xuio = (xuio_t *)uio; else #endif if (uio_prefaultpages(MIN(n, max_blksz), uio)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EFAULT)); } /* * If in append mode, set the io offset pointer to eof. */ locked_range_t *lr; if (ioflag & FAPPEND) { /* * Obtain an appending range lock to guarantee file append * semantics. We reset the write offset once we have the lock. */ lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); woff = lr->lr_offset; if (lr->lr_length == UINT64_MAX) { /* * We overlocked the file because this write will cause * the file block size to increase. * Note that zp_size cannot change with this lock held. */ woff = zp->z_size; } uio->uio_loffset = woff; } else { /* * Note that if the file block size will change as a result of * this write, then this range lock will lock the entire file * so that we can re-write the block safely. */ lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); } if (woff >= limit) { rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (SET_ERROR(EFBIG)); } if ((woff + n) > limit || woff > (limit - n)) n = limit - woff; /* Will this write extend the file length? */ int write_eof = (woff + n > zp->z_size); uint64_t end_size = MAX(zp->z_size, woff + n); zilog_t *zilog = zfsvfs->z_log; #ifdef HAVE_UIO_ZEROCOPY int i_iov = 0; const iovec_t *iovp = uio->uio_iov; ASSERTV(int iovcnt = uio->uio_iovcnt); #endif /* * Write the file in reasonable size chunks. Each chunk is written * in a separate transaction; this keeps the intent log records small * and allows us to do more fine-grained space accounting. */ while (n > 0) { woff = uio->uio_loffset; if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, KUID_TO_SUID(ip->i_uid)) || zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, KGID_TO_SGID(ip->i_gid)) || (zp->z_projid != ZFS_DEFAULT_PROJID && zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, zp->z_projid))) { error = SET_ERROR(EDQUOT); break; } arc_buf_t *abuf = NULL; const iovec_t *aiov = NULL; if (xuio) { #ifdef HAVE_UIO_ZEROCOPY ASSERT(i_iov < iovcnt); ASSERT3U(uio->uio_segflg, !=, UIO_BVEC); aiov = &iovp[i_iov]; abuf = dmu_xuio_arcbuf(xuio, i_iov); dmu_xuio_clear(xuio, i_iov); ASSERT((aiov->iov_base == abuf->b_data) || ((char *)aiov->iov_base - (char *)abuf->b_data + aiov->iov_len == arc_buf_size(abuf))); i_iov++; #endif } else if (n >= max_blksz && woff >= zp->z_size && P2PHASE(woff, max_blksz) == 0 && zp->z_blksz == max_blksz) { /* * This write covers a full block. "Borrow" a buffer * from the dmu so that we can fill it before we enter * a transaction. This avoids the possibility of * holding up the transaction if the data copy hangs * up on a pagefault (e.g., from an NFS server mapping). */ size_t cbytes; abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), max_blksz); ASSERT(abuf != NULL); ASSERT(arc_buf_size(abuf) == max_blksz); if ((error = uiocopy(abuf->b_data, max_blksz, UIO_WRITE, uio, &cbytes))) { dmu_return_arcbuf(abuf); break; } ASSERT(cbytes == max_blksz); } /* * Start a transaction. */ dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); if (abuf != NULL) dmu_return_arcbuf(abuf); break; } /* * If rangelock_enter() over-locked we grow the blocksize * and then reduce the lock range. This will only happen * on the first iteration since rangelock_reduce() will * shrink down lr_length to the appropriate size. */ if (lr->lr_length == UINT64_MAX) { uint64_t new_blksz; if (zp->z_blksz > max_blksz) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ ASSERT(!ISP2(zp->z_blksz)); new_blksz = MIN(end_size, 1 << highbit64(zp->z_blksz)); } else { new_blksz = MIN(end_size, max_blksz); } zfs_grow_blocksize(zp, new_blksz, tx); rangelock_reduce(lr, woff, n); } /* * XXX - should we really limit each write to z_max_blksz? * Perhaps we should use SPA_MAXBLOCKSIZE chunks? */ ssize_t nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); ssize_t tx_bytes; if (abuf == NULL) { tx_bytes = uio->uio_resid; uio->uio_fault_disable = B_TRUE; error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes, tx); + uio->uio_fault_disable = B_FALSE; if (error == EFAULT) { dmu_tx_commit(tx); if (uio_prefaultpages(MIN(n, max_blksz), uio)) { break; } continue; } else if (error != 0) { dmu_tx_commit(tx); break; } tx_bytes -= uio->uio_resid; } else { tx_bytes = nbytes; ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); /* * If this is not a full block write, but we are * extending the file past EOF and this data starts * block-aligned, use assign_arcbuf(). Otherwise, * write via dmu_write(). */ if (tx_bytes < max_blksz && (!write_eof || aiov->iov_base != abuf->b_data)) { ASSERT(xuio); dmu_write(zfsvfs->z_os, zp->z_id, woff, /* cppcheck-suppress nullPointer */ aiov->iov_len, aiov->iov_base, tx); dmu_return_arcbuf(abuf); xuio_stat_wbuf_copied(); } else { ASSERT(xuio || tx_bytes == max_blksz); error = dmu_assign_arcbuf_by_dbuf( sa_get_db(zp->z_sa_hdl), woff, abuf, tx); if (error != 0) { dmu_return_arcbuf(abuf); dmu_tx_commit(tx); break; } } ASSERT(tx_bytes <= uio->uio_resid); uioskip(uio, tx_bytes); } if (tx_bytes && zp->z_is_mapped && !(ioflag & O_DIRECT)) { update_pages(ip, woff, tx_bytes, zfsvfs->z_os, zp->z_id); } /* * If we made no progress, we're done. If we made even * partial progress, update the znode and ZIL accordingly. */ if (tx_bytes == 0) { (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), (void *)&zp->z_size, sizeof (uint64_t), tx); dmu_tx_commit(tx); ASSERT(error != 0); break; } /* * Clear Set-UID/Set-GID bits on successful write if not * privileged and at least one of the execute bits is set. * * It would be nice to to this after all writes have * been done, but that would still expose the ISUID/ISGID * to another app after the partial write is committed. * * Note: we don't call zfs_fuid_map_id() here because * user 0 is not an ephemeral uid. */ mutex_enter(&zp->z_acl_lock); uint32_t uid = KUID_TO_SUID(ip->i_uid); if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(cr, ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) { uint64_t newmode; zp->z_mode &= ~(S_ISUID | S_ISGID); ip->i_mode = newmode = zp->z_mode; (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), (void *)&newmode, sizeof (uint64_t), tx); } mutex_exit(&zp->z_acl_lock); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); /* * Update the file size (zp_size) if it has changed; * account for possible concurrent updates. */ while ((end_size = zp->z_size) < uio->uio_loffset) { (void) atomic_cas_64(&zp->z_size, end_size, uio->uio_loffset); ASSERT(error == 0); } /* * If we are replaying and eof is non zero then force * the file size to the specified eof. Note, there's no * concurrency during replay. */ if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) zp->z_size = zfsvfs->z_replay_eof; error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag, NULL, NULL); dmu_tx_commit(tx); if (error != 0) break; ASSERT(tx_bytes == nbytes); n -= nbytes; if (!xuio && n > 0) { if (uio_prefaultpages(MIN(n, max_blksz), uio)) { error = EFAULT; break; } } } zfs_inode_update(zp); rangelock_exit(lr); /* * If we're in replay mode, or we made no progress, return error. * Otherwise, it's at least a partial write, so it's successful. */ if (zfsvfs->z_replay || uio->uio_resid == start_resid) { ZFS_EXIT(zfsvfs); return (error); } if (ioflag & (FSYNC | FDSYNC) || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, zp->z_id); int64_t nwritten = start_resid - uio->uio_resid; dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); task_io_account_write(nwritten); ZFS_EXIT(zfsvfs); return (0); } /* * Drop a reference on the passed inode asynchronously. This ensures * that the caller will never drop the last reference on an inode in * the current context. Doing so while holding open a tx could result * in a deadlock if iput_final() re-enters the filesystem code. */ void zfs_iput_async(struct inode *ip) { objset_t *os = ITOZSB(ip)->z_os; ASSERT(atomic_read(&ip->i_count) > 0); ASSERT(os != NULL); if (atomic_read(&ip->i_count) == 1) VERIFY(taskq_dispatch(dsl_pool_iput_taskq(dmu_objset_pool(os)), (task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID); else iput(ip); } /* ARGSUSED */ void zfs_get_done(zgd_t *zgd, int error) { znode_t *zp = zgd->zgd_private; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); rangelock_exit(zgd->zgd_lr); /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ zfs_iput_async(ZTOI(zp)); kmem_free(zgd, sizeof (zgd_t)); } #ifdef DEBUG static int zil_fault_io = 0; #endif /* * Get data to generate a TX_WRITE intent log record. */ int zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { zfsvfs_t *zfsvfs = arg; objset_t *os = zfsvfs->z_os; znode_t *zp; uint64_t object = lr->lr_foid; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; dmu_buf_t *db; zgd_t *zgd; int error = 0; ASSERT3P(lwb, !=, NULL); ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); /* * Nothing to do if the file has been removed */ if (zfs_zget(zfsvfs, object, &zp) != 0) return (SET_ERROR(ENOENT)); if (zp->z_unlinked) { /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ zfs_iput_async(ZTOI(zp)); return (SET_ERROR(ENOENT)); } zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_lwb = lwb; zgd->zgd_private = zp; /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ zgd->zgd_lr = rangelock_enter(&zp->z_rangelock, offset, size, RL_READER); /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { error = SET_ERROR(ENOENT); } else { error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); } ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ /* * Have to lock the whole block to ensure when it's * written out and its checksum is being calculated * that no one can change the data. We need to re-check * blocksize after we get the lock in case it's changed! */ for (;;) { uint64_t blkoff; size = zp->z_blksz; blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; offset -= blkoff; zgd->zgd_lr = rangelock_enter(&zp->z_rangelock, offset, size, RL_READER); if (zp->z_blksz == size) break; offset += blkoff; rangelock_exit(zgd->zgd_lr); } /* test for truncation needs to be done while range locked */ if (lr->lr_offset >= zp->z_size) error = SET_ERROR(ENOENT); #ifdef DEBUG if (zil_fault_io) { error = SET_ERROR(EIO); zil_fault_io = 0; } #endif if (error == 0) error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zfs_get_done, zgd); ASSERT(error || lr->lr_length <= size); /* * On success, we need to wait for the write I/O * initiated by dmu_sync() to complete before we can * release this dbuf. We will finish everything up * in the zfs_get_done() callback. */ if (error == 0) return (0); if (error == EALREADY) { lr->lr_common.lrc_txtype = TX_WRITE2; /* * TX_WRITE2 relies on the data previously * written by the TX_WRITE that caused * EALREADY. We zero out the BP because * it is the old, currently-on-disk BP. */ zgd->zgd_bp = NULL; BP_ZERO(bp); error = 0; } } } zfs_get_done(zgd, error); return (error); } /*ARGSUSED*/ int zfs_access(struct inode *ip, int mode, int flag, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (flag & V_ACE_MASK) error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); else error = zfs_zaccess_rwx(zp, mode, flag, cr); ZFS_EXIT(zfsvfs); return (error); } /* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held inode reference for it. * * IN: dip - inode of directory to search. * nm - name of entry to lookup. * flags - LOOKUP_XATTR set if looking for an attribute. * cr - credentials of caller. * direntflags - directory lookup flags * realpnp - returned pathname. * * OUT: ipp - inode of located entry, NULL if not found. * * RETURN: 0 on success, error code on failure. * * Timestamps: * NA */ /* ARGSUSED */ int zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) { znode_t *zdp = ITOZ(dip); zfsvfs_t *zfsvfs = ITOZSB(dip); int error = 0; /* * Fast path lookup, however we must skip DNLC lookup * for case folding or normalizing lookups because the * DNLC code only stores the passed in name. This means * creating 'a' and removing 'A' on a case insensitive * file system would work, but DNLC still thinks 'a' * exists and won't let you create it again on the next * pass through fast path. */ if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { if (!S_ISDIR(dip->i_mode)) { return (SET_ERROR(ENOTDIR)); } else if (zdp->z_sa_hdl == NULL) { return (SET_ERROR(EIO)); } if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { error = zfs_fastaccesschk_execute(zdp, cr); if (!error) { *ipp = dip; igrab(*ipp); return (0); } return (error); } } ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zdp); *ipp = NULL; if (flags & LOOKUP_XATTR) { /* * We don't allow recursive attributes.. * Maybe someday we will. */ if (zdp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if ((error = zfs_get_xattrdir(zdp, ipp, cr, flags))) { ZFS_EXIT(zfsvfs); return (error); } /* * Do we have permission to get into attribute directory? */ if ((error = zfs_zaccess(ITOZ(*ipp), ACE_EXECUTE, 0, B_FALSE, cr))) { iput(*ipp); *ipp = NULL; } ZFS_EXIT(zfsvfs); return (error); } if (!S_ISDIR(dip->i_mode)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOTDIR)); } /* * Check accessibility of directory. */ if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { ZFS_EXIT(zfsvfs); return (error); } if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } error = zfs_dirlook(zdp, nm, ipp, flags, direntflags, realpnp); if ((error == 0) && (*ipp)) zfs_inode_update(ITOZ(*ipp)); ZFS_EXIT(zfsvfs); return (error); } /* * Attempt to create a new entry in a directory. If the entry * already exists, truncate the file if permissible, else return * an error. Return the ip of the created or trunc'd file. * * IN: dip - inode of directory to put new file entry in. * name - name of new file entry. * vap - attributes of new file. * excl - flag indicating exclusive or non-exclusive mode. * mode - mode to open file with. * cr - credentials of caller. * flag - file flag. * vsecp - ACL to be set * * OUT: ipp - inode of created or trunc'd entry. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dip - ctime|mtime updated if new entry created * ip - ctime|mtime always, atime if new */ /* ARGSUSED */ int zfs_create(struct inode *dip, char *name, vattr_t *vap, int excl, int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp) { znode_t *zp, *dzp = ITOZ(dip); zfsvfs_t *zfsvfs = ITOZSB(dip); zilog_t *zilog; objset_t *os; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; uid_t uid; gid_t gid; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; boolean_t have_acl = B_FALSE; boolean_t waited = B_FALSE; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ gid = crgetgid(cr); uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); if (name == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); os = zfsvfs->z_os; zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } top: *ipp = NULL; if (*name == '\0') { /* * Null component name refers to the directory itself. */ igrab(dip); zp = dzp; dl = NULL; error = 0; } else { /* possible igrab(zp) */ int zflg = 0; if (flag & FIGNORECASE) zflg |= ZCILOOK; error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { if (have_acl) zfs_acl_ids_free(&acl_ids); if (strcmp(name, "..") == 0) error = SET_ERROR(EISDIR); ZFS_EXIT(zfsvfs); return (error); } } if (zp == NULL) { uint64_t txtype; uint64_t projid = ZFS_DEFAULT_PROJID; /* * Create a new file object and update the directory * to reference it. */ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { if (have_acl) zfs_acl_ids_free(&acl_ids); goto out; } /* * We only support the creation of regular files in * extended attribute directories. */ if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { if (have_acl) zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EINVAL); goto out; } if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) goto out; have_acl = B_TRUE; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) projid = zfs_inherit_projid(dzp); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { /* * Since, we failed to add the directory entry for it, * delete the newly created dnode. */ zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); goto out; } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); if (flag & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, name, vsecp, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); } else { int aflags = (flag & FAPPEND) ? V_APPEND : 0; if (have_acl) zfs_acl_ids_free(&acl_ids); have_acl = B_FALSE; /* * A directory entry already exists for this name. */ /* * Can't truncate an existing file if in exclusive mode. */ if (excl) { error = SET_ERROR(EEXIST); goto out; } /* * Can't open a directory for writing. */ if (S_ISDIR(ZTOI(zp)->i_mode)) { error = SET_ERROR(EISDIR); goto out; } /* * Verify requested access to file. */ if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { goto out; } mutex_enter(&dzp->z_lock); dzp->z_seq++; mutex_exit(&dzp->z_lock); /* * Truncate regular files if requested. */ if (S_ISREG(ZTOI(zp)->i_mode) && (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { /* we can't hold any locks when calling zfs_freesp() */ if (dl) { zfs_dirent_unlock(dl); dl = NULL; } error = zfs_freesp(zp, 0, 0, mode, TRUE); } } out: if (dl) zfs_dirent_unlock(dl); if (error) { if (zp) iput(ZTOI(zp)); } else { zfs_inode_update(dzp); zfs_inode_update(zp); *ipp = ZTOI(zp); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* ARGSUSED */ int zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp) { znode_t *zp = NULL, *dzp = ITOZ(dip); zfsvfs_t *zfsvfs = ITOZSB(dip); objset_t *os; dmu_tx_t *tx; int error; uid_t uid; gid_t gid; zfs_acl_ids_t acl_ids; uint64_t projid = ZFS_DEFAULT_PROJID; boolean_t fuid_dirtied; boolean_t have_acl = B_FALSE; boolean_t waited = B_FALSE; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ gid = crgetgid(cr); uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); os = zfsvfs->z_os; if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } top: *ipp = NULL; /* * Create a new file object and update the directory * to reference it. */ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { if (have_acl) zfs_acl_ids_free(&acl_ids); goto out; } if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) goto out; have_acl = B_TRUE; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) projid = zfs_inherit_projid(dzp); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); /* Add to unlinked set */ zp->z_unlinked = 1; zfs_unlinked_add(zp, tx); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); out: if (error) { if (zp) iput(ZTOI(zp)); } else { zfs_inode_update(dzp); zfs_inode_update(zp); *ipp = ZTOI(zp); } ZFS_EXIT(zfsvfs); return (error); } /* * Remove an entry from a directory. * * IN: dip - inode of directory to remove entry from. * name - name of entry to remove. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * dip - ctime|mtime * ip - ctime (if nlink > 0) */ uint64_t null_xattr = 0; /*ARGSUSED*/ int zfs_remove(struct inode *dip, char *name, cred_t *cr, int flags) { znode_t *zp, *dzp = ITOZ(dip); znode_t *xzp; struct inode *ip; zfsvfs_t *zfsvfs = ITOZSB(dip); zilog_t *zilog; uint64_t acl_obj, xattr_obj; uint64_t xattr_obj_unlinked = 0; uint64_t obj = 0; uint64_t links; zfs_dirlock_t *dl; dmu_tx_t *tx; boolean_t may_delete_now, delete_now = FALSE; boolean_t unlinked, toobig = FALSE; uint64_t txtype; pathname_t *realnmp = NULL; pathname_t realnm; int error; int zflg = ZEXISTS; boolean_t waited = B_FALSE; if (name == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) { zflg |= ZCILOOK; pn_alloc(&realnm); realnmp = &realnm; } top: xattr_obj = 0; xzp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, realnmp))) { if (realnmp) pn_free(realnmp); ZFS_EXIT(zfsvfs); return (error); } ip = ZTOI(zp); if ((error = zfs_zaccess_delete(dzp, zp, cr))) { goto out; } /* * Need to use rmdir for removing directories. */ if (S_ISDIR(ip->i_mode)) { error = SET_ERROR(EPERM); goto out; } mutex_enter(&zp->z_lock); may_delete_now = atomic_read(&ip->i_count) == 1 && !(zp->z_is_mapped); mutex_exit(&zp->z_lock); /* * We may delete the znode now, or we may put it in the unlinked set; * it depends on whether we're the last link, and on whether there are * other holds on the inode. So we dmu_tx_hold() the right things to * allow for either case. */ obj = zp->z_id; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); if (may_delete_now) { toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; /* if the file is too big, only hold_free a token amount */ dmu_tx_hold_free(tx, zp->z_id, 0, (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); } /* are there any extended attributes? */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT0(error); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } mutex_enter(&zp->z_lock); if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); mutex_exit(&zp->z_lock); /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); /* * Mark this transaction as typically resulting in a net free of space */ dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); iput(ip); if (xzp) iput(ZTOI(xzp)); goto top; } if (realnmp) pn_free(realnmp); dmu_tx_abort(tx); iput(ip); if (xzp) iput(ZTOI(xzp)); ZFS_EXIT(zfsvfs); return (error); } /* * Remove the directory entry. */ error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); if (error) { dmu_tx_commit(tx); goto out; } if (unlinked) { /* * Hold z_lock so that we can make sure that the ACL obj * hasn't changed. Could have been deleted due to * zfs_sa_upgrade(). */ mutex_enter(&zp->z_lock); (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); delete_now = may_delete_now && !toobig && atomic_read(&ip->i_count) == 1 && !(zp->z_is_mapped) && xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == acl_obj; } if (delete_now) { if (xattr_obj_unlinked) { ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2); mutex_enter(&xzp->z_lock); xzp->z_unlinked = 1; clear_nlink(ZTOI(xzp)); links = 0; error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), &links, sizeof (links), tx); ASSERT3U(error, ==, 0); mutex_exit(&xzp->z_lock); zfs_unlinked_add(xzp, tx); if (zp->z_is_sa) error = sa_remove(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), tx); else error = sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &null_xattr, sizeof (uint64_t), tx); ASSERT0(error); } /* * Add to the unlinked set because a new reference could be * taken concurrently resulting in a deferred destruction. */ zfs_unlinked_add(zp, tx); mutex_exit(&zp->z_lock); } else if (unlinked) { mutex_exit(&zp->z_lock); zfs_unlinked_add(zp, tx); } txtype = TX_REMOVE; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_remove(zilog, tx, txtype, dzp, name, obj); dmu_tx_commit(tx); out: if (realnmp) pn_free(realnmp); zfs_dirent_unlock(dl); zfs_inode_update(dzp); zfs_inode_update(zp); if (delete_now) iput(ip); else zfs_iput_async(ip); if (xzp) { zfs_inode_update(xzp); zfs_iput_async(ZTOI(xzp)); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new directory and insert it into dip using the name * provided. Return a pointer to the inserted directory. * * IN: dip - inode of directory to add subdir to. * dirname - name of new directory. * vap - attributes of new directory. * cr - credentials of caller. * vsecp - ACL to be set * * OUT: ipp - inode of created directory. * * RETURN: 0 if success * error code if failure * * Timestamps: * dip - ctime|mtime updated * ipp - ctime|mtime|atime updated */ /*ARGSUSED*/ int zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp, cred_t *cr, int flags, vsecattr_t *vsecp) { znode_t *zp, *dzp = ITOZ(dip); zfsvfs_t *zfsvfs = ITOZSB(dip); zilog_t *zilog; zfs_dirlock_t *dl; uint64_t txtype; dmu_tx_t *tx; int error; int zf = ZNEW; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; boolean_t waited = B_FALSE; ASSERT(S_ISDIR(vap->va_mode)); /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); if (dirname == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (dzp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (zfsvfs->z_utf8 && u8_validate(dirname, strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zf |= ZCILOOK; if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * First make sure the new directory doesn't exist. * * Existence is checked first to make sure we don't return * EACCES instead of EEXIST which can cause some applications * to fail. */ top: *ipp = NULL; if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, NULL, NULL))) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } /* * Add a new entry to the directory. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Create new node. */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); /* * Now put new name in parent dir. */ error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); goto out; } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); *ipp = ZTOI(zp); txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, acl_ids.z_fuidp, vap); out: zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); if (error != 0) { iput(ZTOI(zp)); } else { zfs_inode_update(dzp); zfs_inode_update(zp); } ZFS_EXIT(zfsvfs); return (error); } /* * Remove a directory subdir entry. If the current working * directory is the same as the subdir to be removed, the * remove will fail. * * IN: dip - inode of directory to remove from. * name - name of directory to be removed. * cwd - inode of current working directory. * cr - credentials of caller. * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dip - ctime|mtime updated */ /*ARGSUSED*/ int zfs_rmdir(struct inode *dip, char *name, struct inode *cwd, cred_t *cr, int flags) { znode_t *dzp = ITOZ(dip); znode_t *zp; struct inode *ip; zfsvfs_t *zfsvfs = ITOZSB(dip); zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; int zflg = ZEXISTS; boolean_t waited = B_FALSE; if (name == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) zflg |= ZCILOOK; top: zp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL))) { ZFS_EXIT(zfsvfs); return (error); } ip = ZTOI(zp); if ((error = zfs_zaccess_delete(dzp, zp, cr))) { goto out; } if (!S_ISDIR(ip->i_mode)) { error = SET_ERROR(ENOTDIR); goto out; } if (ip == cwd) { error = SET_ERROR(EINVAL); goto out; } /* * Grab a lock on the directory to make sure that no one is * trying to add (or lookup) entries while we are removing it. */ rw_enter(&zp->z_name_lock, RW_WRITER); /* * Grab a lock on the parent pointer to make sure we play well * with the treewalk and directory rename code. */ rw_enter(&zp->z_parent_lock, RW_WRITER); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); iput(ip); goto top; } dmu_tx_abort(tx); iput(ip); ZFS_EXIT(zfsvfs); return (error); } error = zfs_link_destroy(dl, zp, tx, zflg, NULL); if (error == 0) { uint64_t txtype = TX_RMDIR; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); } dmu_tx_commit(tx); rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); out: zfs_dirent_unlock(dl); zfs_inode_update(dzp); zfs_inode_update(zp); iput(ip); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Read as many directory entries as will fit into the provided * dirent buffer from the given directory cursor position. * * IN: ip - inode of directory to read. * dirent - buffer for directory entries. * * OUT: dirent - filler buffer of directory entries. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - atime updated * * Note that the low 4 bits of the cookie returned by zap is always zero. * This allows us to use the low range for "special" directory entries: * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, * we use the offset 2 for the '.zfs' directory. */ /* ARGSUSED */ int zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); objset_t *os; zap_cursor_t zc; zap_attribute_t zap; int error; uint8_t prefetch; uint8_t type; int done = 0; uint64_t parent; uint64_t offset; /* must be unsigned; checks for < 1 */ ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) goto out; /* * Quit if directory has been removed (posix) */ if (zp->z_unlinked) goto out; error = 0; os = zfsvfs->z_os; offset = ctx->pos; prefetch = zp->z_zn_prefetch; /* * Initialize the iterator cursor. */ if (offset <= 3) { /* * Start iteration from the beginning of the directory. */ zap_cursor_init(&zc, os, zp->z_id); } else { /* * The offset is a serialized cursor. */ zap_cursor_init_serialized(&zc, os, zp->z_id, offset); } /* * Transform to file-system independent format */ while (!done) { uint64_t objnum; /* * Special case `.', `..', and `.zfs'. */ if (offset == 0) { (void) strcpy(zap.za_name, "."); zap.za_normalization_conflict = 0; objnum = zp->z_id; type = DT_DIR; } else if (offset == 1) { (void) strcpy(zap.za_name, ".."); zap.za_normalization_conflict = 0; objnum = parent; type = DT_DIR; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); zap.za_normalization_conflict = 0; objnum = ZFSCTL_INO_ROOT; type = DT_DIR; } else { /* * Grab next entry. */ if ((error = zap_cursor_retrieve(&zc, &zap))) { if (error == ENOENT) break; else goto update; } /* * Allow multiple entries provided the first entry is * the object id. Non-zpl consumers may safely make * use of the additional space. * * XXX: This should be a feature flag for compatibility */ if (zap.za_integer_length != 8 || zap.za_num_integers == 0) { cmn_err(CE_WARN, "zap_readdir: bad directory " "entry, obj = %lld, offset = %lld, " "length = %d, num = %lld\n", (u_longlong_t)zp->z_id, (u_longlong_t)offset, zap.za_integer_length, (u_longlong_t)zap.za_num_integers); error = SET_ERROR(ENXIO); goto update; } objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); type = ZFS_DIRENT_TYPE(zap.za_first_integer); } done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name), objnum, type); if (done) break; /* Prefetch znode */ if (prefetch) { dmu_prefetch(os, objnum, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); } /* * Move to the next entry, fill in the previous offset. */ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { zap_cursor_advance(&zc); offset = zap_cursor_serialize(&zc); } else { offset += 1; } ctx->pos = offset; } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ update: zap_cursor_fini(&zc); if (error == ENOENT) error = 0; out: ZFS_EXIT(zfsvfs); return (error); } ulong_t zfs_fsync_sync_cnt = 4; int zfs_fsync(struct inode *ip, int syncflag, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); } tsd_set(zfs_fsyncer_key, NULL); return (0); } /* * Get the requested file attributes and place them in the provided * vattr structure. * * IN: ip - inode of file. * vap - va_mask identifies requested attributes. * If ATTR_XVATTR set, then optional attrs are requested * flags - ATTR_NOACLCHECK (CIFS server context) * cr - credentials of caller. * * OUT: vap - attribute values. * * RETURN: 0 (always succeeds) */ /* ARGSUSED */ int zfs_getattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error = 0; uint64_t links; uint64_t atime[2], mtime[2], ctime[2]; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap = NULL; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; sa_bulk_attr_t bulk[3]; int count = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. * Also, if we are the owner don't bother, since owner should * always be allowed to read basic attributes of file. */ if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && (vap->va_uid != crgetuid(cr))) { if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, skipaclchk, cr))) { ZFS_EXIT(zfsvfs); return (error); } } /* * Return all attributes. It's cheaper to provide the answer * than to determine whether we were asked the question. */ mutex_enter(&zp->z_lock); vap->va_type = vn_mode_to_vtype(zp->z_mode); vap->va_mode = zp->z_mode; vap->va_fsid = ZTOI(zp)->i_sb->s_dev; vap->va_nodeid = zp->z_id; if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) links = ZTOI(zp)->i_nlink + 1; else links = ZTOI(zp)->i_nlink; vap->va_nlink = MIN(links, ZFS_LINK_MAX); vap->va_size = i_size_read(ip); vap->va_rdev = ip->i_rdev; vap->va_seq = ip->i_generation; /* * Add in any requested optional attributes and the create time. * Also set the corresponding bits in the returned attribute bitmap. */ if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { xoap->xoa_archive = ((zp->z_pflags & ZFS_ARCHIVE) != 0); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { xoap->xoa_readonly = ((zp->z_pflags & ZFS_READONLY) != 0); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { xoap->xoa_system = ((zp->z_pflags & ZFS_SYSTEM) != 0); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { xoap->xoa_hidden = ((zp->z_pflags & ZFS_HIDDEN) != 0); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { xoap->xoa_nounlink = ((zp->z_pflags & ZFS_NOUNLINK) != 0); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { xoap->xoa_immutable = ((zp->z_pflags & ZFS_IMMUTABLE) != 0); XVA_SET_RTN(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { xoap->xoa_appendonly = ((zp->z_pflags & ZFS_APPENDONLY) != 0); XVA_SET_RTN(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { xoap->xoa_nodump = ((zp->z_pflags & ZFS_NODUMP) != 0); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { xoap->xoa_opaque = ((zp->z_pflags & ZFS_OPAQUE) != 0); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { xoap->xoa_av_quarantined = ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { xoap->xoa_av_modified = ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && S_ISREG(ip->i_mode)) { zfs_sa_get_scanstamp(zp, xvap); } if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { uint64_t times[2]; (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), times, sizeof (times)); ZFS_TIME_DECODE(&xoap->xoa_createtime, times); XVA_SET_RTN(xvap, XAT_CREATETIME); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); XVA_SET_RTN(xvap, XAT_REPARSE); } if (XVA_ISSET_REQ(xvap, XAT_GEN)) { xoap->xoa_generation = ip->i_generation; XVA_SET_RTN(xvap, XAT_GEN); } if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { xoap->xoa_offline = ((zp->z_pflags & ZFS_OFFLINE) != 0); XVA_SET_RTN(xvap, XAT_OFFLINE); } if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { xoap->xoa_sparse = ((zp->z_pflags & ZFS_SPARSE) != 0); XVA_SET_RTN(xvap, XAT_SPARSE); } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { xoap->xoa_projinherit = ((zp->z_pflags & ZFS_PROJINHERIT) != 0); XVA_SET_RTN(xvap, XAT_PROJINHERIT); } if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { xoap->xoa_projid = zp->z_projid; XVA_SET_RTN(xvap, XAT_PROJID); } } ZFS_TIME_DECODE(&vap->va_atime, atime); ZFS_TIME_DECODE(&vap->va_mtime, mtime); ZFS_TIME_DECODE(&vap->va_ctime, ctime); mutex_exit(&zp->z_lock); sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks); if (zp->z_blksz == 0) { /* * Block size hasn't been set; suggest maximal I/O transfers. */ vap->va_blksize = zfsvfs->z_max_blksz; } ZFS_EXIT(zfsvfs); return (0); } /* * Get the basic file attributes and place them in the provided kstat * structure. The inode is assumed to be the authoritative source * for most of the attributes. However, the znode currently has the * authoritative atime, blksize, and block count. * * IN: ip - inode of file. * * OUT: sp - kstat values. * * RETURN: 0 (always succeeds) */ /* ARGSUSED */ int zfs_getattr_fast(struct inode *ip, struct kstat *sp) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint32_t blksize; u_longlong_t nblocks; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); mutex_enter(&zp->z_lock); generic_fillattr(ip, sp); sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); sp->blksize = blksize; sp->blocks = nblocks; if (unlikely(zp->z_blksz == 0)) { /* * Block size hasn't been set; suggest maximal I/O transfers. */ sp->blksize = zfsvfs->z_max_blksz; } mutex_exit(&zp->z_lock); /* * Required to prevent NFS client from detecting different inode * numbers of snapshot root dentry before and after snapshot mount. */ if (zfsvfs->z_issnap) { if (ip->i_sb->s_root->d_inode == ip) sp->ino = ZFSCTL_INO_SNAPDIRS - dmu_objset_id(zfsvfs->z_os); } ZFS_EXIT(zfsvfs); return (0); } /* * For the operation of changing file's user/group/project, we need to * handle not only the main object that is assigned to the file directly, * but also the ones that are used by the file via hidden xattr directory. * * Because the xattr directory may contains many EA entries, as to it may * be impossible to change all of them via the transaction of changing the * main object's user/group/project attributes. Then we have to change them * via other multiple independent transactions one by one. It may be not good * solution, but we have no better idea yet. */ static int zfs_setattr_dir(znode_t *dzp) { struct inode *dxip = ZTOI(dzp); struct inode *xip = NULL; zfsvfs_t *zfsvfs = ITOZSB(dxip); objset_t *os = zfsvfs->z_os; zap_cursor_t zc; zap_attribute_t zap; zfs_dirlock_t *dl; znode_t *zp; dmu_tx_t *tx = NULL; uint64_t uid, gid; sa_bulk_attr_t bulk[4]; int count; int err; zap_cursor_init(&zc, os, dzp->z_id); while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) { count = 0; if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { err = ENXIO; break; } err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp, ZEXISTS, NULL, NULL); if (err == ENOENT) goto next; if (err) break; xip = ZTOI(zp); if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) && KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) && zp->z_projid == dzp->z_projid) goto next; tx = dmu_tx_create(os); if (!(zp->z_pflags & ZFS_PROJID)) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); err = dmu_tx_assign(tx, TXG_WAIT); if (err) break; mutex_enter(&dzp->z_lock); if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) { xip->i_uid = dxip->i_uid; uid = zfs_uid_read(dxip); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, sizeof (uid)); } if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) { xip->i_gid = dxip->i_gid; gid = zfs_gid_read(dxip); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, sizeof (gid)); } if (zp->z_projid != dzp->z_projid) { if (!(zp->z_pflags & ZFS_PROJID)) { zp->z_pflags |= ZFS_PROJID; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); } zp->z_projid = dzp->z_projid; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, sizeof (zp->z_projid)); } mutex_exit(&dzp->z_lock); if (likely(count > 0)) { err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); } else { dmu_tx_abort(tx); } tx = NULL; if (err != 0 && err != ENOENT) break; next: if (xip) { iput(xip); xip = NULL; zfs_dirent_unlock(dl); } zap_cursor_advance(&zc); } if (tx) dmu_tx_abort(tx); if (xip) { iput(xip); zfs_dirent_unlock(dl); } zap_cursor_fini(&zc); return (err == ENOENT ? 0 : err); } /* * Set the file attributes to the values contained in the * vattr structure. * * IN: ip - inode of file to be modified. * vap - new attribute values. * If ATTR_XVATTR set, then optional attrs are being set * flags - ATTR_UTIME set if non-default time values provided. * - ATTR_NOACLCHECK (CIFS context only). * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - ctime updated, mtime updated if size changed. */ /* ARGSUSED */ int zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); objset_t *os = zfsvfs->z_os; zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; xvattr_t *tmpxvattr; uint_t mask = vap->va_mask; uint_t saved_mask = 0; int trim_mask = 0; uint64_t new_mode; uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; uint64_t xattr_obj; uint64_t mtime[2], ctime[2], atime[2]; uint64_t projid = ZFS_INVALID_PROJID; znode_t *attrzp; int need_policy = FALSE; int err, err2 = 0; zfs_fuid_info_t *fuidp = NULL; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap; zfs_acl_t *aclp; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; boolean_t fuid_dirtied = B_FALSE; boolean_t handle_eadir = B_FALSE; sa_bulk_attr_t *bulk, *xattr_bulk; int count = 0, xattr_count = 0, bulks = 8; if (mask == 0) return (0); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* * If this is a xvattr_t, then get a pointer to the structure of * optional attributes. If this is NULL, then we have a vattr_t. */ xoap = xva_getxoptattr(xvap); if (xoap != NULL && (mask & ATTR_XVATTR)) { if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { if (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOTSUP)); } projid = xoap->xoa_projid; if (unlikely(projid == ZFS_INVALID_PROJID)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) projid = ZFS_INVALID_PROJID; else need_policy = TRUE; } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && (xoap->xoa_projinherit != ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOTSUP)); } } zilog = zfsvfs->z_log; /* * Make sure that if we have ephemeral uid/gid or xvattr specified * that file system is at proper version level */ if (zfsvfs->z_use_fuids == B_FALSE && (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || (mask & ATTR_XVATTR))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EISDIR)); } if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); xva_init(tmpxvattr); bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); /* * Immutable files can only alter immutable bit and atime */ if ((zp->z_pflags & ZFS_IMMUTABLE) && ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { err = SET_ERROR(EPERM); goto out3; } if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { err = SET_ERROR(EPERM); goto out3; } /* * Verify timestamps doesn't overflow 32 bits. * ZFS can handle large timestamps, but 32bit syscalls can't * handle times greater than 2039. This check should be removed * once large timestamps are fully supported. */ if (mask & (ATTR_ATIME | ATTR_MTIME)) { if (((mask & ATTR_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || ((mask & ATTR_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { err = SET_ERROR(EOVERFLOW); goto out3; } } top: attrzp = NULL; aclp = NULL; /* Can this be moved to before the top label? */ if (zfs_is_readonly(zfsvfs)) { err = SET_ERROR(EROFS); goto out3; } /* * First validate permissions */ if (mask & ATTR_SIZE) { err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); if (err) goto out3; /* * XXX - Note, we are not providing any open * mode flags here (like FNDELAY), so we may * block if there are locks present... this * should be addressed in openat(). */ /* XXX - would it be OK to generate a log record here? */ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); if (err) goto out3; } if (mask & (ATTR_ATIME|ATTR_MTIME) || ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || XVA_ISSET_REQ(xvap, XAT_READONLY) || XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || XVA_ISSET_REQ(xvap, XAT_OFFLINE) || XVA_ISSET_REQ(xvap, XAT_SPARSE) || XVA_ISSET_REQ(xvap, XAT_CREATETIME) || XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, skipaclchk, cr); } if (mask & (ATTR_UID|ATTR_GID)) { int idmask = (mask & (ATTR_UID|ATTR_GID)); int take_owner; int take_group; /* * NOTE: even if a new mode is being set, * we may clear S_ISUID/S_ISGID bits. */ if (!(mask & ATTR_MODE)) vap->va_mode = zp->z_mode; /* * Take ownership or chgrp to group we are a member of */ take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr)); take_group = (mask & ATTR_GID) && zfs_groupmember(zfsvfs, vap->va_gid, cr); /* * If both ATTR_UID and ATTR_GID are set then take_owner and * take_group must both be set in order to allow taking * ownership. * * Otherwise, send the check through secpolicy_vnode_setattr() * */ if (((idmask == (ATTR_UID|ATTR_GID)) && take_owner && take_group) || ((idmask == ATTR_UID) && take_owner) || ((idmask == ATTR_GID) && take_group)) { if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, skipaclchk, cr) == 0) { /* * Remove setuid/setgid for non-privileged users */ (void) secpolicy_setid_clear(vap, cr); trim_mask = (mask & (ATTR_UID|ATTR_GID)); } else { need_policy = TRUE; } } else { need_policy = TRUE; } } mutex_enter(&zp->z_lock); oldva.va_mode = zp->z_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & ATTR_XVATTR) { /* * Update xvattr mask to include only those attributes * that are actually changing. * * the bits will be restored prior to actually setting * the attributes so the caller thinks they were set. */ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { if (xoap->xoa_appendonly != ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_APPENDONLY); XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); } } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { if (xoap->xoa_projinherit != ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_PROJINHERIT); XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); } } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { if (xoap->xoa_nounlink != ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NOUNLINK); XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); } } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { if (xoap->xoa_immutable != ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_IMMUTABLE); XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); } } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { if (xoap->xoa_nodump != ((zp->z_pflags & ZFS_NODUMP) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NODUMP); XVA_SET_REQ(tmpxvattr, XAT_NODUMP); } } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { if (xoap->xoa_av_modified != ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); } } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { if ((!S_ISREG(ip->i_mode) && xoap->xoa_av_quarantined) || xoap->xoa_av_quarantined != ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); } } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { mutex_exit(&zp->z_lock); err = SET_ERROR(EPERM); goto out3; } if (need_policy == FALSE && (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { need_policy = TRUE; } } mutex_exit(&zp->z_lock); if (mask & ATTR_MODE) { if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { err = secpolicy_setid_setsticky_clear(ip, vap, &oldva, cr); if (err) goto out3; trim_mask |= ATTR_MODE; } else { need_policy = TRUE; } } if (need_policy) { /* * If trim_mask is set then take ownership * has been granted or write_acl is present and user * has the ability to modify mode. In that case remove * UID|GID and or MODE from mask so that * secpolicy_vnode_setattr() doesn't revoke it. */ if (trim_mask) { saved_mask = vap->va_mask; vap->va_mask &= ~trim_mask; } err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags, (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); if (err) goto out3; if (trim_mask) vap->va_mask |= saved_mask; } /* * secpolicy_vnode_setattr, or take ownership may have * changed va_mask */ mask = vap->va_mask; if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { handle_eadir = B_TRUE; err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (err == 0 && xattr_obj) { err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); if (err) goto out2; } if (mask & ATTR_UID) { new_kuid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) && zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, new_kuid)) { if (attrzp) iput(ZTOI(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } if (mask & ATTR_GID) { new_kgid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) && zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, new_kgid)) { if (attrzp) iput(ZTOI(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } if (projid != ZFS_INVALID_PROJID && zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { if (attrzp) iput(ZTOI(attrzp)); err = EDQUOT; goto out2; } } tx = dmu_tx_create(os); if (mask & ATTR_MODE) { uint64_t pmode = zp->z_mode; uint64_t acl_obj; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); zfs_acl_chmod_setattr(zp, &aclp, new_mode); mutex_enter(&zp->z_lock); if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { /* * Are we upgrading ACL from old V0 format * to V1 format? */ if (zfsvfs->z_version >= ZPL_VERSION_FUID && zfs_znode_acl_version(zp) == ZFS_ACL_VERSION_INITIAL) { dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } mutex_exit(&zp->z_lock); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); } else { if (((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID))) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); } if (attrzp) { dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); } fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err) goto out; count = 0; /* * Set each attribute requested. * We group settings according to the locks they need to acquire. * * Note: you cannot set ctime directly, although it will be * updated as a side-effect of calling this function. */ if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { /* * For the existed object that is upgraded from old system, * its on-disk layout has no slot for the project ID attribute. * But quota accounting logic needs to access related slots by * offset directly. So we need to adjust old objects' layout * to make the project ID to some unified and fixed offset. */ if (attrzp) err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); if (err == 0) err = sa_add_projid(zp->z_sa_hdl, tx, projid); if (unlikely(err == EEXIST)) err = 0; else if (err != 0) goto out; else projid = ZFS_INVALID_PROJID; } if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_lock); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (attrzp) { if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_enter(&attrzp->z_acl_lock); mutex_enter(&attrzp->z_lock); SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, sizeof (attrzp->z_pflags)); if (projid != ZFS_INVALID_PROJID) { attrzp->z_projid = projid; SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, sizeof (attrzp->z_projid)); } } if (mask & (ATTR_UID|ATTR_GID)) { if (mask & ATTR_UID) { ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid); new_uid = zfs_uid_read(ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid); } } if (mask & ATTR_GID) { ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid); new_gid = zfs_gid_read(ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid); } } if (!(mask & ATTR_MODE)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); new_mode = zp->z_mode; } err = zfs_acl_chown_setattr(zp); ASSERT(err == 0); if (attrzp) { err = zfs_acl_chown_setattr(attrzp); ASSERT(err == 0); } } if (mask & ATTR_MODE) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); zp->z_mode = ZTOI(zp)->i_mode = new_mode; ASSERT3P(aclp, !=, NULL); err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT0(err); if (zp->z_acl_cached) zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = aclp; aclp = NULL; } if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { zp->z_atime_dirty = 0; ZFS_TIME_ENCODE(&ip->i_atime, atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, sizeof (atime)); } if (mask & (ATTR_MTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); ZTOI(zp)->i_mtime = zpl_inode_timespec_trunc(vap->va_mtime, ZTOI(zp)->i_sb->s_time_gran); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); } if (mask & (ATTR_CTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_ctime, ctime); ZTOI(zp)->i_ctime = zpl_inode_timespec_trunc(vap->va_ctime, ZTOI(zp)->i_sb->s_time_gran); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); } if (projid != ZFS_INVALID_PROJID) { zp->z_projid = projid; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, sizeof (zp->z_projid)); } if (attrzp && mask) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); } /* * Do this after setting timestamps to prevent timestamp * update from toggling bit */ if (xoap && (mask & ATTR_XVATTR)) { /* * restore trimmed off masks * so that return masks can be set for caller. */ if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { XVA_SET_REQ(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { XVA_SET_REQ(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { XVA_SET_REQ(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { XVA_SET_REQ(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { XVA_SET_REQ(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { XVA_SET_REQ(xvap, XAT_PROJINHERIT); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ASSERT(S_ISREG(ip->i_mode)); zfs_xvattr_set(zp, xvap, tx); } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); mutex_exit(&zp->z_lock); if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_exit(&zp->z_acl_lock); if (attrzp) { if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_exit(&attrzp->z_acl_lock); mutex_exit(&attrzp->z_lock); } out: if (err == 0 && xattr_count > 0) { err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, xattr_count, tx); ASSERT(err2 == 0); } if (aclp) zfs_acl_free(aclp); if (fuidp) { zfs_fuid_info_free(fuidp); fuidp = NULL; } if (err) { dmu_tx_abort(tx); if (attrzp) iput(ZTOI(attrzp)); if (err == ERESTART) goto top; } else { if (count > 0) err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); if (attrzp) { if (err2 == 0 && handle_eadir) err2 = zfs_setattr_dir(attrzp); iput(ZTOI(attrzp)); } zfs_inode_update(zp); } out2: if (os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); out3: kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); kmem_free(tmpxvattr, sizeof (xvattr_t)); ZFS_EXIT(zfsvfs); return (err); } typedef struct zfs_zlock { krwlock_t *zl_rwlock; /* lock we acquired */ znode_t *zl_znode; /* znode we held */ struct zfs_zlock *zl_next; /* next in list */ } zfs_zlock_t; /* * Drop locks and release vnodes that were held by zfs_rename_lock(). */ static void zfs_rename_unlock(zfs_zlock_t **zlpp) { zfs_zlock_t *zl; while ((zl = *zlpp) != NULL) { if (zl->zl_znode != NULL) zfs_iput_async(ZTOI(zl->zl_znode)); rw_exit(zl->zl_rwlock); *zlpp = zl->zl_next; kmem_free(zl, sizeof (*zl)); } } /* * Search back through the directory tree, using the ".." entries. * Lock each directory in the chain to prevent concurrent renames. * Fail any attempt to move a directory into one of its own descendants. * XXX - z_parent_lock can overlap with map or grow locks */ static int zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) { zfs_zlock_t *zl; znode_t *zp = tdzp; uint64_t rootid = ZTOZSB(zp)->z_root; uint64_t oidp = zp->z_id; krwlock_t *rwlp = &szp->z_parent_lock; krw_t rw = RW_WRITER; /* * First pass write-locks szp and compares to zp->z_id. * Later passes read-lock zp and compare to zp->z_parent. */ do { if (!rw_tryenter(rwlp, rw)) { /* * Another thread is renaming in this path. * Note that if we are a WRITER, we don't have any * parent_locks held yet. */ if (rw == RW_READER && zp->z_id > szp->z_id) { /* * Drop our locks and restart */ zfs_rename_unlock(&zl); *zlpp = NULL; zp = tdzp; oidp = zp->z_id; rwlp = &szp->z_parent_lock; rw = RW_WRITER; continue; } else { /* * Wait for other thread to drop its locks */ rw_enter(rwlp, rw); } } zl = kmem_alloc(sizeof (*zl), KM_SLEEP); zl->zl_rwlock = rwlp; zl->zl_znode = NULL; zl->zl_next = *zlpp; *zlpp = zl; if (oidp == szp->z_id) /* We're a descendant of szp */ return (SET_ERROR(EINVAL)); if (oidp == rootid) /* We've hit the top */ return (0); if (rw == RW_READER) { /* i.e. not the first pass */ int error = zfs_zget(ZTOZSB(zp), oidp, &zp); if (error) return (error); zl->zl_znode = zp; } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), &oidp, sizeof (oidp)); rwlp = &zp->z_parent_lock; rw = RW_READER; } while (zp->z_id != sdzp->z_id); return (0); } /* * Move an entry from the provided source directory to the target * directory. Change the entry name as indicated. * * IN: sdip - Source directory containing the "old entry". * snm - Old entry name. * tdip - Target directory to contain the "new entry". * tnm - New entry name. * cr - credentials of caller. * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * sdip,tdip - ctime|mtime updated */ /*ARGSUSED*/ int zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, cred_t *cr, int flags) { znode_t *tdzp, *szp, *tzp; znode_t *sdzp = ITOZ(sdip); zfsvfs_t *zfsvfs = ITOZSB(sdip); zilog_t *zilog; zfs_dirlock_t *sdl, *tdl; dmu_tx_t *tx; zfs_zlock_t *zl; int cmp, serr, terr; int error = 0; int zflg = 0; boolean_t waited = B_FALSE; if (snm == NULL || tnm == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(sdzp); zilog = zfsvfs->z_log; tdzp = ITOZ(tdip); ZFS_VERIFY_ZP(tdzp); /* * We check i_sb because snapshots and the ctldir must have different * super blocks. */ if (tdip->i_sb != sdip->i_sb || zfsctl_is_node(tdip)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EXDEV)); } if (zfsvfs->z_utf8 && u8_validate(tnm, strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zflg |= ZCILOOK; top: szp = NULL; tzp = NULL; zl = NULL; /* * This is to prevent the creation of links into attribute space * by renaming a linked file into/outof an attribute directory. * See the comment in zfs_link() for why this is considered bad. */ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Lock source and target directory entries. To prevent deadlock, * a lock ordering must be defined. We lock the directory with * the smallest object id first, or if it's a tie, the one with * the lexically first name. */ if (sdzp->z_id < tdzp->z_id) { cmp = -1; } else if (sdzp->z_id > tdzp->z_id) { cmp = 1; } else { /* * First compare the two name arguments without * considering any case folding. */ int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); ASSERT(error == 0 || !zfsvfs->z_utf8); if (cmp == 0) { /* * POSIX: "If the old argument and the new argument * both refer to links to the same existing file, * the rename() function shall return successfully * and perform no other action." */ ZFS_EXIT(zfsvfs); return (0); } /* * If the file system is case-folding, then we may * have some more checking to do. A case-folding file * system is either supporting mixed case sensitivity * access or is completely case-insensitive. Note * that the file system is always case preserving. * * In mixed sensitivity mode case sensitive behavior * is the default. FIGNORECASE must be used to * explicitly request case insensitive behavior. * * If the source and target names provided differ only * by case (e.g., a request to rename 'tim' to 'Tim'), * we will treat this as a special case in the * case-insensitive mode: as long as the source name * is an exact match, we will allow this to proceed as * a name-change request. */ if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || (zfsvfs->z_case == ZFS_CASE_MIXED && flags & FIGNORECASE)) && u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, &error) == 0) { /* * case preserving rename request, require exact * name matches */ zflg |= ZCIEXACT; zflg &= ~ZCILOOK; } } /* * If the source and destination directories are the same, we should * grab the z_name_lock of that directory only once. */ if (sdzp == tdzp) { zflg |= ZHAVELOCK; rw_enter(&sdzp->z_name_lock, RW_READER); } if (cmp < 0) { serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS | zflg, NULL, NULL); terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); } else { terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, zflg, NULL, NULL); serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, NULL, NULL); } if (serr) { /* * Source entry invalid or not there. */ if (!terr) { zfs_dirent_unlock(tdl); if (tzp) iput(ZTOI(tzp)); } if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (strcmp(snm, "..") == 0) serr = EINVAL; ZFS_EXIT(zfsvfs); return (serr); } if (terr) { zfs_dirent_unlock(sdl); iput(ZTOI(szp)); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (strcmp(tnm, "..") == 0) terr = EINVAL; ZFS_EXIT(zfsvfs); return (terr); } /* * If we are using project inheritance, means if the directory has * ZFS_PROJINHERIT set, then its descendant directories will inherit * not only the project ID, but also the ZFS_PROJINHERIT flag. Under * such case, we only allow renames into our tree when the project * IDs are the same. */ if (tdzp->z_pflags & ZFS_PROJINHERIT && tdzp->z_projid != szp->z_projid) { error = SET_ERROR(EXDEV); goto out; } /* * Must have write access at the source to remove the old entry * and write access at the target to create the new entry. * Note that if target and source are the same, this can be * done in a single check. */ if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))) goto out; if (S_ISDIR(ZTOI(szp)->i_mode)) { /* * Check to make sure rename is valid. * Can't do a move like this: /usr/a/b to /usr/a/b/c/d */ if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) goto out; } /* * Does target exist? */ if (tzp) { /* * Source and target must be the same type. */ if (S_ISDIR(ZTOI(szp)->i_mode)) { if (!S_ISDIR(ZTOI(tzp)->i_mode)) { error = SET_ERROR(ENOTDIR); goto out; } } else { if (S_ISDIR(ZTOI(tzp)->i_mode)) { error = SET_ERROR(EISDIR); goto out; } } /* * POSIX dictates that when the source and target * entries refer to the same file object, rename * must do nothing and exit without error. */ if (szp->z_id == tzp->z_id) { error = 0; goto out; } } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tdzp); } if (tzp) { dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); iput(ZTOI(szp)); if (tzp) iput(ZTOI(tzp)); goto top; } dmu_tx_abort(tx); iput(ZTOI(szp)); if (tzp) iput(ZTOI(tzp)); ZFS_EXIT(zfsvfs); return (error); } if (tzp) /* Attempt to remove the existing target */ error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); if (error == 0) { error = zfs_link_create(tdl, szp, tx, ZRENAMING); if (error == 0) { szp->z_pflags |= ZFS_AV_MODIFIED; if (tdzp->z_pflags & ZFS_PROJINHERIT) szp->z_pflags |= ZFS_PROJINHERIT; error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&szp->z_pflags, sizeof (uint64_t), tx); ASSERT0(error); error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); if (error == 0) { zfs_log_rename(zilog, tx, TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); } else { /* * At this point, we have successfully created * the target name, but have failed to remove * the source name. Since the create was done * with the ZRENAMING flag, there are * complications; for one, the link count is * wrong. The easiest way to deal with this * is to remove the newly created target, and * return the original error. This must * succeed; fortunately, it is very unlikely to * fail, since we just created it. */ VERIFY3U(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL), ==, 0); } } else { /* * If we had removed the existing target, subsequent * call to zfs_link_create() to add back the same entry * but, the new dnode (szp) should not fail. */ ASSERT(tzp == NULL); } } dmu_tx_commit(tx); out: if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); zfs_inode_update(sdzp); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (sdzp != tdzp) zfs_inode_update(tdzp); zfs_inode_update(szp); iput(ZTOI(szp)); if (tzp) { zfs_inode_update(tzp); iput(ZTOI(tzp)); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Insert the indicated symbolic reference entry into the directory. * * IN: dip - Directory to contain new symbolic link. * link - Name for new symlink entry. * vap - Attributes of new entry. * target - Target path of new symlink. * * cr - credentials of caller. * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dip - ctime|mtime updated */ /*ARGSUSED*/ int zfs_symlink(struct inode *dip, char *name, vattr_t *vap, char *link, struct inode **ipp, cred_t *cr, int flags) { znode_t *zp, *dzp = ITOZ(dip); zfs_dirlock_t *dl; dmu_tx_t *tx; zfsvfs_t *zfsvfs = ITOZSB(dip); zilog_t *zilog; uint64_t len = strlen(link); int error; int zflg = ZNEW; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t txtype = TX_SYMLINK; boolean_t waited = B_FALSE; ASSERT(S_ISLNK(vap->va_mode)); if (name == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zflg |= ZCILOOK; if (len > MAXPATHLEN) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENAMETOOLONG)); } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } top: *ipp = NULL; /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE + len); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new object for the symlink. * for version 4 ZPL datsets the symlink will be an SA attribute */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), link, len, tx); else zfs_sa_symlink(zp, link, len, tx); mutex_exit(&zp->z_lock); zp->z_size = len; (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), &zp->z_size, sizeof (zp->z_size), tx); /* * Insert the new object into the directory. */ error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); } else { if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); zfs_inode_update(dzp); zfs_inode_update(zp); } zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (error == 0) { *ipp = ZTOI(zp); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); } else { iput(ZTOI(zp)); } ZFS_EXIT(zfsvfs); return (error); } /* * Return, in the buffer contained in the provided uio structure, * the symbolic path referred to by ip. * * IN: ip - inode of symbolic link * uio - structure to contain the link path. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - atime updated */ /* ARGSUSED */ int zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_lookup_uio(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), uio); else error = zfs_sa_readlink(zp, uio); mutex_exit(&zp->z_lock); ZFS_EXIT(zfsvfs); return (error); } /* * Insert a new entry into directory tdip referencing sip. * * IN: tdip - Directory to contain new entry. * sip - inode of new entry. * name - name of new entry. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * tdip - ctime|mtime updated * sip - ctime updated */ /* ARGSUSED */ int zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr, int flags) { znode_t *dzp = ITOZ(tdip); znode_t *tzp, *szp; zfsvfs_t *zfsvfs = ITOZSB(tdip); zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; int zf = ZNEW; uint64_t parent; uid_t owner; boolean_t waited = B_FALSE; boolean_t is_tmpfile = 0; uint64_t txg; #ifdef HAVE_TMPFILE is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE)); #endif ASSERT(S_ISDIR(tdip->i_mode)); if (name == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; /* * POSIX dictates that we return EPERM here. * Better choices include ENOTSUP or EISDIR. */ if (S_ISDIR(sip->i_mode)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } szp = ITOZ(sip); ZFS_VERIFY_ZP(szp); /* * If we are using project inheritance, means if the directory has * ZFS_PROJINHERIT set, then its descendant directories will inherit * not only the project ID, but also the ZFS_PROJINHERIT flag. Under * such case, we only allow hard link creation in our tree when the * project IDs are the same. */ if (dzp->z_pflags & ZFS_PROJINHERIT && dzp->z_projid != szp->z_projid) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EXDEV)); } /* * We check i_sb because snapshots and the ctldir must have different * super blocks. */ if (sip->i_sb != tdip->i_sb || zfsctl_is_node(sip)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EXDEV)); } /* Prevent links to .zfs/shares files */ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } if (parent == zfsvfs->z_shares_dir) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zf |= ZCILOOK; /* * We do not support links between attributes and non-attributes * because of the potential security risk of creating links * into "normal" file space in order to circumvent restrictions * imposed in attribute space. */ if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid), cr, ZFS_OWNER); if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { ZFS_EXIT(zfsvfs); return (error); } top: /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); if (error) { ZFS_EXIT(zfsvfs); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); if (is_tmpfile) dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, szp); zfs_sa_upgrade_txholds(tx, dzp); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* unmark z_unlinked so zfs_link_create will not reject */ if (is_tmpfile) szp->z_unlinked = 0; error = zfs_link_create(dl, szp, tx, 0); if (error == 0) { uint64_t txtype = TX_LINK; /* * tmpfile is created to be in z_unlinkedobj, so remove it. * Also, we don't log in ZIL, be cause all previous file * operation on the tmpfile are ignored by ZIL. Instead we * always wait for txg to sync to make sure all previous * operation are sync safe. */ if (is_tmpfile) { VERIFY(zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0); } else { if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_link(zilog, tx, txtype, dzp, szp, name); } } else if (is_tmpfile) { /* restore z_unlinked since when linking failed */ szp->z_unlinked = 1; } txg = dmu_tx_get_txg(tx); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); if (is_tmpfile) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg); zfs_inode_update(dzp); zfs_inode_update(szp); ZFS_EXIT(zfsvfs); return (error); } static void zfs_putpage_commit_cb(void *arg) { struct page *pp = arg; ClearPageError(pp); end_page_writeback(pp); } /* * Push a page out to disk, once the page is on stable storage the * registered commit callback will be run as notification of completion. * * IN: ip - page mapped for inode. * pp - page to push (page is locked) * wbc - writeback control data * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - ctime|mtime updated */ /* ARGSUSED */ int zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); loff_t offset; loff_t pgoff; unsigned int pglen; dmu_tx_t *tx; caddr_t va; int err = 0; uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int cnt = 0; struct address_space *mapping; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); ASSERT(PageLocked(pp)); pgoff = page_offset(pp); /* Page byte-offset in file */ offset = i_size_read(ip); /* File length in bytes */ pglen = MIN(PAGE_SIZE, /* Page length in bytes */ P2ROUNDUP(offset, PAGE_SIZE)-pgoff); /* Page is beyond end of file */ if (pgoff >= offset) { unlock_page(pp); ZFS_EXIT(zfsvfs); return (0); } /* Truncate page length to end of file */ if (pgoff + pglen > offset) pglen = offset - pgoff; #if 0 /* * FIXME: Allow mmap writes past its quota. The correct fix * is to register a page_mkwrite() handler to count the page * against its quota when it is about to be dirtied. */ if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, KUID_TO_SUID(ip->i_uid)) || zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, KGID_TO_SGID(ip->i_gid)) || (zp->z_projid != ZFS_DEFAULT_PROJID && zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, zp->z_projid))) { err = EDQUOT; } #endif /* * The ordering here is critical and must adhere to the following * rules in order to avoid deadlocking in either zfs_read() or * zfs_free_range() due to a lock inversion. * * 1) The page must be unlocked prior to acquiring the range lock. * This is critical because zfs_read() calls find_lock_page() * which may block on the page lock while holding the range lock. * * 2) Before setting or clearing write back on a page the range lock * must be held in order to prevent a lock inversion with the * zfs_free_range() function. * * This presents a problem because upon entering this function the * page lock is already held. To safely acquire the range lock the * page lock must be dropped. This creates a window where another * process could truncate, invalidate, dirty, or write out the page. * * Therefore, after successfully reacquiring the range and page locks * the current page state is checked. In the common case everything * will be as is expected and it can be written out. However, if * the page state has changed it must be handled accordingly. */ mapping = pp->mapping; redirty_page_for_writepage(wbc, pp); unlock_page(pp); locked_range_t *lr = rangelock_enter(&zp->z_rangelock, pgoff, pglen, RL_WRITER); lock_page(pp); /* Page mapping changed or it was no longer dirty, we're done */ if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { unlock_page(pp); rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (0); } /* Another process started write block if required */ if (PageWriteback(pp)) { unlock_page(pp); rangelock_exit(lr); if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(pp); ZFS_EXIT(zfsvfs); return (0); } /* Clear the dirty flag the required locks are held */ if (!clear_page_dirty_for_io(pp)) { unlock_page(pp); rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (0); } /* * Counterpart for redirty_page_for_writepage() above. This page * was in fact not skipped and should not be counted as if it were. */ wbc->pages_skipped--; set_page_writeback(pp); unlock_page(pp); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_NOWAIT); if (err != 0) { if (err == ERESTART) dmu_tx_wait(tx); dmu_tx_abort(tx); __set_page_dirty_nobuffers(pp); ClearPageError(pp); end_page_writeback(pp); rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (err); } va = kmap(pp); ASSERT3U(pglen, <=, PAGE_SIZE); dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx); kunmap(pp); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); /* Preserve the mtime and ctime provided by the inode */ ZFS_TIME_ENCODE(&ip->i_mtime, mtime); ZFS_TIME_ENCODE(&ip->i_ctime, ctime); zp->z_atime_dirty = 0; zp->z_seq++; err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0, zfs_putpage_commit_cb, pp); dmu_tx_commit(tx); rangelock_exit(lr); if (wbc->sync_mode != WB_SYNC_NONE) { /* * Note that this is rarely called under writepages(), because * writepages() normally handles the entire commit for * performance reasons. */ zil_commit(zfsvfs->z_log, zp->z_id); } ZFS_EXIT(zfsvfs); return (err); } /* * Update the system attributes when the inode has been dirtied. For the * moment we only update the mode, atime, mtime, and ctime. */ int zfs_dirty_inode(struct inode *ip, int flags) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); dmu_tx_t *tx; uint64_t mode, atime[2], mtime[2], ctime[2]; sa_bulk_attr_t bulk[4]; int error = 0; int cnt = 0; if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) return (0); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); #ifdef I_DIRTY_TIME /* * This is the lazytime semantic indroduced in Linux 4.0 * This flag will only be called from update_time when lazytime is set. * (Note, I_DIRTY_SYNC will also set if not lazytime) * Fortunately mtime and ctime are managed within ZFS itself, so we * only need to dirty atime. */ if (flags == I_DIRTY_TIME) { zp->z_atime_dirty = 1; goto out; } #endif tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); goto out; } mutex_enter(&zp->z_lock); zp->z_atime_dirty = 0; SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); /* Preserve the mode, mtime and ctime provided by the inode */ ZFS_TIME_ENCODE(&ip->i_atime, atime); ZFS_TIME_ENCODE(&ip->i_mtime, mtime); ZFS_TIME_ENCODE(&ip->i_ctime, ctime); mode = ip->i_mode; zp->z_mode = mode; error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); mutex_exit(&zp->z_lock); dmu_tx_commit(tx); out: ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ void zfs_inactive(struct inode *ip) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint64_t atime[2]; int error; int need_unlock = 0; /* Only read lock if we haven't already write locked, e.g. rollback */ if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) { need_unlock = 1; rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); } if (zp->z_sa_hdl == NULL) { if (need_unlock) rw_exit(&zfsvfs->z_teardown_inactive_lock); return; } if (zp->z_atime_dirty && zp->z_unlinked == 0) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { ZFS_TIME_ENCODE(&ip->i_atime, atime); mutex_enter(&zp->z_lock); (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), (void *)&atime, sizeof (atime), tx); zp->z_atime_dirty = 0; mutex_exit(&zp->z_lock); dmu_tx_commit(tx); } } zfs_zinactive(zp); if (need_unlock) rw_exit(&zfsvfs->z_teardown_inactive_lock); } /* * Bounds-check the seek operation. * * IN: ip - inode seeking within * ooff - old file offset * noffp - pointer to new file offset * ct - caller context * * RETURN: 0 if success * EINVAL if new offset invalid */ /* ARGSUSED */ int zfs_seek(struct inode *ip, offset_t ooff, offset_t *noffp) { if (S_ISDIR(ip->i_mode)) return (0); return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); } /* * Fill pages with data from the disk. */ static int zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); objset_t *os; struct page *cur_pp; u_offset_t io_off, total; size_t io_len; loff_t i_size; unsigned page_idx; int err; os = zfsvfs->z_os; io_len = nr_pages << PAGE_SHIFT; i_size = i_size_read(ip); io_off = page_offset(pl[0]); if (io_off + io_len > i_size) io_len = i_size - io_off; /* * Iterate over list of pages and read each page individually. */ page_idx = 0; for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { caddr_t va; cur_pp = pl[page_idx++]; va = kmap(cur_pp); err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, DMU_READ_PREFETCH); kunmap(cur_pp); if (err) { /* convert checksum errors into IO errors */ if (err == ECKSUM) err = SET_ERROR(EIO); return (err); } } return (0); } /* * Uses zfs_fillpage to read data from the file and fill the pages. * * IN: ip - inode of file to get data from. * pl - list of pages to read * nr_pages - number of pages to read * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated */ /* ARGSUSED */ int zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int err; if (pl == NULL) return (0); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); err = zfs_fillpage(ip, pl, nr_pages); ZFS_EXIT(zfsvfs); return (err); } /* * Check ZFS specific permissions to memory map a section of a file. * * IN: ip - inode of the file to mmap * off - file offset * addrp - start address in memory region * len - length of memory region * vm_flags- address flags * * RETURN: 0 if success * error code if failure */ /*ARGSUSED*/ int zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, unsigned long vm_flags) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((vm_flags & VM_WRITE) && (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if ((vm_flags & (VM_READ | VM_EXEC)) && (zp->z_pflags & ZFS_AV_QUARANTINED)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } if (off < 0 || len > MAXOFFSET_T - off) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENXIO)); } ZFS_EXIT(zfsvfs); return (0); } /* * convoff - converts the given data (start, whence) to the * given whence. */ int convoff(struct inode *ip, flock64_t *lckdat, int whence, offset_t offset) { vattr_t vap; int error; if ((lckdat->l_whence == SEEK_END) || (whence == SEEK_END)) { if ((error = zfs_getattr(ip, &vap, 0, CRED()))) return (error); } switch (lckdat->l_whence) { case SEEK_CUR: lckdat->l_start += offset; break; case SEEK_END: lckdat->l_start += vap.va_size; /* FALLTHRU */ case SEEK_SET: break; default: return (SET_ERROR(EINVAL)); } if (lckdat->l_start < 0) return (SET_ERROR(EINVAL)); switch (whence) { case SEEK_CUR: lckdat->l_start -= offset; break; case SEEK_END: lckdat->l_start -= vap.va_size; /* FALLTHRU */ case SEEK_SET: break; default: return (SET_ERROR(EINVAL)); } lckdat->l_whence = (short)whence; return (0); } /* * Free or allocate space in a file. Currently, this function only * supports the `F_FREESP' command. However, this command is somewhat * misnamed, as its functionality includes the ability to allocate as * well as free space. * * IN: ip - inode of file to free data in. * cmd - action to take (only F_FREESP supported). * bfp - section of file to free/alloc. * flag - current file open mode flags. * offset - current file offset. * cr - credentials of caller. * * RETURN: 0 on success, error code on failure. * * Timestamps: * ip - ctime|mtime updated */ /* ARGSUSED */ int zfs_space(struct inode *ip, int cmd, flock64_t *bfp, int flag, offset_t offset, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint64_t off, len; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (cmd != F_FREESP) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } if ((error = convoff(ip, bfp, SEEK_SET, offset))) { ZFS_EXIT(zfsvfs); return (error); } if (bfp->l_len < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Permissions aren't checked on Solaris because on this OS * zfs_space() can only be called with an opened file handle. * On Linux we can get here through truncate_range() which * operates directly on inodes, so we need to check access rights. */ if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { ZFS_EXIT(zfsvfs); return (error); } off = bfp->l_start; len = bfp->l_len; /* 0 means from off to end of file */ error = zfs_freesp(zp, off, len, flag, TRUE); ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ int zfs_fid(struct inode *ip, fid_t *fidp) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint32_t gen; uint64_t gen64; uint64_t object = zp->z_id; zfid_short_t *zfid; int size, i, error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &gen64, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } gen = (uint32_t)gen64; size = SHORT_FID_LEN; zfid = (zfid_short_t *)fidp; zfid->zf_len = size; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* Must have a non-zero generation number to distinguish from .zfs */ if (gen == 0) gen = 1; for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); ZFS_EXIT(zfsvfs); return (0); } /*ARGSUSED*/ int zfs_getsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_getacl(zp, vsecp, skipaclchk, cr); ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ int zfs_setsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; zilog_t *zilog = zfsvfs->z_log; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_setacl(zp, vsecp, skipaclchk, cr); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } #ifdef HAVE_UIO_ZEROCOPY /* * Tunable, both must be a power of 2. * * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf * zcr_blksz_max: if set to less than the file block size, allow loaning out of * an arcbuf for a partial block read */ int zcr_blksz_min = (1 << 10); /* 1K */ int zcr_blksz_max = (1 << 17); /* 128K */ /*ARGSUSED*/ static int zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int max_blksz = zfsvfs->z_max_blksz; uio_t *uio = &xuio->xu_uio; ssize_t size = uio->uio_resid; offset_t offset = uio->uio_loffset; int blksz; int fullblk, i; arc_buf_t *abuf; ssize_t maxsize; int preamble, postamble; if (xuio->xu_type != UIOTYPE_ZEROCOPY) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); switch (ioflag) { case UIO_WRITE: /* * Loan out an arc_buf for write if write size is bigger than * max_blksz, and the file's block size is also max_blksz. */ blksz = max_blksz; if (size < blksz || zp->z_blksz != blksz) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Caller requests buffers for write before knowing where the * write offset might be (e.g. NFS TCP write). */ if (offset == -1) { preamble = 0; } else { preamble = P2PHASE(offset, blksz); if (preamble) { preamble = blksz - preamble; size -= preamble; } } postamble = P2PHASE(size, blksz); size -= postamble; fullblk = size / blksz; (void) dmu_xuio_init(xuio, (preamble != 0) + fullblk + (postamble != 0)); /* * Have to fix iov base/len for partial buffers. They * currently represent full arc_buf's. */ if (preamble) { /* data begins in the middle of the arc_buf */ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), blksz); ASSERT(abuf); (void) dmu_xuio_add(xuio, abuf, blksz - preamble, preamble); } for (i = 0; i < fullblk; i++) { abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), blksz); ASSERT(abuf); (void) dmu_xuio_add(xuio, abuf, 0, blksz); } if (postamble) { /* data ends in the middle of the arc_buf */ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), blksz); ASSERT(abuf); (void) dmu_xuio_add(xuio, abuf, 0, postamble); } break; case UIO_READ: /* * Loan out an arc_buf for read if the read size is larger than * the current file block size. Block alignment is not * considered. Partial arc_buf will be loaned out for read. */ blksz = zp->z_blksz; if (blksz < zcr_blksz_min) blksz = zcr_blksz_min; if (blksz > zcr_blksz_max) blksz = zcr_blksz_max; /* avoid potential complexity of dealing with it */ if (blksz > max_blksz) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } maxsize = zp->z_size - uio->uio_loffset; if (size > maxsize) size = maxsize; if (size < blksz) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } break; default: ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } uio->uio_extflg = UIO_XUIO; XUIO_XUZC_RW(xuio) = ioflag; ZFS_EXIT(zfsvfs); return (0); } /*ARGSUSED*/ static int zfs_retzcbuf(struct inode *ip, xuio_t *xuio, cred_t *cr) { int i; arc_buf_t *abuf; int ioflag = XUIO_XUZC_RW(xuio); ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY); i = dmu_xuio_cnt(xuio); while (i-- > 0) { abuf = dmu_xuio_arcbuf(xuio, i); /* * if abuf == NULL, it must be a write buffer * that has been returned in zfs_write(). */ if (abuf) dmu_return_arcbuf(abuf); ASSERT(abuf || ioflag == UIO_WRITE); } dmu_xuio_fini(xuio); return (0); } #endif /* HAVE_UIO_ZEROCOPY */ #if defined(_KERNEL) EXPORT_SYMBOL(zfs_open); EXPORT_SYMBOL(zfs_close); EXPORT_SYMBOL(zfs_read); EXPORT_SYMBOL(zfs_write); EXPORT_SYMBOL(zfs_access); EXPORT_SYMBOL(zfs_lookup); EXPORT_SYMBOL(zfs_create); EXPORT_SYMBOL(zfs_tmpfile); EXPORT_SYMBOL(zfs_remove); EXPORT_SYMBOL(zfs_mkdir); EXPORT_SYMBOL(zfs_rmdir); EXPORT_SYMBOL(zfs_readdir); EXPORT_SYMBOL(zfs_fsync); EXPORT_SYMBOL(zfs_getattr); EXPORT_SYMBOL(zfs_getattr_fast); EXPORT_SYMBOL(zfs_setattr); EXPORT_SYMBOL(zfs_rename); EXPORT_SYMBOL(zfs_symlink); EXPORT_SYMBOL(zfs_readlink); EXPORT_SYMBOL(zfs_link); EXPORT_SYMBOL(zfs_inactive); EXPORT_SYMBOL(zfs_space); EXPORT_SYMBOL(zfs_fid); EXPORT_SYMBOL(zfs_getsecattr); EXPORT_SYMBOL(zfs_setsecattr); EXPORT_SYMBOL(zfs_getpage); EXPORT_SYMBOL(zfs_putpage); EXPORT_SYMBOL(zfs_dirty_inode); EXPORT_SYMBOL(zfs_map); /* CSTYLED */ module_param(zfs_delete_blocks, ulong, 0644); MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); module_param(zfs_read_chunk_size, long, 0644); MODULE_PARM_DESC(zfs_read_chunk_size, "Bytes to read per chunk"); #endif diff --git a/module/zfs/zpl_file.c b/module/zfs/zpl_file.c index 731836c2c71e..acad4670d1c4 100644 --- a/module/zfs/zpl_file.c +++ b/module/zfs/zpl_file.c @@ -1,1075 +1,1075 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. */ #ifdef CONFIG_COMPAT #include #endif #include #include #include #include #include #include static int zpl_open(struct inode *ip, struct file *filp) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; error = generic_file_open(ip, filp); if (error) return (error); crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_release(struct inode *ip, struct file *filp) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); if (ITOZ(ip)->z_atime_dirty) zfs_mark_inode_dirty(ip); crhold(cr); error = -zfs_close(ip, filp->f_flags, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_iterate(struct file *filp, zpl_dir_context_t *ctx) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_readdir(file_inode(filp), ctx, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) static int zpl_readdir(struct file *filp, void *dirent, filldir_t filldir) { zpl_dir_context_t ctx = ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); int error; error = zpl_iterate(filp, &ctx); filp->f_pos = ctx.pos; return (error); } #endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ #if defined(HAVE_FSYNC_WITH_DENTRY) /* * Linux 2.6.x - 2.6.34 API, * Through 2.6.34 the nfsd kernel server would pass a NULL 'file struct *' * to the fops->fsync() hook. For this reason, we must be careful not to * use filp unconditionally. */ static int zpl_fsync(struct file *filp, struct dentry *dentry, int datasync) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_fsync(dentry->d_inode, datasync, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #ifdef HAVE_FILE_AIO_FSYNC static int zpl_aio_fsync(struct kiocb *kiocb, int datasync) { struct file *filp = kiocb->ki_filp; return (zpl_fsync(filp, file_dentry(filp), datasync)); } #endif #elif defined(HAVE_FSYNC_WITHOUT_DENTRY) /* * Linux 2.6.35 - 3.0 API, * As of 2.6.35 the dentry argument to the fops->fsync() hook was deemed * redundant. The dentry is still accessible via filp->f_path.dentry, * and we are guaranteed that filp will never be NULL. */ static int zpl_fsync(struct file *filp, int datasync) { struct inode *inode = filp->f_mapping->host; cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_fsync(inode, datasync, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #ifdef HAVE_FILE_AIO_FSYNC static int zpl_aio_fsync(struct kiocb *kiocb, int datasync) { return (zpl_fsync(kiocb->ki_filp, datasync)); } #endif #elif defined(HAVE_FSYNC_RANGE) /* * Linux 3.1 - 3.x API, * As of 3.1 the responsibility to call filemap_write_and_wait_range() has * been pushed down in to the .fsync() vfs hook. Additionally, the i_mutex * lock is no longer held by the caller, for zfs we don't require the lock * to be held so we don't acquire it. */ static int zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *inode = filp->f_mapping->host; cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; error = filemap_write_and_wait_range(inode->i_mapping, start, end); if (error) return (error); crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_fsync(inode, datasync, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #ifdef HAVE_FILE_AIO_FSYNC static int zpl_aio_fsync(struct kiocb *kiocb, int datasync) { return (zpl_fsync(kiocb->ki_filp, kiocb->ki_pos, -1, datasync)); } #endif #else #error "Unsupported fops->fsync() implementation" #endif static inline int zfs_io_flags(struct kiocb *kiocb) { int flags = 0; #if defined(IOCB_DSYNC) if (kiocb->ki_flags & IOCB_DSYNC) flags |= FDSYNC; #endif #if defined(IOCB_SYNC) if (kiocb->ki_flags & IOCB_SYNC) flags |= FSYNC; #endif #if defined(IOCB_APPEND) if (kiocb->ki_flags & IOCB_APPEND) flags |= FAPPEND; #endif #if defined(IOCB_DIRECT) if (kiocb->ki_flags & IOCB_DIRECT) flags |= FDIRECT; #endif return (flags); } static ssize_t zpl_read_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count, unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags, cred_t *cr, size_t skip) { ssize_t read; - uio_t uio; + uio_t uio = { { 0 }, 0 }; int error; fstrans_cookie_t cookie; uio.uio_iov = iovp; - uio.uio_skip = skip; - uio.uio_resid = count; uio.uio_iovcnt = nr_segs; uio.uio_loffset = *ppos; - uio.uio_limit = MAXOFFSET_T; uio.uio_segflg = segment; + uio.uio_limit = MAXOFFSET_T; + uio.uio_resid = count; + uio.uio_skip = skip; cookie = spl_fstrans_mark(); error = -zfs_read(ip, &uio, flags, cr); spl_fstrans_unmark(cookie); if (error < 0) return (error); read = count - uio.uio_resid; *ppos += read; return (read); } inline ssize_t zpl_read_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos, uio_seg_t segment, int flags, cred_t *cr) { struct iovec iov; iov.iov_base = (void *)buf; iov.iov_len = len; return (zpl_read_common_iovec(ip, &iov, len, 1, ppos, segment, flags, cr, 0)); } static ssize_t zpl_iter_read_common(struct kiocb *kiocb, const struct iovec *iovp, unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip) { cred_t *cr = CRED(); struct file *filp = kiocb->ki_filp; struct inode *ip = filp->f_mapping->host; zfsvfs_t *zfsvfs = ZTOZSB(ITOZ(ip)); ssize_t read; unsigned int f_flags = filp->f_flags; f_flags |= zfs_io_flags(kiocb); crhold(cr); read = zpl_read_common_iovec(filp->f_mapping->host, iovp, count, nr_segs, &kiocb->ki_pos, seg, f_flags, cr, skip); crfree(cr); /* * If relatime is enabled, call file_accessed() only if * zfs_relatime_need_update() is true. This is needed since datasets * with inherited "relatime" property aren't necessarily mounted with * MNT_RELATIME flag (e.g. after `zfs set relatime=...`), which is what * relatime test in VFS by relatime_need_update() is based on. */ if (!IS_NOATIME(ip) && zfsvfs->z_relatime) { if (zfs_relatime_need_update(ip)) file_accessed(filp); } else { file_accessed(filp); } return (read); } #if defined(HAVE_VFS_RW_ITERATE) static ssize_t zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) { ssize_t ret; uio_seg_t seg = UIO_USERSPACE; if (to->type & ITER_KVEC) seg = UIO_SYSSPACE; if (to->type & ITER_BVEC) seg = UIO_BVEC; ret = zpl_iter_read_common(kiocb, to->iov, to->nr_segs, iov_iter_count(to), seg, to->iov_offset); if (ret > 0) iov_iter_advance(to, ret); return (ret); } #else static ssize_t zpl_aio_read(struct kiocb *kiocb, const struct iovec *iovp, unsigned long nr_segs, loff_t pos) { ssize_t ret; size_t count; ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_WRITE); if (ret) return (ret); return (zpl_iter_read_common(kiocb, iovp, nr_segs, count, UIO_USERSPACE, 0)); } #endif /* HAVE_VFS_RW_ITERATE */ static ssize_t zpl_write_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count, unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags, cred_t *cr, size_t skip) { ssize_t wrote; - uio_t uio; + uio_t uio = { { 0 }, 0 }; int error; fstrans_cookie_t cookie; if (flags & O_APPEND) *ppos = i_size_read(ip); uio.uio_iov = iovp; - uio.uio_skip = skip; - uio.uio_resid = count; uio.uio_iovcnt = nr_segs; uio.uio_loffset = *ppos; - uio.uio_limit = MAXOFFSET_T; uio.uio_segflg = segment; + uio.uio_limit = MAXOFFSET_T; + uio.uio_resid = count; + uio.uio_skip = skip; cookie = spl_fstrans_mark(); error = -zfs_write(ip, &uio, flags, cr); spl_fstrans_unmark(cookie); if (error < 0) return (error); wrote = count - uio.uio_resid; *ppos += wrote; return (wrote); } inline ssize_t zpl_write_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos, uio_seg_t segment, int flags, cred_t *cr) { struct iovec iov; iov.iov_base = (void *)buf; iov.iov_len = len; return (zpl_write_common_iovec(ip, &iov, len, 1, ppos, segment, flags, cr, 0)); } static ssize_t zpl_iter_write_common(struct kiocb *kiocb, const struct iovec *iovp, unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip) { cred_t *cr = CRED(); struct file *filp = kiocb->ki_filp; ssize_t wrote; unsigned int f_flags = filp->f_flags; f_flags |= zfs_io_flags(kiocb); crhold(cr); wrote = zpl_write_common_iovec(filp->f_mapping->host, iovp, count, nr_segs, &kiocb->ki_pos, seg, f_flags, cr, skip); crfree(cr); return (wrote); } #if defined(HAVE_VFS_RW_ITERATE) static ssize_t zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) { size_t count; ssize_t ret; uio_seg_t seg = UIO_USERSPACE; #ifndef HAVE_GENERIC_WRITE_CHECKS_KIOCB struct file *file = kiocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *ip = mapping->host; int isblk = S_ISBLK(ip->i_mode); count = iov_iter_count(from); ret = generic_write_checks(file, &kiocb->ki_pos, &count, isblk); if (ret) return (ret); #else /* * XXX - ideally this check should be in the same lock region with * write operations, so that there's no TOCTTOU race when doing * append and someone else grow the file. */ ret = generic_write_checks(kiocb, from); if (ret <= 0) return (ret); count = ret; #endif if (from->type & ITER_KVEC) seg = UIO_SYSSPACE; if (from->type & ITER_BVEC) seg = UIO_BVEC; ret = zpl_iter_write_common(kiocb, from->iov, from->nr_segs, count, seg, from->iov_offset); if (ret > 0) iov_iter_advance(from, ret); return (ret); } #else static ssize_t zpl_aio_write(struct kiocb *kiocb, const struct iovec *iovp, unsigned long nr_segs, loff_t pos) { struct file *file = kiocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *ip = mapping->host; int isblk = S_ISBLK(ip->i_mode); size_t count; ssize_t ret; ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_READ); if (ret) return (ret); ret = generic_write_checks(file, &pos, &count, isblk); if (ret) return (ret); return (zpl_iter_write_common(kiocb, iovp, nr_segs, count, UIO_USERSPACE, 0)); } #endif /* HAVE_VFS_RW_ITERATE */ #if defined(HAVE_VFS_RW_ITERATE) static ssize_t zpl_direct_IO_impl(int rw, struct kiocb *kiocb, struct iov_iter *iter) { if (rw == WRITE) return (zpl_iter_write(kiocb, iter)); else return (zpl_iter_read(kiocb, iter)); } #if defined(HAVE_VFS_DIRECT_IO_ITER) static ssize_t zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter) { return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter)); } #elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET) static ssize_t zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) { ASSERT3S(pos, ==, kiocb->ki_pos); return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter)); } #elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET) static ssize_t zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) { ASSERT3S(pos, ==, kiocb->ki_pos); return (zpl_direct_IO_impl(rw, kiocb, iter)); } #else #error "Unknown direct IO interface" #endif #else #if defined(HAVE_VFS_DIRECT_IO_IOVEC) static ssize_t zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iovp, loff_t pos, unsigned long nr_segs) { if (rw == WRITE) return (zpl_aio_write(kiocb, iovp, nr_segs, pos)); else return (zpl_aio_read(kiocb, iovp, nr_segs, pos)); } #else #error "Unknown direct IO interface" #endif #endif /* HAVE_VFS_RW_ITERATE */ static loff_t zpl_llseek(struct file *filp, loff_t offset, int whence) { #if defined(SEEK_HOLE) && defined(SEEK_DATA) fstrans_cookie_t cookie; if (whence == SEEK_DATA || whence == SEEK_HOLE) { struct inode *ip = filp->f_mapping->host; loff_t maxbytes = ip->i_sb->s_maxbytes; loff_t error; spl_inode_lock_shared(ip); cookie = spl_fstrans_mark(); error = -zfs_holey(ip, whence, &offset); spl_fstrans_unmark(cookie); if (error == 0) error = lseek_execute(filp, ip, offset, maxbytes); spl_inode_unlock_shared(ip); return (error); } #endif /* SEEK_HOLE && SEEK_DATA */ return (generic_file_llseek(filp, offset, whence)); } /* * It's worth taking a moment to describe how mmap is implemented * for zfs because it differs considerably from other Linux filesystems. * However, this issue is handled the same way under OpenSolaris. * * The issue is that by design zfs bypasses the Linux page cache and * leaves all caching up to the ARC. This has been shown to work * well for the common read(2)/write(2) case. However, mmap(2) * is problem because it relies on being tightly integrated with the * page cache. To handle this we cache mmap'ed files twice, once in * the ARC and a second time in the page cache. The code is careful * to keep both copies synchronized. * * When a file with an mmap'ed region is written to using write(2) * both the data in the ARC and existing pages in the page cache * are updated. For a read(2) data will be read first from the page * cache then the ARC if needed. Neither a write(2) or read(2) will * will ever result in new pages being added to the page cache. * * New pages are added to the page cache only via .readpage() which * is called when the vfs needs to read a page off disk to back the * virtual memory region. These pages may be modified without * notifying the ARC and will be written out periodically via * .writepage(). This will occur due to either a sync or the usual * page aging behavior. Note because a read(2) of a mmap'ed file * will always check the page cache first even when the ARC is out * of date correct data will still be returned. * * While this implementation ensures correct behavior it does have * have some drawbacks. The most obvious of which is that it * increases the required memory footprint when access mmap'ed * files. It also adds additional complexity to the code keeping * both caches synchronized. * * Longer term it may be possible to cleanly resolve this wart by * mapping page cache pages directly on to the ARC buffers. The * Linux address space operations are flexible enough to allow * selection of which pages back a particular index. The trick * would be working out the details of which subsystem is in * charge, the ARC, the page cache, or both. It may also prove * helpful to move the ARC buffers to a scatter-gather lists * rather than a vmalloc'ed region. */ static int zpl_mmap(struct file *filp, struct vm_area_struct *vma) { struct inode *ip = filp->f_mapping->host; znode_t *zp = ITOZ(ip); int error; fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start, (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags); spl_fstrans_unmark(cookie); if (error) return (error); error = generic_file_mmap(filp, vma); if (error) return (error); mutex_enter(&zp->z_lock); zp->z_is_mapped = B_TRUE; mutex_exit(&zp->z_lock); return (error); } /* * Populate a page with data for the Linux page cache. This function is * only used to support mmap(2). There will be an identical copy of the * data in the ARC which is kept up to date via .write() and .writepage(). * * Current this function relies on zpl_read_common() and the O_DIRECT * flag to read in a page. This works but the more correct way is to * update zfs_fillpage() to be Linux friendly and use that interface. */ static int zpl_readpage(struct file *filp, struct page *pp) { struct inode *ip; struct page *pl[1]; int error = 0; fstrans_cookie_t cookie; ASSERT(PageLocked(pp)); ip = pp->mapping->host; pl[0] = pp; cookie = spl_fstrans_mark(); error = -zfs_getpage(ip, pl, 1); spl_fstrans_unmark(cookie); if (error) { SetPageError(pp); ClearPageUptodate(pp); } else { ClearPageError(pp); SetPageUptodate(pp); flush_dcache_page(pp); } unlock_page(pp); return (error); } /* * Populate a set of pages with data for the Linux page cache. This * function will only be called for read ahead and never for demand * paging. For simplicity, the code relies on read_cache_pages() to * correctly lock each page for IO and call zpl_readpage(). */ static int zpl_readpages(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { return (read_cache_pages(mapping, pages, (filler_t *)zpl_readpage, filp)); } int zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) { struct address_space *mapping = data; fstrans_cookie_t cookie; ASSERT(PageLocked(pp)); ASSERT(!PageWriteback(pp)); cookie = spl_fstrans_mark(); (void) zfs_putpage(mapping->host, pp, wbc); spl_fstrans_unmark(cookie); return (0); } static int zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) { znode_t *zp = ITOZ(mapping->host); zfsvfs_t *zfsvfs = ITOZSB(mapping->host); enum writeback_sync_modes sync_mode; int result; ZFS_ENTER(zfsvfs); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) wbc->sync_mode = WB_SYNC_ALL; ZFS_EXIT(zfsvfs); sync_mode = wbc->sync_mode; /* * We don't want to run write_cache_pages() in SYNC mode here, because * that would make putpage() wait for a single page to be committed to * disk every single time, resulting in atrocious performance. Instead * we run it once in non-SYNC mode so that the ZIL gets all the data, * and then we commit it all in one go. */ wbc->sync_mode = WB_SYNC_NONE; result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); if (sync_mode != wbc->sync_mode) { ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); /* * We need to call write_cache_pages() again (we can't just * return after the commit) because the previous call in * non-SYNC mode does not guarantee that we got all the dirty * pages (see the implementation of write_cache_pages() for * details). That being said, this is a no-op in most cases. */ wbc->sync_mode = sync_mode; result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); } return (result); } /* * Write out dirty pages to the ARC, this function is only required to * support mmap(2). Mapped pages may be dirtied by memory operations * which never call .write(). These dirty pages are kept in sync with * the ARC buffers via this hook. */ static int zpl_writepage(struct page *pp, struct writeback_control *wbc) { if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS) wbc->sync_mode = WB_SYNC_ALL; return (zpl_putpage(pp, wbc, pp->mapping)); } /* * The only flag combination which matches the behavior of zfs_space() * is FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE * flag was introduced in the 2.6.38 kernel. */ #if defined(HAVE_FILE_FALLOCATE) || defined(HAVE_INODE_FALLOCATE) long zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len) { int error = -EOPNOTSUPP; #if defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE) cred_t *cr = CRED(); flock64_t bf; loff_t olen; fstrans_cookie_t cookie; if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return (error); if (offset < 0 || len <= 0) return (-EINVAL); spl_inode_lock(ip); olen = i_size_read(ip); if (offset > olen) { spl_inode_unlock(ip); return (0); } if (offset + len > olen) len = olen - offset; bf.l_type = F_WRLCK; bf.l_whence = SEEK_SET; bf.l_start = offset; bf.l_len = len; bf.l_pid = 0; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_space(ip, F_FREESP, &bf, FWRITE, offset, cr); spl_fstrans_unmark(cookie); spl_inode_unlock(ip); crfree(cr); #endif /* defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE) */ ASSERT3S(error, <=, 0); return (error); } #endif /* defined(HAVE_FILE_FALLOCATE) || defined(HAVE_INODE_FALLOCATE) */ #ifdef HAVE_FILE_FALLOCATE static long zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len) { return zpl_fallocate_common(file_inode(filp), mode, offset, len); } #endif /* HAVE_FILE_FALLOCATE */ #define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL) #define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL) static uint32_t __zpl_ioctl_getflags(struct inode *ip) { uint64_t zfs_flags = ITOZ(ip)->z_pflags; uint32_t ioctl_flags = 0; if (zfs_flags & ZFS_IMMUTABLE) ioctl_flags |= FS_IMMUTABLE_FL; if (zfs_flags & ZFS_APPENDONLY) ioctl_flags |= FS_APPEND_FL; if (zfs_flags & ZFS_NODUMP) ioctl_flags |= FS_NODUMP_FL; if (zfs_flags & ZFS_PROJINHERIT) ioctl_flags |= ZFS_PROJINHERIT_FL; return (ioctl_flags & ZFS_FL_USER_VISIBLE); } /* * Map zfs file z_pflags (xvattr_t) to linux file attributes. Only file * attributes common to both Linux and Solaris are mapped. */ static int zpl_ioctl_getflags(struct file *filp, void __user *arg) { uint32_t flags; int err; flags = __zpl_ioctl_getflags(file_inode(filp)); err = copy_to_user(arg, &flags, sizeof (flags)); return (err); } /* * fchange() is a helper macro to detect if we have been asked to change a * flag. This is ugly, but the requirement that we do this is a consequence of * how the Linux file attribute interface was designed. Another consequence is * that concurrent modification of files suffers from a TOCTOU race. Neither * are things we can fix without modifying the kernel-userland interface, which * is outside of our jurisdiction. */ #define fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1))) static int __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva) { uint64_t zfs_flags = ITOZ(ip)->z_pflags; xoptattr_t *xoap; if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | ZFS_PROJINHERIT_FL)) return (-EOPNOTSUPP); if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE) return (-EACCES); if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) || fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) && !capable(CAP_LINUX_IMMUTABLE)) return (-EACCES); if (!zpl_inode_owner_or_capable(ip)) return (-EACCES); xva_init(xva); xoap = xva_getxoptattr(xva); XVA_SET_REQ(xva, XAT_IMMUTABLE); if (ioctl_flags & FS_IMMUTABLE_FL) xoap->xoa_immutable = B_TRUE; XVA_SET_REQ(xva, XAT_APPENDONLY); if (ioctl_flags & FS_APPEND_FL) xoap->xoa_appendonly = B_TRUE; XVA_SET_REQ(xva, XAT_NODUMP); if (ioctl_flags & FS_NODUMP_FL) xoap->xoa_nodump = B_TRUE; XVA_SET_REQ(xva, XAT_PROJINHERIT); if (ioctl_flags & ZFS_PROJINHERIT_FL) xoap->xoa_projinherit = B_TRUE; return (0); } static int zpl_ioctl_setflags(struct file *filp, void __user *arg) { struct inode *ip = file_inode(filp); uint32_t flags; cred_t *cr = CRED(); xvattr_t xva; int err; fstrans_cookie_t cookie; if (copy_from_user(&flags, arg, sizeof (flags))) return (-EFAULT); err = __zpl_ioctl_setflags(ip, flags, &xva); if (err) return (err); crhold(cr); cookie = spl_fstrans_mark(); err = -zfs_setattr(ip, (vattr_t *)&xva, 0, cr); spl_fstrans_unmark(cookie); crfree(cr); return (err); } static int zpl_ioctl_getxattr(struct file *filp, void __user *arg) { zfsxattr_t fsx = { 0 }; struct inode *ip = file_inode(filp); int err; fsx.fsx_xflags = __zpl_ioctl_getflags(ip); fsx.fsx_projid = ITOZ(ip)->z_projid; err = copy_to_user(arg, &fsx, sizeof (fsx)); return (err); } static int zpl_ioctl_setxattr(struct file *filp, void __user *arg) { struct inode *ip = file_inode(filp); zfsxattr_t fsx; cred_t *cr = CRED(); xvattr_t xva; xoptattr_t *xoap; int err; fstrans_cookie_t cookie; if (copy_from_user(&fsx, arg, sizeof (fsx))) return (-EFAULT); if (!zpl_is_valid_projid(fsx.fsx_projid)) return (-EINVAL); err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva); if (err) return (err); xoap = xva_getxoptattr(&xva); XVA_SET_REQ(&xva, XAT_PROJID); xoap->xoa_projid = fsx.fsx_projid; crhold(cr); cookie = spl_fstrans_mark(); err = -zfs_setattr(ip, (vattr_t *)&xva, 0, cr); spl_fstrans_unmark(cookie); crfree(cr); return (err); } static long zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { case FS_IOC_GETFLAGS: return (zpl_ioctl_getflags(filp, (void *)arg)); case FS_IOC_SETFLAGS: return (zpl_ioctl_setflags(filp, (void *)arg)); case ZFS_IOC_FSGETXATTR: return (zpl_ioctl_getxattr(filp, (void *)arg)); case ZFS_IOC_FSSETXATTR: return (zpl_ioctl_setxattr(filp, (void *)arg)); default: return (-ENOTTY); } } #ifdef CONFIG_COMPAT static long zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { case FS_IOC32_GETFLAGS: cmd = FS_IOC_GETFLAGS; break; case FS_IOC32_SETFLAGS: cmd = FS_IOC_SETFLAGS; break; default: return (-ENOTTY); } return (zpl_ioctl(filp, cmd, (unsigned long)compat_ptr(arg))); } #endif /* CONFIG_COMPAT */ const struct address_space_operations zpl_address_space_operations = { .readpages = zpl_readpages, .readpage = zpl_readpage, .writepage = zpl_writepage, .writepages = zpl_writepages, .direct_IO = zpl_direct_IO, }; const struct file_operations zpl_file_operations = { .open = zpl_open, .release = zpl_release, .llseek = zpl_llseek, #ifdef HAVE_VFS_RW_ITERATE #ifdef HAVE_NEW_SYNC_READ .read = new_sync_read, .write = new_sync_write, #endif .read_iter = zpl_iter_read, .write_iter = zpl_iter_write, #else .read = do_sync_read, .write = do_sync_write, .aio_read = zpl_aio_read, .aio_write = zpl_aio_write, #endif .mmap = zpl_mmap, .fsync = zpl_fsync, #ifdef HAVE_FILE_AIO_FSYNC .aio_fsync = zpl_aio_fsync, #endif #ifdef HAVE_FILE_FALLOCATE .fallocate = zpl_fallocate, #endif /* HAVE_FILE_FALLOCATE */ .unlocked_ioctl = zpl_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = zpl_compat_ioctl, #endif }; const struct file_operations zpl_dir_file_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, #if defined(HAVE_VFS_ITERATE_SHARED) .iterate_shared = zpl_iterate, #elif defined(HAVE_VFS_ITERATE) .iterate = zpl_iterate, #else .readdir = zpl_readdir, #endif .fsync = zpl_fsync, .unlocked_ioctl = zpl_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = zpl_compat_ioctl, #endif }; diff --git a/module/zfs/zpl_inode.c b/module/zfs/zpl_inode.c index 720330a8b7d4..3f3b2e2dc53c 100644 --- a/module/zfs/zpl_inode.c +++ b/module/zfs/zpl_inode.c @@ -1,827 +1,826 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. */ #include #include #include #include #include #include #include #include static struct dentry * #ifdef HAVE_LOOKUP_NAMEIDATA zpl_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) #else zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) #endif { cred_t *cr = CRED(); struct inode *ip; int error; fstrans_cookie_t cookie; pathname_t *ppn = NULL; pathname_t pn; int zfs_flags = 0; zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; if (dlen(dentry) >= ZAP_MAXNAMELEN) return (ERR_PTR(-ENAMETOOLONG)); crhold(cr); cookie = spl_fstrans_mark(); /* If we are a case insensitive fs, we need the real name */ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { zfs_flags = FIGNORECASE; pn_alloc(&pn); ppn = &pn; } error = -zfs_lookup(dir, dname(dentry), &ip, zfs_flags, cr, NULL, ppn); spl_fstrans_unmark(cookie); ASSERT3S(error, <=, 0); crfree(cr); spin_lock(&dentry->d_lock); dentry->d_time = jiffies; #ifndef HAVE_S_D_OP d_set_d_op(dentry, &zpl_dentry_operations); #endif /* HAVE_S_D_OP */ spin_unlock(&dentry->d_lock); if (error) { /* * If we have a case sensitive fs, we do not want to * insert negative entries, so return NULL for ENOENT. * Fall through if the error is not ENOENT. Also free memory. */ if (ppn) { pn_free(ppn); if (error == -ENOENT) return (NULL); } if (error == -ENOENT) return (d_splice_alias(NULL, dentry)); else return (ERR_PTR(error)); } /* * If we are case insensitive, call the correct function * to install the name. */ if (ppn) { struct dentry *new_dentry; struct qstr ci_name; if (strcmp(dname(dentry), pn.pn_buf) == 0) { new_dentry = d_splice_alias(ip, dentry); } else { ci_name.name = pn.pn_buf; ci_name.len = strlen(pn.pn_buf); new_dentry = d_add_ci(dentry, ip, &ci_name); } pn_free(ppn); return (new_dentry); } else { return (d_splice_alias(ip, dentry)); } } void zpl_vap_init(vattr_t *vap, struct inode *dir, zpl_umode_t mode, cred_t *cr) { vap->va_mask = ATTR_MODE; vap->va_mode = mode; vap->va_uid = crgetfsuid(cr); if (dir && dir->i_mode & S_ISGID) { vap->va_gid = KGID_TO_SGID(dir->i_gid); if (S_ISDIR(mode)) vap->va_mode |= S_ISGID; } else { vap->va_gid = crgetfsgid(cr); } } static int #ifdef HAVE_CREATE_NAMEIDATA zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, struct nameidata *nd) #else zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, bool flag) #endif { cred_t *cr = CRED(); struct inode *ip; vattr_t *vap; int error; fstrans_cookie_t cookie; crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); zpl_vap_init(vap, dir, mode, cr); cookie = spl_fstrans_mark(); error = -zfs_create(dir, dname(dentry), vap, 0, mode, &ip, cr, 0, NULL); if (error == 0) { d_instantiate(dentry, ip); error = zpl_xattr_security_init(ip, dir, &dentry->d_name); if (error == 0) error = zpl_init_acl(ip, dir); if (error) (void) zfs_remove(dir, dname(dentry), cr, 0); } spl_fstrans_unmark(cookie); kmem_free(vap, sizeof (vattr_t)); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_mknod(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, dev_t rdev) { cred_t *cr = CRED(); struct inode *ip; vattr_t *vap; int error; fstrans_cookie_t cookie; /* * We currently expect Linux to supply rdev=0 for all sockets * and fifos, but we want to know if this behavior ever changes. */ if (S_ISSOCK(mode) || S_ISFIFO(mode)) ASSERT(rdev == 0); crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); zpl_vap_init(vap, dir, mode, cr); vap->va_rdev = rdev; cookie = spl_fstrans_mark(); error = -zfs_create(dir, dname(dentry), vap, 0, mode, &ip, cr, 0, NULL); if (error == 0) { d_instantiate(dentry, ip); error = zpl_xattr_security_init(ip, dir, &dentry->d_name); if (error == 0) error = zpl_init_acl(ip, dir); if (error) (void) zfs_remove(dir, dname(dentry), cr, 0); } spl_fstrans_unmark(cookie); kmem_free(vap, sizeof (vattr_t)); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #ifdef HAVE_TMPFILE static int zpl_tmpfile(struct inode *dir, struct dentry *dentry, zpl_umode_t mode) { cred_t *cr = CRED(); struct inode *ip; vattr_t *vap; int error; fstrans_cookie_t cookie; crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); zpl_vap_init(vap, dir, mode, cr); cookie = spl_fstrans_mark(); error = -zfs_tmpfile(dir, vap, 0, mode, &ip, cr, 0, NULL); if (error == 0) { /* d_tmpfile will do drop_nlink, so we should set it first */ set_nlink(ip, 1); d_tmpfile(dentry, ip); error = zpl_xattr_security_init(ip, dir, &dentry->d_name); if (error == 0) error = zpl_init_acl(ip, dir); /* * don't need to handle error here, file is already in * unlinked set. */ } spl_fstrans_unmark(cookie); kmem_free(vap, sizeof (vattr_t)); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #endif static int zpl_unlink(struct inode *dir, struct dentry *dentry) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_remove(dir, dname(dentry), cr, 0); /* * For a CI FS we must invalidate the dentry to prevent the * creation of negative entries. */ if (error == 0 && zfsvfs->z_case == ZFS_CASE_INSENSITIVE) d_invalidate(dentry); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_mkdir(struct inode *dir, struct dentry *dentry, zpl_umode_t mode) { cred_t *cr = CRED(); vattr_t *vap; struct inode *ip; int error; fstrans_cookie_t cookie; crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); zpl_vap_init(vap, dir, mode | S_IFDIR, cr); cookie = spl_fstrans_mark(); error = -zfs_mkdir(dir, dname(dentry), vap, &ip, cr, 0, NULL); if (error == 0) { d_instantiate(dentry, ip); error = zpl_xattr_security_init(ip, dir, &dentry->d_name); if (error == 0) error = zpl_init_acl(ip, dir); if (error) (void) zfs_rmdir(dir, dname(dentry), NULL, cr, 0); } spl_fstrans_unmark(cookie); kmem_free(vap, sizeof (vattr_t)); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_rmdir(struct inode *dir, struct dentry *dentry) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_rmdir(dir, dname(dentry), NULL, cr, 0); /* * For a CI FS we must invalidate the dentry to prevent the * creation of negative entries. */ if (error == 0 && zfsvfs->z_case == ZFS_CASE_INSENSITIVE) d_invalidate(dentry); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { int error; fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); /* * XXX request_mask and query_flags currently ignored. */ error = -zfs_getattr_fast(path->dentry->d_inode, stat); spl_fstrans_unmark(cookie); ASSERT3S(error, <=, 0); return (error); } ZPL_GETATTR_WRAPPER(zpl_getattr); static int zpl_setattr(struct dentry *dentry, struct iattr *ia) { struct inode *ip = dentry->d_inode; cred_t *cr = CRED(); vattr_t *vap; int error; fstrans_cookie_t cookie; error = setattr_prepare(dentry, ia); if (error) return (error); crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); vap->va_mask = ia->ia_valid & ATTR_IATTR_MASK; vap->va_mode = ia->ia_mode; vap->va_uid = KUID_TO_SUID(ia->ia_uid); vap->va_gid = KGID_TO_SGID(ia->ia_gid); vap->va_size = ia->ia_size; vap->va_atime = ia->ia_atime; vap->va_mtime = ia->ia_mtime; vap->va_ctime = ia->ia_ctime; if (vap->va_mask & ATTR_ATIME) { ip->i_atime = zpl_inode_timespec_trunc(ia->ia_atime, ip->i_sb->s_time_gran); } cookie = spl_fstrans_mark(); error = -zfs_setattr(ip, vap, 0, cr); if (!error && (ia->ia_valid & ATTR_MODE)) error = zpl_chmod_acl(ip); spl_fstrans_unmark(cookie); kmem_free(vap, sizeof (vattr_t)); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_rename2(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, unsigned int flags) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; /* We don't have renameat2(2) support */ if (flags) return (-EINVAL); crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_rename(sdip, dname(sdentry), tdip, dname(tdentry), cr, 0); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #ifndef HAVE_RENAME_WANTS_FLAGS static int zpl_rename(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry) { return (zpl_rename2(sdip, sdentry, tdip, tdentry, 0)); } #endif static int zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name) { cred_t *cr = CRED(); vattr_t *vap; struct inode *ip; int error; fstrans_cookie_t cookie; crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); zpl_vap_init(vap, dir, S_IFLNK | S_IRWXUGO, cr); cookie = spl_fstrans_mark(); error = -zfs_symlink(dir, dname(dentry), vap, (char *)name, &ip, cr, 0); if (error == 0) { d_instantiate(dentry, ip); error = zpl_xattr_security_init(ip, dir, &dentry->d_name); if (error) (void) zfs_remove(dir, dname(dentry), cr, 0); } spl_fstrans_unmark(cookie); kmem_free(vap, sizeof (vattr_t)); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #if defined(HAVE_PUT_LINK_COOKIE) static void zpl_put_link(struct inode *unused, void *cookie) { kmem_free(cookie, MAXPATHLEN); } #elif defined(HAVE_PUT_LINK_NAMEIDATA) static void zpl_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr) { const char *link = nd_get_link(nd); if (!IS_ERR(link)) kmem_free(link, MAXPATHLEN); } #elif defined(HAVE_PUT_LINK_DELAYED) static void zpl_put_link(void *ptr) { kmem_free(ptr, MAXPATHLEN); } #endif static int zpl_get_link_common(struct dentry *dentry, struct inode *ip, char **link) { fstrans_cookie_t cookie; cred_t *cr = CRED(); struct iovec iov; - uio_t uio; + uio_t uio = { { 0 }, 0 }; int error; crhold(cr); *link = NULL; iov.iov_len = MAXPATHLEN; iov.iov_base = kmem_zalloc(MAXPATHLEN, KM_SLEEP); uio.uio_iov = &iov; uio.uio_iovcnt = 1; - uio.uio_skip = 0; - uio.uio_resid = (MAXPATHLEN - 1); uio.uio_segflg = UIO_SYSSPACE; + uio.uio_resid = (MAXPATHLEN - 1); cookie = spl_fstrans_mark(); error = -zfs_readlink(ip, &uio, cr); spl_fstrans_unmark(cookie); crfree(cr); if (error) kmem_free(iov.iov_base, MAXPATHLEN); else *link = iov.iov_base; return (error); } #if defined(HAVE_GET_LINK_DELAYED) const char * zpl_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { char *link = NULL; int error; if (!dentry) return (ERR_PTR(-ECHILD)); error = zpl_get_link_common(dentry, inode, &link); if (error) return (ERR_PTR(error)); set_delayed_call(done, zpl_put_link, link); return (link); } #elif defined(HAVE_GET_LINK_COOKIE) const char * zpl_get_link(struct dentry *dentry, struct inode *inode, void **cookie) { char *link = NULL; int error; if (!dentry) return (ERR_PTR(-ECHILD)); error = zpl_get_link_common(dentry, inode, &link); if (error) return (ERR_PTR(error)); return (*cookie = link); } #elif defined(HAVE_FOLLOW_LINK_COOKIE) const char * zpl_follow_link(struct dentry *dentry, void **cookie) { char *link = NULL; int error; error = zpl_get_link_common(dentry, dentry->d_inode, &link); if (error) return (ERR_PTR(error)); return (*cookie = link); } #elif defined(HAVE_FOLLOW_LINK_NAMEIDATA) static void * zpl_follow_link(struct dentry *dentry, struct nameidata *nd) { char *link = NULL; int error; error = zpl_get_link_common(dentry, dentry->d_inode, &link); if (error) nd_set_link(nd, ERR_PTR(error)); else nd_set_link(nd, link); return (NULL); } #endif static int zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { cred_t *cr = CRED(); struct inode *ip = old_dentry->d_inode; int error; fstrans_cookie_t cookie; if (ip->i_nlink >= ZFS_LINK_MAX) return (-EMLINK); crhold(cr); ip->i_ctime = current_time(ip); igrab(ip); /* Use ihold() if available */ cookie = spl_fstrans_mark(); error = -zfs_link(dir, ip, dname(dentry), cr, 0); if (error) { iput(ip); goto out; } d_instantiate(dentry, ip); out: spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #ifdef HAVE_INODE_TRUNCATE_RANGE static void zpl_truncate_range(struct inode *ip, loff_t start, loff_t end) { cred_t *cr = CRED(); flock64_t bf; fstrans_cookie_t cookie; ASSERT3S(start, <=, end); /* * zfs_freesp() will interpret (len == 0) as meaning "truncate until * the end of the file". We don't want that. */ if (start == end) return; crhold(cr); bf.l_type = F_WRLCK; bf.l_whence = SEEK_SET; bf.l_start = start; bf.l_len = end - start; bf.l_pid = 0; cookie = spl_fstrans_mark(); zfs_space(ip, F_FREESP, &bf, FWRITE, start, cr); spl_fstrans_unmark(cookie); crfree(cr); } #endif /* HAVE_INODE_TRUNCATE_RANGE */ #ifdef HAVE_INODE_FALLOCATE static long zpl_fallocate(struct inode *ip, int mode, loff_t offset, loff_t len) { return (zpl_fallocate_common(ip, mode, offset, len)); } #endif /* HAVE_INODE_FALLOCATE */ static int #ifdef HAVE_D_REVALIDATE_NAMEIDATA zpl_revalidate(struct dentry *dentry, struct nameidata *nd) { unsigned int flags = (nd ? nd->flags : 0); #else zpl_revalidate(struct dentry *dentry, unsigned int flags) { #endif /* HAVE_D_REVALIDATE_NAMEIDATA */ /* CSTYLED */ zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; int error; if (flags & LOOKUP_RCU) return (-ECHILD); /* * Automounted snapshots rely on periodic dentry revalidation * to defer snapshots from being automatically unmounted. */ if (zfsvfs->z_issnap) { if (time_after(jiffies, zfsvfs->z_snap_defer_time + MAX(zfs_expire_snapshot * HZ / 2, HZ))) { zfsvfs->z_snap_defer_time = jiffies; zfsctl_snapshot_unmount_delay(zfsvfs->z_os->os_spa, dmu_objset_id(zfsvfs->z_os), zfs_expire_snapshot); } } /* * After a rollback negative dentries created before the rollback * time must be invalidated. Otherwise they can obscure files which * are only present in the rolled back dataset. */ if (dentry->d_inode == NULL) { spin_lock(&dentry->d_lock); error = time_before(dentry->d_time, zfsvfs->z_rollback_time); spin_unlock(&dentry->d_lock); if (error) return (0); } /* * The dentry may reference a stale inode if a mounted file system * was rolled back to a point in time where the object didn't exist. */ if (dentry->d_inode && ITOZ(dentry->d_inode)->z_is_stale) return (0); return (1); } const struct inode_operations zpl_inode_operations = { .setattr = zpl_setattr, .getattr = zpl_getattr, #ifdef HAVE_GENERIC_SETXATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .removexattr = generic_removexattr, #endif .listxattr = zpl_xattr_list, #ifdef HAVE_INODE_TRUNCATE_RANGE .truncate_range = zpl_truncate_range, #endif /* HAVE_INODE_TRUNCATE_RANGE */ #ifdef HAVE_INODE_FALLOCATE .fallocate = zpl_fallocate, #endif /* HAVE_INODE_FALLOCATE */ #if defined(CONFIG_FS_POSIX_ACL) #if defined(HAVE_SET_ACL) .set_acl = zpl_set_acl, #endif #if defined(HAVE_GET_ACL) .get_acl = zpl_get_acl, #elif defined(HAVE_CHECK_ACL) .check_acl = zpl_check_acl, #elif defined(HAVE_PERMISSION) .permission = zpl_permission, #endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */ #endif /* CONFIG_FS_POSIX_ACL */ }; const struct inode_operations zpl_dir_inode_operations = { .create = zpl_create, .lookup = zpl_lookup, .link = zpl_link, .unlink = zpl_unlink, .symlink = zpl_symlink, .mkdir = zpl_mkdir, .rmdir = zpl_rmdir, .mknod = zpl_mknod, #ifdef HAVE_RENAME_WANTS_FLAGS .rename = zpl_rename2, #else .rename = zpl_rename, #endif #ifdef HAVE_TMPFILE .tmpfile = zpl_tmpfile, #endif .setattr = zpl_setattr, .getattr = zpl_getattr, #ifdef HAVE_GENERIC_SETXATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .removexattr = generic_removexattr, #endif .listxattr = zpl_xattr_list, #if defined(CONFIG_FS_POSIX_ACL) #if defined(HAVE_SET_ACL) .set_acl = zpl_set_acl, #endif #if defined(HAVE_GET_ACL) .get_acl = zpl_get_acl, #elif defined(HAVE_CHECK_ACL) .check_acl = zpl_check_acl, #elif defined(HAVE_PERMISSION) .permission = zpl_permission, #endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */ #endif /* CONFIG_FS_POSIX_ACL */ }; const struct inode_operations zpl_symlink_inode_operations = { #ifdef HAVE_GENERIC_READLINK .readlink = generic_readlink, #endif #if defined(HAVE_GET_LINK_DELAYED) || defined(HAVE_GET_LINK_COOKIE) .get_link = zpl_get_link, #elif defined(HAVE_FOLLOW_LINK_COOKIE) || defined(HAVE_FOLLOW_LINK_NAMEIDATA) .follow_link = zpl_follow_link, #endif #if defined(HAVE_PUT_LINK_COOKIE) || defined(HAVE_PUT_LINK_NAMEIDATA) .put_link = zpl_put_link, #endif .setattr = zpl_setattr, .getattr = zpl_getattr, #ifdef HAVE_GENERIC_SETXATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .removexattr = generic_removexattr, #endif .listxattr = zpl_xattr_list, }; const struct inode_operations zpl_special_inode_operations = { .setattr = zpl_setattr, .getattr = zpl_getattr, #ifdef HAVE_GENERIC_SETXATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .removexattr = generic_removexattr, #endif .listxattr = zpl_xattr_list, #if defined(CONFIG_FS_POSIX_ACL) #if defined(HAVE_SET_ACL) .set_acl = zpl_set_acl, #endif #if defined(HAVE_GET_ACL) .get_acl = zpl_get_acl, #elif defined(HAVE_CHECK_ACL) .check_acl = zpl_check_acl, #elif defined(HAVE_PERMISSION) .permission = zpl_permission, #endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */ #endif /* CONFIG_FS_POSIX_ACL */ }; dentry_operations_t zpl_dentry_operations = { .d_revalidate = zpl_revalidate, }; diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index a77339d7f4c4..c29f65f676b9 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -1,2767 +1,2767 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Rewritten for Linux by Brian Behlendorf . * LLNL-CODE-403049. * * ZFS volume emulation driver. * * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. * Volumes are accessed through the symbolic links named: * * /dev// * * Volumes are persistent through reboot and module load. No user command * needs to be run before opening and using a device. * * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2012, 2019 by Delphix. All rights reserved. */ /* * Note on locking of zvol state structures. * * These structures are used to maintain internal state used to emulate block * devices on top of zvols. In particular, management of device minor number * operations - create, remove, rename, and set_snapdev - involves access to * these structures. The zvol_state_lock is primarily used to protect the * zvol_state_list. The zv->zv_state_lock is used to protect the contents * of the zvol_state_t structures, as well as to make sure that when the * time comes to remove the structure from the list, it is not in use, and * therefore, it can be taken off zvol_state_list and freed. * * The zv_suspend_lock was introduced to allow for suspending I/O to a zvol, * e.g. for the duration of receive and rollback operations. This lock can be * held for significant periods of time. Given that it is undesirable to hold * mutexes for long periods of time, the following lock ordering applies: * - take zvol_state_lock if necessary, to protect zvol_state_list * - take zv_suspend_lock if necessary, by the code path in question * - take zv_state_lock to protect zvol_state_t * * The minor operations are issued to spa->spa_zvol_taskq queues, that are * single-threaded (to preserve order of minor operations), and are executed * through the zvol_task_cb that dispatches the specific operations. Therefore, * these operations are serialized per pool. Consequently, we can be certain * that for a given zvol, there is only one operation at a time in progress. * That is why one can be sure that first, zvol_state_t for a given zvol is * allocated and placed on zvol_state_list, and then other minor operations * for this zvol are going to proceed in the order of issue. * * It is also worth keeping in mind that once add_disk() is called, the zvol is * announced to the world, and zvol_open()/zvol_release() can be called at any * time. Incidentally, add_disk() itself calls zvol_open()->zvol_first_open() * and zvol_release()->zvol_last_close() directly as well. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include unsigned int zvol_inhibit_dev = 0; unsigned int zvol_major = ZVOL_MAJOR; unsigned int zvol_threads = 32; unsigned int zvol_request_sync = 0; unsigned int zvol_prefetch_bytes = (128 * 1024); unsigned long zvol_max_discard_blocks = 16384; unsigned int zvol_volmode = ZFS_VOLMODE_GEOM; static taskq_t *zvol_taskq; static krwlock_t zvol_state_lock; static list_t zvol_state_list; #define ZVOL_HT_SIZE 1024 static struct hlist_head *zvol_htable; #define ZVOL_HT_HEAD(hash) (&zvol_htable[(hash) & (ZVOL_HT_SIZE-1)]) static struct ida zvol_ida; /* * The in-core state of each volume. */ struct zvol_state { char zv_name[MAXNAMELEN]; /* name */ uint64_t zv_volsize; /* advertised space */ uint64_t zv_volblocksize; /* volume block size */ objset_t *zv_objset; /* objset handle */ uint32_t zv_flags; /* ZVOL_* flags */ uint32_t zv_open_count; /* open counts */ uint32_t zv_changed; /* disk changed */ zilog_t *zv_zilog; /* ZIL handle */ rangelock_t zv_rangelock; /* for range locking */ dnode_t *zv_dn; /* dnode hold */ dev_t zv_dev; /* device id */ struct gendisk *zv_disk; /* generic disk */ struct request_queue *zv_queue; /* request queue */ dataset_kstats_t zv_kstat; /* zvol kstats */ list_node_t zv_next; /* next zvol_state_t linkage */ uint64_t zv_hash; /* name hash */ struct hlist_node zv_hlink; /* hash link */ kmutex_t zv_state_lock; /* protects zvol_state_t */ atomic_t zv_suspend_ref; /* refcount for suspend */ krwlock_t zv_suspend_lock; /* suspend lock */ }; typedef enum { ZVOL_ASYNC_CREATE_MINORS, ZVOL_ASYNC_REMOVE_MINORS, ZVOL_ASYNC_RENAME_MINORS, ZVOL_ASYNC_SET_SNAPDEV, ZVOL_ASYNC_SET_VOLMODE, ZVOL_ASYNC_MAX } zvol_async_op_t; typedef struct { zvol_async_op_t op; char pool[MAXNAMELEN]; char name1[MAXNAMELEN]; char name2[MAXNAMELEN]; zprop_source_t source; uint64_t value; } zvol_task_t; #define ZVOL_RDONLY 0x1 /* * Whether the zvol has been written to (as opposed to ZVOL_RDONLY, which * specifies whether or not the zvol _can_ be written to) */ #define ZVOL_WRITTEN_TO 0x2 static uint64_t zvol_name_hash(const char *name) { int i; uint64_t crc = -1ULL; uint8_t *p = (uint8_t *)name; ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); for (i = 0; i < MAXNAMELEN - 1 && *p; i++, p++) { crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (*p)) & 0xFF]; } return (crc); } /* * Find a zvol_state_t given the full major+minor dev_t. If found, * return with zv_state_lock taken, otherwise, return (NULL) without * taking zv_state_lock. */ static zvol_state_t * zvol_find_by_dev(dev_t dev) { zvol_state_t *zv; rw_enter(&zvol_state_lock, RW_READER); for (zv = list_head(&zvol_state_list); zv != NULL; zv = list_next(&zvol_state_list, zv)) { mutex_enter(&zv->zv_state_lock); if (zv->zv_dev == dev) { rw_exit(&zvol_state_lock); return (zv); } mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); return (NULL); } /* * Find a zvol_state_t given the name and hash generated by zvol_name_hash. * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise, * return (NULL) without the taking locks. The zv_suspend_lock is always taken * before zv_state_lock. The mode argument indicates the mode (including none) * for zv_suspend_lock to be taken. */ static zvol_state_t * zvol_find_by_name_hash(const char *name, uint64_t hash, int mode) { zvol_state_t *zv; struct hlist_node *p = NULL; rw_enter(&zvol_state_lock, RW_READER); hlist_for_each(p, ZVOL_HT_HEAD(hash)) { zv = hlist_entry(p, zvol_state_t, zv_hlink); mutex_enter(&zv->zv_state_lock); if (zv->zv_hash == hash && strncmp(zv->zv_name, name, MAXNAMELEN) == 0) { /* * this is the right zvol, take the locks in the * right order */ if (mode != RW_NONE && !rw_tryenter(&zv->zv_suspend_lock, mode)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, mode); mutex_enter(&zv->zv_state_lock); /* * zvol cannot be renamed as we continue * to hold zvol_state_lock */ ASSERT(zv->zv_hash == hash && strncmp(zv->zv_name, name, MAXNAMELEN) == 0); } rw_exit(&zvol_state_lock); return (zv); } mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); return (NULL); } /* * Find a zvol_state_t given the name. * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise, * return (NULL) without the taking locks. The zv_suspend_lock is always taken * before zv_state_lock. The mode argument indicates the mode (including none) * for zv_suspend_lock to be taken. */ static zvol_state_t * zvol_find_by_name(const char *name, int mode) { return (zvol_find_by_name_hash(name, zvol_name_hash(name), mode)); } /* * Given a path, return TRUE if path is a ZVOL. */ boolean_t zvol_is_zvol(const char *device) { struct block_device *bdev; unsigned int major; bdev = vdev_lookup_bdev(device); if (IS_ERR(bdev)) return (B_FALSE); major = MAJOR(bdev->bd_dev); bdput(bdev); if (major == zvol_major) return (B_TRUE); return (B_FALSE); } /* * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation. */ void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { zfs_creat_t *zct = arg; nvlist_t *nvprops = zct->zct_props; int error; uint64_t volblocksize, volsize; VERIFY(nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0); if (nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); /* * These properties must be removed from the list so the generic * property setting step won't apply to them. */ VERIFY(nvlist_remove_all(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0); (void) nvlist_remove_all(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE)); error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize, DMU_OT_NONE, 0, tx); ASSERT(error == 0); error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP, DMU_OT_NONE, 0, tx); ASSERT(error == 0); error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); ASSERT(error == 0); } /* * ZFS_IOC_OBJSET_STATS entry point. */ int zvol_get_stats(objset_t *os, nvlist_t *nv) { int error; dmu_object_info_t *doi; uint64_t val; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val); if (error) return (SET_ERROR(error)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val); doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); error = dmu_object_info(os, ZVOL_OBJ, doi); if (error == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE, doi->doi_data_block_size); } kmem_free(doi, sizeof (dmu_object_info_t)); return (SET_ERROR(error)); } /* * Sanity check volume size. */ int zvol_check_volsize(uint64_t volsize, uint64_t blocksize) { if (volsize == 0) return (SET_ERROR(EINVAL)); if (volsize % blocksize != 0) return (SET_ERROR(EINVAL)); #ifdef _ILP32 if (volsize - 1 > SPEC_MAXOFFSET_T) return (SET_ERROR(EOVERFLOW)); #endif return (0); } /* * Ensure the zap is flushed then inform the VFS of the capacity change. */ static int zvol_update_volsize(uint64_t volsize, objset_t *os) { dmu_tx_t *tx; int error; uint64_t txg; tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (SET_ERROR(error)); } txg = dmu_tx_get_txg(tx); error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); dmu_tx_commit(tx); txg_wait_synced(dmu_objset_pool(os), txg); if (error == 0) error = dmu_free_long_range(os, ZVOL_OBJ, volsize, DMU_OBJECT_END); return (error); } /* * Set ZFS_PROP_VOLSIZE set entry point. Note that modifying the volume * size will result in a udev "change" event being generated. */ int zvol_set_volsize(const char *name, uint64_t volsize) { objset_t *os = NULL; struct gendisk *disk = NULL; uint64_t readonly; int error; boolean_t owned = B_FALSE; error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL); if (error != 0) return (SET_ERROR(error)); if (readonly) return (SET_ERROR(EROFS)); zvol_state_t *zv = zvol_find_by_name(name, RW_READER); ASSERT(zv == NULL || (MUTEX_HELD(&zv->zv_state_lock) && RW_READ_HELD(&zv->zv_suspend_lock))); if (zv == NULL || zv->zv_objset == NULL) { if (zv != NULL) rw_exit(&zv->zv_suspend_lock); if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, B_TRUE, FTAG, &os)) != 0) { if (zv != NULL) mutex_exit(&zv->zv_state_lock); return (SET_ERROR(error)); } owned = B_TRUE; if (zv != NULL) zv->zv_objset = os; } else { os = zv->zv_objset; } dmu_object_info_t *doi = kmem_alloc(sizeof (*doi), KM_SLEEP); if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) || (error = zvol_check_volsize(volsize, doi->doi_data_block_size))) goto out; error = zvol_update_volsize(volsize, os); if (error == 0 && zv != NULL) { zv->zv_volsize = volsize; zv->zv_changed = 1; disk = zv->zv_disk; } out: kmem_free(doi, sizeof (dmu_object_info_t)); if (owned) { dmu_objset_disown(os, B_TRUE, FTAG); if (zv != NULL) zv->zv_objset = NULL; } else { rw_exit(&zv->zv_suspend_lock); } if (zv != NULL) mutex_exit(&zv->zv_state_lock); if (disk != NULL) revalidate_disk(disk); return (SET_ERROR(error)); } /* * Sanity check volume block size. */ int zvol_check_volblocksize(const char *name, uint64_t volblocksize) { /* Record sizes above 128k need the feature to be enabled */ if (volblocksize > SPA_OLD_MAXBLOCKSIZE) { spa_t *spa; int error; if ((error = spa_open(name, &spa, FTAG)) != 0) return (error); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } /* * We don't allow setting the property above 1MB, * unless the tunable has been changed. */ if (volblocksize > zfs_max_recordsize) return (SET_ERROR(EDOM)); spa_close(spa, FTAG); } if (volblocksize < SPA_MINBLOCKSIZE || volblocksize > SPA_MAXBLOCKSIZE || !ISP2(volblocksize)) return (SET_ERROR(EDOM)); return (0); } /* * Set ZFS_PROP_VOLBLOCKSIZE set entry point. */ int zvol_set_volblocksize(const char *name, uint64_t volblocksize) { zvol_state_t *zv; dmu_tx_t *tx; int error; zv = zvol_find_by_name(name, RW_READER); if (zv == NULL) return (SET_ERROR(ENXIO)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); if (zv->zv_flags & ZVOL_RDONLY) { mutex_exit(&zv->zv_state_lock); rw_exit(&zv->zv_suspend_lock); return (SET_ERROR(EROFS)); } tx = dmu_tx_create(zv->zv_objset); dmu_tx_hold_bonus(tx, ZVOL_OBJ); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ, volblocksize, 0, tx); if (error == ENOTSUP) error = SET_ERROR(EBUSY); dmu_tx_commit(tx); if (error == 0) zv->zv_volblocksize = volblocksize; } mutex_exit(&zv->zv_state_lock); rw_exit(&zv->zv_suspend_lock); return (SET_ERROR(error)); } /* * Replay a TX_TRUNCATE ZIL transaction if asked. TX_TRUNCATE is how we * implement DKIOCFREE/free-long-range. */ static int zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) { zvol_state_t *zv = arg1; lr_truncate_t *lr = arg2; uint64_t offset, length; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); offset = lr->lr_offset; length = lr->lr_length; return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length)); } /* * Replay a TX_WRITE ZIL transaction that didn't get committed * after a system failure */ static int zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) { zvol_state_t *zv = arg1; lr_write_t *lr = arg2; objset_t *os = zv->zv_objset; char *data = (char *)(lr + 1); /* data follows lr_write_t */ uint64_t offset, length; dmu_tx_t *tx; int error; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); offset = lr->lr_offset; length = lr->lr_length; /* If it's a dmu_sync() block, write the whole block */ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); if (length < blocksize) { offset -= offset % blocksize; length = blocksize; } } tx = dmu_tx_create(os); dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { dmu_write(os, ZVOL_OBJ, offset, length, data, tx); dmu_tx_commit(tx); } return (error); } static int zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap) { return (SET_ERROR(ENOTSUP)); } /* * Callback vectors for replaying records. * Only TX_WRITE and TX_TRUNCATE are needed for zvol. */ zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_err, /* no such transaction type */ zvol_replay_err, /* TX_CREATE */ zvol_replay_err, /* TX_MKDIR */ zvol_replay_err, /* TX_MKXATTR */ zvol_replay_err, /* TX_SYMLINK */ zvol_replay_err, /* TX_REMOVE */ zvol_replay_err, /* TX_RMDIR */ zvol_replay_err, /* TX_LINK */ zvol_replay_err, /* TX_RENAME */ zvol_replay_write, /* TX_WRITE */ zvol_replay_truncate, /* TX_TRUNCATE */ zvol_replay_err, /* TX_SETATTR */ zvol_replay_err, /* TX_ACL */ zvol_replay_err, /* TX_CREATE_ATTR */ zvol_replay_err, /* TX_CREATE_ACL_ATTR */ zvol_replay_err, /* TX_MKDIR_ACL */ zvol_replay_err, /* TX_MKDIR_ATTR */ zvol_replay_err, /* TX_MKDIR_ACL_ATTR */ zvol_replay_err, /* TX_WRITE2 */ }; /* * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions. * * We store data in the log buffers if it's small enough. * Otherwise we will later flush the data out via dmu_sync(). */ ssize_t zvol_immediate_write_sz = 32768; static void zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, uint64_t size, int sync) { uint32_t blocksize = zv->zv_volblocksize; zilog_t *zilog = zv->zv_zilog; itx_wr_state_t write_state; if (zil_replaying(zilog, tx)) return; if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) write_state = WR_INDIRECT; else if (!spa_has_slogs(zilog->zl_spa) && size >= blocksize && blocksize > zvol_immediate_write_sz) write_state = WR_INDIRECT; else if (sync) write_state = WR_COPIED; else write_state = WR_NEED_COPY; while (size) { itx_t *itx; lr_write_t *lr; itx_wr_state_t wr_state = write_state; ssize_t len = size; if (wr_state == WR_COPIED && size > ZIL_MAX_COPIED_DATA) wr_state = WR_NEED_COPY; else if (wr_state == WR_INDIRECT) len = MIN(blocksize - P2PHASE(offset, blocksize), size); itx = zil_itx_create(TX_WRITE, sizeof (*lr) + (wr_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn, offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(TX_WRITE, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; wr_state = WR_NEED_COPY; } itx->itx_wr_state = wr_state; lr->lr_foid = ZVOL_OBJ; lr->lr_offset = offset; lr->lr_length = len; lr->lr_blkoff = 0; BP_ZERO(&lr->lr_blkptr); itx->itx_private = zv; itx->itx_sync = sync; (void) zil_itx_assign(zilog, itx, tx); offset += len; size -= len; } } typedef struct zv_request { zvol_state_t *zv; struct bio *bio; locked_range_t *lr; } zv_request_t; static void uio_from_bio(uio_t *uio, struct bio *bio) { uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)]; - uio->uio_skip = BIO_BI_SKIP(bio); - uio->uio_resid = BIO_BI_SIZE(bio); uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio); uio->uio_loffset = BIO_BI_SECTOR(bio) << 9; - uio->uio_limit = MAXOFFSET_T; uio->uio_segflg = UIO_BVEC; + uio->uio_limit = MAXOFFSET_T; + uio->uio_resid = BIO_BI_SIZE(bio); + uio->uio_skip = BIO_BI_SKIP(bio); } static void zvol_write(void *arg) { int error = 0; zv_request_t *zvr = arg; struct bio *bio = zvr->bio; - uio_t uio; + uio_t uio = { { 0 }, 0 }; uio_from_bio(&uio, bio); zvol_state_t *zv = zvr->zv; ASSERT(zv && zv->zv_open_count > 0); ASSERT(zv->zv_zilog != NULL); ssize_t start_resid = uio.uio_resid; unsigned long start_jif = jiffies; blk_generic_start_io_acct(zv->zv_queue, WRITE, bio_sectors(bio), &zv->zv_disk->part0); boolean_t sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; uint64_t volsize = zv->zv_volsize; while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); uint64_t off = uio.uio_loffset; dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); if (bytes > volsize - off) /* don't write past the end */ bytes = volsize - off; dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); /* This will only fail for ENOSPC */ error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); break; } error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); if (error == 0) { zvol_log_write(zv, tx, off, bytes, sync); } dmu_tx_commit(tx); if (error) break; } rangelock_exit(zvr->lr); int64_t nwritten = start_resid - uio.uio_resid; dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); task_io_account_write(nwritten); if (sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); blk_generic_end_io_acct(zv->zv_queue, WRITE, &zv->zv_disk->part0, start_jif); BIO_END_IO(bio, -error); kmem_free(zvr, sizeof (zv_request_t)); } /* * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE. */ static void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len, boolean_t sync) { itx_t *itx; lr_truncate_t *lr; zilog_t *zilog = zv->zv_zilog; if (zil_replaying(zilog, tx)) return; itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); lr = (lr_truncate_t *)&itx->itx_lr; lr->lr_foid = ZVOL_OBJ; lr->lr_offset = off; lr->lr_length = len; itx->itx_sync = sync; zil_itx_assign(zilog, itx, tx); } static void zvol_discard(void *arg) { zv_request_t *zvr = arg; struct bio *bio = zvr->bio; zvol_state_t *zv = zvr->zv; uint64_t start = BIO_BI_SECTOR(bio) << 9; uint64_t size = BIO_BI_SIZE(bio); uint64_t end = start + size; boolean_t sync; int error = 0; dmu_tx_t *tx; unsigned long start_jif; ASSERT(zv && zv->zv_open_count > 0); ASSERT(zv->zv_zilog != NULL); start_jif = jiffies; blk_generic_start_io_acct(zv->zv_queue, WRITE, bio_sectors(bio), &zv->zv_disk->part0); sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; if (end > zv->zv_volsize) { error = SET_ERROR(EIO); goto unlock; } /* * Align the request to volume block boundaries when a secure erase is * not required. This will prevent dnode_free_range() from zeroing out * the unaligned parts which is slow (read-modify-write) and useless * since we are not freeing any space by doing so. */ if (!bio_is_secure_erase(bio)) { start = P2ROUNDUP(start, zv->zv_volblocksize); end = P2ALIGN(end, zv->zv_volblocksize); size = end - start; } if (start >= end) goto unlock; tx = dmu_tx_create(zv->zv_objset); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { dmu_tx_abort(tx); } else { zvol_log_truncate(zv, tx, start, size, B_TRUE); dmu_tx_commit(tx); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, size); } unlock: rangelock_exit(zvr->lr); if (error == 0 && sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); blk_generic_end_io_acct(zv->zv_queue, WRITE, &zv->zv_disk->part0, start_jif); BIO_END_IO(bio, -error); kmem_free(zvr, sizeof (zv_request_t)); } static void zvol_read(void *arg) { int error = 0; zv_request_t *zvr = arg; struct bio *bio = zvr->bio; - uio_t uio; + uio_t uio = { { 0 }, 0 }; uio_from_bio(&uio, bio); zvol_state_t *zv = zvr->zv; ASSERT(zv && zv->zv_open_count > 0); ssize_t start_resid = uio.uio_resid; unsigned long start_jif = jiffies; blk_generic_start_io_acct(zv->zv_queue, READ, bio_sectors(bio), &zv->zv_disk->part0); uint64_t volsize = zv->zv_volsize; while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); /* don't read past the end */ if (bytes > volsize - uio.uio_loffset) bytes = volsize - uio.uio_loffset; error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } } rangelock_exit(zvr->lr); int64_t nread = start_resid - uio.uio_resid; dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); task_io_account_read(nread); rw_exit(&zv->zv_suspend_lock); blk_generic_end_io_acct(zv->zv_queue, READ, &zv->zv_disk->part0, start_jif); BIO_END_IO(bio, -error); kmem_free(zvr, sizeof (zv_request_t)); } /* ARGSUSED */ static void zvol_get_done(zgd_t *zgd, int error) { if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); rangelock_exit(zgd->zgd_lr); kmem_free(zgd, sizeof (zgd_t)); } /* * Get data to generate a TX_WRITE intent log record. */ static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { zvol_state_t *zv = arg; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; dmu_buf_t *db; zgd_t *zgd; int error; ASSERT3P(lwb, !=, NULL); ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_lwb = lwb; /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size, RL_READER); error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, DMU_READ_NO_PREFETCH); } else { /* indirect write */ /* * Have to lock the whole block to ensure when it's written out * and its checksum is being calculated that no one can change * the data. Contrarily to zfs_get_data we need not re-check * blocksize after we get the lock because it cannot be changed. */ size = zv->zv_volblocksize; offset = P2ALIGN_TYPED(offset, size, uint64_t); zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size, RL_READER); error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT(db != NULL); ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zvol_get_done, zgd); if (error == 0) return (0); } } zvol_get_done(zgd, error); return (SET_ERROR(error)); } static MAKE_REQUEST_FN_RET zvol_request(struct request_queue *q, struct bio *bio) { zvol_state_t *zv = q->queuedata; fstrans_cookie_t cookie = spl_fstrans_mark(); uint64_t offset = BIO_BI_SECTOR(bio) << 9; uint64_t size = BIO_BI_SIZE(bio); int rw = bio_data_dir(bio); zv_request_t *zvr; if (bio_has_data(bio) && offset + size > zv->zv_volsize) { printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", zv->zv_disk->disk_name, (long long unsigned)offset, (long unsigned)size); BIO_END_IO(bio, -SET_ERROR(EIO)); goto out; } if (rw == WRITE) { boolean_t need_sync = B_FALSE; if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { BIO_END_IO(bio, -SET_ERROR(EROFS)); goto out; } /* * To be released in the I/O function. See the comment on * rangelock_enter() below. */ rw_enter(&zv->zv_suspend_lock, RW_READER); /* * Open a ZIL if this is the first time we have written to this * zvol. We protect zv->zv_zilog with zv_suspend_lock rather * than zv_state_lock so that we don't need to acquire an * additional lock in this path. */ if (zv->zv_zilog == NULL) { rw_exit(&zv->zv_suspend_lock); rw_enter(&zv->zv_suspend_lock, RW_WRITER); if (zv->zv_zilog == NULL) { zv->zv_zilog = zil_open(zv->zv_objset, zvol_get_data); zv->zv_flags |= ZVOL_WRITTEN_TO; } rw_downgrade(&zv->zv_suspend_lock); } /* bio marked as FLUSH need to flush before write */ if (bio_is_flush(bio)) zil_commit(zv->zv_zilog, ZVOL_OBJ); /* Some requests are just for flush and nothing else. */ if (size == 0) { rw_exit(&zv->zv_suspend_lock); BIO_END_IO(bio, 0); goto out; } zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP); zvr->zv = zv; zvr->bio = bio; /* * To be released in the I/O function. Since the I/O functions * are asynchronous, we take it here synchronously to make * sure overlapped I/Os are properly ordered. */ zvr->lr = rangelock_enter(&zv->zv_rangelock, offset, size, RL_WRITER); /* * Sync writes and discards execute zil_commit() which may need * to take a RL_READER lock on the whole block being modified * via its zillog->zl_get_data(): to avoid circular dependency * issues with taskq threads execute these requests * synchronously here in zvol_request(). */ need_sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; if (bio_is_discard(bio) || bio_is_secure_erase(bio)) { if (zvol_request_sync || need_sync || taskq_dispatch(zvol_taskq, zvol_discard, zvr, TQ_SLEEP) == TASKQID_INVALID) zvol_discard(zvr); } else { if (zvol_request_sync || need_sync || taskq_dispatch(zvol_taskq, zvol_write, zvr, TQ_SLEEP) == TASKQID_INVALID) zvol_write(zvr); } } else { /* * The SCST driver, and possibly others, may issue READ I/Os * with a length of zero bytes. These empty I/Os contain no * data and require no additional handling. */ if (size == 0) { BIO_END_IO(bio, 0); goto out; } zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP); zvr->zv = zv; zvr->bio = bio; rw_enter(&zv->zv_suspend_lock, RW_READER); zvr->lr = rangelock_enter(&zv->zv_rangelock, offset, size, RL_READER); if (zvol_request_sync || taskq_dispatch(zvol_taskq, zvol_read, zvr, TQ_SLEEP) == TASKQID_INVALID) zvol_read(zvr); } out: spl_fstrans_unmark(cookie); #ifdef HAVE_MAKE_REQUEST_FN_RET_INT return (0); #elif defined(HAVE_MAKE_REQUEST_FN_RET_QC) return (BLK_QC_T_NONE); #endif } /* * The zvol_state_t's are inserted into zvol_state_list and zvol_htable. */ static void zvol_insert(zvol_state_t *zv) { ASSERT(RW_WRITE_HELD(&zvol_state_lock)); ASSERT3U(MINOR(zv->zv_dev) & ZVOL_MINOR_MASK, ==, 0); list_insert_head(&zvol_state_list, zv); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); } /* * Simply remove the zvol from to list of zvols. */ static void zvol_remove(zvol_state_t *zv) { ASSERT(RW_WRITE_HELD(&zvol_state_lock)); list_remove(&zvol_state_list, zv); hlist_del(&zv->zv_hlink); } /* * Setup zv after we just own the zv->objset */ static int zvol_setup_zv(zvol_state_t *zv) { uint64_t volsize; int error; uint64_t ro; objset_t *os = zv->zv_objset; ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT(RW_LOCK_HELD(&zv->zv_suspend_lock)); zv->zv_zilog = NULL; zv->zv_flags &= ~ZVOL_WRITTEN_TO; error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL); if (error) return (SET_ERROR(error)); error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) return (SET_ERROR(error)); error = dnode_hold(os, ZVOL_OBJ, FTAG, &zv->zv_dn); if (error) return (SET_ERROR(error)); set_capacity(zv->zv_disk, volsize >> 9); zv->zv_volsize = volsize; if (ro || dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) { set_disk_ro(zv->zv_disk, 1); zv->zv_flags |= ZVOL_RDONLY; } else { set_disk_ro(zv->zv_disk, 0); zv->zv_flags &= ~ZVOL_RDONLY; } return (0); } /* * Shutdown every zv_objset related stuff except zv_objset itself. * The is the reverse of zvol_setup_zv. */ static void zvol_shutdown_zv(zvol_state_t *zv) { ASSERT(MUTEX_HELD(&zv->zv_state_lock) && RW_LOCK_HELD(&zv->zv_suspend_lock)); if (zv->zv_flags & ZVOL_WRITTEN_TO) { ASSERT(zv->zv_zilog != NULL); zil_close(zv->zv_zilog); } zv->zv_zilog = NULL; dnode_rele(zv->zv_dn, FTAG); zv->zv_dn = NULL; /* * Evict cached data. We must write out any dirty data before * disowning the dataset. */ if (zv->zv_flags & ZVOL_WRITTEN_TO) txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); (void) dmu_objset_evict_dbufs(zv->zv_objset); } /* * return the proper tag for rollback and recv */ void * zvol_tag(zvol_state_t *zv) { ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); return (zv->zv_open_count > 0 ? zv : NULL); } /* * Suspend the zvol for recv and rollback. */ zvol_state_t * zvol_suspend(const char *name) { zvol_state_t *zv; zv = zvol_find_by_name(name, RW_WRITER); if (zv == NULL) return (NULL); /* block all I/O, release in zvol_resume. */ ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); atomic_inc(&zv->zv_suspend_ref); if (zv->zv_open_count > 0) zvol_shutdown_zv(zv); /* * do not hold zv_state_lock across suspend/resume to * avoid locking up zvol lookups */ mutex_exit(&zv->zv_state_lock); /* zv_suspend_lock is released in zvol_resume() */ return (zv); } int zvol_resume(zvol_state_t *zv) { int error = 0; ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); mutex_enter(&zv->zv_state_lock); if (zv->zv_open_count > 0) { VERIFY0(dmu_objset_hold(zv->zv_name, zv, &zv->zv_objset)); VERIFY3P(zv->zv_objset->os_dsl_dataset->ds_owner, ==, zv); VERIFY(dsl_dataset_long_held(zv->zv_objset->os_dsl_dataset)); dmu_objset_rele(zv->zv_objset, zv); error = zvol_setup_zv(zv); } mutex_exit(&zv->zv_state_lock); rw_exit(&zv->zv_suspend_lock); /* * We need this because we don't hold zvol_state_lock while releasing * zv_suspend_lock. zvol_remove_minors_impl thus cannot check * zv_suspend_lock to determine it is safe to free because rwlock is * not inherent atomic. */ atomic_dec(&zv->zv_suspend_ref); return (SET_ERROR(error)); } static int zvol_first_open(zvol_state_t *zv, boolean_t readonly) { objset_t *os; int error, locked = 0; boolean_t ro; ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); /* * In all other cases the spa_namespace_lock is taken before the * bdev->bd_mutex lock. But in this case the Linux __blkdev_get() * function calls fops->open() with the bdev->bd_mutex lock held. * This deadlock can be easily observed with zvols used as vdevs. * * To avoid a potential lock inversion deadlock we preemptively * try to take the spa_namespace_lock(). Normally it will not * be contended and this is safe because spa_open_common() handles * the case where the caller already holds the spa_namespace_lock. * * When it is contended we risk a lock inversion if we were to * block waiting for the lock. Luckily, the __blkdev_get() * function allows us to return -ERESTARTSYS which will result in * bdev->bd_mutex being dropped, reacquired, and fops->open() being * called again. This process can be repeated safely until both * locks are acquired. */ if (!mutex_owned(&spa_namespace_lock)) { locked = mutex_tryenter(&spa_namespace_lock); if (!locked) return (-SET_ERROR(ERESTARTSYS)); } ro = (readonly || (strchr(zv->zv_name, '@') != NULL)); error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os); if (error) goto out_mutex; zv->zv_objset = os; error = zvol_setup_zv(zv); if (error) { dmu_objset_disown(os, 1, zv); zv->zv_objset = NULL; } out_mutex: if (locked) mutex_exit(&spa_namespace_lock); return (SET_ERROR(-error)); } static void zvol_last_close(zvol_state_t *zv) { ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); zvol_shutdown_zv(zv); dmu_objset_disown(zv->zv_objset, 1, zv); zv->zv_objset = NULL; } static int zvol_open(struct block_device *bdev, fmode_t flag) { zvol_state_t *zv; int error = 0; boolean_t drop_suspend = B_TRUE; rw_enter(&zvol_state_lock, RW_READER); /* * Obtain a copy of private_data under the zvol_state_lock to make * sure that either the result of zvol free code path setting * bdev->bd_disk->private_data to NULL is observed, or zvol_free() * is not called on this zv because of the positive zv_open_count. */ zv = bdev->bd_disk->private_data; if (zv == NULL) { rw_exit(&zvol_state_lock); return (SET_ERROR(-ENXIO)); } mutex_enter(&zv->zv_state_lock); /* * make sure zvol is not suspended during first open * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock */ if (zv->zv_open_count == 0) { if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, RW_READER); mutex_enter(&zv->zv_state_lock); /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 0) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; } } } else { drop_suspend = B_FALSE; } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT(zv->zv_open_count != 0 || RW_READ_HELD(&zv->zv_suspend_lock)); if (zv->zv_open_count == 0) { error = zvol_first_open(zv, !(flag & FMODE_WRITE)); if (error) goto out_mutex; } if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { error = -EROFS; goto out_open_count; } zv->zv_open_count++; mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); check_disk_change(bdev); return (0); out_open_count: if (zv->zv_open_count == 0) zvol_last_close(zv); out_mutex: mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); if (error == -ERESTARTSYS) schedule(); return (SET_ERROR(error)); } #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID static void #else static int #endif zvol_release(struct gendisk *disk, fmode_t mode) { zvol_state_t *zv; boolean_t drop_suspend = B_TRUE; rw_enter(&zvol_state_lock, RW_READER); zv = disk->private_data; mutex_enter(&zv->zv_state_lock); ASSERT(zv->zv_open_count > 0); /* * make sure zvol is not suspended during last close * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock */ if (zv->zv_open_count == 1) { if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, RW_READER); mutex_enter(&zv->zv_state_lock); /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 1) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; } } } else { drop_suspend = B_FALSE; } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT(zv->zv_open_count != 1 || RW_READ_HELD(&zv->zv_suspend_lock)); zv->zv_open_count--; if (zv->zv_open_count == 0) zvol_last_close(zv); mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); #ifndef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID return (0); #endif } static int zvol_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { zvol_state_t *zv = bdev->bd_disk->private_data; int error = 0; ASSERT3U(zv->zv_open_count, >, 0); switch (cmd) { case BLKFLSBUF: fsync_bdev(bdev); invalidate_bdev(bdev); rw_enter(&zv->zv_suspend_lock, RW_READER); if (!(zv->zv_flags & ZVOL_RDONLY)) txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); rw_exit(&zv->zv_suspend_lock); break; case BLKZNAME: mutex_enter(&zv->zv_state_lock); error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); mutex_exit(&zv->zv_state_lock); break; default: error = -ENOTTY; break; } return (SET_ERROR(error)); } #ifdef CONFIG_COMPAT static int zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { return (zvol_ioctl(bdev, mode, cmd, arg)); } #else #define zvol_compat_ioctl NULL #endif /* * Linux 2.6.38 preferred interface. */ #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS static unsigned int zvol_check_events(struct gendisk *disk, unsigned int clearing) { unsigned int mask = 0; rw_enter(&zvol_state_lock, RW_READER); zvol_state_t *zv = disk->private_data; if (zv != NULL) { mutex_enter(&zv->zv_state_lock); mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; zv->zv_changed = 0; mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); return (mask); } #else static int zvol_media_changed(struct gendisk *disk) { int changed = 0; rw_enter(&zvol_state_lock, RW_READER); zvol_state_t *zv = disk->private_data; if (zv != NULL) { mutex_enter(&zv->zv_state_lock); changed = zv->zv_changed; zv->zv_changed = 0; mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); return (changed); } #endif static int zvol_revalidate_disk(struct gendisk *disk) { rw_enter(&zvol_state_lock, RW_READER); zvol_state_t *zv = disk->private_data; if (zv != NULL) { mutex_enter(&zv->zv_state_lock); set_capacity(zv->zv_disk, zv->zv_volsize >> SECTOR_BITS); mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); return (0); } /* * Provide a simple virtual geometry for legacy compatibility. For devices * smaller than 1 MiB a small head and sector count is used to allow very * tiny devices. For devices over 1 Mib a standard head and sector count * is used to keep the cylinders count reasonable. */ static int zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) { zvol_state_t *zv = bdev->bd_disk->private_data; sector_t sectors; ASSERT3U(zv->zv_open_count, >, 0); sectors = get_capacity(zv->zv_disk); if (sectors > 2048) { geo->heads = 16; geo->sectors = 63; } else { geo->heads = 2; geo->sectors = 4; } geo->start = 0; geo->cylinders = sectors / (geo->heads * geo->sectors); return (0); } static struct kobject * zvol_probe(dev_t dev, int *part, void *arg) { zvol_state_t *zv; struct kobject *kobj; zv = zvol_find_by_dev(dev); kobj = zv ? get_disk_and_module(zv->zv_disk) : NULL; ASSERT(zv == NULL || MUTEX_HELD(&zv->zv_state_lock)); if (zv) mutex_exit(&zv->zv_state_lock); return (kobj); } static struct block_device_operations zvol_ops = { .open = zvol_open, .release = zvol_release, .ioctl = zvol_ioctl, .compat_ioctl = zvol_compat_ioctl, #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS .check_events = zvol_check_events, #else .media_changed = zvol_media_changed, #endif .revalidate_disk = zvol_revalidate_disk, .getgeo = zvol_getgeo, .owner = THIS_MODULE, }; /* * Allocate memory for a new zvol_state_t and setup the required * request queue and generic disk structures for the block device. */ static zvol_state_t * zvol_alloc(dev_t dev, const char *name) { zvol_state_t *zv; uint64_t volmode; if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) return (NULL); if (volmode == ZFS_VOLMODE_DEFAULT) volmode = zvol_volmode; if (volmode == ZFS_VOLMODE_NONE) return (NULL); zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); list_link_init(&zv->zv_next); mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); zv->zv_queue = blk_alloc_queue(GFP_ATOMIC); if (zv->zv_queue == NULL) goto out_kmem; blk_queue_make_request(zv->zv_queue, zvol_request); blk_queue_set_write_cache(zv->zv_queue, B_TRUE, B_TRUE); /* Limit read-ahead to a single page to prevent over-prefetching. */ blk_queue_set_read_ahead(zv->zv_queue, 1); /* Disable write merging in favor of the ZIO pipeline. */ blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zv->zv_queue); zv->zv_disk = alloc_disk(ZVOL_MINORS); if (zv->zv_disk == NULL) goto out_queue; zv->zv_queue->queuedata = zv; zv->zv_dev = dev; zv->zv_open_count = 0; strlcpy(zv->zv_name, name, MAXNAMELEN); rangelock_init(&zv->zv_rangelock, NULL, NULL); rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); zv->zv_disk->major = zvol_major; #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS zv->zv_disk->events = DISK_EVENT_MEDIA_CHANGE; #endif if (volmode == ZFS_VOLMODE_DEV) { /* * ZFS_VOLMODE_DEV disable partitioning on ZVOL devices: set * gendisk->minors = 1 as noted in include/linux/genhd.h. * Also disable extended partition numbers (GENHD_FL_EXT_DEVT) * and suppresses partition scanning (GENHD_FL_NO_PART_SCAN) * setting gendisk->flags accordingly. */ zv->zv_disk->minors = 1; #if defined(GENHD_FL_EXT_DEVT) zv->zv_disk->flags &= ~GENHD_FL_EXT_DEVT; #endif #if defined(GENHD_FL_NO_PART_SCAN) zv->zv_disk->flags |= GENHD_FL_NO_PART_SCAN; #endif } zv->zv_disk->first_minor = (dev & MINORMASK); zv->zv_disk->fops = &zvol_ops; zv->zv_disk->private_data = zv; zv->zv_disk->queue = zv->zv_queue; snprintf(zv->zv_disk->disk_name, DISK_NAME_LEN, "%s%d", ZVOL_DEV_NAME, (dev & MINORMASK)); return (zv); out_queue: blk_cleanup_queue(zv->zv_queue); out_kmem: kmem_free(zv, sizeof (zvol_state_t)); return (NULL); } /* * Cleanup then free a zvol_state_t which was created by zvol_alloc(). * At this time, the structure is not opened by anyone, is taken off * the zvol_state_list, and has its private data set to NULL. * The zvol_state_lock is dropped. */ static void zvol_free(void *arg) { zvol_state_t *zv = arg; ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); ASSERT(zv->zv_open_count == 0); ASSERT(zv->zv_disk->private_data == NULL); rw_destroy(&zv->zv_suspend_lock); rangelock_fini(&zv->zv_rangelock); del_gendisk(zv->zv_disk); blk_cleanup_queue(zv->zv_queue); put_disk(zv->zv_disk); ida_simple_remove(&zvol_ida, MINOR(zv->zv_dev) >> ZVOL_MINOR_BITS); mutex_destroy(&zv->zv_state_lock); dataset_kstats_destroy(&zv->zv_kstat); kmem_free(zv, sizeof (zvol_state_t)); } /* * Create a block device minor node and setup the linkage between it * and the specified volume. Once this function returns the block * device is live and ready for use. */ static int zvol_create_minor_impl(const char *name) { zvol_state_t *zv; objset_t *os; dmu_object_info_t *doi; uint64_t volsize; uint64_t len; unsigned minor = 0; int error = 0; int idx; uint64_t hash = zvol_name_hash(name); if (zvol_inhibit_dev) return (0); idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); if (idx < 0) return (SET_ERROR(-idx)); minor = idx << ZVOL_MINOR_BITS; zv = zvol_find_by_name_hash(name, hash, RW_NONE); if (zv) { ASSERT(MUTEX_HELD(&zv->zv_state_lock)); mutex_exit(&zv->zv_state_lock); ida_simple_remove(&zvol_ida, idx); return (SET_ERROR(EEXIST)); } doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); if (error) goto out_doi; error = dmu_object_info(os, ZVOL_OBJ, doi); if (error) goto out_dmu_objset_disown; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) goto out_dmu_objset_disown; zv = zvol_alloc(MKDEV(zvol_major, minor), name); if (zv == NULL) { error = SET_ERROR(EAGAIN); goto out_dmu_objset_disown; } zv->zv_hash = hash; if (dmu_objset_is_snapshot(os)) zv->zv_flags |= ZVOL_RDONLY; zv->zv_volblocksize = doi->doi_data_block_size; zv->zv_volsize = volsize; zv->zv_objset = os; set_capacity(zv->zv_disk, zv->zv_volsize >> 9); blk_queue_max_hw_sectors(zv->zv_queue, (DMU_MAX_ACCESS / 4) >> 9); blk_queue_max_segments(zv->zv_queue, UINT16_MAX); blk_queue_max_segment_size(zv->zv_queue, UINT_MAX); blk_queue_physical_block_size(zv->zv_queue, zv->zv_volblocksize); blk_queue_io_opt(zv->zv_queue, zv->zv_volblocksize); blk_queue_max_discard_sectors(zv->zv_queue, (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); blk_queue_discard_granularity(zv->zv_queue, zv->zv_volblocksize); blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_queue); #ifdef QUEUE_FLAG_NONROT blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_queue); #endif #ifdef QUEUE_FLAG_ADD_RANDOM blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_queue); #endif if (spa_writeable(dmu_objset_spa(os))) { if (zil_replay_disable) zil_destroy(dmu_objset_zil(os), B_FALSE); else zil_replay(os, zv, zvol_replay_vector); } ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); /* * When udev detects the addition of the device it will immediately * invoke blkid(8) to determine the type of content on the device. * Prefetching the blocks commonly scanned by blkid(8) will speed * up this process. */ len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE); if (len > 0) { dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, ZIO_PRIORITY_SYNC_READ); } zv->zv_objset = NULL; out_dmu_objset_disown: dmu_objset_disown(os, B_TRUE, FTAG); out_doi: kmem_free(doi, sizeof (dmu_object_info_t)); if (error == 0) { rw_enter(&zvol_state_lock, RW_WRITER); zvol_insert(zv); rw_exit(&zvol_state_lock); add_disk(zv->zv_disk); } else { ida_simple_remove(&zvol_ida, idx); } return (SET_ERROR(error)); } /* * Rename a block device minor mode for the specified volume. */ static void zvol_rename_minor(zvol_state_t *zv, const char *newname) { int readonly = get_disk_ro(zv->zv_disk); ASSERT(RW_LOCK_HELD(&zvol_state_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); /* move to new hashtable entry */ zv->zv_hash = zvol_name_hash(zv->zv_name); hlist_del(&zv->zv_hlink); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); /* * The block device's read-only state is briefly changed causing * a KOBJ_CHANGE uevent to be issued. This ensures udev detects * the name change and fixes the symlinks. This does not change * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never * changes. This would normally be done using kobject_uevent() but * that is a GPL-only symbol which is why we need this workaround. */ set_disk_ro(zv->zv_disk, !readonly); set_disk_ro(zv->zv_disk, readonly); } typedef struct minors_job { list_t *list; list_node_t link; /* input */ char *name; /* output */ int error; } minors_job_t; /* * Prefetch zvol dnodes for the minors_job */ static void zvol_prefetch_minors_impl(void *arg) { minors_job_t *job = arg; char *dsname = job->name; objset_t *os = NULL; job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); if (job->error == 0) { dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); dmu_objset_disown(os, B_TRUE, FTAG); } } /* * Mask errors to continue dmu_objset_find() traversal */ static int zvol_create_snap_minor_cb(const char *dsname, void *arg) { minors_job_t *j = arg; list_t *minors_list = j->list; const char *name = j->name; ASSERT0(MUTEX_HELD(&spa_namespace_lock)); /* skip the designated dataset */ if (name && strcmp(dsname, name) == 0) return (0); /* at this point, the dsname should name a snapshot */ if (strchr(dsname, '@') == 0) { dprintf("zvol_create_snap_minor_cb(): " "%s is not a shapshot name\n", dsname); } else { minors_job_t *job; char *n = strdup(dsname); if (n == NULL) return (0); job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); job->name = n; job->list = minors_list; job->error = 0; list_insert_tail(minors_list, job); /* don't care if dispatch fails, because job->error is 0 */ taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job, TQ_SLEEP); } return (0); } /* * Mask errors to continue dmu_objset_find() traversal */ static int zvol_create_minors_cb(const char *dsname, void *arg) { uint64_t snapdev; int error; list_t *minors_list = arg; ASSERT0(MUTEX_HELD(&spa_namespace_lock)); error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL); if (error) return (0); /* * Given the name and the 'snapdev' property, create device minor nodes * with the linkages to zvols/snapshots as needed. * If the name represents a zvol, create a minor node for the zvol, then * check if its snapshots are 'visible', and if so, iterate over the * snapshots and create device minor nodes for those. */ if (strchr(dsname, '@') == 0) { minors_job_t *job; char *n = strdup(dsname); if (n == NULL) return (0); job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); job->name = n; job->list = minors_list; job->error = 0; list_insert_tail(minors_list, job); /* don't care if dispatch fails, because job->error is 0 */ taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job, TQ_SLEEP); if (snapdev == ZFS_SNAPDEV_VISIBLE) { /* * traverse snapshots only, do not traverse children, * and skip the 'dsname' */ error = dmu_objset_find((char *)dsname, zvol_create_snap_minor_cb, (void *)job, DS_FIND_SNAPSHOTS); } } else { dprintf("zvol_create_minors_cb(): %s is not a zvol name\n", dsname); } return (0); } /* * Create minors for the specified dataset, including children and snapshots. * Pay attention to the 'snapdev' property and iterate over the snapshots * only if they are 'visible'. This approach allows one to assure that the * snapshot metadata is read from disk only if it is needed. * * The name can represent a dataset to be recursively scanned for zvols and * their snapshots, or a single zvol snapshot. If the name represents a * dataset, the scan is performed in two nested stages: * - scan the dataset for zvols, and * - for each zvol, create a minor node, then check if the zvol's snapshots * are 'visible', and only then iterate over the snapshots if needed * * If the name represents a snapshot, a check is performed if the snapshot is * 'visible' (which also verifies that the parent is a zvol), and if so, * a minor node for that snapshot is created. */ static int zvol_create_minors_impl(const char *name) { int error = 0; fstrans_cookie_t cookie; char *atp, *parent; list_t minors_list; minors_job_t *job; if (zvol_inhibit_dev) return (0); /* * This is the list for prefetch jobs. Whenever we found a match * during dmu_objset_find, we insert a minors_job to the list and do * taskq_dispatch to parallel prefetch zvol dnodes. Note we don't need * any lock because all list operation is done on the current thread. * * We will use this list to do zvol_create_minor_impl after prefetch * so we don't have to traverse using dmu_objset_find again. */ list_create(&minors_list, sizeof (minors_job_t), offsetof(minors_job_t, link)); parent = kmem_alloc(MAXPATHLEN, KM_SLEEP); (void) strlcpy(parent, name, MAXPATHLEN); if ((atp = strrchr(parent, '@')) != NULL) { uint64_t snapdev; *atp = '\0'; error = dsl_prop_get_integer(parent, "snapdev", &snapdev, NULL); if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) error = zvol_create_minor_impl(name); } else { cookie = spl_fstrans_mark(); error = dmu_objset_find(parent, zvol_create_minors_cb, &minors_list, DS_FIND_CHILDREN); spl_fstrans_unmark(cookie); } kmem_free(parent, MAXPATHLEN); taskq_wait_outstanding(system_taskq, 0); /* * Prefetch is completed, we can do zvol_create_minor_impl * sequentially. */ while ((job = list_head(&minors_list)) != NULL) { list_remove(&minors_list, job); if (!job->error) zvol_create_minor_impl(job->name); strfree(job->name); kmem_free(job, sizeof (minors_job_t)); } list_destroy(&minors_list); return (SET_ERROR(error)); } /* * Remove minors for specified dataset including children and snapshots. */ static void zvol_remove_minors_impl(const char *name) { zvol_state_t *zv, *zv_next; int namelen = ((name) ? strlen(name) : 0); taskqid_t t, tid = TASKQID_INVALID; list_t free_list; if (zvol_inhibit_dev) return; list_create(&free_list, sizeof (zvol_state_t), offsetof(zvol_state_t, zv_next)); rw_enter(&zvol_state_lock, RW_WRITER); for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { zv_next = list_next(&zvol_state_list, zv); mutex_enter(&zv->zv_state_lock); if (name == NULL || strcmp(zv->zv_name, name) == 0 || (strncmp(zv->zv_name, name, namelen) == 0 && (zv->zv_name[namelen] == '/' || zv->zv_name[namelen] == '@'))) { /* * By holding zv_state_lock here, we guarantee that no * one is currently using this zv */ /* If in use, leave alone */ if (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) { mutex_exit(&zv->zv_state_lock); continue; } zvol_remove(zv); /* * Cleared while holding zvol_state_lock as a writer * which will prevent zvol_open() from opening it. */ zv->zv_disk->private_data = NULL; /* Drop zv_state_lock before zvol_free() */ mutex_exit(&zv->zv_state_lock); /* Try parallel zv_free, if failed do it in place */ t = taskq_dispatch(system_taskq, zvol_free, zv, TQ_SLEEP); if (t == TASKQID_INVALID) list_insert_head(&free_list, zv); else tid = t; } else { mutex_exit(&zv->zv_state_lock); } } rw_exit(&zvol_state_lock); /* Drop zvol_state_lock before calling zvol_free() */ while ((zv = list_head(&free_list)) != NULL) { list_remove(&free_list, zv); zvol_free(zv); } if (tid != TASKQID_INVALID) taskq_wait_outstanding(system_taskq, tid); } /* Remove minor for this specific volume only */ static void zvol_remove_minor_impl(const char *name) { zvol_state_t *zv = NULL, *zv_next; if (zvol_inhibit_dev) return; rw_enter(&zvol_state_lock, RW_WRITER); for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { zv_next = list_next(&zvol_state_list, zv); mutex_enter(&zv->zv_state_lock); if (strcmp(zv->zv_name, name) == 0) { /* * By holding zv_state_lock here, we guarantee that no * one is currently using this zv */ /* If in use, leave alone */ if (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) { mutex_exit(&zv->zv_state_lock); continue; } zvol_remove(zv); /* * Cleared while holding zvol_state_lock as a writer * which will prevent zvol_open() from opening it. */ zv->zv_disk->private_data = NULL; mutex_exit(&zv->zv_state_lock); break; } else { mutex_exit(&zv->zv_state_lock); } } /* Drop zvol_state_lock before calling zvol_free() */ rw_exit(&zvol_state_lock); if (zv != NULL) zvol_free(zv); } /* * Rename minors for specified dataset including children and snapshots. */ static void zvol_rename_minors_impl(const char *oldname, const char *newname) { zvol_state_t *zv, *zv_next; int oldnamelen, newnamelen; if (zvol_inhibit_dev) return; oldnamelen = strlen(oldname); newnamelen = strlen(newname); rw_enter(&zvol_state_lock, RW_READER); for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { zv_next = list_next(&zvol_state_list, zv); mutex_enter(&zv->zv_state_lock); if (strcmp(zv->zv_name, oldname) == 0) { zvol_rename_minor(zv, newname); } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 && (zv->zv_name[oldnamelen] == '/' || zv->zv_name[oldnamelen] == '@')) { char *name = kmem_asprintf("%s%c%s", newname, zv->zv_name[oldnamelen], zv->zv_name + oldnamelen + 1); zvol_rename_minor(zv, name); strfree(name); } mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); } typedef struct zvol_snapdev_cb_arg { uint64_t snapdev; } zvol_snapdev_cb_arg_t; static int zvol_set_snapdev_cb(const char *dsname, void *param) { zvol_snapdev_cb_arg_t *arg = param; if (strchr(dsname, '@') == NULL) return (0); switch (arg->snapdev) { case ZFS_SNAPDEV_VISIBLE: (void) zvol_create_minor_impl(dsname); break; case ZFS_SNAPDEV_HIDDEN: (void) zvol_remove_minor_impl(dsname); break; } return (0); } static void zvol_set_snapdev_impl(char *name, uint64_t snapdev) { zvol_snapdev_cb_arg_t arg = {snapdev}; fstrans_cookie_t cookie = spl_fstrans_mark(); /* * The zvol_set_snapdev_sync() sets snapdev appropriately * in the dataset hierarchy. Here, we only scan snapshots. */ dmu_objset_find(name, zvol_set_snapdev_cb, &arg, DS_FIND_SNAPSHOTS); spl_fstrans_unmark(cookie); } typedef struct zvol_volmode_cb_arg { uint64_t volmode; } zvol_volmode_cb_arg_t; static void zvol_set_volmode_impl(char *name, uint64_t volmode) { fstrans_cookie_t cookie = spl_fstrans_mark(); if (strchr(name, '@') != NULL) return; /* * It's unfortunate we need to remove minors before we create new ones: * this is necessary because our backing gendisk (zvol_state->zv_disk) * coule be different when we set, for instance, volmode from "geom" * to "dev" (or vice versa). * A possible optimization is to modify our consumers so we don't get * called when "volmode" does not change. */ switch (volmode) { case ZFS_VOLMODE_NONE: (void) zvol_remove_minor_impl(name); break; case ZFS_VOLMODE_GEOM: case ZFS_VOLMODE_DEV: (void) zvol_remove_minor_impl(name); (void) zvol_create_minor_impl(name); break; case ZFS_VOLMODE_DEFAULT: (void) zvol_remove_minor_impl(name); if (zvol_volmode == ZFS_VOLMODE_NONE) break; else /* if zvol_volmode is invalid defaults to "geom" */ (void) zvol_create_minor_impl(name); break; } spl_fstrans_unmark(cookie); } static zvol_task_t * zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2, uint64_t value) { zvol_task_t *task; char *delim; /* Never allow tasks on hidden names. */ if (name1[0] == '$') return (NULL); task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); task->op = op; task->value = value; delim = strchr(name1, '/'); strlcpy(task->pool, name1, delim ? (delim - name1 + 1) : MAXNAMELEN); strlcpy(task->name1, name1, MAXNAMELEN); if (name2 != NULL) strlcpy(task->name2, name2, MAXNAMELEN); return (task); } static void zvol_task_free(zvol_task_t *task) { kmem_free(task, sizeof (zvol_task_t)); } /* * The worker thread function performed asynchronously. */ static void zvol_task_cb(void *param) { zvol_task_t *task = (zvol_task_t *)param; switch (task->op) { case ZVOL_ASYNC_CREATE_MINORS: (void) zvol_create_minors_impl(task->name1); break; case ZVOL_ASYNC_REMOVE_MINORS: zvol_remove_minors_impl(task->name1); break; case ZVOL_ASYNC_RENAME_MINORS: zvol_rename_minors_impl(task->name1, task->name2); break; case ZVOL_ASYNC_SET_SNAPDEV: zvol_set_snapdev_impl(task->name1, task->value); break; case ZVOL_ASYNC_SET_VOLMODE: zvol_set_volmode_impl(task->name1, task->value); break; default: VERIFY(0); break; } zvol_task_free(task); } typedef struct zvol_set_prop_int_arg { const char *zsda_name; uint64_t zsda_value; zprop_source_t zsda_source; dmu_tx_t *zsda_tx; } zvol_set_prop_int_arg_t; /* * Sanity check the dataset for safe use by the sync task. No additional * conditions are imposed. */ static int zvol_set_snapdev_check(void *arg, dmu_tx_t *tx) { zvol_set_prop_int_arg_t *zsda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd; int error; error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL); if (error != 0) return (error); dsl_dir_rele(dd, FTAG); return (error); } /* ARGSUSED */ static int zvol_set_snapdev_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { char dsname[MAXNAMELEN]; zvol_task_t *task; uint64_t snapdev; dsl_dataset_name(ds, dsname); if (dsl_prop_get_int_ds(ds, "snapdev", &snapdev) != 0) return (0); task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname, NULL, snapdev); if (task == NULL) return (0); (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); return (0); } /* * Traverse all child datasets and apply snapdev appropriately. * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel * dataset and read the effective "snapdev" on every child in the callback * function: this is because the value is not guaranteed to be the same in the * whole dataset hierarchy. */ static void zvol_set_snapdev_sync(void *arg, dmu_tx_t *tx) { zvol_set_prop_int_arg_t *zsda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd; dsl_dataset_t *ds; int error; VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL)); zsda->zsda_tx = tx; error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds); if (error == 0) { dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_SNAPDEV), zsda->zsda_source, sizeof (zsda->zsda_value), 1, &zsda->zsda_value, zsda->zsda_tx); dsl_dataset_rele(ds, FTAG); } dmu_objset_find_dp(dp, dd->dd_object, zvol_set_snapdev_sync_cb, zsda, DS_FIND_CHILDREN); dsl_dir_rele(dd, FTAG); } int zvol_set_snapdev(const char *ddname, zprop_source_t source, uint64_t snapdev) { zvol_set_prop_int_arg_t zsda; zsda.zsda_name = ddname; zsda.zsda_source = source; zsda.zsda_value = snapdev; return (dsl_sync_task(ddname, zvol_set_snapdev_check, zvol_set_snapdev_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE)); } /* * Sanity check the dataset for safe use by the sync task. No additional * conditions are imposed. */ static int zvol_set_volmode_check(void *arg, dmu_tx_t *tx) { zvol_set_prop_int_arg_t *zsda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd; int error; error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL); if (error != 0) return (error); dsl_dir_rele(dd, FTAG); return (error); } /* ARGSUSED */ static int zvol_set_volmode_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { char dsname[MAXNAMELEN]; zvol_task_t *task; uint64_t volmode; dsl_dataset_name(ds, dsname); if (dsl_prop_get_int_ds(ds, "volmode", &volmode) != 0) return (0); task = zvol_task_alloc(ZVOL_ASYNC_SET_VOLMODE, dsname, NULL, volmode); if (task == NULL) return (0); (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); return (0); } /* * Traverse all child datasets and apply volmode appropriately. * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel * dataset and read the effective "volmode" on every child in the callback * function: this is because the value is not guaranteed to be the same in the * whole dataset hierarchy. */ static void zvol_set_volmode_sync(void *arg, dmu_tx_t *tx) { zvol_set_prop_int_arg_t *zsda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd; dsl_dataset_t *ds; int error; VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL)); zsda->zsda_tx = tx; error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds); if (error == 0) { dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_VOLMODE), zsda->zsda_source, sizeof (zsda->zsda_value), 1, &zsda->zsda_value, zsda->zsda_tx); dsl_dataset_rele(ds, FTAG); } dmu_objset_find_dp(dp, dd->dd_object, zvol_set_volmode_sync_cb, zsda, DS_FIND_CHILDREN); dsl_dir_rele(dd, FTAG); } int zvol_set_volmode(const char *ddname, zprop_source_t source, uint64_t volmode) { zvol_set_prop_int_arg_t zsda; zsda.zsda_name = ddname; zsda.zsda_source = source; zsda.zsda_value = volmode; return (dsl_sync_task(ddname, zvol_set_volmode_check, zvol_set_volmode_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE)); } void zvol_create_minors(spa_t *spa, const char *name, boolean_t async) { zvol_task_t *task; taskqid_t id; task = zvol_task_alloc(ZVOL_ASYNC_CREATE_MINORS, name, NULL, ~0ULL); if (task == NULL) return; id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); if ((async == B_FALSE) && (id != TASKQID_INVALID)) taskq_wait_id(spa->spa_zvol_taskq, id); } void zvol_remove_minors(spa_t *spa, const char *name, boolean_t async) { zvol_task_t *task; taskqid_t id; task = zvol_task_alloc(ZVOL_ASYNC_REMOVE_MINORS, name, NULL, ~0ULL); if (task == NULL) return; id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); if ((async == B_FALSE) && (id != TASKQID_INVALID)) taskq_wait_id(spa->spa_zvol_taskq, id); } void zvol_rename_minors(spa_t *spa, const char *name1, const char *name2, boolean_t async) { zvol_task_t *task; taskqid_t id; task = zvol_task_alloc(ZVOL_ASYNC_RENAME_MINORS, name1, name2, ~0ULL); if (task == NULL) return; id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); if ((async == B_FALSE) && (id != TASKQID_INVALID)) taskq_wait_id(spa->spa_zvol_taskq, id); } int zvol_init(void) { int threads = MIN(MAX(zvol_threads, 1), 1024); int i, error; list_create(&zvol_state_list, sizeof (zvol_state_t), offsetof(zvol_state_t, zv_next)); rw_init(&zvol_state_lock, NULL, RW_DEFAULT, NULL); ida_init(&zvol_ida); zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri, threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); if (zvol_taskq == NULL) { printk(KERN_INFO "ZFS: taskq_create() failed\n"); error = -ENOMEM; goto out; } zvol_htable = kmem_alloc(ZVOL_HT_SIZE * sizeof (struct hlist_head), KM_SLEEP); if (!zvol_htable) { error = -ENOMEM; goto out_taskq; } for (i = 0; i < ZVOL_HT_SIZE; i++) INIT_HLIST_HEAD(&zvol_htable[i]); error = register_blkdev(zvol_major, ZVOL_DRIVER); if (error) { printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); goto out_free; } blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS, THIS_MODULE, zvol_probe, NULL, NULL); return (0); out_free: kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head)); out_taskq: taskq_destroy(zvol_taskq); out: ida_destroy(&zvol_ida); rw_destroy(&zvol_state_lock); list_destroy(&zvol_state_list); return (SET_ERROR(error)); } void zvol_fini(void) { zvol_remove_minors_impl(NULL); blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS); unregister_blkdev(zvol_major, ZVOL_DRIVER); kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head)); taskq_destroy(zvol_taskq); list_destroy(&zvol_state_list); rw_destroy(&zvol_state_lock); ida_destroy(&zvol_ida); } /* BEGIN CSTYLED */ module_param(zvol_inhibit_dev, uint, 0644); MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); module_param(zvol_major, uint, 0444); MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); module_param(zvol_threads, uint, 0444); MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests"); module_param(zvol_request_sync, uint, 0644); MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); module_param(zvol_max_discard_blocks, ulong, 0444); MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); module_param(zvol_prefetch_bytes, uint, 0644); MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); module_param(zvol_volmode, uint, 0644); MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); /* END CSTYLED */