Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c @@ -1394,6 +1394,7 @@ vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */ vfsp->mnt_kern_flag |= MNTK_NOMSYNC; + vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG; /* * The fsid is 64 bits, composed of an 8-bit fs type, which Index: sys/fs/nfs/nfsport.h =================================================================== --- sys/fs/nfs/nfsport.h +++ sys/fs/nfs/nfsport.h @@ -878,6 +878,7 @@ int nfscl_loadattrcache(struct vnode **, struct nfsvattr *, void *, void *, int, int); int newnfs_realign(struct mbuf **, int); +bool ncl_pager_setsize(struct vnode *vp, u_quad_t *nsizep); /* * If the port runs on an SMP box that can enforce Atomic ops with low Index: sys/fs/nfsclient/nfs_clnode.c =================================================================== --- sys/fs/nfsclient/nfs_clnode.c +++ sys/fs/nfsclient/nfs_clnode.c @@ -162,6 +162,8 @@ vp->v_type = VDIR; vp->v_vflag |= VV_ROOT; } + + vp->v_vflag |= VV_VMSIZEVNLOCK; np->n_fhp = malloc(sizeof (struct nfsfh) + fhsize, M_NFSFH, M_WAITOK); Index: sys/fs/nfsclient/nfs_clport.c =================================================================== --- sys/fs/nfsclient/nfs_clport.c +++ sys/fs/nfsclient/nfs_clport.c @@ -246,6 +246,8 @@ vp->v_type = VDIR; vp->v_vflag |= VV_ROOT; } + + vp->v_vflag |= VV_VMSIZEVNLOCK; np->n_fhp = nfhp; /* @@ -414,10 +416,7 @@ struct nfsnode *np; struct nfsmount *nmp; struct timespec mtime_save; - vm_object_t object; - u_quad_t nsize; int error, force_fid_err; - bool setnsize; error = 0; @@ -565,27 +564,53 @@ if (np->n_attrstamp != 0) KDTRACE_NFS_ATTRCACHE_LOAD_DONE(vp, vap, error); #endif + (void)ncl_pager_setsize(vp, NULL); + return (error); +} + +/* + * Call vnode_pager_setsize() if the size of the node changed, as + * recorded in nfsnode vs. v_object, or delay the call if notifying + * the pager is not possible at the moment. + * + * If nsizep is non-NULL, the call is delayed and the new node size is + * provided. Caller should itself call vnode_pager_setsize() if + * function returned true. If nsizep is NULL, function tries to call + * vnode_pager_setsize() itself if needed and possible, and the nfs + * node is unlocked unconditionally, the return value is not useful. + */ +bool +ncl_pager_setsize(struct vnode *vp, u_quad_t *nsizep) +{ + struct nfsnode *np; + vm_object_t object; + struct vattr *vap; + u_quad_t nsize; + bool setnsize; + + np = VTONFS(vp); + NFSASSERTNODE(np); + + vap = &np->n_vattr.na_vattr; nsize = vap->va_size; object = vp->v_object; setnsize = false; - if (object != NULL) { - if (OFF_TO_IDX(nsize + PAGE_MASK) < object->size) { - /* - * When shrinking the size, the call to - * vnode_pager_setsize() cannot be done with - * the mutex held, because we might need to - * wait for a busy page. Delay it until after - * the node is unlocked. - */ + + if (object != NULL && nsize != object->un_pager.vnp.vnp_size) { + if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) setnsize = true; - } else { + else + np->n_flag |= NVNSETSZSKIP; + } + if (nsizep == NULL) { + NFSUNLOCKNODE(np); + if (setnsize) vnode_pager_setsize(vp, nsize); - } + setnsize = false; + } else { + *nsizep = nsize; } - NFSUNLOCKNODE(np); - if (setnsize) - vnode_pager_setsize(vp, nsize); - return (error); + return (setnsize); } /* Index: sys/fs/nfsclient/nfs_clsubs.c =================================================================== --- sys/fs/nfsclient/nfs_clsubs.c +++ sys/fs/nfsclient/nfs_clsubs.c @@ -185,6 +185,8 @@ struct vattr *vap; struct nfsmount *nmp; int timeo, mustflush; + u_quad_t nsize; + bool setnsize; np = VTONFS(vp); vap = &np->n_vattr.na_vattr; @@ -230,6 +232,7 @@ return( ENOENT); } nfsstatsv1.attrcache_hits++; + setnsize = false; if (vap->va_size != np->n_size) { if (vap->va_type == VREG) { if (np->n_flag & NMODIFIED) { @@ -240,7 +243,7 @@ } else { np->n_size = vap->va_size; } - vnode_pager_setsize(vp, np->n_size); + setnsize = ncl_pager_setsize(vp, &nsize); } else { np->n_size = vap->va_size; } @@ -253,6 +256,8 @@ vaper->va_mtime = np->n_mtim; } NFSUNLOCKNODE(np); + if (setnsize) + vnode_pager_setsize(vp, nsize); KDTRACE_NFS_ATTRCACHE_GET_HIT(vp, vap); return (0); } Index: sys/fs/nfsclient/nfs_clvnops.c =================================================================== --- sys/fs/nfsclient/nfs_clvnops.c +++ sys/fs/nfsclient/nfs_clvnops.c @@ -142,6 +142,7 @@ static vop_advlockasync_t nfs_advlockasync; static vop_getacl_t nfs_getacl; static vop_setacl_t nfs_setacl; +static vop_lock1_t nfs_lock; /* * Global vfs data structures for nfs @@ -160,6 +161,7 @@ .vop_putpages = ncl_putpages, .vop_inactive = ncl_inactive, .vop_link = nfs_link, + .vop_lock1 = nfs_lock, .vop_lookup = nfs_lookup, .vop_mkdir = nfs_mkdir, .vop_mknod = nfs_mknod, @@ -295,6 +297,73 @@ * rep->r_mtx : Protects the fields in an nfsreq. */ +static int +nfs_lock(struct vop_lock1_args *ap) +{ + struct vnode *vp; + struct nfsnode *np; + u_quad_t nsize; + int error, lktype; + bool onfault; + + vp = ap->a_vp; + lktype = ap->a_flags & LK_TYPE_MASK; + error = VOP_LOCK1_APV(&default_vnodeops, ap); + if (error != 0 || vp->v_op != &newnfs_vnodeops) + return (error); + np = VTONFS(vp); + NFSLOCKNODE(np); + if ((np->n_flag & NVNSETSZSKIP) == 0 || (lktype != LK_SHARED && + lktype != LK_EXCLUSIVE && lktype != LK_UPGRADE && + lktype != LK_TRYUPGRADE)) { + NFSUNLOCKNODE(np); + return (0); + } + onfault = (ap->a_flags & LK_EATTR_MASK) == LK_NOWAIT && + (ap->a_flags & LK_INIT_MASK) == LK_CANRECURSE && + (lktype == LK_SHARED || lktype == LK_EXCLUSIVE); + if (onfault && vp->v_vnlock->lk_recurse == 0) { + /* + * Force retry in vm_fault(), to make the lock request + * sleepable, which allows us to piggy-back the + * sleepable call to vnode_pager_setsize(). + */ + NFSUNLOCKNODE(np); + VOP_UNLOCK(vp, 0); + return (EBUSY); + } + if ((ap->a_flags & LK_NOWAIT) != 0 || + (lktype == LK_SHARED && vp->v_vnlock->lk_recurse > 0)) { + NFSUNLOCKNODE(np); + return (0); + } + if (lktype == LK_SHARED) { + NFSUNLOCKNODE(np); + VOP_UNLOCK(vp, 0); + ap->a_flags &= ~(LK_TYPE_MASK | LK_INTERLOCK); + ap->a_flags |= LK_EXCLUSIVE; + error = VOP_LOCK1_APV(&default_vnodeops, ap); + if (error != 0 || vp->v_op != &newnfs_vnodeops) + return (error); + NFSLOCKNODE(np); + if ((np->n_flag & NVNSETSZSKIP) == 0) { + NFSUNLOCKNODE(np); + goto downgrade; + } + } + np->n_flag &= ~NVNSETSZSKIP; + nsize = np->n_size; + NFSUNLOCKNODE(np); + vnode_pager_setsize(vp, nsize); +downgrade: + if (lktype == LK_SHARED) { + ap->a_flags &= ~(LK_TYPE_MASK | LK_INTERLOCK); + ap->a_flags |= LK_DOWNGRADE; + (void)VOP_LOCK1_APV(&default_vnodeops, ap); + } + return (0); +} + static int nfs34_access_otw(struct vnode *vp, int wmode, struct thread *td, struct ucred *cred, u_int32_t *retmode) Index: sys/fs/nfsclient/nfsnode.h =================================================================== --- sys/fs/nfsclient/nfsnode.h +++ sys/fs/nfsclient/nfsnode.h @@ -163,6 +163,7 @@ #define NWRITEOPENED 0x00040000 /* Has been opened for writing */ #define NHASBEENLOCKED 0x00080000 /* Has been file locked. */ #define NDSCOMMIT 0x00100000 /* Commit is done via the DS. */ +#define NVNSETSZSKIP 0x00200000 /* Skipped vnode_pager_setsize() */ /* * Convert between nfsnode pointers and vnode pointers Index: sys/sys/mount.h =================================================================== --- sys/sys/mount.h +++ sys/sys/mount.h @@ -411,6 +411,7 @@ #define MNTK_UNMAPPED_BUFS 0x00002000 #define MNTK_USES_BCACHE 0x00004000 /* FS uses the buffer cache. */ #define MNTK_TEXT_REFS 0x00008000 /* Keep use ref for text */ +#define MNTK_VMSETSIZE_BUG 0x00010000 #define MNTK_NOASYNC 0x00800000 /* disable async */ #define MNTK_UNMOUNT 0x01000000 /* unmount in progress */ #define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */ Index: sys/sys/vnode.h =================================================================== --- sys/sys/vnode.h +++ sys/sys/vnode.h @@ -247,6 +247,7 @@ #define VV_NOSYNC 0x0004 /* unlinked, stop syncing */ #define VV_ETERNALDEV 0x0008 /* device that is never destroyed */ #define VV_CACHEDLABEL 0x0010 /* Vnode has valid cached MAC label */ +#define VV_VMSIZEVNLOCK 0x0020 /* object size check requires vnode lock */ #define VV_COPYONWRITE 0x0040 /* vnode is doing copy-on-write */ #define VV_SYSTEM 0x0080 /* vnode being used by kernel */ #define VV_PROCDEP 0x0100 /* vnode is process dependent */ Index: sys/vm/vm_fault.c =================================================================== --- sys/vm/vm_fault.c +++ sys/vm/vm_fault.c @@ -628,19 +628,62 @@ return (result); } +static int +vm_fault_lock_vnode(struct faultstate *fs) +{ + struct vnode *vp; + int error, locked; + + if (fs->object->type != OBJT_VNODE) + return (KERN_SUCCESS); + vp = fs->object->handle; + if (vp == fs->vp) + return (KERN_SUCCESS); + + /* + * Perform an unlock in case the desired vnode changed while + * the map was unlocked during a retry. + */ + unlock_vp(fs); + + locked = VOP_ISLOCKED(vp); + if (locked != LK_EXCLUSIVE) + locked = LK_SHARED; + + /* + * We must not sleep acquiring the vnode lock while we have + * the page exclusive busied or the object's + * paging-in-progress count incremented. Otherwise, we could + * deadlock. + */ + error = vget(vp, locked | LK_CANRECURSE | LK_NOWAIT, curthread); + if (error == 0) { + fs->vp = vp; + return (KERN_SUCCESS); + } + + vhold(vp); + release_page(fs); + unlock_and_deallocate(fs); + error = vget(vp, locked | LK_RETRY | LK_CANRECURSE, curthread); + vdrop(vp); + fs->vp = vp; + KASSERT(error == 0, ("vm_fault: vget failed %d", error)); + return (KERN_RESOURCE_SHORTAGE); +} + int vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, vm_page_t *m_hold) { struct faultstate fs; - struct vnode *vp; struct domainset *dset; vm_object_t next_object, retry_object; vm_offset_t e_end, e_start; vm_pindex_t retry_pindex; vm_prot_t prot, retry_prot; - int ahead, alloc_req, behind, cluster_offset, error, era, faultcount; - int locked, nera, oom, result, rv; + int ahead, alloc_req, behind, cluster_offset, era, faultcount; + int nera, oom, result, rv; u_char behavior; boolean_t wired; /* Passed by reference. */ bool dead, hardfault, is_first_object_locked; @@ -844,6 +887,13 @@ */ if (fs.object->type != OBJT_DEFAULT || fs.object == fs.first_object) { + if ((fs.object->flags & OBJ_SIZEVNLOCK) != 0) { + rv = vm_fault_lock_vnode(&fs); + MPASS(rv == KERN_SUCCESS || + rv == KERN_RESOURCE_SHORTAGE); + if (rv == KERN_RESOURCE_SHORTAGE) + goto RetryFault; + } if (fs.pindex >= fs.object->size) { unlock_and_deallocate(&fs); return (KERN_OUT_OF_BOUNDS); @@ -1001,41 +1051,11 @@ */ unlock_map(&fs); - if (fs.object->type == OBJT_VNODE && - (vp = fs.object->handle) != fs.vp) { - /* - * Perform an unlock in case the desired vnode - * changed while the map was unlocked during a - * retry. - */ - unlock_vp(&fs); - - locked = VOP_ISLOCKED(vp); - if (locked != LK_EXCLUSIVE) - locked = LK_SHARED; - - /* - * We must not sleep acquiring the vnode lock - * while we have the page exclusive busied or - * the object's paging-in-progress count - * incremented. Otherwise, we could deadlock. - */ - error = vget(vp, locked | LK_CANRECURSE | - LK_NOWAIT, curthread); - if (error != 0) { - vhold(vp); - release_page(&fs); - unlock_and_deallocate(&fs); - error = vget(vp, locked | LK_RETRY | - LK_CANRECURSE, curthread); - vdrop(vp); - fs.vp = vp; - KASSERT(error == 0, - ("vm_fault: vget failed")); - goto RetryFault; - } - fs.vp = vp; - } + rv = vm_fault_lock_vnode(&fs); + MPASS(rv == KERN_SUCCESS || + rv == KERN_RESOURCE_SHORTAGE); + if (rv == KERN_RESOURCE_SHORTAGE) + goto RetryFault; KASSERT(fs.vp == NULL || !fs.map->system_map, ("vm_fault: vnode-backed object mapped by system map")); Index: sys/vm/vm_object.h =================================================================== --- sys/vm/vm_object.h +++ sys/vm/vm_object.h @@ -186,6 +186,7 @@ #define OBJ_DEAD 0x0008 /* dead objects (during rundown) */ #define OBJ_NOSPLIT 0x0010 /* dont split this object */ #define OBJ_UMTXDEAD 0x0020 /* umtx pshared was terminated */ +#define OBJ_SIZEVNLOCK 0x0040 /* lock vnode to check obj size */ #define OBJ_PG_DTOR 0x0080 /* dont reset object, leave that for dtor */ #define OBJ_MIGHTBEDIRTY 0x0100 /* object might be dirty, only for vnode */ #define OBJ_TMPFS_NODE 0x0200 /* object belongs to tmpfs VREG node */ Index: sys/vm/vnode_pager.c =================================================================== --- sys/vm/vnode_pager.c +++ sys/vm/vnode_pager.c @@ -269,8 +269,12 @@ object->un_pager.vnp.vnp_size = size; object->un_pager.vnp.writemappings = 0; object->domain.dr_policy = vnode_domainset; - object->handle = handle; + if ((vp->v_vflag & VV_VMSIZEVNLOCK) != 0) { + VM_OBJECT_WLOCK(object); + vm_object_set_flag(object, OBJ_SIZEVNLOCK); + VM_OBJECT_WUNLOCK(object); + } VI_LOCK(vp); if (vp->v_object != NULL) { /* @@ -440,7 +444,16 @@ if ((object = vp->v_object) == NULL) return; -/* ASSERT_VOP_ELOCKED(vp, "vnode_pager_setsize and not locked vnode"); */ +#ifdef DEBUG_VFS_LOCKS + { + struct mount *mp; + + mp = vp->v_mount; + if (mp != NULL && (mp->mnt_kern_flag & MNTK_VMSETSIZE_BUG) == 0) + assert_vop_elocked(vp, + "vnode_pager_setsize and not locked vnode"); + } +#endif VM_OBJECT_WLOCK(object); if (object->type == OBJT_DEAD) { VM_OBJECT_WUNLOCK(object);