Index: sys/compat/linux/linux_misc.c =================================================================== --- sys/compat/linux/linux_misc.c +++ sys/compat/linux/linux_misc.c @@ -263,7 +263,7 @@ unsigned long bss_size; char *library; ssize_t aresid; - int error, locked, writecount; + int error, locked; LCONVPATHEXIST(td, args->library, &library); @@ -292,15 +292,6 @@ */ locked = 1; - /* Writable? */ - error = VOP_GET_WRITECOUNT(vp, &writecount); - if (error != 0) - goto cleanup; - if (writecount != 0) { - error = ETXTBSY; - goto cleanup; - } - /* Executable? */ error = VOP_GETATTR(vp, &attr, td->td_ucred); if (error) @@ -400,10 +391,10 @@ /* * Prevent more writers. - * XXX: Note that if any of the VM operations fail below we don't - * clear this flag. */ - VOP_SET_TEXT(vp); + error = VOP_SET_TEXT(vp); + if (error != 0) + goto cleanup; /* * Lock no longer needed @@ -437,6 +428,7 @@ td->td_ucred, NOCRED, &aresid, td); if (error != 0) goto cleanup; + VOP_SET_TEXT_SUCCEED(vp); if (aresid != 0) { error = ENOEXEC; goto cleanup; @@ -455,11 +447,13 @@ * Map it all into the process's space as a single * copy-on-write "data" segment. */ + /* XXXKIB */ error = vm_mmap(&td->td_proc->p_vmspace->vm_map, &vmaddr, a_out->a_text + a_out->a_data, VM_PROT_ALL, VM_PROT_ALL, MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, vp, file_offset); if (error) goto cleanup; + VOP_SET_TEXT_SUCCEED(vp); } #ifdef DEBUG printf("mem=%08lx = %08lx %08lx\n", (long)vmaddr, ((long *)vmaddr)[0], @@ -482,6 +476,7 @@ /* Unlock vnode if needed */ if (locked) VOP_UNLOCK(vp, 0); + VOP_UNSET_TEXT_SUCCEED(vp); /* Release the temporary mapping. */ if (a_out) Index: sys/fs/nfsclient/nfs_clbio.c =================================================================== --- sys/fs/nfsclient/nfs_clbio.c +++ sys/fs/nfsclient/nfs_clbio.c @@ -1639,7 +1639,7 @@ } } /* ASSERT_VOP_LOCKED(vp, "ncl_doio"); */ - if (p && (vp->v_vflag & VV_TEXT)) { + if (p && vp->v_writecount == -1) { mtx_lock(&np->n_mtx); if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &np->n_vattr.na_mtime)) { mtx_unlock(&np->n_mtx); Index: sys/fs/nfsclient/nfs_clvnops.c =================================================================== --- sys/fs/nfsclient/nfs_clvnops.c +++ sys/fs/nfsclient/nfs_clvnops.c @@ -3442,8 +3442,7 @@ np->n_mtime = np->n_vattr.na_mtime; mtx_unlock(&np->n_mtx); - vp->v_vflag |= VV_TEXT; - return (0); + return (vop_stdset_text(ap)); } /* Index: sys/fs/nullfs/null_vnops.c =================================================================== --- sys/fs/nullfs/null_vnops.c +++ sys/fs/nullfs/null_vnops.c @@ -331,26 +331,6 @@ return (error); } -static int -null_add_writecount(struct vop_add_writecount_args *ap) -{ - struct vnode *lvp, *vp; - int error; - - vp = ap->a_vp; - lvp = NULLVPTOLOWERVP(vp); - KASSERT(vp->v_writecount + ap->a_inc >= 0, ("wrong writecount inc")); - if (vp->v_writecount > 0 && vp->v_writecount + ap->a_inc == 0) - error = VOP_ADD_WRITECOUNT(lvp, -1); - else if (vp->v_writecount == 0 && vp->v_writecount + ap->a_inc > 0) - error = VOP_ADD_WRITECOUNT(lvp, 1); - else - error = 0; - if (error == 0) - vp->v_writecount += ap->a_inc; - return (error); -} - /* * We have to carry on the locking protocol on the null layer vnodes * as we progress through the tree. We also have to enforce read-only @@ -804,13 +784,6 @@ vp->v_vnlock = &vp->v_lock; VI_UNLOCK(vp); - /* - * If we were opened for write, we leased one write reference - * to the lower vnode. If this is a reclamation due to the - * forced unmount, undo the reference now. - */ - if (vp->v_writecount > 0) - VOP_ADD_WRITECOUNT(lowervp, -1); if ((xp->null_flags & NULLV_NOUNLOCK) != 0) vunref(lowervp); else @@ -938,5 +911,4 @@ .vop_unlock = null_unlock, .vop_vptocnp = null_vptocnp, .vop_vptofh = null_vptofh, - .vop_add_writecount = null_add_writecount, }; Index: sys/fs/unionfs/union_subr.c =================================================================== --- sys/fs/unionfs/union_subr.c +++ sys/fs/unionfs/union_subr.c @@ -941,10 +941,14 @@ vput(vp); goto unionfs_vn_create_on_upper_free_out1; } - VOP_ADD_WRITECOUNT(vp, 1); + error = VOP_ADD_WRITECOUNT(vp, 1); CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", __func__, vp, vp->v_writecount); - *vpp = vp; + if (error == 0) { + *vpp = vp; + } else { + VOP_CLOSE(vp, fmode, cred, td); + } unionfs_vn_create_on_upper_free_out1: VOP_UNLOCK(udvp, LK_RELEASE); @@ -1078,7 +1082,7 @@ } } VOP_CLOSE(uvp, FWRITE, cred, td); - VOP_ADD_WRITECOUNT(uvp, -1); + VOP_ADD_WRITECOUNT_SUCCEED(uvp, -1); CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", __func__, uvp, uvp->v_writecount); Index: sys/kern/imgact_aout.c =================================================================== --- sys/kern/imgact_aout.c +++ sys/kern/imgact_aout.c @@ -247,8 +247,8 @@ /* data + bss can't exceed rlimit */ a_out->a_data + bss_size > lim_cur_proc(imgp->proc, RLIMIT_DATA) || racct_set(imgp->proc, RACCT_DATA, a_out->a_data + bss_size) != 0) { - PROC_UNLOCK(imgp->proc); - return (ENOMEM); + PROC_UNLOCK(imgp->proc); + return (ENOMEM); } PROC_UNLOCK(imgp->proc); @@ -267,7 +267,7 @@ */ error = exec_new_vmspace(imgp, &aout_sysvec); - vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); + vn_lock(imgp->vp, LK_SHARED | LK_RETRY); if (error) return (error); @@ -286,12 +286,13 @@ file_offset, virtual_offset, text_end, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL, - MAP_COPY_ON_WRITE | MAP_PREFAULT); + MAP_COPY_ON_WRITE | MAP_PREFAULT | MAP_VN_EXEC); if (error) { vm_map_unlock(map); vm_object_deallocate(object); return (error); } + VOP_SET_TEXT_SUCCEED(imgp->vp); data_end = text_end + a_out->a_data; if (a_out->a_data) { vm_object_reference(object); @@ -299,12 +300,13 @@ file_offset + a_out->a_text, text_end, data_end, VM_PROT_ALL, VM_PROT_ALL, - MAP_COPY_ON_WRITE | MAP_PREFAULT); + MAP_COPY_ON_WRITE | MAP_PREFAULT | MAP_VN_EXEC); if (error) { vm_map_unlock(map); vm_object_deallocate(object); return (error); } + VOP_SET_TEXT_SUCCEED(imgp->vp); } if (bss_size) { Index: sys/kern/imgact_elf.c =================================================================== --- sys/kern/imgact_elf.c +++ sys/kern/imgact_elf.c @@ -449,7 +449,7 @@ /* * Create the page if it doesn't exist yet. Ignore errors. */ - vm_map_fixed(map, NULL, 0, trunc_page(start), round_page(end) - + error = vm_map_fixed(map, NULL, 0, trunc_page(start), round_page(end) - trunc_page(start), VM_PROT_ALL, VM_PROT_ALL, MAP_CHECK_EXCL); /* @@ -526,13 +526,16 @@ } else { vm_object_reference(object); rv = vm_map_fixed(map, object, offset, start, end - start, - prot, VM_PROT_ALL, cow | MAP_CHECK_EXCL); + prot, VM_PROT_ALL, cow | MAP_CHECK_EXCL | + (object != NULL ? MAP_VN_EXEC : 0)); if (rv != KERN_SUCCESS) { locked = VOP_ISLOCKED(imgp->vp); VOP_UNLOCK(imgp->vp, 0); vm_object_deallocate(object); vn_lock(imgp->vp, locked | LK_RETRY); return (rv); + } else if (object != NULL) { + VOP_SET_TEXT_SUCCEED(imgp->vp); } } return (KERN_SUCCESS); @@ -589,13 +592,8 @@ cow = MAP_COPY_ON_WRITE | MAP_PREFAULT | (prot & VM_PROT_WRITE ? 0 : MAP_DISABLE_COREDUMP); - rv = __elfN(map_insert)(imgp, map, - object, - file_addr, /* file offset */ - map_addr, /* virtual start */ - map_addr + map_len,/* virtual end */ - prot, - cow); + rv = __elfN(map_insert)(imgp, map, object, file_addr, + map_addr, map_addr + map_len, prot, cow); if (rv != KERN_SUCCESS) return (EINVAL); @@ -716,7 +714,7 @@ struct nameidata *nd; struct vattr *attr; struct image_params *imgp; - u_long flags, rbase; + u_long rbase; u_long base_addr = 0; int error; @@ -744,10 +742,8 @@ imgp->object = NULL; imgp->execlabel = NULL; - flags = FOLLOW | LOCKSHARED | LOCKLEAF; - -again: - NDINIT(nd, LOOKUP, flags, UIO_SYSSPACE, file, curthread); + NDINIT(nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF, UIO_SYSSPACE, file, + curthread); if ((error = namei(nd)) != 0) { nd->ni_vp = NULL; goto fail; @@ -762,27 +758,6 @@ if (error) goto fail; - /* - * Also make certain that the interpreter stays the same, - * so set its VV_TEXT flag, too. Since this function is only - * used to load the interpreter, the VV_TEXT is almost always - * already set. - */ - if (VOP_IS_TEXT(nd->ni_vp) == 0) { - if (VOP_ISLOCKED(nd->ni_vp) != LK_EXCLUSIVE) { - /* - * LK_UPGRADE could have resulted in dropping - * the lock. Just try again from the start, - * this time with exclusive vnode lock. - */ - vput(nd->ni_vp); - flags &= ~LOCKSHARED; - goto again; - } - - VOP_SET_TEXT(nd->ni_vp); - } - error = exec_map_first_page(imgp); if (error) goto fail; @@ -825,9 +800,10 @@ if (imgp->firstpage) exec_unmap_first_page(imgp); - if (nd->ni_vp) + if (nd->ni_vp) { + VOP_UNSET_TEXT_SUCCEED(nd->ni_vp); vput(nd->ni_vp); - + } free(tempdata, M_TEMP); return (error); @@ -957,9 +933,12 @@ interp_name_len = phdr->p_filesz; if (phdr->p_offset > PAGE_SIZE || interp_name_len > PAGE_SIZE - phdr->p_offset) { - VOP_UNLOCK(imgp->vp, 0); - interp = malloc(interp_name_len + 1, M_TEMP, M_WAITOK); - vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); + interp = malloc(interp_name_len + 1, M_TEMP, M_NOWAIT); + if (interp == NULL) { + VOP_UNLOCK(imgp->vp, 0); + interp = malloc(interp_name_len + 1, M_TEMP, M_WAITOK); + vn_lock(imgp->vp, LK_SHARED | LK_RETRY); + } error = vn_rdwr(UIO_READ, imgp->vp, interp, interp_name_len, phdr->p_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, @@ -1225,7 +1204,7 @@ maxv / 2, 1UL << flsl(maxalign)); } - vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); + vn_lock(imgp->vp, LK_SHARED | LK_RETRY); if (error != 0) goto ret; @@ -1269,7 +1248,7 @@ } error = __elfN(load_interp)(imgp, brand_info, interp, &addr, &imgp->entry_addr); - vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); + vn_lock(imgp->vp, LK_SHARED | LK_RETRY); if (error != 0) goto ret; } else @@ -1278,7 +1257,12 @@ /* * Construct auxargs table (used by the fixup routine) */ - elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK); + elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_NOWAIT); + if (elf_auxargs == NULL) { + VOP_UNLOCK(imgp->vp, 0); + elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK); + vn_lock(imgp->vp, LK_SHARED | LK_RETRY); + } elf_auxargs->execfd = -1; elf_auxargs->phdr = proghdr + et_dyn_addr; elf_auxargs->phent = hdr->e_phentsize; @@ -2558,9 +2542,12 @@ ASSERT_VOP_LOCKED(imgp->vp, "parse_notes"); if (pnote->p_offset > PAGE_SIZE || pnote->p_filesz > PAGE_SIZE - pnote->p_offset) { - VOP_UNLOCK(imgp->vp, 0); - buf = malloc(pnote->p_filesz, M_TEMP, M_WAITOK); - vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); + buf = malloc(pnote->p_filesz, M_TEMP, M_NOWAIT); + if (buf == NULL) { + VOP_UNLOCK(imgp->vp, 0); + buf = malloc(pnote->p_filesz, M_TEMP, M_WAITOK); + vn_lock(imgp->vp, LK_SHARED | LK_RETRY); + } error = vn_rdwr(UIO_READ, imgp->vp, buf, pnote->p_filesz, pnote->p_offset, UIO_SYSSPACE, IO_NODELOCKED, curthread->td_ucred, NOCRED, NULL, curthread); Index: sys/kern/kern_exec.c =================================================================== --- sys/kern/kern_exec.c +++ sys/kern/kern_exec.c @@ -375,7 +375,6 @@ #endif struct vnode *oldtextvp = NULL, *newtextvp; int credential_changing; - int textset; #ifdef MAC struct label *interpvplabel = NULL; int will_transition; @@ -423,8 +422,8 @@ * interpreter if this is an interpreted binary. */ if (args->fname != NULL) { - NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME - | AUDITVNODE1, UIO_SYSSPACE, args->fname, td); + NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | LOCKSHARED | FOLLOW | + SAVENAME | AUDITVNODE1, UIO_SYSSPACE, args->fname, td); } SDT_PROBE1(proc, , , exec, args->fname); @@ -457,13 +456,14 @@ error = fgetvp_exec(td, args->fd, &cap_fexecve_rights, &newtextvp); if (error) goto exec_fail; - vn_lock(newtextvp, LK_EXCLUSIVE | LK_RETRY); + vn_lock(newtextvp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(newtextvp); imgp->vp = newtextvp; } /* - * Check file permissions (also 'opens' file) + * Check file permissions. Also 'opens' file and sets its to + * text mode. */ error = exec_check_permissions(imgp); if (error) @@ -473,16 +473,6 @@ if (imgp->object != NULL) vm_object_reference(imgp->object); - /* - * Set VV_TEXT now so no one can write to the executable while we're - * activating it. - * - * Remember if this was set before and unset it in case this is not - * actually an executable image. - */ - textset = VOP_IS_TEXT(imgp->vp); - VOP_SET_TEXT(imgp->vp); - error = exec_map_first_page(imgp); if (error) goto exec_fail_dealloc; @@ -610,11 +600,8 @@ } if (error) { - if (error == -1) { - if (textset == 0) - VOP_UNSET_TEXT(imgp->vp); + if (error == -1) error = ENOEXEC; - } goto exec_fail_dealloc; } @@ -630,7 +617,7 @@ * VV_TEXT will be set. The vnode lock is held over this * entire period so nothing should illegitimately be blocked. */ - VOP_UNSET_TEXT(imgp->vp); + VOP_UNSET_TEXT_SUCCEED(imgp->vp); /* free name buffer and old vnode */ if (args->fname != NULL) NDFREE(&nd, NDF_ONLY_PNBUF); @@ -886,6 +873,8 @@ NDFREE(&nd, NDF_ONLY_PNBUF); if (imgp->opened) VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td); + if (imgp->textset) + VOP_UNSET_TEXT_SUCCEED(imgp->vp); if (error != 0) vput(imgp->vp); else @@ -1706,7 +1695,7 @@ struct vnode *vp = imgp->vp; struct vattr *attr = imgp->attr; struct thread *td; - int error, writecount; + int error; td = curthread; @@ -1750,12 +1739,17 @@ /* * Check number of open-for-writes on the file and deny execution * if there are any. + * + * Set VV_TEXT now so no one can write to the executable while we're + * activating it. + * + * Remember if this was set before and unset it in case this is not + * actually an executable image. */ - error = VOP_GET_WRITECOUNT(vp, &writecount); + error = VOP_SET_TEXT(vp); if (error != 0) return (error); - if (writecount != 0) - return (ETXTBSY); + imgp->textset = true; /* * Call filesystem specific open routine (which does nothing in the Index: sys/kern/vfs_default.c =================================================================== --- sys/kern/vfs_default.c +++ sys/kern/vfs_default.c @@ -81,9 +81,7 @@ #define DIRENT_MINSIZE (sizeof(struct dirent) - (MAXNAMLEN+1) + 4) static int vop_stdis_text(struct vop_is_text_args *ap); -static int vop_stdset_text(struct vop_set_text_args *ap); static int vop_stdunset_text(struct vop_unset_text_args *ap); -static int vop_stdget_writecount(struct vop_get_writecount_args *ap); static int vop_stdadd_writecount(struct vop_add_writecount_args *ap); static int vop_stdfdatasync(struct vop_fdatasync_args *ap); static int vop_stdgetpages_async(struct vop_getpages_async_args *ap); @@ -141,7 +139,6 @@ .vop_is_text = vop_stdis_text, .vop_set_text = vop_stdset_text, .vop_unset_text = vop_stdunset_text, - .vop_get_writecount = vop_stdget_writecount, .vop_add_writecount = vop_stdadd_writecount, }; @@ -1070,39 +1067,63 @@ vop_stdis_text(struct vop_is_text_args *ap) { - return ((ap->a_vp->v_vflag & VV_TEXT) != 0); + return (ap->a_vp->v_writecount < 0); } -static int +int vop_stdset_text(struct vop_set_text_args *ap) { + struct vnode *vp; + int error; - ap->a_vp->v_vflag |= VV_TEXT; - return (0); + vp = ap->a_vp; + VI_LOCK(vp); + if (vp->v_writecount > 0) { + error = ETXTBSY; + } else { + vp->v_writecount--; + error = 0; + } + VI_UNLOCK(vp); + return (error); } static int vop_stdunset_text(struct vop_unset_text_args *ap) { + struct vnode *vp; + int error; - ap->a_vp->v_vflag &= ~VV_TEXT; - return (0); -} - -static int -vop_stdget_writecount(struct vop_get_writecount_args *ap) -{ - - *ap->a_writecount = ap->a_vp->v_writecount; - return (0); + vp = ap->a_vp; + VI_LOCK(vp); + if (vp->v_writecount < 0) { + vp->v_writecount++; + error = 0; + } else { + error = EINVAL; + } + VI_UNLOCK(vp); + return (error); } static int vop_stdadd_writecount(struct vop_add_writecount_args *ap) { + struct vnode *vp; + int error; - ap->a_vp->v_writecount += ap->a_inc; - return (0); + vp = ap->a_vp; + VI_LOCK(vp); + if (vp->v_writecount < 0) { + error = ETXTBSY; + } else { + VNASSERT(vp->v_writecount + ap->a_inc >= 0, vp, + ("neg writecount increment %d", ap->a_inc)); + vp->v_writecount += ap->a_inc; + error = 0; + } + VI_UNLOCK(vp); + return (error); } /* Index: sys/kern/vfs_subr.c =================================================================== --- sys/kern/vfs_subr.c +++ sys/kern/vfs_subr.c @@ -3491,8 +3491,6 @@ strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); if (vp->v_vflag & VV_CACHEDLABEL) strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); - if (vp->v_vflag & VV_TEXT) - strlcat(buf, "|VV_TEXT", sizeof(buf)); if (vp->v_vflag & VV_COPYONWRITE) strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); if (vp->v_vflag & VV_SYSTEM) @@ -3508,7 +3506,7 @@ if (vp->v_vflag & VV_FORCEINSMQ) strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | - VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | + VV_CACHEDLABEL | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); Index: sys/kern/vfs_vnops.c =================================================================== --- sys/kern/vfs_vnops.c +++ sys/kern/vfs_vnops.c @@ -335,24 +335,26 @@ accmode &= ~(VCREAT | VVERIFY); #endif - if ((fmode & O_CREAT) == 0) { - if (accmode & VWRITE) { - error = vn_writechk(vp); - if (error) - return (error); - } - if (accmode) { - error = VOP_ACCESS(vp, accmode, cred, td); - if (error) - return (error); - } + if ((fmode & O_CREAT) == 0 && accmode != 0) { + error = VOP_ACCESS(vp, accmode, cred, td); + if (error != 0) + return (error); } if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) vn_lock(vp, LK_UPGRADE | LK_RETRY); - if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0) - return (error); - - while ((fmode & (O_EXLOCK | O_SHLOCK)) != 0) { + if ((fmode & FWRITE) != 0) { + error = VOP_ADD_WRITECOUNT(vp, 1); + if (error != 0) + return (error); + CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", + __func__, vp, vp->v_writecount); + } + for (;;) { + error = VOP_OPEN(vp, fmode, cred, td, fp); + if (error != 0) + break; + if ((fmode & (O_EXLOCK | O_SHLOCK)) == 0) + break; KASSERT(fp != NULL, ("open with flock requires fp")); if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE) { error = EOPNOTSUPP; @@ -374,36 +376,17 @@ if (error == 0) fp->f_flag |= FHASLOCK; vn_lock(vp, lock_flags | LK_RETRY); - if (error != 0) - break; - if ((vp->v_iflag & VI_DOOMED) != 0) { + if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0) error = ENOENT; - break; - } - - /* - * Another thread might have used this vnode as an - * executable while the vnode lock was dropped. - * Ensure the vnode is still able to be opened for - * writing after the lock has been obtained. - */ - if ((accmode & VWRITE) != 0) - error = vn_writechk(vp); break; } - if (error != 0) { - fp->f_flag |= FOPENFAILED; fp->f_vnode = vp; if (fp->f_ops == &badfileops) { fp->f_type = DTYPE_VNODE; fp->f_ops = &vnops; } vref(vp); - } else if ((fmode & FWRITE) != 0) { - VOP_ADD_WRITECOUNT(vp, 1); - CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", - __func__, vp, vp->v_writecount); } ASSERT_VOP_LOCKED(vp, "vn_open_vnode"); return (error); @@ -412,6 +395,7 @@ /* * Check for write permissions on the specified vnode. * Prototype text segments cannot be written. + * It is racy. */ int vn_writechk(struct vnode *vp) @@ -444,14 +428,13 @@ lock_flags = LK_SHARED; else lock_flags = LK_EXCLUSIVE; + error = 0; vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, lock_flags | LK_RETRY); AUDIT_ARG_VNODE1(vp); - if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) { - VNASSERT(vp->v_writecount > 0, vp, - ("vn_close: negative writecount")); - VOP_ADD_WRITECOUNT(vp, -1); + if ((flags & FWRITE) != 0) { + VOP_ADD_WRITECOUNT_SUCCEED(vp, -1); CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", __func__, vp, vp->v_writecount); } @@ -1319,13 +1302,14 @@ if (error) goto out; #endif - error = vn_writechk(vp); + error = VOP_ADD_WRITECOUNT(vp, 1); if (error == 0) { VATTR_NULL(&vattr); vattr.va_size = length; if ((fp->f_flag & O_FSYNC) != 0) vattr.va_vaflags |= VA_SYNC; error = VOP_SETATTR(vp, &vattr, fp->f_cred); + VOP_ADD_WRITECOUNT_SUCCEED(vp, -1); } out: VOP_UNLOCK(vp, 0); Index: sys/kern/vnode_if.src =================================================================== --- sys/kern/vnode_if.src +++ sys/kern/vnode_if.src @@ -688,29 +688,21 @@ }; -%% set_text vp E E E +%% set_text vp = = = vop_set_text { IN struct vnode *vp; }; -%% vop_unset_text vp E E E +%% vop_unset_text vp = = = vop_unset_text { IN struct vnode *vp; }; -%% get_writecount vp L L L - -vop_get_writecount { - IN struct vnode *vp; - OUT int *writecount; -}; - - -%% add_writecount vp E E E +%% add_writecount vp L L L vop_add_writecount { IN struct vnode *vp; Index: sys/sys/fcntl.h =================================================================== --- sys/sys/fcntl.h +++ sys/sys/fcntl.h @@ -145,8 +145,6 @@ /* Only for devfs d_close() flags. */ #define FLASTCLOSE O_DIRECTORY #define FREVOKE O_VERIFY -/* Only for fo_close() from half-succeeded open */ -#define FOPENFAILED O_TTY_INIT /* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */ #define FFLAGS(oflags) ((oflags) & O_EXEC ? (oflags) : (oflags) + 1) Index: sys/sys/imgact.h =================================================================== --- sys/sys/imgact.h +++ sys/sys/imgact.h @@ -89,6 +89,7 @@ u_long stack_sz; struct ucred *newcred; /* new credentials if changing */ bool credential_setid; /* true if becoming setid */ + bool textset; u_int map_flags; }; Index: sys/sys/vnode.h =================================================================== --- sys/sys/vnode.h +++ sys/sys/vnode.h @@ -244,7 +244,6 @@ #define VV_NOSYNC 0x0004 /* unlinked, stop syncing */ #define VV_ETERNALDEV 0x0008 /* device that is never destroyed */ #define VV_CACHEDLABEL 0x0010 /* Vnode has valid cached MAC label */ -#define VV_TEXT 0x0020 /* vnode is a pure text prototype */ #define VV_COPYONWRITE 0x0040 /* vnode is doing copy-on-write */ #define VV_SYSTEM 0x0080 /* vnode being used by kernel */ #define VV_PROCDEP 0x0100 /* vnode is process dependent */ @@ -749,6 +748,7 @@ int vop_stdadvlockasync(struct vop_advlockasync_args *ap); int vop_stdadvlockpurge(struct vop_advlockpurge_args *ap); int vop_stdallocate(struct vop_allocate_args *ap); +int vop_stdset_text(struct vop_set_text_args *ap); int vop_stdpathconf(struct vop_pathconf_args *); int vop_stdpoll(struct vop_poll_args *); int vop_stdvptocnp(struct vop_vptocnp_args *ap); @@ -828,6 +828,33 @@ #define VOP_LOCK(vp, flags) VOP_LOCK1(vp, flags, __FILE__, __LINE__) +#ifdef INVARIANTS +#define VOP_ADD_WRITECOUNT_SUCCEED(vp, cnt) \ +do { \ + int error_; \ + \ + error_ = VOP_ADD_WRITECOUNT((vp), (cnt)); \ + MPASS(error_ == 0); \ +} while (0) +#define VOP_SET_TEXT_SUCCEED(vp) \ +do { \ + int error_; \ + \ + error_ = VOP_SET_TEXT((vp)); \ + MPASS(error_ == 0); \ +} while (0) +#define VOP_UNSET_TEXT_SUCCEED(vp) \ +do { \ + int error_; \ + \ + error_ = VOP_UNSET_TEXT((vp)); \ + MPASS(error_ == 0); \ +} while (0) +#else +#define VOP_ADD_WRITECOUNT_SUCCEED(vp, cnt) VOP_ADD_WRITECOUNT((vp), (cnt)) +#define VOP_SET_TEXT_SUCCEED(vp) VOP_SET_TEXT((vp)) +#define VOP_UNSET_TEXT_SUCCEED(vp) VOP_UNSET_TEXT((vp)) +#endif void vput(struct vnode *vp); void vrele(struct vnode *vp); Index: sys/ufs/ufs/ufs_extattr.c =================================================================== --- sys/ufs/ufs/ufs_extattr.c +++ sys/ufs/ufs/ufs_extattr.c @@ -338,7 +338,12 @@ return (error); } - VOP_ADD_WRITECOUNT(vp, 1); + error = VOP_ADD_WRITECOUNT(vp, 1); + if (error != 0) { + VOP_CLOSE(vp, FREAD | FWRITE, td->td_ucred, td); + VOP_UNLOCK(vp, 0); + return (error); + } CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", __func__, vp, vp->v_writecount); Index: sys/vm/vm_map.h =================================================================== --- sys/vm/vm_map.h +++ sys/vm/vm_map.h @@ -136,7 +136,7 @@ #define MAP_ENTRY_IN_TRANSITION 0x0100 /* entry being changed */ #define MAP_ENTRY_NEEDS_WAKEUP 0x0200 /* waiters in transition */ #define MAP_ENTRY_NOCOREDUMP 0x0400 /* don't include in a core */ - +#define MAP_ENTRY_VN_EXEC 0x0800 /* text vnode mapping */ #define MAP_ENTRY_GROWS_DOWN 0x1000 /* Top-down stacks */ #define MAP_ENTRY_GROWS_UP 0x2000 /* Bottom-up stacks */ @@ -352,6 +352,7 @@ #define MAP_ACC_NO_CHARGE 0x8000 #define MAP_CREATE_STACK_GAP_UP 0x10000 #define MAP_CREATE_STACK_GAP_DN 0x20000 +#define MAP_VN_EXEC 0x40000 /* * vm_fault option flags Index: sys/vm/vm_map.c =================================================================== --- sys/vm/vm_map.c +++ sys/vm/vm_map.c @@ -506,6 +506,37 @@ map->timestamp++; } +static void +vm_map_entry_inc_vnode_text(vm_map_entry_t entry, bool add) +{ + vm_object_t object, object1; + struct vnode *vp; + + if ((entry->eflags & MAP_ENTRY_VN_EXEC) == 0) + return; + KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0, + ("Submap with execs")); + object = entry->object.vm_object; + KASSERT(object != NULL, ("No object for text, entry %p", entry)); + VM_OBJECT_RLOCK(object); + while ((object1 = object->backing_object) != NULL) { + VM_OBJECT_RLOCK(object1); + VM_OBJECT_RUNLOCK(object); + object = object1; + } + MPASS(((object->flags & OBJ_TMPFS) == 0 && + object->type == OBJT_VNODE) || + ((object->flags & OBJ_TMPFS) != 0 && + object->type == OBJT_SWAP)); + vp = (object->flags & OBJ_TMPFS) == 0 ? object->handle : + object->un_pager.swp.swp_tmpfs; + if (add) + VOP_SET_TEXT_SUCCEED(vp); + else + VOP_UNSET_TEXT_SUCCEED(vp); + VM_OBJECT_RUNLOCK(object); +} + static void vm_map_process_deferred(void) { @@ -518,6 +549,9 @@ td->td_map_def_user = NULL; while (entry != NULL) { next = entry->next; + MPASS((entry->eflags & (MAP_ENTRY_VN_WRITECNT | + MAP_ENTRY_VN_EXEC)) != (MAP_ENTRY_VN_WRITECNT | + MAP_ENTRY_VN_EXEC)); if ((entry->eflags & MAP_ENTRY_VN_WRITECNT) != 0) { /* * Decrement the object's writemappings and @@ -530,6 +564,7 @@ vnode_pager_release_writecount(object, entry->start, entry->end); } + vm_map_entry_inc_vnode_text(entry, false); vm_map_entry_deallocate(entry, FALSE); entry = next; } @@ -1372,6 +1407,8 @@ protoeflags |= MAP_ENTRY_GROWS_UP; if (cow & MAP_VN_WRITECOUNT) protoeflags |= MAP_ENTRY_VN_WRITECNT; + if (cow & MAP_VN_EXEC) + protoeflags |= MAP_ENTRY_VN_EXEC; if ((cow & MAP_CREATE_GUARD) != 0) protoeflags |= MAP_ENTRY_GUARD; if ((cow & MAP_CREATE_STACK_GAP_DN) != 0) @@ -1415,7 +1452,8 @@ VM_OBJECT_WUNLOCK(object); } else if ((prev_entry->eflags & ~MAP_ENTRY_USER_WIRED) == protoeflags && - (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 && + (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP | + MAP_VN_EXEC)) == 0 && prev_entry->end == start && (prev_entry->cred == cred || (prev_entry->object.vm_object != NULL && prev_entry->object.vm_object->cred == cred)) && @@ -1937,7 +1975,7 @@ * another entry. */ #define MAP_ENTRY_NOMERGE_MASK (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP | \ - MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP) + MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP | MAP_ENTRY_VN_EXEC) static bool vm_map_mergeable_neighbors(vm_map_entry_t prev, vm_map_entry_t entry) @@ -2088,6 +2126,7 @@ if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { vm_object_reference(new_entry->object.vm_object); + vm_map_entry_inc_vnode_text(new_entry, true); /* * The object->un_pager.vnp.writemappings for the * object of MAP_ENTRY_VN_WRITECNT type entry shall be @@ -2170,6 +2209,7 @@ if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { vm_object_reference(new_entry->object.vm_object); + vm_map_entry_inc_vnode_text(new_entry, true); } } @@ -3662,6 +3702,7 @@ */ vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry, fork_charge); + dst_entry->eflags &= ~MAP_ENTRY_VN_EXEC; } } @@ -3840,6 +3881,7 @@ vnode_pager_update_writecount(object, new_entry->start, new_entry->end); } + vm_map_entry_inc_vnode_text(new_entry, true); /* * Insert the entry into the new map -- we know we're @@ -3876,6 +3918,7 @@ vmspace_map_entry_forked(vm1, vm2, new_entry); vm_map_copy_entry(old_map, new_map, old_entry, new_entry, fork_charge); + vm_map_entry_inc_vnode_text(new_entry, true); break; case VM_INHERIT_ZERO: @@ -3890,7 +3933,7 @@ new_entry->end = old_entry->end; new_entry->eflags = old_entry->eflags & ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION | - MAP_ENTRY_VN_WRITECNT); + MAP_ENTRY_VN_WRITECNT | MAP_ENTRY_VN_EXEC); new_entry->protection = old_entry->protection; new_entry->max_protection = old_entry->max_protection; new_entry->inheritance = VM_INHERIT_ZERO; Index: sys/vm/vm_mmap.c =================================================================== --- sys/vm/vm_mmap.c +++ sys/vm/vm_mmap.c @@ -1202,14 +1202,13 @@ vm_object_t obj; vm_ooffset_t foff; struct ucred *cred; - int error, flags, locktype; + int error, flags; + bool writex; cred = td->td_ucred; - if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED)) - locktype = LK_EXCLUSIVE; - else - locktype = LK_SHARED; - if ((error = vget(vp, locktype, td)) != 0) + writex = (*maxprotp & VM_PROT_WRITE) != 0 && + (*flagsp & MAP_SHARED) != 0; + if ((error = vget(vp, LK_SHARED, td)) != 0) return (error); AUDIT_ARG_VNODE1(vp); foff = *foffp; @@ -1230,11 +1229,11 @@ * Bypass filesystems obey the mpsafety of the * underlying fs. Tmpfs never bypasses. */ - error = vget(vp, locktype, td); + error = vget(vp, LK_SHARED, td); if (error != 0) return (error); } - if (locktype == LK_EXCLUSIVE) { + if (writex) { *writecounted = TRUE; vnode_pager_update_writecount(obj, 0, objsize); } Index: sys/vm/vm_object.c =================================================================== --- sys/vm/vm_object.c +++ sys/vm/vm_object.c @@ -494,11 +494,11 @@ umtx_shm_object_terminated(object); /* - * The test for text of vp vnode does not need a bypass to - * reach right VV_TEXT there, since it is obtained from - * object->handle. + * If vp->v_writecount == -1 (VV_TEXT surrogate) and + * object->ref_count > 0, then v_writecount cannot change + * while the object is locked. */ - if (object->ref_count > 1 || (vp->v_vflag & VV_TEXT) == 0) { + if (object->ref_count > 1 || vp->v_writecount != -1) { object->ref_count--; VM_OBJECT_WUNLOCK(object); /* vrele may need the vnode lock. */ @@ -514,8 +514,6 @@ VM_OBJECT_WUNLOCK(object); VOP_UNLOCK(vp, 0); } else { - if (object->ref_count == 0) - VOP_UNSET_TEXT(vp); VM_OBJECT_WUNLOCK(object); vput(vp); } Index: sys/vm/vnode_pager.c =================================================================== --- sys/vm/vnode_pager.c +++ sys/vm/vnode_pager.c @@ -325,7 +325,7 @@ ASSERT_VOP_ELOCKED(vp, "vnode_pager_dealloc"); if (object->un_pager.vnp.writemappings > 0) { object->un_pager.vnp.writemappings = 0; - VOP_ADD_WRITECOUNT(vp, -1); + VOP_ADD_WRITECOUNT_SUCCEED(vp, -1); CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", __func__, vp, vp->v_writecount); } @@ -1513,13 +1513,13 @@ object->un_pager.vnp.writemappings += (vm_ooffset_t)end - start; vp = object->handle; if (old_wm == 0 && object->un_pager.vnp.writemappings != 0) { - ASSERT_VOP_ELOCKED(vp, "v_writecount inc"); - VOP_ADD_WRITECOUNT(vp, 1); + ASSERT_VOP_LOCKED(vp, "v_writecount inc"); + VOP_ADD_WRITECOUNT_SUCCEED(vp, 1); CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", __func__, vp, vp->v_writecount); } else if (old_wm != 0 && object->un_pager.vnp.writemappings == 0) { - ASSERT_VOP_ELOCKED(vp, "v_writecount dec"); - VOP_ADD_WRITECOUNT(vp, -1); + ASSERT_VOP_LOCKED(vp, "v_writecount dec"); + VOP_ADD_WRITECOUNT_SUCCEED(vp, -1); CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", __func__, vp, vp->v_writecount); } @@ -1561,7 +1561,7 @@ VM_OBJECT_WUNLOCK(object); mp = NULL; vn_start_write(vp, &mp, V_WAIT); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + vn_lock(vp, LK_SHARED | LK_RETRY); /* * Decrement the object's writemappings, by swapping the start