Index: head/sys/coda/coda_vnops.c =================================================================== --- head/sys/coda/coda_vnops.c (revision 169670) +++ head/sys/coda/coda_vnops.c (revision 169671) @@ -1,1837 +1,1837 @@ /*- * Coda: an Experimental Distributed File System * Release 3.1 * * Copyright (c) 1987-1998 Carnegie Mellon University * All Rights Reserved * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation, and * that credit is given to Carnegie Mellon University in all documents * and publicity pertaining to direct or indirect use of this code or its * derivatives. * * CODA IS AN EXPERIMENTAL SOFTWARE SYSTEM AND IS KNOWN TO HAVE BUGS, * SOME OF WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON ALLOWS * FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION. CARNEGIE MELLON * DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER * RESULTING DIRECTLY OR INDIRECTLY FROM THE USE OF THIS SOFTWARE OR OF * ANY DERIVATIVE WORK. * * Carnegie Mellon encourages users of this software to return any * improvements or extensions that they make, and to grant Carnegie * Mellon the rights to redistribute these changes without encumbrance. * * @(#) src/sys/coda/coda_vnops.c,v 1.1.1.1 1998/08/29 21:14:52 rvb Exp $ */ /* * Mach Operating System * Copyright (c) 1990 Carnegie-Mellon University * Copyright (c) 1989 Carnegie-Mellon University * All rights reserved. The CMU software License Agreement specifies * the terms and conditions for use and redistribution. */ /* * This code was written for the Coda filesystem at Carnegie Mellon * University. Contributers include David Steere, James Kistler, and * M. Satyanarayanan. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include /* Must come after sys/malloc.h */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * These flags select various performance enhancements. */ int coda_attr_cache = 1; /* Set to cache attributes in the kernel */ int coda_symlink_cache = 1; /* Set to cache symbolic link information */ int coda_access_cache = 1; /* Set to handle some access checks directly */ /* structure to keep track of vfs calls */ struct coda_op_stats coda_vnodeopstats[CODA_VNODEOPS_SIZE]; #define MARK_ENTRY(op) (coda_vnodeopstats[op].entries++) #define MARK_INT_SAT(op) (coda_vnodeopstats[op].sat_intrn++) #define MARK_INT_FAIL(op) (coda_vnodeopstats[op].unsat_intrn++) #define MARK_INT_GEN(op) (coda_vnodeopstats[op].gen_intrn++) /* What we are delaying for in printf */ int coda_printf_delay = 0; /* in microseconds */ int coda_vnop_print_entry = 0; static int coda_lockdebug = 0; /* * Some NetBSD details: * * coda_start is called at the end of the mount syscall. * coda_init is called at boot time. */ #define ENTRY if(coda_vnop_print_entry) myprintf(("Entered %s\n",__func__)) /* Definition of the vnode operation vector */ struct vop_vector coda_vnodeops = { .vop_default = VOP_PANIC, .vop_lookup = coda_lookup, /* lookup */ .vop_create = coda_create, /* create */ .vop_mknod = VOP_PANIC, /* mknod */ .vop_open = coda_open, /* open */ .vop_close = coda_close, /* close */ .vop_access = coda_access, /* access */ .vop_getattr = coda_getattr, /* getattr */ .vop_setattr = coda_setattr, /* setattr */ .vop_read = coda_read, /* read */ .vop_write = coda_write, /* write */ .vop_ioctl = coda_ioctl, /* ioctl */ .vop_fsync = coda_fsync, /* fsync */ .vop_remove = coda_remove, /* remove */ .vop_link = coda_link, /* link */ .vop_rename = coda_rename, /* rename */ .vop_mkdir = coda_mkdir, /* mkdir */ .vop_rmdir = coda_rmdir, /* rmdir */ .vop_symlink = coda_symlink, /* symlink */ .vop_readdir = coda_readdir, /* readdir */ .vop_readlink = coda_readlink, /* readlink */ .vop_inactive = coda_inactive, /* inactive */ .vop_reclaim = coda_reclaim, /* reclaim */ - ._vop_lock = coda_lock, /* lock */ + .vop_lock1 = coda_lock, /* lock */ .vop_unlock = coda_unlock, /* unlock */ .vop_bmap = coda_bmap, /* bmap */ .vop_print = VOP_PANIC, /* print */ .vop_islocked = coda_islocked, /* islocked */ .vop_pathconf = coda_pathconf, /* pathconf */ .vop_advlock = VOP_NULL, /* advlock */ .vop_lease = VOP_NULL, /* lease */ .vop_poll = vop_stdpoll, .vop_getpages = vop_stdgetpages, /* pager intf.*/ .vop_putpages = vop_stdputpages, /* pager intf.*/ .vop_getwritemount = vop_stdgetwritemount, #if 0 missing .vop_cachedlookup = ufs_lookup, .vop_whiteout = ufs_whiteout, #endif }; /* A generic do-nothing. For lease_check, advlock */ int coda_vop_nop(void *anon) { struct vnodeop_desc **desc = (struct vnodeop_desc **)anon; if (codadebug) { myprintf(("Vnode operation %s called, but unsupported\n", (*desc)->vdesc_name)); } return (0); } int coda_vnodeopstats_init(void) { register int i; for(i=0;ia_vp); struct cnode *cp = VTOC(*vpp); int flag = ap->a_mode & (~O_EXCL); struct ucred *cred = ap->a_cred; struct thread *td = ap->a_td; /* locals */ int error; struct vnode *vp; struct cdev *dev; ino_t inode; MARK_ENTRY(CODA_OPEN_STATS); /* Check for open of control file. */ if (IS_CTL_VP(*vpp)) { /* XXX */ /* if (WRITEABLE(flag)) */ if (flag & (FWRITE | O_TRUNC | O_CREAT | O_EXCL)) { MARK_INT_FAIL(CODA_OPEN_STATS); return(EACCES); } MARK_INT_SAT(CODA_OPEN_STATS); return(0); } error = venus_open(vtomi((*vpp)), &cp->c_fid, flag, cred, td->td_proc, &dev, &inode); if (error) return (error); if (!error) { CODADEBUG( CODA_OPEN,myprintf(("open: dev %#lx inode %lu result %d\n", (u_long)dev2udev(dev), (u_long)inode, error)); ) } /* Translate the pair for the cache file into an inode pointer. */ error = coda_grab_vnode(dev, inode, &vp); if (error) return (error); /* We get the vnode back locked. Needs unlocked */ VOP_UNLOCK(vp, 0, td); /* Keep a reference until the close comes in. */ vref(*vpp); /* Save the vnode pointer for the cache file. */ if (cp->c_ovp == NULL) { cp->c_ovp = vp; } else { if (cp->c_ovp != vp) panic("coda_open: cp->c_ovp != ITOV(ip)"); } cp->c_ocount++; /* Flush the attribute cached if writing the file. */ if (flag & FWRITE) { cp->c_owrite++; cp->c_flags &= ~C_VATTR; } /* Save the pair for the cache file to speed up subsequent page_read's. */ cp->c_device = dev; cp->c_inode = inode; /* Open the cache file. */ error = VOP_OPEN(vp, flag, cred, td, -1); if (error) { printf("coda_open: VOP_OPEN on container failed %d\n", error); return (error); } /* grab (above) does this when it calls newvnode unless it's in the cache*/ return(error); } /* * Close the cache file used for I/O and notify Venus. */ int coda_close(struct vop_close_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); int flag = ap->a_fflag; struct ucred *cred = ap->a_cred; struct thread *td = ap->a_td; /* locals */ int error; MARK_ENTRY(CODA_CLOSE_STATS); /* Check for close of control file. */ if (IS_CTL_VP(vp)) { MARK_INT_SAT(CODA_CLOSE_STATS); return(0); } if (IS_UNMOUNTING(cp)) { if (cp->c_ovp) { #ifdef CODA_VERBOSE printf("coda_close: destroying container ref %d, ufs vp %p of vp %p/cp %p\n", vrefcnt(vp), cp->c_ovp, vp, cp); #endif #ifdef hmm vgone(cp->c_ovp); #else VOP_CLOSE(cp->c_ovp, flag, cred, td); /* Do errors matter here? */ vrele(cp->c_ovp); #endif } else { #ifdef CODA_VERBOSE printf("coda_close: NO container vp %p/cp %p\n", vp, cp); #endif } return ENODEV; } else { VOP_CLOSE(cp->c_ovp, flag, cred, td); /* Do errors matter here? */ vrele(cp->c_ovp); } if (--cp->c_ocount == 0) cp->c_ovp = NULL; if (flag & FWRITE) /* file was opened for write */ --cp->c_owrite; error = venus_close(vtomi(vp), &cp->c_fid, flag, cred, td->td_proc); vrele(CTOV(cp)); CODADEBUG(CODA_CLOSE, myprintf(("close: result %d\n",error)); ) return(error); } int coda_read(struct vop_read_args *ap) { ENTRY; return(coda_rdwr(ap->a_vp, ap->a_uio, UIO_READ, ap->a_ioflag, ap->a_cred, ap->a_uio->uio_td)); } int coda_write(struct vop_write_args *ap) { ENTRY; return(coda_rdwr(ap->a_vp, ap->a_uio, UIO_WRITE, ap->a_ioflag, ap->a_cred, ap->a_uio->uio_td)); } int coda_rdwr(struct vnode *vp, struct uio *uiop, enum uio_rw rw, int ioflag, struct ucred *cred, struct thread *td) { /* upcall decl */ /* NOTE: container file operation!!! */ /* locals */ struct cnode *cp = VTOC(vp); struct vnode *cfvp = cp->c_ovp; struct proc *p = td->td_proc; struct thread *ltd = td; int igot_internally = 0; int opened_internally = 0; int error = 0; int iscore = 0; MARK_ENTRY(CODA_RDWR_STATS); CODADEBUG(CODA_RDWR, myprintf(("coda_rdwr(%d, %p, %d, %lld, %d)\n", rw, (void *)uiop->uio_iov->iov_base, uiop->uio_resid, (long long)uiop->uio_offset, uiop->uio_segflg)); ) /* Check for rdwr of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_RDWR_STATS); return(EINVAL); } /* * If file is not already open this must be a page * {read,write} request. Iget the cache file's inode * pointer if we still have its pair. * Otherwise, we must do an internal open to derive the * pair. */ if (cfvp == NULL) { /* * If we're dumping core, do the internal open. Otherwise * venus won't have the correct size of the core when * it's completely written. */ if (p) { PROC_LOCK(p); iscore = (p->p_acflag & ACORE); PROC_UNLOCK(p); } else ltd = curthread; if (cp->c_inode != 0 && !iscore) { igot_internally = 1; error = coda_grab_vnode(cp->c_device, cp->c_inode, &cfvp); if (error) { MARK_INT_FAIL(CODA_RDWR_STATS); return(error); } /* * We get the vnode back locked by curthread in both Mach and * NetBSD. Needs unlocked */ VOP_UNLOCK(cfvp, 0, ltd); } else { opened_internally = 1; MARK_INT_GEN(CODA_OPEN_STATS); error = VOP_OPEN(vp, (rw == UIO_READ ? FREAD : FWRITE), cred, td, -1); printf("coda_rdwr: Internally Opening %p\n", vp); if (error) { printf("coda_rdwr: VOP_OPEN on container failed %d\n", error); return (error); } cfvp = cp->c_ovp; } } /* Have UFS handle the call. */ CODADEBUG(CODA_RDWR, myprintf(("indirect rdwr: fid = %s, refcnt = %d\n", coda_f2s(&cp->c_fid), CTOV(cp)->v_usecount)); ) if (rw == UIO_READ) { error = VOP_READ(cfvp, uiop, ioflag, cred); } else { error = VOP_WRITE(cfvp, uiop, ioflag, cred); /* ufs_write updates the vnode_pager_setsize for the vnode/object */ { struct vattr attr; if (VOP_GETATTR(cfvp, &attr, cred, td) == 0) { vnode_pager_setsize(vp, attr.va_size); } } } if (error) MARK_INT_FAIL(CODA_RDWR_STATS); else MARK_INT_SAT(CODA_RDWR_STATS); /* Do an internal close if necessary. */ if (opened_internally) { MARK_INT_GEN(CODA_CLOSE_STATS); (void)VOP_CLOSE(vp, (rw == UIO_READ ? FREAD : FWRITE), cred, td); } /* Invalidate cached attributes if writing. */ if (rw == UIO_WRITE) cp->c_flags &= ~C_VATTR; return(error); } int coda_ioctl(struct vop_ioctl_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; int com = ap->a_command; caddr_t data = ap->a_data; int flag = ap->a_fflag; struct ucred *cred = ap->a_cred; struct thread *td = ap->a_td; /* locals */ int error; struct vnode *tvp; struct nameidata ndp; struct PioctlData *iap = (struct PioctlData *)data; MARK_ENTRY(CODA_IOCTL_STATS); CODADEBUG(CODA_IOCTL, myprintf(("in coda_ioctl on %s\n", iap->path));) /* Don't check for operation on a dying object, for ctlvp it shouldn't matter */ /* Must be control object to succeed. */ if (!IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_IOCTL_STATS); CODADEBUG(CODA_IOCTL, myprintf(("coda_ioctl error: vp != ctlvp"));) return (EOPNOTSUPP); } /* Look up the pathname. */ /* Should we use the name cache here? It would get it from lookupname sooner or later anyway, right? */ NDINIT(&ndp, LOOKUP, (iap->follow ? FOLLOW : NOFOLLOW), UIO_USERSPACE, iap->path, td); error = namei(&ndp); tvp = ndp.ni_vp; if (error) { MARK_INT_FAIL(CODA_IOCTL_STATS); CODADEBUG(CODA_IOCTL, myprintf(("coda_ioctl error: lookup returns %d\n", error));) return(error); } /* * Make sure this is a coda style cnode, but it may be a * different vfsp */ if (tvp->v_op != &coda_vnodeops) { vrele(tvp); NDFREE(&ndp, NDF_ONLY_PNBUF); MARK_INT_FAIL(CODA_IOCTL_STATS); CODADEBUG(CODA_IOCTL, myprintf(("coda_ioctl error: %s not a coda object\n", iap->path));) return(EINVAL); } if (iap->vi.in_size > VC_MAXDATASIZE) { NDFREE(&ndp, 0); return(EINVAL); } error = venus_ioctl(vtomi(tvp), &((VTOC(tvp))->c_fid), com, flag, data, cred, td->td_proc); if (error) MARK_INT_FAIL(CODA_IOCTL_STATS); else CODADEBUG(CODA_IOCTL, myprintf(("Ioctl returns %d \n", error)); ) vrele(tvp); NDFREE(&ndp, NDF_ONLY_PNBUF); return(error); } /* * To reduce the cost of a user-level venus;we cache attributes in * the kernel. Each cnode has storage allocated for an attribute. If * c_vattr is valid, return a reference to it. Otherwise, get the * attributes from venus and store them in the cnode. There is some * question if this method is a security leak. But I think that in * order to make this call, the user must have done a lookup and * opened the file, and therefore should already have access. */ int coda_getattr(struct vop_getattr_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = ap->a_td; /* locals */ int error; MARK_ENTRY(CODA_GETATTR_STATS); if (IS_UNMOUNTING(cp)) return ENODEV; /* Check for getattr of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_GETATTR_STATS); return(ENOENT); } /* Check to see if the attributes have already been cached */ if (VALID_VATTR(cp)) { CODADEBUG(CODA_GETATTR, { myprintf(("attr cache hit: %s\n", coda_f2s(&cp->c_fid)));}); CODADEBUG(CODA_GETATTR, if (!(codadebug & ~CODA_GETATTR)) print_vattr(&cp->c_vattr); ); *vap = cp->c_vattr; MARK_INT_SAT(CODA_GETATTR_STATS); return(0); } error = venus_getattr(vtomi(vp), &cp->c_fid, cred, td->td_proc, vap); if (!error) { CODADEBUG(CODA_GETATTR, myprintf(("getattr miss %s: result %d\n", coda_f2s(&cp->c_fid), error)); ) CODADEBUG(CODA_GETATTR, if (!(codadebug & ~CODA_GETATTR)) print_vattr(vap); ); { int size = vap->va_size; struct vnode *convp = cp->c_ovp; if (convp != (struct vnode *)0) { vnode_pager_setsize(convp, size); } } /* If not open for write, store attributes in cnode */ if ((cp->c_owrite == 0) && (coda_attr_cache)) { cp->c_vattr = *vap; cp->c_flags |= C_VATTR; } } return(error); } int coda_setattr(struct vop_setattr_args *ap) { /* true args */ register struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); register struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = ap->a_td; /* locals */ int error; MARK_ENTRY(CODA_SETATTR_STATS); /* Check for setattr of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_SETATTR_STATS); return(ENOENT); } if (codadebug & CODADBGMSK(CODA_SETATTR)) { print_vattr(vap); } error = venus_setattr(vtomi(vp), &cp->c_fid, vap, cred, td->td_proc); if (!error) cp->c_flags &= ~C_VATTR; { int size = vap->va_size; struct vnode *convp = cp->c_ovp; if (size != VNOVAL && convp != (struct vnode *)0) { vnode_pager_setsize(convp, size); } } CODADEBUG(CODA_SETATTR, myprintf(("setattr %d\n", error)); ) return(error); } int coda_access(struct vop_access_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); int mode = ap->a_mode; struct ucred *cred = ap->a_cred; struct thread *td = ap->a_td; /* locals */ int error; MARK_ENTRY(CODA_ACCESS_STATS); /* Check for access of control object. Only read access is allowed on it. */ if (IS_CTL_VP(vp)) { /* bogus hack - all will be marked as successes */ MARK_INT_SAT(CODA_ACCESS_STATS); return(((mode & VREAD) && !(mode & (VWRITE | VEXEC))) ? 0 : EACCES); } /* * if the file is a directory, and we are checking exec (eg lookup) * access, and the file is in the namecache, then the user must have * lookup access to it. */ if (coda_access_cache) { if ((vp->v_type == VDIR) && (mode & VEXEC)) { if (coda_nc_lookup(cp, ".", 1, cred)) { MARK_INT_SAT(CODA_ACCESS_STATS); return(0); /* it was in the cache */ } } } error = venus_access(vtomi(vp), &cp->c_fid, mode, cred, td->td_proc); return(error); } int coda_readlink(struct vop_readlink_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct uio *uiop = ap->a_uio; struct ucred *cred = ap->a_cred; struct thread *td = ap->a_uio->uio_td; /* locals */ int error; char *str; int len; MARK_ENTRY(CODA_READLINK_STATS); /* Check for readlink of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_READLINK_STATS); return(ENOENT); } if ((coda_symlink_cache) && (VALID_SYMLINK(cp))) { /* symlink was cached */ uiop->uio_rw = UIO_READ; error = uiomove(cp->c_symlink, (int)cp->c_symlen, uiop); if (error) MARK_INT_FAIL(CODA_READLINK_STATS); else MARK_INT_SAT(CODA_READLINK_STATS); return(error); } error = venus_readlink(vtomi(vp), &cp->c_fid, cred, td != NULL ? td->td_proc : NULL, &str, &len); if (!error) { uiop->uio_rw = UIO_READ; error = uiomove(str, len, uiop); if (coda_symlink_cache) { cp->c_symlink = str; cp->c_symlen = len; cp->c_flags |= C_SYMLINK; } else CODA_FREE(str, len); } CODADEBUG(CODA_READLINK, myprintf(("in readlink result %d\n",error));) return(error); } int coda_fsync(struct vop_fsync_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct thread *td = ap->a_td; /* locals */ struct vnode *convp = cp->c_ovp; int error; MARK_ENTRY(CODA_FSYNC_STATS); /* Check for fsync on an unmounting object */ /* The NetBSD kernel, in it's infinite wisdom, can try to fsync * after an unmount has been initiated. This is a Bad Thing, * which we have to avoid. Not a legitimate failure for stats. */ if (IS_UNMOUNTING(cp)) { return(ENODEV); } /* Check for fsync of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_SAT(CODA_FSYNC_STATS); return(0); } if (convp) VOP_FSYNC(convp, MNT_WAIT, td); /* * We see fsyncs with usecount == 1 then usecount == 0. * For now we ignore them. */ /* VI_LOCK(vp); if (!vp->v_usecount) { printf("coda_fsync on vnode %p with %d usecount. c_flags = %x (%x)\n", vp, vp->v_usecount, cp->c_flags, cp->c_flags&C_PURGING); } VI_UNLOCK(vp); */ /* * We can expect fsync on any vnode at all if venus is pruging it. * Venus can't very well answer the fsync request, now can it? * Hopefully, it won't have to, because hopefully, venus preserves * the (possibly untrue) invariant that it never purges an open * vnode. Hopefully. */ if (cp->c_flags & C_PURGING) { return(0); } /* needs research */ return 0; error = venus_fsync(vtomi(vp), &cp->c_fid, td->td_proc); CODADEBUG(CODA_FSYNC, myprintf(("in fsync result %d\n",error)); ); return(error); } int coda_inactive(struct vop_inactive_args *ap) { /* XXX - at the moment, inactive doesn't look at cred, and doesn't have a proc pointer. Oops. */ /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct ucred *cred __attribute__((unused)) = NULL; struct thread *td __attribute__((unused)) = curthread; /* upcall decl */ /* locals */ /* We don't need to send inactive to venus - DCS */ MARK_ENTRY(CODA_INACTIVE_STATS); if (IS_CTL_VP(vp)) { MARK_INT_SAT(CODA_INACTIVE_STATS); return 0; } CODADEBUG(CODA_INACTIVE, myprintf(("in inactive, %s, vfsp %p\n", coda_f2s(&cp->c_fid), vp->v_mount));) /* If an array has been allocated to hold the symlink, deallocate it */ if ((coda_symlink_cache) && (VALID_SYMLINK(cp))) { if (cp->c_symlink == NULL) panic("coda_inactive: null symlink pointer in cnode"); CODA_FREE(cp->c_symlink, cp->c_symlen); cp->c_flags &= ~C_SYMLINK; cp->c_symlen = 0; } /* Remove it from the table so it can't be found. */ coda_unsave(cp); if ((struct coda_mntinfo *)(vp->v_mount->mnt_data) == NULL) { myprintf(("Help! vfsp->vfs_data was NULL, but vnode %p wasn't dying\n", vp)); panic("badness in coda_inactive\n"); } if (IS_UNMOUNTING(cp)) { #ifdef DEBUG printf("coda_inactive: IS_UNMOUNTING use %d: vp %p, cp %p\n", vrefcnt(vp), vp, cp); if (cp->c_ovp != NULL) printf("coda_inactive: cp->ovp != NULL use %d: vp %p, cp %p\n", vrefcnt(vp), vp, cp); #endif } else { #ifdef OLD_DIAGNOSTIC if (vrefcnt(CTOV(cp))) { panic("coda_inactive: nonzero reference count"); } if (cp->c_ovp != NULL) { panic("coda_inactive: cp->ovp != NULL"); } #endif vgone(vp); } MARK_INT_SAT(CODA_INACTIVE_STATS); return(0); } /* * Remote filesystem operations having to do with directory manipulation. */ /* * It appears that in NetBSD, lookup is supposed to return the vnode locked */ int coda_lookup(struct vop_lookup_args *ap) { /* true args */ struct vnode *dvp = ap->a_dvp; struct cnode *dcp = VTOC(dvp); struct vnode **vpp = ap->a_vpp; /* * It looks as though ap->a_cnp->ni_cnd->cn_nameptr holds the rest * of the string to xlate, and that we must try to get at least * ap->a_cnp->ni_cnd->cn_namelen of those characters to macth. I * could be wrong. */ struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; struct thread *td = cnp->cn_thread; /* locals */ struct cnode *cp; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; CodaFid VFid; int vtype; int error = 0; MARK_ENTRY(CODA_LOOKUP_STATS); CODADEBUG(CODA_LOOKUP, myprintf(("lookup: %s in %s\n", nm, coda_f2s(&dcp->c_fid)));); /* Check for lookup of control object. */ if (IS_CTL_NAME(dvp, nm, len)) { *vpp = coda_ctlvp; vref(*vpp); MARK_INT_SAT(CODA_LOOKUP_STATS); goto exit; } if (len+1 > CODA_MAXNAMLEN) { MARK_INT_FAIL(CODA_LOOKUP_STATS); CODADEBUG(CODA_LOOKUP, myprintf(("name too long: lookup, %s (%s)\n", coda_f2s(&dcp->c_fid), nm));); *vpp = (struct vnode *)0; error = EINVAL; goto exit; } /* First try to look the file up in the cfs name cache */ /* lock the parent vnode? */ cp = coda_nc_lookup(dcp, nm, len, cred); if (cp) { *vpp = CTOV(cp); vref(*vpp); CODADEBUG(CODA_LOOKUP, myprintf(("lookup result %d vpp %p\n",error,*vpp));) } else { /* The name wasn't cached, so we need to contact Venus */ error = venus_lookup(vtomi(dvp), &dcp->c_fid, nm, len, cred, td->td_proc, &VFid, &vtype); if (error) { MARK_INT_FAIL(CODA_LOOKUP_STATS); CODADEBUG(CODA_LOOKUP, myprintf(("lookup error on %s (%s)%d\n", coda_f2s(&dcp->c_fid), nm, error));) *vpp = (struct vnode *)0; } else { MARK_INT_SAT(CODA_LOOKUP_STATS); CODADEBUG(CODA_LOOKUP, myprintf(("lookup: %s type %o result %d\n", coda_f2s(&VFid), vtype, error)); ) cp = make_coda_node(&VFid, dvp->v_mount, vtype); *vpp = CTOV(cp); /* enter the new vnode in the Name Cache only if the top bit isn't set */ /* And don't enter a new vnode for an invalid one! */ if (!(vtype & CODA_NOCACHE)) coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp)); } } exit: /* * If we are creating, and this was the last name to be looked up, * and the error was ENOENT, then there really shouldn't be an * error and we can make the leaf NULL and return success. Since * this is supposed to work under Mach as well as NetBSD, we're * leaving this fn wrapped. We also must tell lookup/namei that * we need to save the last component of the name. (Create will * have to free the name buffer later...lucky us...) */ if (((cnp->cn_nameiop == CREATE) || (cnp->cn_nameiop == RENAME)) && (cnp->cn_flags & ISLASTCN) && (error == ENOENT)) { error = EJUSTRETURN; cnp->cn_flags |= SAVENAME; *ap->a_vpp = NULL; } /* * If we are removing, and we are at the last element, and we * found it, then we need to keep the name around so that the * removal will go ahead as planned. Unfortunately, this will * probably also lock the to-be-removed vnode, which may or may * not be a good idea. I'll have to look at the bits of * coda_remove to make sure. We'll only save the name if we did in * fact find the name, otherwise coda_remove won't have a chance * to free the pathname. */ if ((cnp->cn_nameiop == DELETE) && (cnp->cn_flags & ISLASTCN) && !error) { cnp->cn_flags |= SAVENAME; } /* * If the lookup went well, we need to (potentially?) unlock the * parent, and lock the child. We are only responsible for * checking to see if the parent is supposed to be unlocked before * we return. We must always lock the child (provided there is * one, and (the parent isn't locked or it isn't the same as the * parent.) Simple, huh? We can never leave the parent locked unless * we are ISLASTCN */ if (!error || (error == EJUSTRETURN)) { if (cnp->cn_flags & ISDOTDOT) { if ((error = VOP_UNLOCK(dvp, 0, td))) { return error; } /* * The parent is unlocked. As long as there is a child, * lock it without bothering to check anything else. */ if (*ap->a_vpp) { if ((error = VOP_LOCK(*ap->a_vpp, LK_EXCLUSIVE, td))) { vn_lock(dvp, LK_RETRY|LK_EXCLUSIVE, td); return (error); } } vn_lock(dvp, LK_RETRY|LK_EXCLUSIVE, td); } else { /* The parent is locked, and may be the same as the child */ if (*ap->a_vpp && (*ap->a_vpp != dvp)) { /* Different, go ahead and lock it. */ if ((error = VOP_LOCK(*ap->a_vpp, LK_EXCLUSIVE, td))) { return (error); } } } } else { /* If the lookup failed, we need to ensure that the leaf is NULL */ /* Don't change any locking? */ *ap->a_vpp = NULL; } return(error); } /*ARGSUSED*/ int coda_create(struct vop_create_args *ap) { /* true args */ struct vnode *dvp = ap->a_dvp; struct cnode *dcp = VTOC(dvp); struct vattr *va = ap->a_vap; int exclusive = 1; int mode = ap->a_vap->va_mode; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; struct thread *td = cnp->cn_thread; /* locals */ int error; struct cnode *cp; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; CodaFid VFid; struct vattr attr; MARK_ENTRY(CODA_CREATE_STATS); /* All creates are exclusive XXX */ /* I'm assuming the 'mode' argument is the file mode bits XXX */ /* Check for create of control object. */ if (IS_CTL_NAME(dvp, nm, len)) { *vpp = (struct vnode *)0; MARK_INT_FAIL(CODA_CREATE_STATS); return(EACCES); } error = venus_create(vtomi(dvp), &dcp->c_fid, nm, len, exclusive, mode, va, cred, td->td_proc, &VFid, &attr); if (!error) { /* If this is an exclusive create, panic if the file already exists. */ /* Venus should have detected the file and reported EEXIST. */ if ((exclusive == 1) && (coda_find(&VFid) != NULL)) panic("cnode existed for newly created file!"); cp = make_coda_node(&VFid, dvp->v_mount, attr.va_type); *vpp = CTOV(cp); /* Update va to reflect the new attributes. */ (*va) = attr; /* Update the attribute cache and mark it as valid */ if (coda_attr_cache) { VTOC(*vpp)->c_vattr = attr; VTOC(*vpp)->c_flags |= C_VATTR; } /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(dvp)->c_flags &= ~C_VATTR; /* enter the new vnode in the Name Cache */ coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp)); CODADEBUG(CODA_CREATE, myprintf(("create: %s, result %d\n", coda_f2s(&VFid), error)); ) } else { *vpp = (struct vnode *)0; CODADEBUG(CODA_CREATE, myprintf(("create error %d\n", error));) } if (!error) { if (cnp->cn_flags & LOCKLEAF) { if ((error = VOP_LOCK(*ap->a_vpp, LK_EXCLUSIVE, td))) { printf("coda_create: "); panic("unlocked parent but couldn't lock child"); } } #ifdef OLD_DIAGNOSTIC else { printf("coda_create: LOCKLEAF not set!\n"); } #endif } return(error); } int coda_remove(struct vop_remove_args *ap) { /* true args */ struct vnode *dvp = ap->a_dvp; struct cnode *cp = VTOC(dvp); struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; struct thread *td = cnp->cn_thread; /* locals */ int error; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; struct cnode *tp; MARK_ENTRY(CODA_REMOVE_STATS); CODADEBUG(CODA_REMOVE, myprintf(("remove: %s in %s\n", nm, coda_f2s(&cp->c_fid)));); /* Remove the file's entry from the CODA Name Cache */ /* We're being conservative here, it might be that this person * doesn't really have sufficient access to delete the file * but we feel zapping the entry won't really hurt anyone -- dcs */ /* I'm gonna go out on a limb here. If a file and a hardlink to it * exist, and one is removed, the link count on the other will be * off by 1. We could either invalidate the attrs if cached, or * fix them. I'll try to fix them. DCS 11/8/94 */ tp = coda_nc_lookup(VTOC(dvp), nm, len, cred); if (tp) { if (VALID_VATTR(tp)) { /* If attrs are cached */ if (tp->c_vattr.va_nlink > 1) { /* If it's a hard link */ tp->c_vattr.va_nlink--; } } coda_nc_zapfile(VTOC(dvp), nm, len); /* No need to flush it if it doesn't exist! */ } /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(dvp)->c_flags &= ~C_VATTR; /* Check for remove of control object. */ if (IS_CTL_NAME(dvp, nm, len)) { MARK_INT_FAIL(CODA_REMOVE_STATS); return(ENOENT); } error = venus_remove(vtomi(dvp), &cp->c_fid, nm, len, cred, td->td_proc); CODADEBUG(CODA_REMOVE, myprintf(("in remove result %d\n",error)); ) return(error); } int coda_link(struct vop_link_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct vnode *tdvp = ap->a_tdvp; struct cnode *tdcp = VTOC(tdvp); struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; struct thread *td = cnp->cn_thread; /* locals */ int error; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; MARK_ENTRY(CODA_LINK_STATS); if (codadebug & CODADBGMSK(CODA_LINK)) { myprintf(("nb_link: vp fid: %s\n", coda_f2s(&cp->c_fid))); myprintf(("nb_link: tdvp fid: %s)\n", coda_f2s(&tdcp->c_fid))); } if (codadebug & CODADBGMSK(CODA_LINK)) { myprintf(("link: vp fid: %s\n", coda_f2s(&cp->c_fid))); myprintf(("link: tdvp fid: %s\n", coda_f2s(&tdcp->c_fid))); } /* Check for link to/from control object. */ if (IS_CTL_NAME(tdvp, nm, len) || IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_LINK_STATS); return(EACCES); } error = venus_link(vtomi(vp), &cp->c_fid, &tdcp->c_fid, nm, len, cred, td->td_proc); /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(tdvp)->c_flags &= ~C_VATTR; VTOC(vp)->c_flags &= ~C_VATTR; CODADEBUG(CODA_LINK, myprintf(("in link result %d\n",error)); ) return(error); } int coda_rename(struct vop_rename_args *ap) { /* true args */ struct vnode *odvp = ap->a_fdvp; struct cnode *odcp = VTOC(odvp); struct componentname *fcnp = ap->a_fcnp; struct vnode *ndvp = ap->a_tdvp; struct cnode *ndcp = VTOC(ndvp); struct componentname *tcnp = ap->a_tcnp; struct ucred *cred = fcnp->cn_cred; struct thread *td = fcnp->cn_thread; /* true args */ int error; const char *fnm = fcnp->cn_nameptr; int flen = fcnp->cn_namelen; const char *tnm = tcnp->cn_nameptr; int tlen = tcnp->cn_namelen; MARK_ENTRY(CODA_RENAME_STATS); /* Hmmm. The vnodes are already looked up. Perhaps they are locked? This could be Bad. XXX */ #ifdef OLD_DIAGNOSTIC if ((fcnp->cn_cred != tcnp->cn_cred) || (fcnp->cn_thread != tcnp->cn_thread)) { panic("coda_rename: component names don't agree"); } #endif /* Check for rename involving control object. */ if (IS_CTL_NAME(odvp, fnm, flen) || IS_CTL_NAME(ndvp, tnm, tlen)) { MARK_INT_FAIL(CODA_RENAME_STATS); return(EACCES); } /* Problem with moving directories -- need to flush entry for .. */ if (odvp != ndvp) { struct cnode *ovcp = coda_nc_lookup(VTOC(odvp), fnm, flen, cred); if (ovcp) { struct vnode *ovp = CTOV(ovcp); if ((ovp) && (ovp->v_type == VDIR)) /* If it's a directory */ coda_nc_zapfile(VTOC(ovp),"..", 2); } } /* Remove the entries for both source and target files */ coda_nc_zapfile(VTOC(odvp), fnm, flen); coda_nc_zapfile(VTOC(ndvp), tnm, tlen); /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(odvp)->c_flags &= ~C_VATTR; VTOC(ndvp)->c_flags &= ~C_VATTR; if (flen+1 > CODA_MAXNAMLEN) { MARK_INT_FAIL(CODA_RENAME_STATS); error = EINVAL; goto exit; } if (tlen+1 > CODA_MAXNAMLEN) { MARK_INT_FAIL(CODA_RENAME_STATS); error = EINVAL; goto exit; } error = venus_rename(vtomi(odvp), &odcp->c_fid, &ndcp->c_fid, fnm, flen, tnm, tlen, cred, td->td_proc); exit: CODADEBUG(CODA_RENAME, myprintf(("in rename result %d\n",error));) /* XXX - do we need to call cache pureg on the moved vnode? */ cache_purge(ap->a_fvp); /* Release parents first, then children. */ vrele(odvp); if (ap->a_tvp) { if (ap->a_tvp == ndvp) vrele(ndvp); else vput(ndvp); vput(ap->a_tvp); } else vput(ndvp); vrele(ap->a_fvp); return(error); } int coda_mkdir(struct vop_mkdir_args *ap) { /* true args */ struct vnode *dvp = ap->a_dvp; struct cnode *dcp = VTOC(dvp); struct componentname *cnp = ap->a_cnp; register struct vattr *va = ap->a_vap; struct vnode **vpp = ap->a_vpp; struct ucred *cred = cnp->cn_cred; struct thread *td = cnp->cn_thread; /* locals */ int error; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; struct cnode *cp; CodaFid VFid; struct vattr ova; MARK_ENTRY(CODA_MKDIR_STATS); /* Check for mkdir of target object. */ if (IS_CTL_NAME(dvp, nm, len)) { *vpp = (struct vnode *)0; MARK_INT_FAIL(CODA_MKDIR_STATS); return(EACCES); } if (len+1 > CODA_MAXNAMLEN) { *vpp = (struct vnode *)0; MARK_INT_FAIL(CODA_MKDIR_STATS); return(EACCES); } error = venus_mkdir(vtomi(dvp), &dcp->c_fid, nm, len, va, cred, td->td_proc, &VFid, &ova); if (!error) { if (coda_find(&VFid) != NULL) panic("cnode existed for newly created directory!"); cp = make_coda_node(&VFid, dvp->v_mount, va->va_type); *vpp = CTOV(cp); /* enter the new vnode in the Name Cache */ coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp)); /* as a side effect, enter "." and ".." for the directory */ coda_nc_enter(VTOC(*vpp), ".", 1, cred, VTOC(*vpp)); coda_nc_enter(VTOC(*vpp), "..", 2, cred, VTOC(dvp)); if (coda_attr_cache) { VTOC(*vpp)->c_vattr = ova; /* update the attr cache */ VTOC(*vpp)->c_flags |= C_VATTR; /* Valid attributes in cnode */ } /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(dvp)->c_flags &= ~C_VATTR; CODADEBUG( CODA_MKDIR, myprintf(("mkdir: %s result %d\n", coda_f2s(&VFid), error)); ) } else { *vpp = (struct vnode *)0; CODADEBUG(CODA_MKDIR, myprintf(("mkdir error %d\n",error));) } return(error); } int coda_rmdir(struct vop_rmdir_args *ap) { /* true args */ struct vnode *dvp = ap->a_dvp; struct cnode *dcp = VTOC(dvp); struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; struct thread *td = cnp->cn_thread; /* true args */ int error; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; struct cnode *cp; MARK_ENTRY(CODA_RMDIR_STATS); /* Check for rmdir of control object. */ if (IS_CTL_NAME(dvp, nm, len)) { MARK_INT_FAIL(CODA_RMDIR_STATS); return(ENOENT); } /* We're being conservative here, it might be that this person * doesn't really have sufficient access to delete the file * but we feel zapping the entry won't really hurt anyone -- dcs */ /* * As a side effect of the rmdir, remove any entries for children of * the directory, especially "." and "..". */ cp = coda_nc_lookup(dcp, nm, len, cred); if (cp) coda_nc_zapParentfid(&(cp->c_fid), NOT_DOWNCALL); /* Remove the file's entry from the CODA Name Cache */ coda_nc_zapfile(dcp, nm, len); /* Invalidate the parent's attr cache, the modification time has changed */ dcp->c_flags &= ~C_VATTR; error = venus_rmdir(vtomi(dvp), &dcp->c_fid, nm, len, cred, td->td_proc); CODADEBUG(CODA_RMDIR, myprintf(("in rmdir result %d\n", error)); ) return(error); } int coda_symlink(struct vop_symlink_args *ap) { /* true args */ struct vnode *tdvp = ap->a_dvp; struct cnode *tdcp = VTOC(tdvp); struct componentname *cnp = ap->a_cnp; struct vattr *tva = ap->a_vap; char *path = ap->a_target; struct ucred *cred = cnp->cn_cred; struct thread *td = cnp->cn_thread; struct vnode **vpp = ap->a_vpp; /* locals */ int error; /* * XXX I'm assuming the following things about coda_symlink's * arguments: * t(foo) is the new name/parent/etc being created. * lname is the contents of the new symlink. */ char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; int plen = strlen(path); /* * Here's the strategy for the moment: perform the symlink, then * do a lookup to grab the resulting vnode. I know this requires * two communications with Venus for a new sybolic link, but * that's the way the ball bounces. I don't yet want to change * the way the Mach symlink works. When Mach support is * deprecated, we should change symlink so that the common case * returns the resultant vnode in a vpp argument. */ MARK_ENTRY(CODA_SYMLINK_STATS); /* Check for symlink of control object. */ if (IS_CTL_NAME(tdvp, nm, len)) { MARK_INT_FAIL(CODA_SYMLINK_STATS); return(EACCES); } if (plen+1 > CODA_MAXPATHLEN) { MARK_INT_FAIL(CODA_SYMLINK_STATS); return(EINVAL); } if (len+1 > CODA_MAXNAMLEN) { MARK_INT_FAIL(CODA_SYMLINK_STATS); error = EINVAL; goto exit; } error = venus_symlink(vtomi(tdvp), &tdcp->c_fid, path, plen, nm, len, tva, cred, td->td_proc); /* Invalidate the parent's attr cache, the modification time has changed */ tdcp->c_flags &= ~C_VATTR; if (error == 0) error = VOP_LOOKUP(tdvp, vpp, cnp); exit: CODADEBUG(CODA_SYMLINK, myprintf(("in symlink result %d\n",error)); ) return(error); } /* * Read directory entries. */ int coda_readdir(struct vop_readdir_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); register struct uio *uiop = ap->a_uio; struct ucred *cred = ap->a_cred; int *eofflag = ap->a_eofflag; u_long **cookies = ap->a_cookies; int *ncookies = ap->a_ncookies; struct thread *td = ap->a_uio->uio_td; /* upcall decl */ /* locals */ int error = 0; MARK_ENTRY(CODA_READDIR_STATS); CODADEBUG(CODA_READDIR, myprintf(("coda_readdir(%p, %d, %lld, %d)\n", (void *)uiop->uio_iov->iov_base, uiop->uio_resid, (long long)uiop->uio_offset, uiop->uio_segflg)); ) /* Check for readdir of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_READDIR_STATS); return(ENOENT); } { /* If directory is not already open do an "internal open" on it. */ int opened_internally = 0; if (cp->c_ovp == NULL) { opened_internally = 1; MARK_INT_GEN(CODA_OPEN_STATS); error = VOP_OPEN(vp, FREAD, cred, td, -1); printf("coda_readdir: Internally Opening %p\n", vp); if (error) { printf("coda_readdir: VOP_OPEN on container failed %d\n", error); return (error); } } /* Have UFS handle the call. */ CODADEBUG(CODA_READDIR, myprintf(("indirect readdir: fid = %s, refcnt = %d\n", coda_f2s(&cp->c_fid), vp->v_usecount)); ) error = VOP_READDIR(cp->c_ovp, uiop, cred, eofflag, ncookies, cookies); if (error) MARK_INT_FAIL(CODA_READDIR_STATS); else MARK_INT_SAT(CODA_READDIR_STATS); /* Do an "internal close" if necessary. */ if (opened_internally) { MARK_INT_GEN(CODA_CLOSE_STATS); (void)VOP_CLOSE(vp, FREAD, cred, td); } } return(error); } /* * Convert from filesystem blocks to device blocks */ int coda_bmap(struct vop_bmap_args *ap) { /* XXX on the global proc */ /* true args */ struct vnode *vp __attribute__((unused)) = ap->a_vp; /* file's vnode */ daddr_t bn __attribute__((unused)) = ap->a_bn; /* fs block number */ struct bufobj **bop = ap->a_bop; /* RETURN bufobj of device */ daddr_t *bnp __attribute__((unused)) = ap->a_bnp; /* RETURN device block number */ struct thread *td __attribute__((unused)) = curthread; /* upcall decl */ /* locals */ int ret = 0; struct cnode *cp; cp = VTOC(vp); if (cp->c_ovp) { return EINVAL; ret = VOP_BMAP(cp->c_ovp, bn, bop, bnp, ap->a_runp, ap->a_runb); #if 0 printf("VOP_BMAP(cp->c_ovp %p, bn %p, bop %p, bnp %lld, ap->a_runp %p, ap->a_runb %p) = %d\n", cp->c_ovp, bn, bop, bnp, ap->a_runp, ap->a_runb, ret); #endif return ret; } else { #if 0 printf("coda_bmap: no container\n"); #endif return(EOPNOTSUPP); } } int coda_reclaim(struct vop_reclaim_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); /* upcall decl */ /* locals */ /* * Forced unmount/flush will let vnodes with non zero use be destroyed! */ ENTRY; if (IS_UNMOUNTING(cp)) { #ifdef DEBUG if (VTOC(vp)->c_ovp) { if (IS_UNMOUNTING(cp)) printf("coda_reclaim: c_ovp not void: vp %p, cp %p\n", vp, cp); } #endif } else { #ifdef OLD_DIAGNOSTIC if (vrefcnt(vp) != 0) print("coda_reclaim: pushing active %p\n", vp); if (VTOC(vp)->c_ovp) { panic("coda_reclaim: c_ovp not void"); } #endif } cache_purge(vp); coda_free(VTOC(vp)); vp->v_data = NULL; vnode_destroy_vobject(vp); return (0); } int -coda_lock(struct _vop_lock_args *ap) +coda_lock(struct vop_lock1_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); /* upcall decl */ /* locals */ ENTRY; if ((ap->a_flags & LK_INTERLOCK) == 0) { VI_LOCK(vp); ap->a_flags |= LK_INTERLOCK; } if (coda_lockdebug) { myprintf(("Attempting lock on %s\n", coda_f2s(&cp->c_fid))); } return (vop_stdlock(ap)); } int coda_unlock(struct vop_unlock_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); /* upcall decl */ /* locals */ ENTRY; if (coda_lockdebug) { myprintf(("Attempting unlock on %s\n", coda_f2s(&cp->c_fid))); } return (vop_stdunlock(ap)); } int coda_islocked(struct vop_islocked_args *ap) { /* true args */ ENTRY; return (vop_stdislocked(ap)); } /* How one looks up a vnode given a device/inode pair: */ int coda_grab_vnode(struct cdev *dev, ino_t ino, struct vnode **vpp) { /* This is like VFS_VGET() or igetinode()! */ int error; struct mount *mp; if (!(mp = devtomp(dev))) { myprintf(("coda_grab_vnode: devtomp(%#lx) returns NULL\n", (u_long)dev2udev(dev))); return(ENXIO); } /* XXX - ensure that nonzero-return means failure */ error = VFS_VGET(mp,ino,LK_EXCLUSIVE,vpp); if (error) { myprintf(("coda_grab_vnode: iget/vget(%lx, %lu) returns %p, err %d\n", (u_long)dev2udev(dev), (u_long)ino, (void *)*vpp, error)); return(ENOENT); } return(0); } void print_vattr(struct vattr *attr) { char *typestr; switch (attr->va_type) { case VNON: typestr = "VNON"; break; case VREG: typestr = "VREG"; break; case VDIR: typestr = "VDIR"; break; case VBLK: typestr = "VBLK"; break; case VCHR: typestr = "VCHR"; break; case VLNK: typestr = "VLNK"; break; case VSOCK: typestr = "VSCK"; break; case VFIFO: typestr = "VFFO"; break; case VBAD: typestr = "VBAD"; break; default: typestr = "????"; break; } myprintf(("attr: type %s mode %d uid %d gid %d fsid %d rdev %d\n", typestr, (int)attr->va_mode, (int)attr->va_uid, (int)attr->va_gid, (int)attr->va_fsid, (int)attr->va_rdev)); myprintf((" fileid %d nlink %d size %d blocksize %d bytes %d\n", (int)attr->va_fileid, (int)attr->va_nlink, (int)attr->va_size, (int)attr->va_blocksize,(int)attr->va_bytes)); myprintf((" gen %ld flags %ld vaflags %d\n", attr->va_gen, attr->va_flags, attr->va_vaflags)); myprintf((" atime sec %d nsec %d\n", (int)attr->va_atime.tv_sec, (int)attr->va_atime.tv_nsec)); myprintf((" mtime sec %d nsec %d\n", (int)attr->va_mtime.tv_sec, (int)attr->va_mtime.tv_nsec)); myprintf((" ctime sec %d nsec %d\n", (int)attr->va_ctime.tv_sec, (int)attr->va_ctime.tv_nsec)); } /* How to print a ucred */ void print_cred(struct ucred *cred) { int i; myprintf(("ref %d\tuid %d\n",cred->cr_ref,cred->cr_uid)); for (i=0; i < cred->cr_ngroups; i++) myprintf(("\tgroup %d: (%d)\n",i,cred->cr_groups[i])); myprintf(("\n")); } /* * Return a vnode for the given fid. * If no cnode exists for this fid create one and put it * in a table hashed by coda_f2i(). If the cnode for * this fid is already in the table return it (ref count is * incremented by coda_find. The cnode will be flushed from the * table when coda_inactive calls coda_unsave. */ struct cnode * make_coda_node(CodaFid *fid, struct mount *vfsp, short type) { struct cnode *cp; int err; if ((cp = coda_find(fid)) == NULL) { struct vnode *vp; cp = coda_alloc(); cp->c_fid = *fid; err = getnewvnode("coda", vfsp, &coda_vnodeops, &vp); if (err) { panic("coda: getnewvnode returned error %d\n", err); } err = insmntque1(vp, vfsp, NULL, NULL); /* XXX: Too early for mpsafe fs */ if (err != 0) panic("coda: insmntque failed: error %d", err); vp->v_data = cp; vp->v_type = type; cp->c_vnode = vp; coda_save(cp); } else { vref(CTOV(cp)); } return cp; } int coda_pathconf(struct vop_pathconf_args *ap) { int error; register_t *retval; retval = ap->a_retval; error = 0; switch (ap->a_name) { case _PC_NAME_MAX: *retval = CODA_MAXNAMLEN; break; case _PC_PATH_MAX: *retval = CODA_MAXPATHLEN; break; default: error = vop_stdpathconf(ap); break; } return (error); } Index: head/sys/coda/coda_vnops.h =================================================================== --- head/sys/coda/coda_vnops.h (revision 169670) +++ head/sys/coda/coda_vnops.h (revision 169671) @@ -1,86 +1,86 @@ /*- * * Coda: an Experimental Distributed File System * Release 3.1 * * Copyright (c) 1987-1998 Carnegie Mellon University * All Rights Reserved * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation, and * that credit is given to Carnegie Mellon University in all documents * and publicity pertaining to direct or indirect use of this code or its * derivatives. * * CODA IS AN EXPERIMENTAL SOFTWARE SYSTEM AND IS KNOWN TO HAVE BUGS, * SOME OF WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON ALLOWS * FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION. CARNEGIE MELLON * DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER * RESULTING DIRECTLY OR INDIRECTLY FROM THE USE OF THIS SOFTWARE OR OF * ANY DERIVATIVE WORK. * * Carnegie Mellon encourages users of this software to return any * improvements or extensions that they make, and to grant Carnegie * Mellon the rights to redistribute these changes without encumbrance. * * @(#) src/sys/coda/coda_vnops.h,v 1.1.1.1 1998/08/29 21:14:52 rvb Exp $ * $FreeBSD$ * */ /* * Mach Operating System * Copyright (c) 1990 Carnegie-Mellon University * Copyright (c) 1989 Carnegie-Mellon University * All rights reserved. The CMU software License Agreement specifies * the terms and conditions for use and redistribution. */ /* * This code was written for the Coda filesystem at Carnegie Mellon * University. Contributers include David Steere, James Kistler, and * M. Satyanarayanan. */ /* NetBSD interfaces to the vnodeops */ vop_open_t coda_open; vop_close_t coda_close; vop_read_t coda_read; vop_write_t coda_write; vop_ioctl_t coda_ioctl; /* 1.3 int cfs_select(void *);*/ vop_getattr_t coda_getattr; vop_setattr_t coda_setattr; vop_access_t coda_access; int coda_abortop(void *); vop_readlink_t coda_readlink; vop_fsync_t coda_fsync; vop_inactive_t coda_inactive; vop_lookup_t coda_lookup; vop_create_t coda_create; vop_remove_t coda_remove; vop_link_t coda_link; vop_rename_t coda_rename; vop_mkdir_t coda_mkdir; vop_rmdir_t coda_rmdir; vop_symlink_t coda_symlink; vop_readdir_t coda_readdir; vop_bmap_t coda_bmap; vop_strategy_t coda_strategy; vop_reclaim_t coda_reclaim; -_vop_lock_t coda_lock; +vop_lock1_t coda_lock; vop_unlock_t coda_unlock; vop_islocked_t coda_islocked; int coda_vop_error(void *); int coda_vop_nop(void *); vop_pathconf_t coda_pathconf; int coda_rdwr(struct vnode *vp, struct uio *uiop, enum uio_rw rw, int ioflag, struct ucred *cred, struct thread *td); int coda_grab_vnode(struct cdev *dev, ino_t ino, struct vnode **vpp); void print_vattr(struct vattr *attr); void print_cred(struct ucred *cred); Index: head/sys/fs/coda/coda_vnops.c =================================================================== --- head/sys/fs/coda/coda_vnops.c (revision 169670) +++ head/sys/fs/coda/coda_vnops.c (revision 169671) @@ -1,1837 +1,1837 @@ /*- * Coda: an Experimental Distributed File System * Release 3.1 * * Copyright (c) 1987-1998 Carnegie Mellon University * All Rights Reserved * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation, and * that credit is given to Carnegie Mellon University in all documents * and publicity pertaining to direct or indirect use of this code or its * derivatives. * * CODA IS AN EXPERIMENTAL SOFTWARE SYSTEM AND IS KNOWN TO HAVE BUGS, * SOME OF WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON ALLOWS * FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION. CARNEGIE MELLON * DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER * RESULTING DIRECTLY OR INDIRECTLY FROM THE USE OF THIS SOFTWARE OR OF * ANY DERIVATIVE WORK. * * Carnegie Mellon encourages users of this software to return any * improvements or extensions that they make, and to grant Carnegie * Mellon the rights to redistribute these changes without encumbrance. * * @(#) src/sys/coda/coda_vnops.c,v 1.1.1.1 1998/08/29 21:14:52 rvb Exp $ */ /* * Mach Operating System * Copyright (c) 1990 Carnegie-Mellon University * Copyright (c) 1989 Carnegie-Mellon University * All rights reserved. The CMU software License Agreement specifies * the terms and conditions for use and redistribution. */ /* * This code was written for the Coda filesystem at Carnegie Mellon * University. Contributers include David Steere, James Kistler, and * M. Satyanarayanan. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include /* Must come after sys/malloc.h */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * These flags select various performance enhancements. */ int coda_attr_cache = 1; /* Set to cache attributes in the kernel */ int coda_symlink_cache = 1; /* Set to cache symbolic link information */ int coda_access_cache = 1; /* Set to handle some access checks directly */ /* structure to keep track of vfs calls */ struct coda_op_stats coda_vnodeopstats[CODA_VNODEOPS_SIZE]; #define MARK_ENTRY(op) (coda_vnodeopstats[op].entries++) #define MARK_INT_SAT(op) (coda_vnodeopstats[op].sat_intrn++) #define MARK_INT_FAIL(op) (coda_vnodeopstats[op].unsat_intrn++) #define MARK_INT_GEN(op) (coda_vnodeopstats[op].gen_intrn++) /* What we are delaying for in printf */ int coda_printf_delay = 0; /* in microseconds */ int coda_vnop_print_entry = 0; static int coda_lockdebug = 0; /* * Some NetBSD details: * * coda_start is called at the end of the mount syscall. * coda_init is called at boot time. */ #define ENTRY if(coda_vnop_print_entry) myprintf(("Entered %s\n",__func__)) /* Definition of the vnode operation vector */ struct vop_vector coda_vnodeops = { .vop_default = VOP_PANIC, .vop_lookup = coda_lookup, /* lookup */ .vop_create = coda_create, /* create */ .vop_mknod = VOP_PANIC, /* mknod */ .vop_open = coda_open, /* open */ .vop_close = coda_close, /* close */ .vop_access = coda_access, /* access */ .vop_getattr = coda_getattr, /* getattr */ .vop_setattr = coda_setattr, /* setattr */ .vop_read = coda_read, /* read */ .vop_write = coda_write, /* write */ .vop_ioctl = coda_ioctl, /* ioctl */ .vop_fsync = coda_fsync, /* fsync */ .vop_remove = coda_remove, /* remove */ .vop_link = coda_link, /* link */ .vop_rename = coda_rename, /* rename */ .vop_mkdir = coda_mkdir, /* mkdir */ .vop_rmdir = coda_rmdir, /* rmdir */ .vop_symlink = coda_symlink, /* symlink */ .vop_readdir = coda_readdir, /* readdir */ .vop_readlink = coda_readlink, /* readlink */ .vop_inactive = coda_inactive, /* inactive */ .vop_reclaim = coda_reclaim, /* reclaim */ - ._vop_lock = coda_lock, /* lock */ + .vop_lock1 = coda_lock, /* lock */ .vop_unlock = coda_unlock, /* unlock */ .vop_bmap = coda_bmap, /* bmap */ .vop_print = VOP_PANIC, /* print */ .vop_islocked = coda_islocked, /* islocked */ .vop_pathconf = coda_pathconf, /* pathconf */ .vop_advlock = VOP_NULL, /* advlock */ .vop_lease = VOP_NULL, /* lease */ .vop_poll = vop_stdpoll, .vop_getpages = vop_stdgetpages, /* pager intf.*/ .vop_putpages = vop_stdputpages, /* pager intf.*/ .vop_getwritemount = vop_stdgetwritemount, #if 0 missing .vop_cachedlookup = ufs_lookup, .vop_whiteout = ufs_whiteout, #endif }; /* A generic do-nothing. For lease_check, advlock */ int coda_vop_nop(void *anon) { struct vnodeop_desc **desc = (struct vnodeop_desc **)anon; if (codadebug) { myprintf(("Vnode operation %s called, but unsupported\n", (*desc)->vdesc_name)); } return (0); } int coda_vnodeopstats_init(void) { register int i; for(i=0;ia_vp); struct cnode *cp = VTOC(*vpp); int flag = ap->a_mode & (~O_EXCL); struct ucred *cred = ap->a_cred; struct thread *td = ap->a_td; /* locals */ int error; struct vnode *vp; struct cdev *dev; ino_t inode; MARK_ENTRY(CODA_OPEN_STATS); /* Check for open of control file. */ if (IS_CTL_VP(*vpp)) { /* XXX */ /* if (WRITEABLE(flag)) */ if (flag & (FWRITE | O_TRUNC | O_CREAT | O_EXCL)) { MARK_INT_FAIL(CODA_OPEN_STATS); return(EACCES); } MARK_INT_SAT(CODA_OPEN_STATS); return(0); } error = venus_open(vtomi((*vpp)), &cp->c_fid, flag, cred, td->td_proc, &dev, &inode); if (error) return (error); if (!error) { CODADEBUG( CODA_OPEN,myprintf(("open: dev %#lx inode %lu result %d\n", (u_long)dev2udev(dev), (u_long)inode, error)); ) } /* Translate the pair for the cache file into an inode pointer. */ error = coda_grab_vnode(dev, inode, &vp); if (error) return (error); /* We get the vnode back locked. Needs unlocked */ VOP_UNLOCK(vp, 0, td); /* Keep a reference until the close comes in. */ vref(*vpp); /* Save the vnode pointer for the cache file. */ if (cp->c_ovp == NULL) { cp->c_ovp = vp; } else { if (cp->c_ovp != vp) panic("coda_open: cp->c_ovp != ITOV(ip)"); } cp->c_ocount++; /* Flush the attribute cached if writing the file. */ if (flag & FWRITE) { cp->c_owrite++; cp->c_flags &= ~C_VATTR; } /* Save the pair for the cache file to speed up subsequent page_read's. */ cp->c_device = dev; cp->c_inode = inode; /* Open the cache file. */ error = VOP_OPEN(vp, flag, cred, td, -1); if (error) { printf("coda_open: VOP_OPEN on container failed %d\n", error); return (error); } /* grab (above) does this when it calls newvnode unless it's in the cache*/ return(error); } /* * Close the cache file used for I/O and notify Venus. */ int coda_close(struct vop_close_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); int flag = ap->a_fflag; struct ucred *cred = ap->a_cred; struct thread *td = ap->a_td; /* locals */ int error; MARK_ENTRY(CODA_CLOSE_STATS); /* Check for close of control file. */ if (IS_CTL_VP(vp)) { MARK_INT_SAT(CODA_CLOSE_STATS); return(0); } if (IS_UNMOUNTING(cp)) { if (cp->c_ovp) { #ifdef CODA_VERBOSE printf("coda_close: destroying container ref %d, ufs vp %p of vp %p/cp %p\n", vrefcnt(vp), cp->c_ovp, vp, cp); #endif #ifdef hmm vgone(cp->c_ovp); #else VOP_CLOSE(cp->c_ovp, flag, cred, td); /* Do errors matter here? */ vrele(cp->c_ovp); #endif } else { #ifdef CODA_VERBOSE printf("coda_close: NO container vp %p/cp %p\n", vp, cp); #endif } return ENODEV; } else { VOP_CLOSE(cp->c_ovp, flag, cred, td); /* Do errors matter here? */ vrele(cp->c_ovp); } if (--cp->c_ocount == 0) cp->c_ovp = NULL; if (flag & FWRITE) /* file was opened for write */ --cp->c_owrite; error = venus_close(vtomi(vp), &cp->c_fid, flag, cred, td->td_proc); vrele(CTOV(cp)); CODADEBUG(CODA_CLOSE, myprintf(("close: result %d\n",error)); ) return(error); } int coda_read(struct vop_read_args *ap) { ENTRY; return(coda_rdwr(ap->a_vp, ap->a_uio, UIO_READ, ap->a_ioflag, ap->a_cred, ap->a_uio->uio_td)); } int coda_write(struct vop_write_args *ap) { ENTRY; return(coda_rdwr(ap->a_vp, ap->a_uio, UIO_WRITE, ap->a_ioflag, ap->a_cred, ap->a_uio->uio_td)); } int coda_rdwr(struct vnode *vp, struct uio *uiop, enum uio_rw rw, int ioflag, struct ucred *cred, struct thread *td) { /* upcall decl */ /* NOTE: container file operation!!! */ /* locals */ struct cnode *cp = VTOC(vp); struct vnode *cfvp = cp->c_ovp; struct proc *p = td->td_proc; struct thread *ltd = td; int igot_internally = 0; int opened_internally = 0; int error = 0; int iscore = 0; MARK_ENTRY(CODA_RDWR_STATS); CODADEBUG(CODA_RDWR, myprintf(("coda_rdwr(%d, %p, %d, %lld, %d)\n", rw, (void *)uiop->uio_iov->iov_base, uiop->uio_resid, (long long)uiop->uio_offset, uiop->uio_segflg)); ) /* Check for rdwr of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_RDWR_STATS); return(EINVAL); } /* * If file is not already open this must be a page * {read,write} request. Iget the cache file's inode * pointer if we still have its pair. * Otherwise, we must do an internal open to derive the * pair. */ if (cfvp == NULL) { /* * If we're dumping core, do the internal open. Otherwise * venus won't have the correct size of the core when * it's completely written. */ if (p) { PROC_LOCK(p); iscore = (p->p_acflag & ACORE); PROC_UNLOCK(p); } else ltd = curthread; if (cp->c_inode != 0 && !iscore) { igot_internally = 1; error = coda_grab_vnode(cp->c_device, cp->c_inode, &cfvp); if (error) { MARK_INT_FAIL(CODA_RDWR_STATS); return(error); } /* * We get the vnode back locked by curthread in both Mach and * NetBSD. Needs unlocked */ VOP_UNLOCK(cfvp, 0, ltd); } else { opened_internally = 1; MARK_INT_GEN(CODA_OPEN_STATS); error = VOP_OPEN(vp, (rw == UIO_READ ? FREAD : FWRITE), cred, td, -1); printf("coda_rdwr: Internally Opening %p\n", vp); if (error) { printf("coda_rdwr: VOP_OPEN on container failed %d\n", error); return (error); } cfvp = cp->c_ovp; } } /* Have UFS handle the call. */ CODADEBUG(CODA_RDWR, myprintf(("indirect rdwr: fid = %s, refcnt = %d\n", coda_f2s(&cp->c_fid), CTOV(cp)->v_usecount)); ) if (rw == UIO_READ) { error = VOP_READ(cfvp, uiop, ioflag, cred); } else { error = VOP_WRITE(cfvp, uiop, ioflag, cred); /* ufs_write updates the vnode_pager_setsize for the vnode/object */ { struct vattr attr; if (VOP_GETATTR(cfvp, &attr, cred, td) == 0) { vnode_pager_setsize(vp, attr.va_size); } } } if (error) MARK_INT_FAIL(CODA_RDWR_STATS); else MARK_INT_SAT(CODA_RDWR_STATS); /* Do an internal close if necessary. */ if (opened_internally) { MARK_INT_GEN(CODA_CLOSE_STATS); (void)VOP_CLOSE(vp, (rw == UIO_READ ? FREAD : FWRITE), cred, td); } /* Invalidate cached attributes if writing. */ if (rw == UIO_WRITE) cp->c_flags &= ~C_VATTR; return(error); } int coda_ioctl(struct vop_ioctl_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; int com = ap->a_command; caddr_t data = ap->a_data; int flag = ap->a_fflag; struct ucred *cred = ap->a_cred; struct thread *td = ap->a_td; /* locals */ int error; struct vnode *tvp; struct nameidata ndp; struct PioctlData *iap = (struct PioctlData *)data; MARK_ENTRY(CODA_IOCTL_STATS); CODADEBUG(CODA_IOCTL, myprintf(("in coda_ioctl on %s\n", iap->path));) /* Don't check for operation on a dying object, for ctlvp it shouldn't matter */ /* Must be control object to succeed. */ if (!IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_IOCTL_STATS); CODADEBUG(CODA_IOCTL, myprintf(("coda_ioctl error: vp != ctlvp"));) return (EOPNOTSUPP); } /* Look up the pathname. */ /* Should we use the name cache here? It would get it from lookupname sooner or later anyway, right? */ NDINIT(&ndp, LOOKUP, (iap->follow ? FOLLOW : NOFOLLOW), UIO_USERSPACE, iap->path, td); error = namei(&ndp); tvp = ndp.ni_vp; if (error) { MARK_INT_FAIL(CODA_IOCTL_STATS); CODADEBUG(CODA_IOCTL, myprintf(("coda_ioctl error: lookup returns %d\n", error));) return(error); } /* * Make sure this is a coda style cnode, but it may be a * different vfsp */ if (tvp->v_op != &coda_vnodeops) { vrele(tvp); NDFREE(&ndp, NDF_ONLY_PNBUF); MARK_INT_FAIL(CODA_IOCTL_STATS); CODADEBUG(CODA_IOCTL, myprintf(("coda_ioctl error: %s not a coda object\n", iap->path));) return(EINVAL); } if (iap->vi.in_size > VC_MAXDATASIZE) { NDFREE(&ndp, 0); return(EINVAL); } error = venus_ioctl(vtomi(tvp), &((VTOC(tvp))->c_fid), com, flag, data, cred, td->td_proc); if (error) MARK_INT_FAIL(CODA_IOCTL_STATS); else CODADEBUG(CODA_IOCTL, myprintf(("Ioctl returns %d \n", error)); ) vrele(tvp); NDFREE(&ndp, NDF_ONLY_PNBUF); return(error); } /* * To reduce the cost of a user-level venus;we cache attributes in * the kernel. Each cnode has storage allocated for an attribute. If * c_vattr is valid, return a reference to it. Otherwise, get the * attributes from venus and store them in the cnode. There is some * question if this method is a security leak. But I think that in * order to make this call, the user must have done a lookup and * opened the file, and therefore should already have access. */ int coda_getattr(struct vop_getattr_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = ap->a_td; /* locals */ int error; MARK_ENTRY(CODA_GETATTR_STATS); if (IS_UNMOUNTING(cp)) return ENODEV; /* Check for getattr of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_GETATTR_STATS); return(ENOENT); } /* Check to see if the attributes have already been cached */ if (VALID_VATTR(cp)) { CODADEBUG(CODA_GETATTR, { myprintf(("attr cache hit: %s\n", coda_f2s(&cp->c_fid)));}); CODADEBUG(CODA_GETATTR, if (!(codadebug & ~CODA_GETATTR)) print_vattr(&cp->c_vattr); ); *vap = cp->c_vattr; MARK_INT_SAT(CODA_GETATTR_STATS); return(0); } error = venus_getattr(vtomi(vp), &cp->c_fid, cred, td->td_proc, vap); if (!error) { CODADEBUG(CODA_GETATTR, myprintf(("getattr miss %s: result %d\n", coda_f2s(&cp->c_fid), error)); ) CODADEBUG(CODA_GETATTR, if (!(codadebug & ~CODA_GETATTR)) print_vattr(vap); ); { int size = vap->va_size; struct vnode *convp = cp->c_ovp; if (convp != (struct vnode *)0) { vnode_pager_setsize(convp, size); } } /* If not open for write, store attributes in cnode */ if ((cp->c_owrite == 0) && (coda_attr_cache)) { cp->c_vattr = *vap; cp->c_flags |= C_VATTR; } } return(error); } int coda_setattr(struct vop_setattr_args *ap) { /* true args */ register struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); register struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = ap->a_td; /* locals */ int error; MARK_ENTRY(CODA_SETATTR_STATS); /* Check for setattr of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_SETATTR_STATS); return(ENOENT); } if (codadebug & CODADBGMSK(CODA_SETATTR)) { print_vattr(vap); } error = venus_setattr(vtomi(vp), &cp->c_fid, vap, cred, td->td_proc); if (!error) cp->c_flags &= ~C_VATTR; { int size = vap->va_size; struct vnode *convp = cp->c_ovp; if (size != VNOVAL && convp != (struct vnode *)0) { vnode_pager_setsize(convp, size); } } CODADEBUG(CODA_SETATTR, myprintf(("setattr %d\n", error)); ) return(error); } int coda_access(struct vop_access_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); int mode = ap->a_mode; struct ucred *cred = ap->a_cred; struct thread *td = ap->a_td; /* locals */ int error; MARK_ENTRY(CODA_ACCESS_STATS); /* Check for access of control object. Only read access is allowed on it. */ if (IS_CTL_VP(vp)) { /* bogus hack - all will be marked as successes */ MARK_INT_SAT(CODA_ACCESS_STATS); return(((mode & VREAD) && !(mode & (VWRITE | VEXEC))) ? 0 : EACCES); } /* * if the file is a directory, and we are checking exec (eg lookup) * access, and the file is in the namecache, then the user must have * lookup access to it. */ if (coda_access_cache) { if ((vp->v_type == VDIR) && (mode & VEXEC)) { if (coda_nc_lookup(cp, ".", 1, cred)) { MARK_INT_SAT(CODA_ACCESS_STATS); return(0); /* it was in the cache */ } } } error = venus_access(vtomi(vp), &cp->c_fid, mode, cred, td->td_proc); return(error); } int coda_readlink(struct vop_readlink_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct uio *uiop = ap->a_uio; struct ucred *cred = ap->a_cred; struct thread *td = ap->a_uio->uio_td; /* locals */ int error; char *str; int len; MARK_ENTRY(CODA_READLINK_STATS); /* Check for readlink of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_READLINK_STATS); return(ENOENT); } if ((coda_symlink_cache) && (VALID_SYMLINK(cp))) { /* symlink was cached */ uiop->uio_rw = UIO_READ; error = uiomove(cp->c_symlink, (int)cp->c_symlen, uiop); if (error) MARK_INT_FAIL(CODA_READLINK_STATS); else MARK_INT_SAT(CODA_READLINK_STATS); return(error); } error = venus_readlink(vtomi(vp), &cp->c_fid, cred, td != NULL ? td->td_proc : NULL, &str, &len); if (!error) { uiop->uio_rw = UIO_READ; error = uiomove(str, len, uiop); if (coda_symlink_cache) { cp->c_symlink = str; cp->c_symlen = len; cp->c_flags |= C_SYMLINK; } else CODA_FREE(str, len); } CODADEBUG(CODA_READLINK, myprintf(("in readlink result %d\n",error));) return(error); } int coda_fsync(struct vop_fsync_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct thread *td = ap->a_td; /* locals */ struct vnode *convp = cp->c_ovp; int error; MARK_ENTRY(CODA_FSYNC_STATS); /* Check for fsync on an unmounting object */ /* The NetBSD kernel, in it's infinite wisdom, can try to fsync * after an unmount has been initiated. This is a Bad Thing, * which we have to avoid. Not a legitimate failure for stats. */ if (IS_UNMOUNTING(cp)) { return(ENODEV); } /* Check for fsync of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_SAT(CODA_FSYNC_STATS); return(0); } if (convp) VOP_FSYNC(convp, MNT_WAIT, td); /* * We see fsyncs with usecount == 1 then usecount == 0. * For now we ignore them. */ /* VI_LOCK(vp); if (!vp->v_usecount) { printf("coda_fsync on vnode %p with %d usecount. c_flags = %x (%x)\n", vp, vp->v_usecount, cp->c_flags, cp->c_flags&C_PURGING); } VI_UNLOCK(vp); */ /* * We can expect fsync on any vnode at all if venus is pruging it. * Venus can't very well answer the fsync request, now can it? * Hopefully, it won't have to, because hopefully, venus preserves * the (possibly untrue) invariant that it never purges an open * vnode. Hopefully. */ if (cp->c_flags & C_PURGING) { return(0); } /* needs research */ return 0; error = venus_fsync(vtomi(vp), &cp->c_fid, td->td_proc); CODADEBUG(CODA_FSYNC, myprintf(("in fsync result %d\n",error)); ); return(error); } int coda_inactive(struct vop_inactive_args *ap) { /* XXX - at the moment, inactive doesn't look at cred, and doesn't have a proc pointer. Oops. */ /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct ucred *cred __attribute__((unused)) = NULL; struct thread *td __attribute__((unused)) = curthread; /* upcall decl */ /* locals */ /* We don't need to send inactive to venus - DCS */ MARK_ENTRY(CODA_INACTIVE_STATS); if (IS_CTL_VP(vp)) { MARK_INT_SAT(CODA_INACTIVE_STATS); return 0; } CODADEBUG(CODA_INACTIVE, myprintf(("in inactive, %s, vfsp %p\n", coda_f2s(&cp->c_fid), vp->v_mount));) /* If an array has been allocated to hold the symlink, deallocate it */ if ((coda_symlink_cache) && (VALID_SYMLINK(cp))) { if (cp->c_symlink == NULL) panic("coda_inactive: null symlink pointer in cnode"); CODA_FREE(cp->c_symlink, cp->c_symlen); cp->c_flags &= ~C_SYMLINK; cp->c_symlen = 0; } /* Remove it from the table so it can't be found. */ coda_unsave(cp); if ((struct coda_mntinfo *)(vp->v_mount->mnt_data) == NULL) { myprintf(("Help! vfsp->vfs_data was NULL, but vnode %p wasn't dying\n", vp)); panic("badness in coda_inactive\n"); } if (IS_UNMOUNTING(cp)) { #ifdef DEBUG printf("coda_inactive: IS_UNMOUNTING use %d: vp %p, cp %p\n", vrefcnt(vp), vp, cp); if (cp->c_ovp != NULL) printf("coda_inactive: cp->ovp != NULL use %d: vp %p, cp %p\n", vrefcnt(vp), vp, cp); #endif } else { #ifdef OLD_DIAGNOSTIC if (vrefcnt(CTOV(cp))) { panic("coda_inactive: nonzero reference count"); } if (cp->c_ovp != NULL) { panic("coda_inactive: cp->ovp != NULL"); } #endif vgone(vp); } MARK_INT_SAT(CODA_INACTIVE_STATS); return(0); } /* * Remote filesystem operations having to do with directory manipulation. */ /* * It appears that in NetBSD, lookup is supposed to return the vnode locked */ int coda_lookup(struct vop_lookup_args *ap) { /* true args */ struct vnode *dvp = ap->a_dvp; struct cnode *dcp = VTOC(dvp); struct vnode **vpp = ap->a_vpp; /* * It looks as though ap->a_cnp->ni_cnd->cn_nameptr holds the rest * of the string to xlate, and that we must try to get at least * ap->a_cnp->ni_cnd->cn_namelen of those characters to macth. I * could be wrong. */ struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; struct thread *td = cnp->cn_thread; /* locals */ struct cnode *cp; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; CodaFid VFid; int vtype; int error = 0; MARK_ENTRY(CODA_LOOKUP_STATS); CODADEBUG(CODA_LOOKUP, myprintf(("lookup: %s in %s\n", nm, coda_f2s(&dcp->c_fid)));); /* Check for lookup of control object. */ if (IS_CTL_NAME(dvp, nm, len)) { *vpp = coda_ctlvp; vref(*vpp); MARK_INT_SAT(CODA_LOOKUP_STATS); goto exit; } if (len+1 > CODA_MAXNAMLEN) { MARK_INT_FAIL(CODA_LOOKUP_STATS); CODADEBUG(CODA_LOOKUP, myprintf(("name too long: lookup, %s (%s)\n", coda_f2s(&dcp->c_fid), nm));); *vpp = (struct vnode *)0; error = EINVAL; goto exit; } /* First try to look the file up in the cfs name cache */ /* lock the parent vnode? */ cp = coda_nc_lookup(dcp, nm, len, cred); if (cp) { *vpp = CTOV(cp); vref(*vpp); CODADEBUG(CODA_LOOKUP, myprintf(("lookup result %d vpp %p\n",error,*vpp));) } else { /* The name wasn't cached, so we need to contact Venus */ error = venus_lookup(vtomi(dvp), &dcp->c_fid, nm, len, cred, td->td_proc, &VFid, &vtype); if (error) { MARK_INT_FAIL(CODA_LOOKUP_STATS); CODADEBUG(CODA_LOOKUP, myprintf(("lookup error on %s (%s)%d\n", coda_f2s(&dcp->c_fid), nm, error));) *vpp = (struct vnode *)0; } else { MARK_INT_SAT(CODA_LOOKUP_STATS); CODADEBUG(CODA_LOOKUP, myprintf(("lookup: %s type %o result %d\n", coda_f2s(&VFid), vtype, error)); ) cp = make_coda_node(&VFid, dvp->v_mount, vtype); *vpp = CTOV(cp); /* enter the new vnode in the Name Cache only if the top bit isn't set */ /* And don't enter a new vnode for an invalid one! */ if (!(vtype & CODA_NOCACHE)) coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp)); } } exit: /* * If we are creating, and this was the last name to be looked up, * and the error was ENOENT, then there really shouldn't be an * error and we can make the leaf NULL and return success. Since * this is supposed to work under Mach as well as NetBSD, we're * leaving this fn wrapped. We also must tell lookup/namei that * we need to save the last component of the name. (Create will * have to free the name buffer later...lucky us...) */ if (((cnp->cn_nameiop == CREATE) || (cnp->cn_nameiop == RENAME)) && (cnp->cn_flags & ISLASTCN) && (error == ENOENT)) { error = EJUSTRETURN; cnp->cn_flags |= SAVENAME; *ap->a_vpp = NULL; } /* * If we are removing, and we are at the last element, and we * found it, then we need to keep the name around so that the * removal will go ahead as planned. Unfortunately, this will * probably also lock the to-be-removed vnode, which may or may * not be a good idea. I'll have to look at the bits of * coda_remove to make sure. We'll only save the name if we did in * fact find the name, otherwise coda_remove won't have a chance * to free the pathname. */ if ((cnp->cn_nameiop == DELETE) && (cnp->cn_flags & ISLASTCN) && !error) { cnp->cn_flags |= SAVENAME; } /* * If the lookup went well, we need to (potentially?) unlock the * parent, and lock the child. We are only responsible for * checking to see if the parent is supposed to be unlocked before * we return. We must always lock the child (provided there is * one, and (the parent isn't locked or it isn't the same as the * parent.) Simple, huh? We can never leave the parent locked unless * we are ISLASTCN */ if (!error || (error == EJUSTRETURN)) { if (cnp->cn_flags & ISDOTDOT) { if ((error = VOP_UNLOCK(dvp, 0, td))) { return error; } /* * The parent is unlocked. As long as there is a child, * lock it without bothering to check anything else. */ if (*ap->a_vpp) { if ((error = VOP_LOCK(*ap->a_vpp, LK_EXCLUSIVE, td))) { vn_lock(dvp, LK_RETRY|LK_EXCLUSIVE, td); return (error); } } vn_lock(dvp, LK_RETRY|LK_EXCLUSIVE, td); } else { /* The parent is locked, and may be the same as the child */ if (*ap->a_vpp && (*ap->a_vpp != dvp)) { /* Different, go ahead and lock it. */ if ((error = VOP_LOCK(*ap->a_vpp, LK_EXCLUSIVE, td))) { return (error); } } } } else { /* If the lookup failed, we need to ensure that the leaf is NULL */ /* Don't change any locking? */ *ap->a_vpp = NULL; } return(error); } /*ARGSUSED*/ int coda_create(struct vop_create_args *ap) { /* true args */ struct vnode *dvp = ap->a_dvp; struct cnode *dcp = VTOC(dvp); struct vattr *va = ap->a_vap; int exclusive = 1; int mode = ap->a_vap->va_mode; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; struct thread *td = cnp->cn_thread; /* locals */ int error; struct cnode *cp; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; CodaFid VFid; struct vattr attr; MARK_ENTRY(CODA_CREATE_STATS); /* All creates are exclusive XXX */ /* I'm assuming the 'mode' argument is the file mode bits XXX */ /* Check for create of control object. */ if (IS_CTL_NAME(dvp, nm, len)) { *vpp = (struct vnode *)0; MARK_INT_FAIL(CODA_CREATE_STATS); return(EACCES); } error = venus_create(vtomi(dvp), &dcp->c_fid, nm, len, exclusive, mode, va, cred, td->td_proc, &VFid, &attr); if (!error) { /* If this is an exclusive create, panic if the file already exists. */ /* Venus should have detected the file and reported EEXIST. */ if ((exclusive == 1) && (coda_find(&VFid) != NULL)) panic("cnode existed for newly created file!"); cp = make_coda_node(&VFid, dvp->v_mount, attr.va_type); *vpp = CTOV(cp); /* Update va to reflect the new attributes. */ (*va) = attr; /* Update the attribute cache and mark it as valid */ if (coda_attr_cache) { VTOC(*vpp)->c_vattr = attr; VTOC(*vpp)->c_flags |= C_VATTR; } /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(dvp)->c_flags &= ~C_VATTR; /* enter the new vnode in the Name Cache */ coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp)); CODADEBUG(CODA_CREATE, myprintf(("create: %s, result %d\n", coda_f2s(&VFid), error)); ) } else { *vpp = (struct vnode *)0; CODADEBUG(CODA_CREATE, myprintf(("create error %d\n", error));) } if (!error) { if (cnp->cn_flags & LOCKLEAF) { if ((error = VOP_LOCK(*ap->a_vpp, LK_EXCLUSIVE, td))) { printf("coda_create: "); panic("unlocked parent but couldn't lock child"); } } #ifdef OLD_DIAGNOSTIC else { printf("coda_create: LOCKLEAF not set!\n"); } #endif } return(error); } int coda_remove(struct vop_remove_args *ap) { /* true args */ struct vnode *dvp = ap->a_dvp; struct cnode *cp = VTOC(dvp); struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; struct thread *td = cnp->cn_thread; /* locals */ int error; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; struct cnode *tp; MARK_ENTRY(CODA_REMOVE_STATS); CODADEBUG(CODA_REMOVE, myprintf(("remove: %s in %s\n", nm, coda_f2s(&cp->c_fid)));); /* Remove the file's entry from the CODA Name Cache */ /* We're being conservative here, it might be that this person * doesn't really have sufficient access to delete the file * but we feel zapping the entry won't really hurt anyone -- dcs */ /* I'm gonna go out on a limb here. If a file and a hardlink to it * exist, and one is removed, the link count on the other will be * off by 1. We could either invalidate the attrs if cached, or * fix them. I'll try to fix them. DCS 11/8/94 */ tp = coda_nc_lookup(VTOC(dvp), nm, len, cred); if (tp) { if (VALID_VATTR(tp)) { /* If attrs are cached */ if (tp->c_vattr.va_nlink > 1) { /* If it's a hard link */ tp->c_vattr.va_nlink--; } } coda_nc_zapfile(VTOC(dvp), nm, len); /* No need to flush it if it doesn't exist! */ } /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(dvp)->c_flags &= ~C_VATTR; /* Check for remove of control object. */ if (IS_CTL_NAME(dvp, nm, len)) { MARK_INT_FAIL(CODA_REMOVE_STATS); return(ENOENT); } error = venus_remove(vtomi(dvp), &cp->c_fid, nm, len, cred, td->td_proc); CODADEBUG(CODA_REMOVE, myprintf(("in remove result %d\n",error)); ) return(error); } int coda_link(struct vop_link_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct vnode *tdvp = ap->a_tdvp; struct cnode *tdcp = VTOC(tdvp); struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; struct thread *td = cnp->cn_thread; /* locals */ int error; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; MARK_ENTRY(CODA_LINK_STATS); if (codadebug & CODADBGMSK(CODA_LINK)) { myprintf(("nb_link: vp fid: %s\n", coda_f2s(&cp->c_fid))); myprintf(("nb_link: tdvp fid: %s)\n", coda_f2s(&tdcp->c_fid))); } if (codadebug & CODADBGMSK(CODA_LINK)) { myprintf(("link: vp fid: %s\n", coda_f2s(&cp->c_fid))); myprintf(("link: tdvp fid: %s\n", coda_f2s(&tdcp->c_fid))); } /* Check for link to/from control object. */ if (IS_CTL_NAME(tdvp, nm, len) || IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_LINK_STATS); return(EACCES); } error = venus_link(vtomi(vp), &cp->c_fid, &tdcp->c_fid, nm, len, cred, td->td_proc); /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(tdvp)->c_flags &= ~C_VATTR; VTOC(vp)->c_flags &= ~C_VATTR; CODADEBUG(CODA_LINK, myprintf(("in link result %d\n",error)); ) return(error); } int coda_rename(struct vop_rename_args *ap) { /* true args */ struct vnode *odvp = ap->a_fdvp; struct cnode *odcp = VTOC(odvp); struct componentname *fcnp = ap->a_fcnp; struct vnode *ndvp = ap->a_tdvp; struct cnode *ndcp = VTOC(ndvp); struct componentname *tcnp = ap->a_tcnp; struct ucred *cred = fcnp->cn_cred; struct thread *td = fcnp->cn_thread; /* true args */ int error; const char *fnm = fcnp->cn_nameptr; int flen = fcnp->cn_namelen; const char *tnm = tcnp->cn_nameptr; int tlen = tcnp->cn_namelen; MARK_ENTRY(CODA_RENAME_STATS); /* Hmmm. The vnodes are already looked up. Perhaps they are locked? This could be Bad. XXX */ #ifdef OLD_DIAGNOSTIC if ((fcnp->cn_cred != tcnp->cn_cred) || (fcnp->cn_thread != tcnp->cn_thread)) { panic("coda_rename: component names don't agree"); } #endif /* Check for rename involving control object. */ if (IS_CTL_NAME(odvp, fnm, flen) || IS_CTL_NAME(ndvp, tnm, tlen)) { MARK_INT_FAIL(CODA_RENAME_STATS); return(EACCES); } /* Problem with moving directories -- need to flush entry for .. */ if (odvp != ndvp) { struct cnode *ovcp = coda_nc_lookup(VTOC(odvp), fnm, flen, cred); if (ovcp) { struct vnode *ovp = CTOV(ovcp); if ((ovp) && (ovp->v_type == VDIR)) /* If it's a directory */ coda_nc_zapfile(VTOC(ovp),"..", 2); } } /* Remove the entries for both source and target files */ coda_nc_zapfile(VTOC(odvp), fnm, flen); coda_nc_zapfile(VTOC(ndvp), tnm, tlen); /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(odvp)->c_flags &= ~C_VATTR; VTOC(ndvp)->c_flags &= ~C_VATTR; if (flen+1 > CODA_MAXNAMLEN) { MARK_INT_FAIL(CODA_RENAME_STATS); error = EINVAL; goto exit; } if (tlen+1 > CODA_MAXNAMLEN) { MARK_INT_FAIL(CODA_RENAME_STATS); error = EINVAL; goto exit; } error = venus_rename(vtomi(odvp), &odcp->c_fid, &ndcp->c_fid, fnm, flen, tnm, tlen, cred, td->td_proc); exit: CODADEBUG(CODA_RENAME, myprintf(("in rename result %d\n",error));) /* XXX - do we need to call cache pureg on the moved vnode? */ cache_purge(ap->a_fvp); /* Release parents first, then children. */ vrele(odvp); if (ap->a_tvp) { if (ap->a_tvp == ndvp) vrele(ndvp); else vput(ndvp); vput(ap->a_tvp); } else vput(ndvp); vrele(ap->a_fvp); return(error); } int coda_mkdir(struct vop_mkdir_args *ap) { /* true args */ struct vnode *dvp = ap->a_dvp; struct cnode *dcp = VTOC(dvp); struct componentname *cnp = ap->a_cnp; register struct vattr *va = ap->a_vap; struct vnode **vpp = ap->a_vpp; struct ucred *cred = cnp->cn_cred; struct thread *td = cnp->cn_thread; /* locals */ int error; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; struct cnode *cp; CodaFid VFid; struct vattr ova; MARK_ENTRY(CODA_MKDIR_STATS); /* Check for mkdir of target object. */ if (IS_CTL_NAME(dvp, nm, len)) { *vpp = (struct vnode *)0; MARK_INT_FAIL(CODA_MKDIR_STATS); return(EACCES); } if (len+1 > CODA_MAXNAMLEN) { *vpp = (struct vnode *)0; MARK_INT_FAIL(CODA_MKDIR_STATS); return(EACCES); } error = venus_mkdir(vtomi(dvp), &dcp->c_fid, nm, len, va, cred, td->td_proc, &VFid, &ova); if (!error) { if (coda_find(&VFid) != NULL) panic("cnode existed for newly created directory!"); cp = make_coda_node(&VFid, dvp->v_mount, va->va_type); *vpp = CTOV(cp); /* enter the new vnode in the Name Cache */ coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp)); /* as a side effect, enter "." and ".." for the directory */ coda_nc_enter(VTOC(*vpp), ".", 1, cred, VTOC(*vpp)); coda_nc_enter(VTOC(*vpp), "..", 2, cred, VTOC(dvp)); if (coda_attr_cache) { VTOC(*vpp)->c_vattr = ova; /* update the attr cache */ VTOC(*vpp)->c_flags |= C_VATTR; /* Valid attributes in cnode */ } /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(dvp)->c_flags &= ~C_VATTR; CODADEBUG( CODA_MKDIR, myprintf(("mkdir: %s result %d\n", coda_f2s(&VFid), error)); ) } else { *vpp = (struct vnode *)0; CODADEBUG(CODA_MKDIR, myprintf(("mkdir error %d\n",error));) } return(error); } int coda_rmdir(struct vop_rmdir_args *ap) { /* true args */ struct vnode *dvp = ap->a_dvp; struct cnode *dcp = VTOC(dvp); struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; struct thread *td = cnp->cn_thread; /* true args */ int error; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; struct cnode *cp; MARK_ENTRY(CODA_RMDIR_STATS); /* Check for rmdir of control object. */ if (IS_CTL_NAME(dvp, nm, len)) { MARK_INT_FAIL(CODA_RMDIR_STATS); return(ENOENT); } /* We're being conservative here, it might be that this person * doesn't really have sufficient access to delete the file * but we feel zapping the entry won't really hurt anyone -- dcs */ /* * As a side effect of the rmdir, remove any entries for children of * the directory, especially "." and "..". */ cp = coda_nc_lookup(dcp, nm, len, cred); if (cp) coda_nc_zapParentfid(&(cp->c_fid), NOT_DOWNCALL); /* Remove the file's entry from the CODA Name Cache */ coda_nc_zapfile(dcp, nm, len); /* Invalidate the parent's attr cache, the modification time has changed */ dcp->c_flags &= ~C_VATTR; error = venus_rmdir(vtomi(dvp), &dcp->c_fid, nm, len, cred, td->td_proc); CODADEBUG(CODA_RMDIR, myprintf(("in rmdir result %d\n", error)); ) return(error); } int coda_symlink(struct vop_symlink_args *ap) { /* true args */ struct vnode *tdvp = ap->a_dvp; struct cnode *tdcp = VTOC(tdvp); struct componentname *cnp = ap->a_cnp; struct vattr *tva = ap->a_vap; char *path = ap->a_target; struct ucred *cred = cnp->cn_cred; struct thread *td = cnp->cn_thread; struct vnode **vpp = ap->a_vpp; /* locals */ int error; /* * XXX I'm assuming the following things about coda_symlink's * arguments: * t(foo) is the new name/parent/etc being created. * lname is the contents of the new symlink. */ char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; int plen = strlen(path); /* * Here's the strategy for the moment: perform the symlink, then * do a lookup to grab the resulting vnode. I know this requires * two communications with Venus for a new sybolic link, but * that's the way the ball bounces. I don't yet want to change * the way the Mach symlink works. When Mach support is * deprecated, we should change symlink so that the common case * returns the resultant vnode in a vpp argument. */ MARK_ENTRY(CODA_SYMLINK_STATS); /* Check for symlink of control object. */ if (IS_CTL_NAME(tdvp, nm, len)) { MARK_INT_FAIL(CODA_SYMLINK_STATS); return(EACCES); } if (plen+1 > CODA_MAXPATHLEN) { MARK_INT_FAIL(CODA_SYMLINK_STATS); return(EINVAL); } if (len+1 > CODA_MAXNAMLEN) { MARK_INT_FAIL(CODA_SYMLINK_STATS); error = EINVAL; goto exit; } error = venus_symlink(vtomi(tdvp), &tdcp->c_fid, path, plen, nm, len, tva, cred, td->td_proc); /* Invalidate the parent's attr cache, the modification time has changed */ tdcp->c_flags &= ~C_VATTR; if (error == 0) error = VOP_LOOKUP(tdvp, vpp, cnp); exit: CODADEBUG(CODA_SYMLINK, myprintf(("in symlink result %d\n",error)); ) return(error); } /* * Read directory entries. */ int coda_readdir(struct vop_readdir_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); register struct uio *uiop = ap->a_uio; struct ucred *cred = ap->a_cred; int *eofflag = ap->a_eofflag; u_long **cookies = ap->a_cookies; int *ncookies = ap->a_ncookies; struct thread *td = ap->a_uio->uio_td; /* upcall decl */ /* locals */ int error = 0; MARK_ENTRY(CODA_READDIR_STATS); CODADEBUG(CODA_READDIR, myprintf(("coda_readdir(%p, %d, %lld, %d)\n", (void *)uiop->uio_iov->iov_base, uiop->uio_resid, (long long)uiop->uio_offset, uiop->uio_segflg)); ) /* Check for readdir of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_READDIR_STATS); return(ENOENT); } { /* If directory is not already open do an "internal open" on it. */ int opened_internally = 0; if (cp->c_ovp == NULL) { opened_internally = 1; MARK_INT_GEN(CODA_OPEN_STATS); error = VOP_OPEN(vp, FREAD, cred, td, -1); printf("coda_readdir: Internally Opening %p\n", vp); if (error) { printf("coda_readdir: VOP_OPEN on container failed %d\n", error); return (error); } } /* Have UFS handle the call. */ CODADEBUG(CODA_READDIR, myprintf(("indirect readdir: fid = %s, refcnt = %d\n", coda_f2s(&cp->c_fid), vp->v_usecount)); ) error = VOP_READDIR(cp->c_ovp, uiop, cred, eofflag, ncookies, cookies); if (error) MARK_INT_FAIL(CODA_READDIR_STATS); else MARK_INT_SAT(CODA_READDIR_STATS); /* Do an "internal close" if necessary. */ if (opened_internally) { MARK_INT_GEN(CODA_CLOSE_STATS); (void)VOP_CLOSE(vp, FREAD, cred, td); } } return(error); } /* * Convert from filesystem blocks to device blocks */ int coda_bmap(struct vop_bmap_args *ap) { /* XXX on the global proc */ /* true args */ struct vnode *vp __attribute__((unused)) = ap->a_vp; /* file's vnode */ daddr_t bn __attribute__((unused)) = ap->a_bn; /* fs block number */ struct bufobj **bop = ap->a_bop; /* RETURN bufobj of device */ daddr_t *bnp __attribute__((unused)) = ap->a_bnp; /* RETURN device block number */ struct thread *td __attribute__((unused)) = curthread; /* upcall decl */ /* locals */ int ret = 0; struct cnode *cp; cp = VTOC(vp); if (cp->c_ovp) { return EINVAL; ret = VOP_BMAP(cp->c_ovp, bn, bop, bnp, ap->a_runp, ap->a_runb); #if 0 printf("VOP_BMAP(cp->c_ovp %p, bn %p, bop %p, bnp %lld, ap->a_runp %p, ap->a_runb %p) = %d\n", cp->c_ovp, bn, bop, bnp, ap->a_runp, ap->a_runb, ret); #endif return ret; } else { #if 0 printf("coda_bmap: no container\n"); #endif return(EOPNOTSUPP); } } int coda_reclaim(struct vop_reclaim_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); /* upcall decl */ /* locals */ /* * Forced unmount/flush will let vnodes with non zero use be destroyed! */ ENTRY; if (IS_UNMOUNTING(cp)) { #ifdef DEBUG if (VTOC(vp)->c_ovp) { if (IS_UNMOUNTING(cp)) printf("coda_reclaim: c_ovp not void: vp %p, cp %p\n", vp, cp); } #endif } else { #ifdef OLD_DIAGNOSTIC if (vrefcnt(vp) != 0) print("coda_reclaim: pushing active %p\n", vp); if (VTOC(vp)->c_ovp) { panic("coda_reclaim: c_ovp not void"); } #endif } cache_purge(vp); coda_free(VTOC(vp)); vp->v_data = NULL; vnode_destroy_vobject(vp); return (0); } int -coda_lock(struct _vop_lock_args *ap) +coda_lock(struct vop_lock1_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); /* upcall decl */ /* locals */ ENTRY; if ((ap->a_flags & LK_INTERLOCK) == 0) { VI_LOCK(vp); ap->a_flags |= LK_INTERLOCK; } if (coda_lockdebug) { myprintf(("Attempting lock on %s\n", coda_f2s(&cp->c_fid))); } return (vop_stdlock(ap)); } int coda_unlock(struct vop_unlock_args *ap) { /* true args */ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); /* upcall decl */ /* locals */ ENTRY; if (coda_lockdebug) { myprintf(("Attempting unlock on %s\n", coda_f2s(&cp->c_fid))); } return (vop_stdunlock(ap)); } int coda_islocked(struct vop_islocked_args *ap) { /* true args */ ENTRY; return (vop_stdislocked(ap)); } /* How one looks up a vnode given a device/inode pair: */ int coda_grab_vnode(struct cdev *dev, ino_t ino, struct vnode **vpp) { /* This is like VFS_VGET() or igetinode()! */ int error; struct mount *mp; if (!(mp = devtomp(dev))) { myprintf(("coda_grab_vnode: devtomp(%#lx) returns NULL\n", (u_long)dev2udev(dev))); return(ENXIO); } /* XXX - ensure that nonzero-return means failure */ error = VFS_VGET(mp,ino,LK_EXCLUSIVE,vpp); if (error) { myprintf(("coda_grab_vnode: iget/vget(%lx, %lu) returns %p, err %d\n", (u_long)dev2udev(dev), (u_long)ino, (void *)*vpp, error)); return(ENOENT); } return(0); } void print_vattr(struct vattr *attr) { char *typestr; switch (attr->va_type) { case VNON: typestr = "VNON"; break; case VREG: typestr = "VREG"; break; case VDIR: typestr = "VDIR"; break; case VBLK: typestr = "VBLK"; break; case VCHR: typestr = "VCHR"; break; case VLNK: typestr = "VLNK"; break; case VSOCK: typestr = "VSCK"; break; case VFIFO: typestr = "VFFO"; break; case VBAD: typestr = "VBAD"; break; default: typestr = "????"; break; } myprintf(("attr: type %s mode %d uid %d gid %d fsid %d rdev %d\n", typestr, (int)attr->va_mode, (int)attr->va_uid, (int)attr->va_gid, (int)attr->va_fsid, (int)attr->va_rdev)); myprintf((" fileid %d nlink %d size %d blocksize %d bytes %d\n", (int)attr->va_fileid, (int)attr->va_nlink, (int)attr->va_size, (int)attr->va_blocksize,(int)attr->va_bytes)); myprintf((" gen %ld flags %ld vaflags %d\n", attr->va_gen, attr->va_flags, attr->va_vaflags)); myprintf((" atime sec %d nsec %d\n", (int)attr->va_atime.tv_sec, (int)attr->va_atime.tv_nsec)); myprintf((" mtime sec %d nsec %d\n", (int)attr->va_mtime.tv_sec, (int)attr->va_mtime.tv_nsec)); myprintf((" ctime sec %d nsec %d\n", (int)attr->va_ctime.tv_sec, (int)attr->va_ctime.tv_nsec)); } /* How to print a ucred */ void print_cred(struct ucred *cred) { int i; myprintf(("ref %d\tuid %d\n",cred->cr_ref,cred->cr_uid)); for (i=0; i < cred->cr_ngroups; i++) myprintf(("\tgroup %d: (%d)\n",i,cred->cr_groups[i])); myprintf(("\n")); } /* * Return a vnode for the given fid. * If no cnode exists for this fid create one and put it * in a table hashed by coda_f2i(). If the cnode for * this fid is already in the table return it (ref count is * incremented by coda_find. The cnode will be flushed from the * table when coda_inactive calls coda_unsave. */ struct cnode * make_coda_node(CodaFid *fid, struct mount *vfsp, short type) { struct cnode *cp; int err; if ((cp = coda_find(fid)) == NULL) { struct vnode *vp; cp = coda_alloc(); cp->c_fid = *fid; err = getnewvnode("coda", vfsp, &coda_vnodeops, &vp); if (err) { panic("coda: getnewvnode returned error %d\n", err); } err = insmntque1(vp, vfsp, NULL, NULL); /* XXX: Too early for mpsafe fs */ if (err != 0) panic("coda: insmntque failed: error %d", err); vp->v_data = cp; vp->v_type = type; cp->c_vnode = vp; coda_save(cp); } else { vref(CTOV(cp)); } return cp; } int coda_pathconf(struct vop_pathconf_args *ap) { int error; register_t *retval; retval = ap->a_retval; error = 0; switch (ap->a_name) { case _PC_NAME_MAX: *retval = CODA_MAXNAMLEN; break; case _PC_PATH_MAX: *retval = CODA_MAXPATHLEN; break; default: error = vop_stdpathconf(ap); break; } return (error); } Index: head/sys/fs/coda/coda_vnops.h =================================================================== --- head/sys/fs/coda/coda_vnops.h (revision 169670) +++ head/sys/fs/coda/coda_vnops.h (revision 169671) @@ -1,86 +1,86 @@ /*- * * Coda: an Experimental Distributed File System * Release 3.1 * * Copyright (c) 1987-1998 Carnegie Mellon University * All Rights Reserved * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation, and * that credit is given to Carnegie Mellon University in all documents * and publicity pertaining to direct or indirect use of this code or its * derivatives. * * CODA IS AN EXPERIMENTAL SOFTWARE SYSTEM AND IS KNOWN TO HAVE BUGS, * SOME OF WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON ALLOWS * FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION. CARNEGIE MELLON * DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER * RESULTING DIRECTLY OR INDIRECTLY FROM THE USE OF THIS SOFTWARE OR OF * ANY DERIVATIVE WORK. * * Carnegie Mellon encourages users of this software to return any * improvements or extensions that they make, and to grant Carnegie * Mellon the rights to redistribute these changes without encumbrance. * * @(#) src/sys/coda/coda_vnops.h,v 1.1.1.1 1998/08/29 21:14:52 rvb Exp $ * $FreeBSD$ * */ /* * Mach Operating System * Copyright (c) 1990 Carnegie-Mellon University * Copyright (c) 1989 Carnegie-Mellon University * All rights reserved. The CMU software License Agreement specifies * the terms and conditions for use and redistribution. */ /* * This code was written for the Coda filesystem at Carnegie Mellon * University. Contributers include David Steere, James Kistler, and * M. Satyanarayanan. */ /* NetBSD interfaces to the vnodeops */ vop_open_t coda_open; vop_close_t coda_close; vop_read_t coda_read; vop_write_t coda_write; vop_ioctl_t coda_ioctl; /* 1.3 int cfs_select(void *);*/ vop_getattr_t coda_getattr; vop_setattr_t coda_setattr; vop_access_t coda_access; int coda_abortop(void *); vop_readlink_t coda_readlink; vop_fsync_t coda_fsync; vop_inactive_t coda_inactive; vop_lookup_t coda_lookup; vop_create_t coda_create; vop_remove_t coda_remove; vop_link_t coda_link; vop_rename_t coda_rename; vop_mkdir_t coda_mkdir; vop_rmdir_t coda_rmdir; vop_symlink_t coda_symlink; vop_readdir_t coda_readdir; vop_bmap_t coda_bmap; vop_strategy_t coda_strategy; vop_reclaim_t coda_reclaim; -_vop_lock_t coda_lock; +vop_lock1_t coda_lock; vop_unlock_t coda_unlock; vop_islocked_t coda_islocked; int coda_vop_error(void *); int coda_vop_nop(void *); vop_pathconf_t coda_pathconf; int coda_rdwr(struct vnode *vp, struct uio *uiop, enum uio_rw rw, int ioflag, struct ucred *cred, struct thread *td); int coda_grab_vnode(struct cdev *dev, ino_t ino, struct vnode **vpp); void print_vattr(struct vattr *attr); void print_cred(struct ucred *cred); Index: head/sys/fs/nullfs/null_vnops.c =================================================================== --- head/sys/fs/nullfs/null_vnops.c (revision 169670) +++ head/sys/fs/nullfs/null_vnops.c (revision 169671) @@ -1,741 +1,741 @@ /*- * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * John Heidemann of the UCLA Ficus project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)null_vnops.c 8.6 (Berkeley) 5/27/95 * * Ancestors: * @(#)lofs_vnops.c 1.2 (Berkeley) 6/18/92 * ...and... * @(#)null_vnodeops.c 1.20 92/07/07 UCLA Ficus project * * $FreeBSD$ */ /* * Null Layer * * (See mount_nullfs(8) for more information.) * * The null layer duplicates a portion of the filesystem * name space under a new name. In this respect, it is * similar to the loopback filesystem. It differs from * the loopback fs in two respects: it is implemented using * a stackable layers techniques, and its "null-node"s stack above * all lower-layer vnodes, not just over directory vnodes. * * The null layer has two purposes. First, it serves as a demonstration * of layering by proving a layer which does nothing. (It actually * does everything the loopback filesystem does, which is slightly * more than nothing.) Second, the null layer can serve as a prototype * layer. Since it provides all necessary layer framework, * new filesystem layers can be created very easily be starting * with a null layer. * * The remainder of this man page examines the null layer as a basis * for constructing new layers. * * * INSTANTIATING NEW NULL LAYERS * * New null layers are created with mount_nullfs(8). * Mount_nullfs(8) takes two arguments, the pathname * of the lower vfs (target-pn) and the pathname where the null * layer will appear in the namespace (alias-pn). After * the null layer is put into place, the contents * of target-pn subtree will be aliased under alias-pn. * * * OPERATION OF A NULL LAYER * * The null layer is the minimum filesystem layer, * simply bypassing all possible operations to the lower layer * for processing there. The majority of its activity centers * on the bypass routine, through which nearly all vnode operations * pass. * * The bypass routine accepts arbitrary vnode operations for * handling by the lower layer. It begins by examing vnode * operation arguments and replacing any null-nodes by their * lower-layer equivlants. It then invokes the operation * on the lower layer. Finally, it replaces the null-nodes * in the arguments and, if a vnode is return by the operation, * stacks a null-node on top of the returned vnode. * * Although bypass handles most operations, vop_getattr, vop_lock, * vop_unlock, vop_inactive, vop_reclaim, and vop_print are not * bypassed. Vop_getattr must change the fsid being returned. * Vop_lock and vop_unlock must handle any locking for the * current vnode as well as pass the lock request down. * Vop_inactive and vop_reclaim are not bypassed so that * they can handle freeing null-layer specific data. Vop_print * is not bypassed to avoid excessive debugging information. * Also, certain vnode operations change the locking state within * the operation (create, mknod, remove, link, rename, mkdir, rmdir, * and symlink). Ideally these operations should not change the * lock state, but should be changed to let the caller of the * function unlock them. Otherwise all intermediate vnode layers * (such as union, umapfs, etc) must catch these functions to do * the necessary locking at their layer. * * * INSTANTIATING VNODE STACKS * * Mounting associates the null layer with a lower layer, * effect stacking two VFSes. Vnode stacks are instead * created on demand as files are accessed. * * The initial mount creates a single vnode stack for the * root of the new null layer. All other vnode stacks * are created as a result of vnode operations on * this or other null vnode stacks. * * New vnode stacks come into existance as a result of * an operation which returns a vnode. * The bypass routine stacks a null-node above the new * vnode before returning it to the caller. * * For example, imagine mounting a null layer with * "mount_nullfs /usr/include /dev/layer/null". * Changing directory to /dev/layer/null will assign * the root null-node (which was created when the null layer was mounted). * Now consider opening "sys". A vop_lookup would be * done on the root null-node. This operation would bypass through * to the lower layer which would return a vnode representing * the UFS "sys". Null_bypass then builds a null-node * aliasing the UFS "sys" and returns this to the caller. * Later operations on the null-node "sys" will repeat this * process when constructing other vnode stacks. * * * CREATING OTHER FILE SYSTEM LAYERS * * One of the easiest ways to construct new filesystem layers is to make * a copy of the null layer, rename all files and variables, and * then begin modifing the copy. Sed can be used to easily rename * all variables. * * The umap layer is an example of a layer descended from the * null layer. * * * INVOKING OPERATIONS ON LOWER LAYERS * * There are two techniques to invoke operations on a lower layer * when the operation cannot be completely bypassed. Each method * is appropriate in different situations. In both cases, * it is the responsibility of the aliasing layer to make * the operation arguments "correct" for the lower layer * by mapping a vnode arguments to the lower layer. * * The first approach is to call the aliasing layer's bypass routine. * This method is most suitable when you wish to invoke the operation * currently being handled on the lower layer. It has the advantage * that the bypass routine already must do argument mapping. * An example of this is null_getattrs in the null layer. * * A second approach is to directly invoke vnode operations on * the lower layer with the VOP_OPERATIONNAME interface. * The advantage of this method is that it is easy to invoke * arbitrary operations on the lower layer. The disadvantage * is that vnode arguments must be manualy mapped. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int null_bug_bypass = 0; /* for debugging: enables bypass printf'ing */ SYSCTL_INT(_debug, OID_AUTO, nullfs_bug_bypass, CTLFLAG_RW, &null_bug_bypass, 0, ""); /* * This is the 10-Apr-92 bypass routine. * This version has been optimized for speed, throwing away some * safety checks. It should still always work, but it's not as * robust to programmer errors. * * In general, we map all vnodes going down and unmap them on the way back. * As an exception to this, vnodes can be marked "unmapped" by setting * the Nth bit in operation's vdesc_flags. * * Also, some BSD vnode operations have the side effect of vrele'ing * their arguments. With stacking, the reference counts are held * by the upper node, not the lower one, so we must handle these * side-effects here. This is not of concern in Sun-derived systems * since there are no such side-effects. * * This makes the following assumptions: * - only one returned vpp * - no INOUT vpp's (Sun's vop_open has one of these) * - the vnode operation vector of the first vnode should be used * to determine what implementation of the op should be invoked * - all mapped vnodes are of our vnode-type (NEEDSWORK: * problems on rmdir'ing mount points and renaming?) */ int null_bypass(struct vop_generic_args *ap) { struct vnode **this_vp_p; int error; struct vnode *old_vps[VDESC_MAX_VPS]; struct vnode **vps_p[VDESC_MAX_VPS]; struct vnode ***vppp; struct vnodeop_desc *descp = ap->a_desc; int reles, i; if (null_bug_bypass) printf ("null_bypass: %s\n", descp->vdesc_name); #ifdef DIAGNOSTIC /* * We require at least one vp. */ if (descp->vdesc_vp_offsets == NULL || descp->vdesc_vp_offsets[0] == VDESC_NO_OFFSET) panic ("null_bypass: no vp's in map"); #endif /* * Map the vnodes going in. * Later, we'll invoke the operation based on * the first mapped vnode's operation vector. */ reles = descp->vdesc_flags; for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) break; /* bail out at end of list */ vps_p[i] = this_vp_p = VOPARG_OFFSETTO(struct vnode**,descp->vdesc_vp_offsets[i],ap); /* * We're not guaranteed that any but the first vnode * are of our type. Check for and don't map any * that aren't. (We must always map first vp or vclean fails.) */ if (i && (*this_vp_p == NULLVP || (*this_vp_p)->v_op != &null_vnodeops)) { old_vps[i] = NULLVP; } else { old_vps[i] = *this_vp_p; *(vps_p[i]) = NULLVPTOLOWERVP(*this_vp_p); /* * XXX - Several operations have the side effect * of vrele'ing their vp's. We must account for * that. (This should go away in the future.) */ if (reles & VDESC_VP0_WILLRELE) VREF(*this_vp_p); } } /* * Call the operation on the lower layer * with the modified argument structure. */ if (vps_p[0] && *vps_p[0]) error = VCALL(ap); else { printf("null_bypass: no map for %s\n", descp->vdesc_name); error = EINVAL; } /* * Maintain the illusion of call-by-value * by restoring vnodes in the argument structure * to their original value. */ reles = descp->vdesc_flags; for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) break; /* bail out at end of list */ if (old_vps[i]) { *(vps_p[i]) = old_vps[i]; #if 0 if (reles & VDESC_VP0_WILLUNLOCK) VOP_UNLOCK(*(vps_p[i]), 0, curthread); #endif if (reles & VDESC_VP0_WILLRELE) vrele(*(vps_p[i])); } } /* * Map the possible out-going vpp * (Assumes that the lower layer always returns * a VREF'ed vpp unless it gets an error.) */ if (descp->vdesc_vpp_offset != VDESC_NO_OFFSET && !(descp->vdesc_flags & VDESC_NOMAP_VPP) && !error) { /* * XXX - even though some ops have vpp returned vp's, * several ops actually vrele this before returning. * We must avoid these ops. * (This should go away when these ops are regularized.) */ if (descp->vdesc_flags & VDESC_VPP_WILLRELE) goto out; vppp = VOPARG_OFFSETTO(struct vnode***, descp->vdesc_vpp_offset,ap); if (*vppp) error = null_nodeget(old_vps[0]->v_mount, **vppp, *vppp); } out: return (error); } /* * We have to carry on the locking protocol on the null layer vnodes * as we progress through the tree. We also have to enforce read-only * if this layer is mounted read-only. */ static int null_lookup(struct vop_lookup_args *ap) { struct componentname *cnp = ap->a_cnp; struct vnode *dvp = ap->a_dvp; int flags = cnp->cn_flags; struct vnode *vp, *ldvp, *lvp; int error; if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); /* * Although it is possible to call null_bypass(), we'll do * a direct call to reduce overhead */ ldvp = NULLVPTOLOWERVP(dvp); vp = lvp = NULL; error = VOP_LOOKUP(ldvp, &lvp, cnp); if (error == EJUSTRETURN && (flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME)) error = EROFS; if ((error == 0 || error == EJUSTRETURN) && lvp != NULL) { if (ldvp == lvp) { *ap->a_vpp = dvp; VREF(dvp); vrele(lvp); } else { error = null_nodeget(dvp->v_mount, lvp, &vp); if (error) { /* XXX Cleanup needed... */ panic("null_nodeget failed"); } *ap->a_vpp = vp; } } return (error); } static int null_open(struct vop_open_args *ap) { int retval; struct vnode *vp, *ldvp; vp = ap->a_vp; ldvp = NULLVPTOLOWERVP(vp); retval = null_bypass(&ap->a_gen); if (retval == 0) vp->v_object = ldvp->v_object; return (retval); } /* * Setattr call. Disallow write attempts if the layer is mounted read-only. */ static int null_setattr(struct vop_setattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) && (vp->v_mount->mnt_flag & MNT_RDONLY)) return (EROFS); if (vap->va_size != VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VCHR: case VBLK: case VSOCK: case VFIFO: if (vap->va_flags != VNOVAL) return (EOPNOTSUPP); return (0); case VREG: case VLNK: default: /* * Disallow write attempts if the filesystem is * mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); } } return (null_bypass((struct vop_generic_args *)ap)); } /* * We handle getattr only to change the fsid. */ static int null_getattr(struct vop_getattr_args *ap) { int error; if ((error = null_bypass((struct vop_generic_args *)ap)) != 0) return (error); ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0]; return (0); } /* * Handle to disallow write access if mounted read-only. */ static int null_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; mode_t mode = ap->a_mode; /* * Disallow write attempts on read-only layers; * unless the file is a socket, fifo, or a block or * character device resident on the filesystem. */ if (mode & VWRITE) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: break; } } return (null_bypass((struct vop_generic_args *)ap)); } /* * We handle this to eliminate null FS to lower FS * file moving. Don't know why we don't allow this, * possibly we should. */ static int null_rename(struct vop_rename_args *ap) { struct vnode *tdvp = ap->a_tdvp; struct vnode *fvp = ap->a_fvp; struct vnode *fdvp = ap->a_fdvp; struct vnode *tvp = ap->a_tvp; /* Check for cross-device rename. */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fdvp); vrele(fvp); return (EXDEV); } return (null_bypass((struct vop_generic_args *)ap)); } /* * We need to process our own vnode lock and then clear the * interlock flag as it applies only to our vnode, not the * vnodes below us on the stack. */ static int -null_lock(struct _vop_lock_args *ap) +null_lock(struct vop_lock1_args *ap) { struct vnode *vp = ap->a_vp; int flags = ap->a_flags; struct thread *td = ap->a_td; struct null_node *nn; struct vnode *lvp; int error; if ((flags & LK_INTERLOCK) == 0) { VI_LOCK(vp); ap->a_flags = flags |= LK_INTERLOCK; } nn = VTONULL(vp); /* * If we're still active we must ask the lower layer to * lock as ffs has special lock considerations in it's * vop lock. */ if (nn != NULL && (lvp = NULLVPTOLOWERVP(vp)) != NULL) { VI_LOCK_FLAGS(lvp, MTX_DUPOK); VI_UNLOCK(vp); /* * We have to hold the vnode here to solve a potential * reclaim race. If we're forcibly vgone'd while we * still have refs, a thread could be sleeping inside * the lowervp's vop_lock routine. When we vgone we will * drop our last ref to the lowervp, which would allow it * to be reclaimed. The lowervp could then be recycled, * in which case it is not legal to be sleeping in it's VOP. * We prevent it from being recycled by holding the vnode * here. */ vholdl(lvp); error = VOP_LOCK(lvp, flags, td); /* * We might have slept to get the lock and someone might have * clean our vnode already, switching vnode lock from one in * lowervp to v_lock in our own vnode structure. Handle this * case by reacquiring correct lock in requested mode. */ if (VTONULL(vp) == NULL && error == 0) { ap->a_flags &= ~(LK_TYPE_MASK | LK_INTERLOCK); switch (flags & LK_TYPE_MASK) { case LK_SHARED: ap->a_flags |= LK_SHARED; break; case LK_UPGRADE: case LK_EXCLUSIVE: ap->a_flags |= LK_EXCLUSIVE; break; default: panic("Unsupported lock request %d\n", ap->a_flags); } VOP_UNLOCK(lvp, 0, td); error = vop_stdlock(ap); } vdrop(lvp); } else error = vop_stdlock(ap); return (error); } /* * We need to process our own vnode unlock and then clear the * interlock flag as it applies only to our vnode, not the * vnodes below us on the stack. */ static int null_unlock(struct vop_unlock_args *ap) { struct vnode *vp = ap->a_vp; int flags = ap->a_flags; struct thread *td = ap->a_td; struct null_node *nn; struct vnode *lvp; int error; if ((flags & LK_INTERLOCK) != 0) { VI_UNLOCK(vp); ap->a_flags = flags &= ~LK_INTERLOCK; } nn = VTONULL(vp); if (nn != NULL && (lvp = NULLVPTOLOWERVP(vp)) != NULL) error = VOP_UNLOCK(lvp, flags, td); else error = vop_stdunlock(ap); return (error); } static int null_islocked(struct vop_islocked_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; return (lockstatus(vp->v_vnlock, td)); } /* * There is no way to tell that someone issued remove/rmdir operation * on the underlying filesystem. For now we just have to release lowevrp * as soon as possible. * * Note, we can't release any resources nor remove vnode from hash before * appropriate VXLOCK stuff is is done because other process can find this * vnode in hash during inactivation and may be sitting in vget() and waiting * for null_inactive to unlock vnode. Thus we will do all those in VOP_RECLAIM. */ static int null_inactive(struct vop_inactive_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; vp->v_object = NULL; /* * If this is the last reference, then free up the vnode * so as not to tie up the lower vnodes. */ vrecycle(vp, td); return (0); } /* * Now, the VXLOCK is in force and we're free to destroy the null vnode. */ static int null_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct null_node *xp = VTONULL(vp); struct vnode *lowervp = xp->null_lowervp; struct lock *vnlock; if (lowervp) null_hashrem(xp); /* * Use the interlock to protect the clearing of v_data to * prevent faults in null_lock(). */ VI_LOCK(vp); vp->v_data = NULL; vp->v_object = NULL; vnlock = vp->v_vnlock; vp->v_vnlock = &vp->v_lock; if (lowervp) { lockmgr(vp->v_vnlock, LK_EXCLUSIVE|LK_INTERLOCK, VI_MTX(vp), curthread); vput(lowervp); } else panic("null_reclaim: reclaiming an node with now lowervp"); FREE(xp, M_NULLFSNODE); return (0); } static int null_print(struct vop_print_args *ap) { struct vnode *vp = ap->a_vp; printf("\tvp=%p, lowervp=%p\n", vp, NULLVPTOLOWERVP(vp)); return (0); } /* ARGSUSED */ static int null_getwritemount(struct vop_getwritemount_args *ap) { struct null_node *xp; struct vnode *lowervp; struct vnode *vp; vp = ap->a_vp; VI_LOCK(vp); xp = VTONULL(vp); if (xp && (lowervp = xp->null_lowervp)) { VI_LOCK_FLAGS(lowervp, MTX_DUPOK); VI_UNLOCK(vp); vholdl(lowervp); VI_UNLOCK(lowervp); VOP_GETWRITEMOUNT(lowervp, ap->a_mpp); vdrop(lowervp); } else { VI_UNLOCK(vp); *(ap->a_mpp) = NULL; } return (0); } static int null_vptofh(struct vop_vptofh_args *ap) { struct vnode *lvp; lvp = NULLVPTOLOWERVP(ap->a_vp); return VOP_VPTOFH(lvp, ap->a_fhp); } /* * Global vfs data structures */ struct vop_vector null_vnodeops = { .vop_bypass = null_bypass, .vop_access = null_access, .vop_bmap = VOP_EOPNOTSUPP, .vop_getattr = null_getattr, .vop_getwritemount = null_getwritemount, .vop_inactive = null_inactive, .vop_islocked = null_islocked, - ._vop_lock = null_lock, + .vop_lock1 = null_lock, .vop_lookup = null_lookup, .vop_open = null_open, .vop_print = null_print, .vop_reclaim = null_reclaim, .vop_rename = null_rename, .vop_setattr = null_setattr, .vop_strategy = VOP_EOPNOTSUPP, .vop_unlock = null_unlock, .vop_vptofh = null_vptofh, }; Index: head/sys/fs/umapfs/umap_vnops.c =================================================================== --- head/sys/fs/umapfs/umap_vnops.c (revision 169670) +++ head/sys/fs/umapfs/umap_vnops.c (revision 169671) @@ -1,534 +1,534 @@ /*- * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * the UCLA Ficus project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)umap_vnops.c 8.6 (Berkeley) 5/22/95 * $FreeBSD$ */ /* * Umap Layer */ #include #include #include #include #include #include #include #include #include #include #include static int umap_bug_bypass = 0; /* for debugging: enables bypass printf'ing */ SYSCTL_INT(_debug, OID_AUTO, umapfs_bug_bypass, CTLFLAG_RW, &umap_bug_bypass, 0, ""); static vop_generic_t umap_bypass; static vop_getattr_t umap_getattr; static vop_inactive_t umap_inactive; -static vop_lock_t umap_lock; +static vop_lock1_t umap_lock; static vop_print_t umap_print; static vop_reclaim_t umap_reclaim; static vop_rename_t umap_rename; static vop_unlock_t umap_unlock; static vop_vptofh_t umap_vptofh; /* * This is the 10-Apr-92 bypass routine. * See null_vnops.c:null_bypass for more details. */ static int umap_bypass(ap) struct vop_generic_args /* { struct vnodeop_desc *a_desc; } */ *ap; { struct ucred **credpp = 0, *credp = 0; struct ucred *savecredp = 0, *savecompcredp = 0; struct ucred *compcredp = 0; struct vnode **this_vp_p; int error; struct vnode *old_vps[VDESC_MAX_VPS]; struct vnode *vp1 = 0; struct vnode **vps_p[VDESC_MAX_VPS]; struct vnode ***vppp; struct vnodeop_desc *descp = ap->a_desc; int reles, i; struct componentname **compnamepp = 0; if (umap_bug_bypass) printf ("umap_bypass: %s\n", descp->vdesc_name); #ifdef DIAGNOSTIC /* * We require at least one vp. */ if (descp->vdesc_vp_offsets == NULL || descp->vdesc_vp_offsets[0] == VDESC_NO_OFFSET) panic ("umap_bypass: no vp's in map"); #endif /* * Map the vnodes going in. * Later, we'll invoke the operation based on * the first mapped vnode's operation vector. */ reles = descp->vdesc_flags; for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) break; /* bail out at end of list */ vps_p[i] = this_vp_p = VOPARG_OFFSETTO(struct vnode**, descp->vdesc_vp_offsets[i], ap); if (i == 0) { vp1 = *vps_p[0]; } /* * We're not guaranteed that any but the first vnode * are of our type. Check for and don't map any * that aren't. (Must map first vp or vclean fails.) */ if (i && (*this_vp_p)->v_op != umap_vnodeop_p) { old_vps[i] = NULL; } else { old_vps[i] = *this_vp_p; *(vps_p[i]) = UMAPVPTOLOWERVP(*this_vp_p); if (reles & 1) VREF(*this_vp_p); } } /* * Fix the credentials. (That's the purpose of this layer.) */ if (descp->vdesc_cred_offset != VDESC_NO_OFFSET) { credpp = VOPARG_OFFSETTO(struct ucred**, descp->vdesc_cred_offset, ap); /* Save old values */ savecredp = (*credpp); if (savecredp != NOCRED) (*credpp) = crdup(savecredp); credp = *credpp; if (umap_bug_bypass && credp->cr_uid != 0) printf("umap_bypass: user was %lu, group %lu\n", (u_long)credp->cr_uid, (u_long)credp->cr_gid); /* Map all ids in the credential structure. */ umap_mapids(vp1->v_mount, credp); if (umap_bug_bypass && credp->cr_uid != 0) printf("umap_bypass: user now %lu, group %lu\n", (u_long)credp->cr_uid, (u_long)credp->cr_gid); } /* BSD often keeps a credential in the componentname structure * for speed. If there is one, it better get mapped, too. */ if (descp->vdesc_componentname_offset != VDESC_NO_OFFSET) { compnamepp = VOPARG_OFFSETTO(struct componentname**, descp->vdesc_componentname_offset, ap); compcredp = (*compnamepp)->cn_cred; savecompcredp = compcredp; if (savecompcredp != NOCRED) (*compnamepp)->cn_cred = crdup(savecompcredp); compcredp = (*compnamepp)->cn_cred; if (umap_bug_bypass && compcredp->cr_uid != 0) printf( "umap_bypass: component credit user was %lu, group %lu\n", (u_long)compcredp->cr_uid, (u_long)compcredp->cr_gid); /* Map all ids in the credential structure. */ umap_mapids(vp1->v_mount, compcredp); if (umap_bug_bypass && compcredp->cr_uid != 0) printf( "umap_bypass: component credit user now %lu, group %lu\n", (u_long)compcredp->cr_uid, (u_long)compcredp->cr_gid); } /* * Call the operation on the lower layer * with the modified argument structure. */ error = VCALL(ap); /* * Maintain the illusion of call-by-value * by restoring vnodes in the argument structure * to their original value. */ reles = descp->vdesc_flags; for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) break; /* bail out at end of list */ if (old_vps[i]) { *(vps_p[i]) = old_vps[i]; if (reles & 1) vrele(*(vps_p[i])); }; }; /* * Map the possible out-going vpp * (Assumes that the lower layer always returns * a VREF'ed vpp unless it gets an error.) */ if (descp->vdesc_vpp_offset != VDESC_NO_OFFSET && !(descp->vdesc_flags & VDESC_NOMAP_VPP) && !error) { if (descp->vdesc_flags & VDESC_VPP_WILLRELE) goto out; vppp = VOPARG_OFFSETTO(struct vnode***, descp->vdesc_vpp_offset, ap); if (*vppp) error = umap_node_create(old_vps[0]->v_mount, **vppp, *vppp); }; out: /* * Free duplicate cred structure and restore old one. */ if (descp->vdesc_cred_offset != VDESC_NO_OFFSET) { if (umap_bug_bypass && credp && credp->cr_uid != 0) printf("umap_bypass: returning-user was %lu\n", (u_long)credp->cr_uid); if (savecredp != NOCRED) { crfree(credp); (*credpp) = savecredp; if (umap_bug_bypass && credpp && (*credpp)->cr_uid != 0) printf( "umap_bypass: returning-user now %lu\n\n", (u_long)(*credpp)->cr_uid); } } if (descp->vdesc_componentname_offset != VDESC_NO_OFFSET) { if (umap_bug_bypass && compcredp && compcredp->cr_uid != 0) printf( "umap_bypass: returning-component-user was %lu\n", (u_long)compcredp->cr_uid); if (savecompcredp != NOCRED) { crfree(compcredp); (*compnamepp)->cn_cred = savecompcredp; if (umap_bug_bypass && credpp && (*credpp)->cr_uid != 0) printf( "umap_bypass: returning-component-user now %lu\n", (u_long)compcredp->cr_uid); } } return (error); } /* * We handle getattr to change the fsid. */ static int umap_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { short uid, gid; int error, tmpid, nentries, gnentries; u_long (*mapdata)[2], (*gmapdata)[2]; struct vnode **vp1p; struct vnodeop_desc *descp = ap->a_desc; error = umap_bypass((struct vop_generic_args *)ap); if (error) return (error); /* * Umap needs to map the uid and gid returned by a stat * into the proper values for this site. This involves * finding the returned uid in the mapping information, * translating it into the uid on the other end, * and filling in the proper field in the vattr * structure pointed to by ap->a_vap. The group * is easier, since currently all groups will be * translate to the NULLGROUP. */ /* Find entry in map */ uid = ap->a_vap->va_uid; gid = ap->a_vap->va_gid; if (umap_bug_bypass) printf("umap_getattr: mapped uid = %d, mapped gid = %d\n", uid, gid); vp1p = VOPARG_OFFSETTO(struct vnode**, descp->vdesc_vp_offsets[0], ap); nentries = MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_nentries; mapdata = (MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_mapdata); gnentries = MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_gnentries; gmapdata = (MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_gmapdata); /* Reverse map the uid for the vnode. Since it's a reverse map, we can't use umap_mapids() to do it. */ tmpid = umap_reverse_findid(uid, mapdata, nentries); if (tmpid != -1) { ap->a_vap->va_uid = (uid_t) tmpid; if (umap_bug_bypass) printf("umap_getattr: original uid = %d\n", uid); } else ap->a_vap->va_uid = (uid_t) NOBODY; /* Reverse map the gid for the vnode. */ tmpid = umap_reverse_findid(gid, gmapdata, gnentries); if (tmpid != -1) { ap->a_vap->va_gid = (gid_t) tmpid; if (umap_bug_bypass) printf("umap_getattr: original gid = %d\n", gid); } else ap->a_vap->va_gid = (gid_t) NULLGROUP; return (0); } /* * We need to process our own vnode lock and then clear the * interlock flag as it applies only to our vnode, not the * vnodes below us on the stack. */ static int umap_lock(ap) - struct vop_lock_args /* { + struct vop_lock1_args /* { struct vnode *a_vp; int a_flags; struct thread *a_td; } */ *ap; { /* * vop_nolock no longer exists. I could have pasted the code * in so that it compiles, but that would be doing our users a * great disservice. umapfs is about 5 years behind the nullfs * code that it is derived from. The stub locking here guarantees * a deadlock the moment a VOP_INACTIVE arrives. There is no point * pasting the code that makes it compile either, because that just * makes it Even More Wrong. */ vop_nolock(ap); if ((ap->a_flags & LK_TYPE_MASK) == LK_DRAIN) return (0); ap->a_flags &= ~LK_INTERLOCK; return (null_bypass((struct vop_generic_args *)ap)); } /* * We need to process our own vnode unlock and then clear the * interlock flag as it applies only to our vnode, not the * vnodes below us on the stack. */ static int umap_unlock(ap) struct vop_unlock_args /* { struct vnode *a_vp; int a_flags; struct thread *a_td; } */ *ap; { vop_nounlock(ap); ap->a_flags &= ~LK_INTERLOCK; return (null_bypass((struct vop_generic_args *)ap)); } static int umap_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct umap_node *xp = VTOUMAP(vp); struct vnode *lowervp = xp->umap_lowervp; /* * Do nothing (and _don't_ bypass). * Wait to vrele lowervp until reclaim, * so that until then our umap_node is in the * cache and reusable. * */ VOP_INACTIVE(lowervp, ap->a_td); return (0); } static int umap_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; struct umap_node *xp = VTOUMAP(vp); struct vnode *lowervp = xp->umap_lowervp; /* After this assignment, this node will not be re-used. */ xp->umap_lowervp = NULL; LIST_REMOVE(xp, umap_hash); FREE(vp->v_data, M_TEMP); vp->v_data = NULL; vp->v_object = NULL; vrele(lowervp); return (0); } static int umap_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; printf("\tvp=%p, lowervp=%p\n", vp, UMAPVPTOLOWERVP(vp)); return (0); } static int umap_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { int error; struct componentname *compnamep; struct ucred *compcredp, *savecompcredp; struct vnode *vp; /* * Rename is irregular, having two componentname structures. * We need to map the cre in the second structure, * and then bypass takes care of the rest. */ vp = ap->a_fdvp; compnamep = ap->a_tcnp; compcredp = compnamep->cn_cred; savecompcredp = compcredp; compcredp = compnamep->cn_cred = crdup(savecompcredp); if (umap_bug_bypass && compcredp->cr_uid != 0) printf( "umap_rename: rename component credit user was %lu, group %lu\n", (u_long)compcredp->cr_uid, (u_long)compcredp->cr_gid); /* Map all ids in the credential structure. */ umap_mapids(vp->v_mount, compcredp); if (umap_bug_bypass && compcredp->cr_uid != 0) printf( "umap_rename: rename component credit user now %lu, group %lu\n", (u_long)compcredp->cr_uid, (u_long)compcredp->cr_gid); error = umap_bypass((struct vop_generic_args *)ap); /* Restore the additional mapped componentname cred structure. */ crfree(compcredp); compnamep->cn_cred = savecompcredp; return error; } static int umap_vptofh(ap) struct vop_vptofh_args /* { struct vnode *a_vp; struct fid *a_fhp; } */ *ap; { struct vnode *lvp; lvp = UMAPVPTOLOWERVP(ap->a_vp); return (VOP_VPTOFH(lvp, ap->a_fhp)); } /* * Global vfs data structures */ /* * XXX - strategy, bwrite are hand coded currently. They should * go away with a merged buffer/block cache. * */ static struct vop_vector umap_vnodeops = { .vop_default = umap_bypass, .vop_getattr = umap_getattr, .vop_inactive = umap_inactive, - .vop_lock = umap_lock, + .vop_lock1 = umap_lock, .vop_print = umap_print, .vop_reclaim = umap_reclaim, .vop_rename = umap_rename, .vop_unlock = umap_unlock, .vop_vptofh = umap_vptofh, }; Index: head/sys/fs/unionfs/union_vnops.c =================================================================== --- head/sys/fs/unionfs/union_vnops.c (revision 169670) +++ head/sys/fs/unionfs/union_vnops.c (revision 169671) @@ -1,2285 +1,2285 @@ /*- * Copyright (c) 1992, 1993, 1994, 1995 Jan-Simon Pendry. * Copyright (c) 1992, 1993, 1994, 1995 * The Regents of the University of California. * Copyright (c) 2005, 2006 Masanori Ozawa , ONGS Inc. * Copyright (c) 2006 Daichi Goto * All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)union_vnops.c 8.32 (Berkeley) 6/23/95 * $FreeBSD$ * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if 0 #define UNIONFS_INTERNAL_DEBUG(msg, args...) printf(msg, ## args) #define UNIONFS_IDBG_RENAME #else #define UNIONFS_INTERNAL_DEBUG(msg, args...) #endif /* lockmgr lock <-> reverse table */ struct lk_lr_table { int lock; int revlock; }; static struct lk_lr_table un_llt[] = { {LK_SHARED, LK_RELEASE}, {LK_EXCLUSIVE, LK_RELEASE}, {LK_UPGRADE, LK_DOWNGRADE}, {LK_EXCLUPGRADE, LK_DOWNGRADE}, {LK_DOWNGRADE, LK_UPGRADE}, {0, 0} }; static int unionfs_lookup(struct vop_lookup_args *ap) { int iswhiteout; int lockflag; int error , uerror, lerror; u_long nameiop; u_long cnflags, cnflagsbk; struct unionfs_node *dunp; struct vnode *dvp, *udvp, *ldvp, *vp, *uvp, *lvp, *dtmpvp; struct vattr va; struct componentname *cnp; struct thread *td; iswhiteout = 0; lockflag = 0; error = uerror = lerror = ENOENT; cnp = ap->a_cnp; nameiop = cnp->cn_nameiop; cnflags = cnp->cn_flags; dvp = ap->a_dvp; dunp = VTOUNIONFS(dvp); udvp = dunp->un_uppervp; ldvp = dunp->un_lowervp; vp = uvp = lvp = NULLVP; td = curthread; *(ap->a_vpp) = NULLVP; UNIONFS_INTERNAL_DEBUG("unionfs_lookup: enter: nameiop=%ld, flags=%lx, path=%s\n", nameiop, cnflags, cnp->cn_nameptr); if (dvp->v_type != VDIR) return (ENOTDIR); /* * If read-only and op is not LOOKUP, will return EROFS. */ if ((cnflags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && LOOKUP != nameiop) return (EROFS); /* * lookup dotdot */ if (cnflags & ISDOTDOT) { if (LOOKUP != nameiop && udvp == NULLVP) return (EROFS); if (udvp != NULLVP) { dtmpvp = udvp; if (ldvp != NULLVP) VOP_UNLOCK(ldvp, 0, td); } else dtmpvp = ldvp; error = VOP_LOOKUP(dtmpvp, &vp, cnp); if (dtmpvp == udvp && ldvp != NULLVP) { VOP_UNLOCK(udvp, 0, td); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td); } if (error == 0) { /* * Exchange lock and reference from vp to * dunp->un_dvp. vp is upper/lower vnode, but it * will need to return the unionfs vnode. */ if (nameiop == DELETE || nameiop == RENAME || (cnp->cn_lkflags & LK_TYPE_MASK)) VOP_UNLOCK(vp, 0, td); vrele(vp); VOP_UNLOCK(dvp, 0, td); *(ap->a_vpp) = dunp->un_dvp; vref(dunp->un_dvp); if (nameiop == DELETE || nameiop == RENAME) vn_lock(dunp->un_dvp, LK_EXCLUSIVE | LK_RETRY, td); else if (cnp->cn_lkflags & LK_TYPE_MASK) vn_lock(dunp->un_dvp, cnp->cn_lkflags | LK_RETRY, td); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td); } UNIONFS_INTERNAL_DEBUG("unionfs_lookup: leave (%d)\n", error); return (error); } /* * lookup upper layer */ if (udvp != NULLVP) { uerror = VOP_LOOKUP(udvp, &uvp, cnp); if (uerror == 0) { if (udvp == uvp) { /* is dot */ vrele(uvp); *(ap->a_vpp) = dvp; vref(dvp); UNIONFS_INTERNAL_DEBUG("unionfs_lookup: leave (%d)\n", uerror); return (uerror); } if (nameiop == DELETE || nameiop == RENAME || (cnp->cn_lkflags & LK_TYPE_MASK)) VOP_UNLOCK(uvp, 0, td); } /* check whiteout */ if (uerror == ENOENT || uerror == EJUSTRETURN) if (cnp->cn_flags & ISWHITEOUT) iswhiteout = 1; /* don't lookup lower */ if (iswhiteout == 0 && ldvp != NULLVP) if (VOP_GETATTR(udvp, &va, cnp->cn_cred, td) == 0 && (va.va_flags & OPAQUE)) iswhiteout = 1; /* don't lookup lower */ #if 0 UNIONFS_INTERNAL_DEBUG("unionfs_lookup: debug: whiteout=%d, path=%s\n", iswhiteout, cnp->cn_nameptr); #endif } /* * lookup lower layer */ if (ldvp != NULLVP && !(cnflags & DOWHITEOUT) && iswhiteout == 0) { /* always op is LOOKUP */ cnp->cn_nameiop = LOOKUP; cnflagsbk = cnp->cn_flags; cnp->cn_flags = cnflags; lerror = VOP_LOOKUP(ldvp, &lvp, cnp); cnp->cn_nameiop = nameiop; if (udvp != NULLVP && (uerror == 0 || uerror == EJUSTRETURN)) cnp->cn_flags = cnflagsbk; if (lerror == 0) { if (ldvp == lvp) { /* is dot */ if (uvp != NULLVP) vrele(uvp); /* no need? */ vrele(lvp); *(ap->a_vpp) = dvp; vref(dvp); UNIONFS_INTERNAL_DEBUG("unionfs_lookup: leave (%d)\n", lerror); return (lerror); } if (cnp->cn_lkflags & LK_TYPE_MASK) VOP_UNLOCK(lvp, 0, td); } } /* * check lookup result */ if (uvp == NULLVP && lvp == NULLVP) { UNIONFS_INTERNAL_DEBUG("unionfs_lookup: leave (%d)\n", (udvp != NULLVP ? uerror : lerror)); return (udvp != NULLVP ? uerror : lerror); } /* * check vnode type */ if (uvp != NULLVP && lvp != NULLVP && uvp->v_type != lvp->v_type) { vrele(lvp); lvp = NULLVP; } /* * check shadow dir */ if (uerror != 0 && uerror != EJUSTRETURN && udvp != NULLVP && lerror == 0 && lvp != NULLVP && lvp->v_type == VDIR && !(dvp->v_mount->mnt_flag & MNT_RDONLY) && (1 < cnp->cn_namelen || '.' != *(cnp->cn_nameptr))) { /* get unionfs vnode in order to create a new shadow dir. */ error = unionfs_nodeget(dvp->v_mount, NULLVP, lvp, dvp, &vp, cnp, td); if (error != 0) goto unionfs_lookup_out; if (LK_SHARED == (cnp->cn_lkflags & LK_TYPE_MASK)) VOP_UNLOCK(vp, 0, td); if (LK_EXCLUSIVE != VOP_ISLOCKED(vp, td)) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); lockflag = 1; } error = unionfs_mkshadowdir(MOUNTTOUNIONFSMOUNT(dvp->v_mount), udvp, VTOUNIONFS(vp), cnp, td); if (lockflag != 0) VOP_UNLOCK(vp, 0, td); if (error != 0) { UNIONFSDEBUG("unionfs_lookup: Unable to create shadow dir."); if ((cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) vput(vp); else vrele(vp); goto unionfs_lookup_out; } if ((cnp->cn_lkflags & LK_TYPE_MASK) == LK_SHARED) vn_lock(vp, LK_SHARED | LK_RETRY, td); } /* * get unionfs vnode. */ else { if (uvp != NULLVP) error = uerror; else error = lerror; if (error != 0) goto unionfs_lookup_out; error = unionfs_nodeget(dvp->v_mount, uvp, lvp, dvp, &vp, cnp, td); if (error != 0) { UNIONFSDEBUG("unionfs_lookup: Unable to create unionfs vnode."); goto unionfs_lookup_out; } if ((nameiop == DELETE || nameiop == RENAME) && (cnp->cn_lkflags & LK_TYPE_MASK) == 0) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); } *(ap->a_vpp) = vp; unionfs_lookup_out: if (uvp != NULLVP) vrele(uvp); if (lvp != NULLVP) vrele(lvp); UNIONFS_INTERNAL_DEBUG("unionfs_lookup: leave (%d)\n", error); return (error); } static int unionfs_create(struct vop_create_args *ap) { struct unionfs_node *dunp; struct componentname *cnp; struct thread *td; struct vnode *udvp; struct vnode *vp; int error; UNIONFS_INTERNAL_DEBUG("unionfs_create: enter\n"); dunp = VTOUNIONFS(ap->a_dvp); cnp = ap->a_cnp; td = curthread; udvp = dunp->un_uppervp; error = EROFS; if (udvp != NULLVP) { if ((error = VOP_CREATE(udvp, &vp, cnp, ap->a_vap)) == 0) { VOP_UNLOCK(vp, 0, td); error = unionfs_nodeget(ap->a_dvp->v_mount, vp, NULLVP, ap->a_dvp, ap->a_vpp, cnp, td); vrele(vp); } } UNIONFS_INTERNAL_DEBUG("unionfs_create: leave (%d)\n", error); return (error); } static int unionfs_whiteout(struct vop_whiteout_args *ap) { struct unionfs_node *dunp; struct componentname *cnp; struct vnode *udvp; int error; UNIONFS_INTERNAL_DEBUG("unionfs_whiteout: enter\n"); dunp = VTOUNIONFS(ap->a_dvp); cnp = ap->a_cnp; udvp = dunp->un_uppervp; error = EOPNOTSUPP; if (udvp != NULLVP) { switch (ap->a_flags) { case CREATE: case DELETE: case LOOKUP: error = VOP_WHITEOUT(udvp, cnp, ap->a_flags); break; default: error = EINVAL; break; } } UNIONFS_INTERNAL_DEBUG("unionfs_whiteout: leave (%d)\n", error); return (error); } static int unionfs_mknod(struct vop_mknod_args *ap) { struct unionfs_node *dunp; struct componentname *cnp; struct thread *td; struct vnode *udvp; struct vnode *vp; int error; UNIONFS_INTERNAL_DEBUG("unionfs_mknod: enter\n"); dunp = VTOUNIONFS(ap->a_dvp); cnp = ap->a_cnp; td = curthread; udvp = dunp->un_uppervp; error = EROFS; if (udvp != NULLVP) { if ((error = VOP_MKNOD(udvp, &vp, cnp, ap->a_vap)) == 0) { VOP_UNLOCK(vp, 0, td); error = unionfs_nodeget(ap->a_dvp->v_mount, vp, NULLVP, ap->a_dvp, ap->a_vpp, cnp, td); vrele(vp); } } UNIONFS_INTERNAL_DEBUG("unionfs_mknod: leave (%d)\n", error); return (error); } static int unionfs_open(struct vop_open_args *ap) { int error; struct unionfs_node *unp; struct unionfs_node_status *unsp; struct vnode *uvp; struct vnode *lvp; struct vnode *targetvp; struct ucred *cred; struct thread *td; UNIONFS_INTERNAL_DEBUG("unionfs_open: enter\n"); error = 0; unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; targetvp = NULLVP; cred = ap->a_cred; td = ap->a_td; unionfs_get_node_status(unp, td, &unsp); if (unsp->uns_lower_opencnt > 0 || unsp->uns_upper_opencnt > 0) { /* vnode is already opend. */ if (unsp->uns_upper_opencnt > 0) targetvp = uvp; else targetvp = lvp; if (targetvp == lvp && (ap->a_mode & FWRITE) && lvp->v_type == VREG) targetvp = NULLVP; } if (targetvp == NULLVP) { if (uvp == NULLVP) { if ((ap->a_mode & FWRITE) && lvp->v_type == VREG) { error = unionfs_copyfile(unp, !(ap->a_mode & O_TRUNC), cred, td); if (error != 0) goto unionfs_open_abort; targetvp = uvp = unp->un_uppervp; } else targetvp = lvp; } else targetvp = uvp; } error = VOP_OPEN(targetvp, ap->a_mode, cred, td, ap->a_fdidx); if (error == 0) { if (targetvp == uvp) { if (uvp->v_type == VDIR && lvp != NULLVP && unsp->uns_lower_opencnt <= 0) { /* open lower for readdir */ error = VOP_OPEN(lvp, FREAD, cred, td, -1); if (error != 0) { VOP_CLOSE(uvp, ap->a_mode, cred, td); goto unionfs_open_abort; } unsp->uns_node_flag |= UNS_OPENL_4_READDIR; unsp->uns_lower_opencnt++; } unsp->uns_upper_opencnt++; } else { unsp->uns_lower_opencnt++; unsp->uns_lower_openmode = ap->a_mode; unsp->uns_lower_fdidx = ap->a_fdidx; } ap->a_vp->v_object = targetvp->v_object; } unionfs_open_abort: if (error != 0) unionfs_tryrem_node_status(unp, td, unsp); UNIONFS_INTERNAL_DEBUG("unionfs_open: leave (%d)\n", error); return (error); } static int unionfs_close(struct vop_close_args *ap) { int error; int locked; struct unionfs_node *unp; struct unionfs_node_status *unsp; struct ucred *cred; struct thread *td; struct vnode *ovp; UNIONFS_INTERNAL_DEBUG("unionfs_close: enter\n"); locked = 0; unp = VTOUNIONFS(ap->a_vp); cred = ap->a_cred; td = ap->a_td; if (VOP_ISLOCKED(ap->a_vp, td) != LK_EXCLUSIVE) { vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY, td); locked = 1; } unionfs_get_node_status(unp, td, &unsp); if (unsp->uns_lower_opencnt <= 0 && unsp->uns_upper_opencnt <= 0) { #ifdef DIAGNOSTIC printf("unionfs_close: warning: open count is 0\n"); #endif if (unp->un_uppervp != NULLVP) ovp = unp->un_uppervp; else ovp = unp->un_lowervp; } else if (unsp->uns_upper_opencnt > 0) ovp = unp->un_uppervp; else ovp = unp->un_lowervp; error = VOP_CLOSE(ovp, ap->a_fflag, cred, td); if (error != 0) goto unionfs_close_abort; ap->a_vp->v_object = ovp->v_object; if (ovp == unp->un_uppervp) { unsp->uns_upper_opencnt--; if (unsp->uns_upper_opencnt == 0) { if (unsp->uns_node_flag & UNS_OPENL_4_READDIR) { VOP_CLOSE(unp->un_lowervp, FREAD, cred, td); unsp->uns_node_flag &= ~UNS_OPENL_4_READDIR; unsp->uns_lower_opencnt--; } if (unsp->uns_lower_opencnt > 0) ap->a_vp->v_object = unp->un_lowervp->v_object; } } else unsp->uns_lower_opencnt--; unionfs_close_abort: unionfs_tryrem_node_status(unp, td, unsp); if (locked != 0) VOP_UNLOCK(ap->a_vp, 0, td); UNIONFS_INTERNAL_DEBUG("unionfs_close: leave (%d)\n", error); return (error); } /* * Check the access mode toward shadow file/dir. */ static int unionfs_check_corrected_access(u_short mode, struct vattr *va, struct ucred *cred) { int count; uid_t uid; /* upper side vnode's uid */ gid_t gid; /* upper side vnode's gid */ u_short vmode; /* upper side vnode's mode */ gid_t *gp; u_short mask; mask = 0; uid = va->va_uid; gid = va->va_gid; vmode = va->va_mode; /* check owner */ if (cred->cr_uid == uid) { if (mode & VEXEC) mask |= S_IXUSR; if (mode & VREAD) mask |= S_IRUSR; if (mode & VWRITE) mask |= S_IWUSR; return ((vmode & mask) == mask ? 0 : EACCES); } /* check group */ count = 0; gp = cred->cr_groups; for (; count < cred->cr_ngroups; count++, gp++) { if (gid == *gp) { if (mode & VEXEC) mask |= S_IXGRP; if (mode & VREAD) mask |= S_IRGRP; if (mode & VWRITE) mask |= S_IWGRP; return ((vmode & mask) == mask ? 0 : EACCES); } } /* check other */ if (mode & VEXEC) mask |= S_IXOTH; if (mode & VREAD) mask |= S_IROTH; if (mode & VWRITE) mask |= S_IWOTH; return ((vmode & mask) == mask ? 0 : EACCES); } static int unionfs_access(struct vop_access_args *ap) { struct unionfs_mount *ump; struct unionfs_node *unp; struct vnode *uvp; struct vnode *lvp; struct thread *td; struct vattr va; int mode; int error; UNIONFS_INTERNAL_DEBUG("unionfs_access: enter\n"); ump = MOUNTTOUNIONFSMOUNT(ap->a_vp->v_mount); unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; td = ap->a_td; mode = ap->a_mode; error = EACCES; if ((mode & VWRITE) && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) { switch (ap->a_vp->v_type) { case VREG: case VDIR: case VLNK: return (EROFS); default: break; } } if (uvp != NULLVP) { error = VOP_ACCESS(uvp, mode, ap->a_cred, td); UNIONFS_INTERNAL_DEBUG("unionfs_access: leave (%d)\n", error); return (error); } if (lvp != NULLVP) { if (mode & VWRITE) { if (ump->um_uppervp->v_mount->mnt_flag & MNT_RDONLY) { switch (ap->a_vp->v_type) { case VREG: case VDIR: case VLNK: return (EROFS); default: break; } } else if (ap->a_vp->v_type == VREG || ap->a_vp->v_type == VDIR) { /* check shadow file/dir */ if (ump->um_copymode != UNIONFS_TRANSPARENT) { error = unionfs_create_uppervattr(ump, lvp, &va, ap->a_cred, td); if (error != 0) return (error); error = unionfs_check_corrected_access( mode, &va, ap->a_cred); if (error != 0) return (error); } } mode &= ~VWRITE; mode |= VREAD; /* will copy to upper */ } error = VOP_ACCESS(lvp, mode, ap->a_cred, td); } UNIONFS_INTERNAL_DEBUG("unionfs_access: leave (%d)\n", error); return (error); } static int unionfs_getattr(struct vop_getattr_args *ap) { int error; struct unionfs_node *unp; struct unionfs_mount *ump; struct vnode *uvp; struct vnode *lvp; struct thread *td; struct vattr va; UNIONFS_INTERNAL_DEBUG("unionfs_getattr: enter\n"); unp = VTOUNIONFS(ap->a_vp); ump = MOUNTTOUNIONFSMOUNT(ap->a_vp->v_mount); uvp = unp->un_uppervp; lvp = unp->un_lowervp; td = ap->a_td; if (uvp != NULLVP) { if ((error = VOP_GETATTR(uvp, ap->a_vap, ap->a_cred, td)) == 0) ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0]; UNIONFS_INTERNAL_DEBUG("unionfs_getattr: leave mode=%o, uid=%d, gid=%d (%d)\n", ap->a_vap->va_mode, ap->a_vap->va_uid, ap->a_vap->va_gid, error); return (error); } error = VOP_GETATTR(lvp, ap->a_vap, ap->a_cred, td); if (error == 0 && !(ump->um_uppervp->v_mount->mnt_flag & MNT_RDONLY)) { /* correct the attr toward shadow file/dir. */ if (ap->a_vp->v_type == VREG || ap->a_vp->v_type == VDIR) { unionfs_create_uppervattr_core(ump, ap->a_vap, &va, td); ap->a_vap->va_mode = va.va_mode; ap->a_vap->va_uid = va.va_uid; ap->a_vap->va_gid = va.va_gid; } } if (error == 0) ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0]; UNIONFS_INTERNAL_DEBUG("unionfs_getattr: leave mode=%o, uid=%d, gid=%d (%d)\n", ap->a_vap->va_mode, ap->a_vap->va_uid, ap->a_vap->va_gid, error); return (error); } static int unionfs_setattr(struct vop_setattr_args *ap) { int error; struct unionfs_node *unp; struct vnode *uvp; struct vnode *lvp; struct thread *td; struct vattr *vap; UNIONFS_INTERNAL_DEBUG("unionfs_setattr: enter\n"); error = EROFS; unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; td = ap->a_td; vap = ap->a_vap; if ((ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) && (vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL)) return (EROFS); if (uvp == NULLVP && lvp->v_type == VREG) { error = unionfs_copyfile(unp, (vap->va_size != 0), ap->a_cred, td); if (error != 0) return (error); uvp = unp->un_uppervp; } if (uvp != NULLVP) error = VOP_SETATTR(uvp, vap, ap->a_cred, td); UNIONFS_INTERNAL_DEBUG("unionfs_setattr: leave (%d)\n", error); return (error); } static int unionfs_read(struct vop_read_args *ap) { int error; struct unionfs_node *unp; struct vnode *tvp; /* UNIONFS_INTERNAL_DEBUG("unionfs_read: enter\n"); */ unp = VTOUNIONFS(ap->a_vp); tvp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); error = VOP_READ(tvp, ap->a_uio, ap->a_ioflag, ap->a_cred); /* UNIONFS_INTERNAL_DEBUG("unionfs_read: leave (%d)\n", error); */ return (error); } static int unionfs_write(struct vop_write_args *ap) { int error; struct unionfs_node *unp; struct vnode *tvp; /* UNIONFS_INTERNAL_DEBUG("unionfs_write: enter\n"); */ unp = VTOUNIONFS(ap->a_vp); tvp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); error = VOP_WRITE(tvp, ap->a_uio, ap->a_ioflag, ap->a_cred); /* UNIONFS_INTERNAL_DEBUG("unionfs_write: leave (%d)\n", error); */ return (error); } static int unionfs_lease(struct vop_lease_args *ap) { int error; struct unionfs_node *unp; struct vnode *vp; UNIONFS_INTERNAL_DEBUG("unionfs_lease: enter\n"); unp = VTOUNIONFS(ap->a_vp); vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); error = VOP_LEASE(vp, ap->a_td, ap->a_cred, ap->a_flag); UNIONFS_INTERNAL_DEBUG("unionfs_lease: lease (%d)\n", error); return (error); } static int unionfs_ioctl(struct vop_ioctl_args *ap) { int error; struct unionfs_node *unp; struct unionfs_node_status *unsp; struct vnode *ovp; UNIONFS_INTERNAL_DEBUG("unionfs_ioctl: enter\n"); vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY, ap->a_td); unp = VTOUNIONFS(ap->a_vp); unionfs_get_node_status(unp, ap->a_td, &unsp); ovp = (unsp->uns_upper_opencnt ? unp->un_uppervp : unp->un_lowervp); VOP_UNLOCK(ap->a_vp, 0, ap->a_td); if (ovp == NULLVP) return (EBADF); error = VOP_IOCTL(ovp, ap->a_command, ap->a_data, ap->a_fflag, ap->a_cred, ap->a_td); UNIONFS_INTERNAL_DEBUG("unionfs_ioctl: lease (%d)\n", error); return (error); } static int unionfs_poll(struct vop_poll_args *ap) { struct unionfs_node *unp; struct unionfs_node_status *unsp; struct vnode *ovp; vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY, ap->a_td); unp = VTOUNIONFS(ap->a_vp); unionfs_get_node_status(unp, ap->a_td, &unsp); ovp = (unsp->uns_upper_opencnt ? unp->un_uppervp : unp->un_lowervp); VOP_UNLOCK(ap->a_vp, 0, ap->a_td); if (ovp == NULLVP) return (EBADF); return (VOP_POLL(ovp, ap->a_events, ap->a_cred, ap->a_td)); } static int unionfs_fsync(struct vop_fsync_args *ap) { struct unionfs_node *unp; struct unionfs_node_status *unsp; struct vnode *ovp; unp = VTOUNIONFS(ap->a_vp); unionfs_get_node_status(unp, ap->a_td, &unsp); ovp = (unsp->uns_upper_opencnt ? unp->un_uppervp : unp->un_lowervp); if (ovp == NULLVP) return (EBADF); return (VOP_FSYNC(ovp, ap->a_waitfor, ap->a_td)); } static int unionfs_remove(struct vop_remove_args *ap) { int error; struct unionfs_node *dunp; struct unionfs_node *unp; struct vnode *udvp; struct vnode *uvp; struct vnode *lvp; struct componentname *cnp; struct thread *td; UNIONFS_INTERNAL_DEBUG("unionfs_remove: enter\n"); error = 0; dunp = VTOUNIONFS(ap->a_dvp); unp = VTOUNIONFS(ap->a_vp); udvp = dunp->un_uppervp; uvp = unp->un_uppervp; lvp = unp->un_lowervp; cnp = ap->a_cnp; td = curthread; if (udvp == NULLVP) return (EROFS); if (uvp != NULLVP) { cnp->cn_flags |= DOWHITEOUT; error = VOP_REMOVE(udvp, uvp, cnp); } else if (lvp != NULLVP) error = unionfs_mkwhiteout(udvp, cnp, td, unp->un_path); UNIONFS_INTERNAL_DEBUG("unionfs_remove: leave (%d)\n", error); return (error); } static int unionfs_link(struct vop_link_args *ap) { int error; int needrelookup; struct unionfs_node *dunp; struct unionfs_node *unp; struct vnode *udvp; struct vnode *uvp; struct componentname *cnp; struct thread *td; UNIONFS_INTERNAL_DEBUG("unionfs_link: enter\n"); error = 0; needrelookup = 0; dunp = VTOUNIONFS(ap->a_tdvp); unp = NULL; udvp = dunp->un_uppervp; uvp = NULLVP; cnp = ap->a_cnp; td = curthread; if (udvp == NULLVP) return (EROFS); if (ap->a_vp->v_op != &unionfs_vnodeops) uvp = ap->a_vp; else { unp = VTOUNIONFS(ap->a_vp); if (unp->un_uppervp == NULLVP) { if (ap->a_vp->v_type != VREG) return (EOPNOTSUPP); error = unionfs_copyfile(unp, 1, cnp->cn_cred, td); if (error != 0) return (error); needrelookup = 1; } uvp = unp->un_uppervp; } if (needrelookup != 0) error = unionfs_relookup_for_create(ap->a_tdvp, cnp, td); if (error == 0) error = VOP_LINK(udvp, uvp, cnp); UNIONFS_INTERNAL_DEBUG("unionfs_link: leave (%d)\n", error); return (error); } static int unionfs_rename(struct vop_rename_args *ap) { int error; struct vnode *fdvp; struct vnode *fvp; struct componentname *fcnp; struct vnode *tdvp; struct vnode *tvp; struct componentname *tcnp; struct vnode *ltdvp; struct vnode *ltvp; struct thread *td; /* rename target vnodes */ struct vnode *rfdvp; struct vnode *rfvp; struct vnode *rtdvp; struct vnode *rtvp; int needrelookup; struct unionfs_mount *ump; struct unionfs_node *unp; UNIONFS_INTERNAL_DEBUG("unionfs_rename: enter\n"); error = 0; fdvp = ap->a_fdvp; fvp = ap->a_fvp; fcnp = ap->a_fcnp; tdvp = ap->a_tdvp; tvp = ap->a_tvp; tcnp = ap->a_tcnp; ltdvp = NULLVP; ltvp = NULLVP; td = curthread; rfdvp = fdvp; rfvp = fvp; rtdvp = tdvp; rtvp = tvp; needrelookup = 0; #ifdef DIAGNOSTIC if (!(fcnp->cn_flags & HASBUF) || !(tcnp->cn_flags & HASBUF)) panic("unionfs_rename: no name"); #endif /* check for cross device rename */ if (fvp->v_mount != tdvp->v_mount || (tvp != NULLVP && fvp->v_mount != tvp->v_mount)) { error = EXDEV; goto unionfs_rename_abort; } /* Renaming a file to itself has no effect. */ if (fvp == tvp) goto unionfs_rename_abort; /* * from/to vnode is unionfs node. */ unp = VTOUNIONFS(fdvp); #ifdef UNIONFS_IDBG_RENAME UNIONFS_INTERNAL_DEBUG("fdvp=%p, ufdvp=%p, lfdvp=%p\n", fdvp, unp->un_uppervp, unp->un_lowervp); #endif if (unp->un_uppervp == NULLVP) { error = ENODEV; goto unionfs_rename_abort; } rfdvp = unp->un_uppervp; vref(rfdvp); unp = VTOUNIONFS(fvp); #ifdef UNIONFS_IDBG_RENAME UNIONFS_INTERNAL_DEBUG("fvp=%p, ufvp=%p, lfvp=%p\n", fvp, unp->un_uppervp, unp->un_lowervp); #endif ump = MOUNTTOUNIONFSMOUNT(fvp->v_mount); if (unp->un_uppervp == NULLVP) { switch (fvp->v_type) { case VREG: if ((error = vn_lock(fvp, LK_EXCLUSIVE, td)) != 0) goto unionfs_rename_abort; error = unionfs_copyfile(unp, 1, fcnp->cn_cred, td); VOP_UNLOCK(fvp, 0, td); if (error != 0) goto unionfs_rename_abort; break; case VDIR: if ((error = vn_lock(fvp, LK_EXCLUSIVE, td)) != 0) goto unionfs_rename_abort; error = unionfs_mkshadowdir(ump, rfdvp, unp, fcnp, td); VOP_UNLOCK(fvp, 0, td); if (error != 0) goto unionfs_rename_abort; break; default: error = ENODEV; goto unionfs_rename_abort; } needrelookup = 1; } if (unp->un_lowervp != NULLVP) fcnp->cn_flags |= DOWHITEOUT; rfvp = unp->un_uppervp; vref(rfvp); unp = VTOUNIONFS(tdvp); #ifdef UNIONFS_IDBG_RENAME UNIONFS_INTERNAL_DEBUG("tdvp=%p, utdvp=%p, ltdvp=%p\n", tdvp, unp->un_uppervp, unp->un_lowervp); #endif if (unp->un_uppervp == NULLVP) { error = ENODEV; goto unionfs_rename_abort; } rtdvp = unp->un_uppervp; ltdvp = unp->un_lowervp; vref(rtdvp); if (tdvp == tvp) { rtvp = rtdvp; vref(rtvp); } else if (tvp != NULLVP) { unp = VTOUNIONFS(tvp); #ifdef UNIONFS_IDBG_RENAME UNIONFS_INTERNAL_DEBUG("tvp=%p, utvp=%p, ltvp=%p\n", tvp, unp->un_uppervp, unp->un_lowervp); #endif if (unp->un_uppervp == NULLVP) rtvp = NULLVP; else { if (tvp->v_type == VDIR) { error = EINVAL; goto unionfs_rename_abort; } rtvp = unp->un_uppervp; ltvp = unp->un_lowervp; vref(rtvp); } } if (needrelookup != 0) { if ((error = vn_lock(fdvp, LK_EXCLUSIVE, td)) != 0) goto unionfs_rename_abort; error = unionfs_relookup_for_delete(fdvp, fcnp, td); VOP_UNLOCK(fdvp, 0, td); if (error != 0) goto unionfs_rename_abort; /* Locke of tvp is canceled in order to avoid recursive lock. */ if (tvp != NULLVP && tvp != tdvp) VOP_UNLOCK(tvp, 0, td); error = unionfs_relookup_for_rename(tdvp, tcnp, td); if (tvp != NULLVP && tvp != tdvp) vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY, td); if (error != 0) goto unionfs_rename_abort; } error = VOP_RENAME(rfdvp, rfvp, fcnp, rtdvp, rtvp, tcnp); if (fdvp != rfdvp) vrele(fdvp); if (fvp != rfvp) vrele(fvp); if (tdvp != rtdvp) vrele(tdvp); if (tvp != rtvp && tvp != NULLVP) { if (rtvp == NULLVP) vput(tvp); else vrele(tvp); } if (ltdvp != NULLVP) VOP_UNLOCK(ltdvp, 0, td); if (ltvp != NULLVP) VOP_UNLOCK(ltvp, 0, td); UNIONFS_INTERNAL_DEBUG("unionfs_rename: leave (%d)\n", error); return (error); unionfs_rename_abort: if (fdvp != rfdvp) vrele(rfdvp); if (fvp != rfvp) vrele(rfvp); if (tdvp != rtdvp) vrele(rtdvp); vput(tdvp); if (tvp != rtvp && rtvp != NULLVP) vrele(rtvp); if (tvp != NULLVP) { if (tdvp != tvp) vput(tvp); else vrele(tvp); } vrele(fdvp); vrele(fvp); UNIONFS_INTERNAL_DEBUG("unionfs_rename: leave (%d)\n", error); return (error); } static int unionfs_mkdir(struct vop_mkdir_args *ap) { int error; int lkflags; struct unionfs_node *dunp; struct componentname *cnp; struct thread *td; struct vnode *udvp; struct vnode *uvp; struct vattr va; UNIONFS_INTERNAL_DEBUG("unionfs_mkdir: enter\n"); error = EROFS; dunp = VTOUNIONFS(ap->a_dvp); cnp = ap->a_cnp; lkflags = cnp->cn_lkflags; td = curthread; udvp = dunp->un_uppervp; if (udvp != NULLVP) { /* check opaque */ if (!(cnp->cn_flags & ISWHITEOUT)) { error = VOP_GETATTR(udvp, &va, cnp->cn_cred, td); if (error != 0) return (error); if (va.va_flags & OPAQUE) cnp->cn_flags |= ISWHITEOUT; } if ((error = VOP_MKDIR(udvp, &uvp, cnp, ap->a_vap)) == 0) { VOP_UNLOCK(uvp, 0, td); cnp->cn_lkflags = LK_EXCLUSIVE; error = unionfs_nodeget(ap->a_dvp->v_mount, uvp, NULLVP, ap->a_dvp, ap->a_vpp, cnp, td); cnp->cn_lkflags = lkflags; vrele(uvp); } } UNIONFS_INTERNAL_DEBUG("unionfs_mkdir: leave (%d)\n", error); return (error); } static int unionfs_rmdir(struct vop_rmdir_args *ap) { int error; struct unionfs_node *dunp; struct unionfs_node *unp; struct componentname *cnp; struct thread *td; struct vnode *udvp; struct vnode *uvp; struct vnode *lvp; UNIONFS_INTERNAL_DEBUG("unionfs_rmdir: enter\n"); error = 0; dunp = VTOUNIONFS(ap->a_dvp); unp = VTOUNIONFS(ap->a_vp); cnp = ap->a_cnp; td = curthread; udvp = dunp->un_uppervp; uvp = unp->un_uppervp; lvp = unp->un_lowervp; if (udvp == NULLVP) return (EROFS); if (udvp == uvp) return (EOPNOTSUPP); if (uvp != NULLVP) { if (lvp != NULLVP) { error = unionfs_check_rmdir(ap->a_vp, cnp->cn_cred, td); if (error != 0) return (error); } cnp->cn_flags |= DOWHITEOUT; error = VOP_RMDIR(udvp, uvp, cnp); } else if (lvp != NULLVP) error = unionfs_mkwhiteout(udvp, cnp, td, unp->un_path); UNIONFS_INTERNAL_DEBUG("unionfs_rmdir: leave (%d)\n", error); return (error); } static int unionfs_symlink(struct vop_symlink_args *ap) { int error; int lkflags; struct unionfs_node *dunp; struct componentname *cnp; struct thread *td; struct vnode *udvp; struct vnode *uvp; UNIONFS_INTERNAL_DEBUG("unionfs_symlink: enter\n"); error = EROFS; dunp = VTOUNIONFS(ap->a_dvp); cnp = ap->a_cnp; lkflags = cnp->cn_lkflags; td = curthread; udvp = dunp->un_uppervp; if (udvp != NULLVP) { error = VOP_SYMLINK(udvp, &uvp, cnp, ap->a_vap, ap->a_target); if (error == 0) { VOP_UNLOCK(uvp, 0, td); cnp->cn_lkflags = LK_EXCLUSIVE; error = unionfs_nodeget(ap->a_dvp->v_mount, uvp, NULLVP, ap->a_dvp, ap->a_vpp, cnp, td); cnp->cn_lkflags = lkflags; vrele(uvp); } } UNIONFS_INTERNAL_DEBUG("unionfs_symlink: leave (%d)\n", error); return (error); } static int unionfs_readdir(struct vop_readdir_args *ap) { int error; int eofflag; int locked; struct unionfs_node *unp; struct unionfs_node_status *unsp; struct uio *uio; struct vnode *uvp; struct vnode *lvp; struct thread *td; struct vattr va; int ncookies_bk; u_long *cookies_bk; UNIONFS_INTERNAL_DEBUG("unionfs_readdir: enter\n"); error = 0; eofflag = 0; locked = 0; unp = VTOUNIONFS(ap->a_vp); uio = ap->a_uio; uvp = unp->un_uppervp; lvp = unp->un_lowervp; td = uio->uio_td; ncookies_bk = 0; cookies_bk = NULL; if (ap->a_vp->v_type != VDIR) return (ENOTDIR); /* check opaque */ if (uvp != NULLVP && lvp != NULLVP) { if ((error = VOP_GETATTR(uvp, &va, ap->a_cred, td)) != 0) return (error); if (va.va_flags & OPAQUE) lvp = NULLVP; } if (VOP_ISLOCKED(ap->a_vp, td) != LK_EXCLUSIVE) { vn_lock(ap->a_vp, LK_UPGRADE | LK_RETRY, td); locked = 1; } unionfs_get_node_status(unp, curthread, &unsp); if (locked == 1) vn_lock(ap->a_vp, LK_DOWNGRADE | LK_RETRY, td); /* upper only */ if (uvp != NULLVP && lvp == NULLVP) { if (unsp->uns_upper_opencnt <= 0) error = EBADF; else { error = VOP_READDIR(uvp, uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies); unsp->uns_readdir_status = 0; } goto unionfs_readdir_exit; } /* lower only */ if (uvp == NULLVP && lvp != NULLVP) { if (unsp->uns_lower_opencnt <= 0) error = EBADF; else { error = VOP_READDIR(lvp, uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies); unsp->uns_readdir_status = 2; } goto unionfs_readdir_exit; } /* * readdir upper and lower */ if (unsp->uns_lower_opencnt <= 0 || unsp->uns_upper_opencnt <= 0) { error = EBADF; goto unionfs_readdir_exit; } if (uio->uio_offset == 0) unsp->uns_readdir_status = 0; if (unsp->uns_readdir_status == 0) { /* read upper */ error = VOP_READDIR(uvp, uio, ap->a_cred, &eofflag, ap->a_ncookies, ap->a_cookies); if (error != 0 || eofflag == 0) { UNIONFS_INTERNAL_DEBUG("unionfs_readdir: leave (%d)\n", error); return (error); } unsp->uns_readdir_status = 1; /* * ufs(and other fs) needs size of uio_resid larger than * DIRBLKSIZ. * size of DIRBLKSIZ equals DEV_BSIZE. * (see: ufs/ufs/ufs_vnops.c ufs_readdir func , ufs/ufs/dir.h) */ if (uio->uio_resid <= (uio->uio_resid & (DEV_BSIZE -1))) { UNIONFS_INTERNAL_DEBUG("unionfs_readdir: leave (%d)\n", error); return (0); } /* * backup cookies * It prepares to readdir in lower. */ if (ap->a_ncookies != NULL) { ncookies_bk = *(ap->a_ncookies); *(ap->a_ncookies) = 0; } if (ap->a_cookies != NULL) { cookies_bk = *(ap->a_cookies); *(ap->a_cookies) = NULL; } } /* initialize for readdir in lower */ if (unsp->uns_readdir_status == 1) { unsp->uns_readdir_status = 2; uio->uio_offset = 0; } if (lvp == NULLVP) { error = EBADF; goto unionfs_readdir_exit; } /* read lower */ error = VOP_READDIR(lvp, uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies); if (cookies_bk != NULL) { /* merge cookies */ int size; u_long *newcookies, *pos; size = *(ap->a_ncookies) + ncookies_bk; newcookies = (u_long *) malloc(size * sizeof(u_long), M_TEMP, M_WAITOK); pos = newcookies; memcpy(pos, cookies_bk, ncookies_bk * sizeof(u_long)); pos += ncookies_bk * sizeof(u_long); memcpy(pos, *(ap->a_cookies), *(ap->a_ncookies) * sizeof(u_long)); free(cookies_bk, M_TEMP); free(*(ap->a_cookies), M_TEMP); *(ap->a_ncookies) = size; *(ap->a_cookies) = newcookies; } unionfs_readdir_exit: UNIONFS_INTERNAL_DEBUG("unionfs_readdir: leave (%d)\n", error); return (error); } static int unionfs_readlink(struct vop_readlink_args *ap) { int error; struct unionfs_node *unp; struct vnode *vp; UNIONFS_INTERNAL_DEBUG("unionfs_readlink: enter\n"); unp = VTOUNIONFS(ap->a_vp); vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); error = VOP_READLINK(vp, ap->a_uio, ap->a_cred); UNIONFS_INTERNAL_DEBUG("unionfs_readlink: leave (%d)\n", error); return (error); } static int unionfs_getwritemount(struct vop_getwritemount_args *ap) { int error; struct vnode *uvp; struct vnode *vp; UNIONFS_INTERNAL_DEBUG("unionfs_getwritemount: enter\n"); error = 0; vp = ap->a_vp; if (vp == NULLVP || (vp->v_mount->mnt_flag & MNT_RDONLY)) return (EACCES); uvp = UNIONFSVPTOUPPERVP(vp); if (uvp == NULLVP && VREG == vp->v_type) uvp = UNIONFSVPTOUPPERVP(VTOUNIONFS(vp)->un_dvp); if (uvp != NULLVP) error = VOP_GETWRITEMOUNT(uvp, ap->a_mpp); else { VI_LOCK(vp); if (vp->v_iflag & VI_FREE) error = EOPNOTSUPP; else error = EACCES; VI_UNLOCK(vp); } UNIONFS_INTERNAL_DEBUG("unionfs_getwritemount: leave (%d)\n", error); return (error); } static int unionfs_inactive(struct vop_inactive_args *ap) { struct unionfs_node *unp; unp = VTOUNIONFS(ap->a_vp); if (unp == NULL || !(unp->un_flag & UNIONFS_CACHED)) vgone(ap->a_vp); return (0); } static int unionfs_reclaim(struct vop_reclaim_args *ap) { /* UNIONFS_INTERNAL_DEBUG("unionfs_reclaim: enter\n"); */ unionfs_hashrem(ap->a_vp, ap->a_td); /* UNIONFS_INTERNAL_DEBUG("unionfs_reclaim: leave\n"); */ return (0); } static int unionfs_print(struct vop_print_args *ap) { struct unionfs_node *unp; /* struct unionfs_node_status *unsp; */ unp = VTOUNIONFS(ap->a_vp); /* unionfs_get_node_status(unp, curthread, &unsp); */ printf("unionfs_vp=%p, uppervp=%p, lowervp=%p\n", ap->a_vp, unp->un_uppervp, unp->un_lowervp); /* printf("unionfs opencnt: uppervp=%d, lowervp=%d\n", unsp->uns_upper_opencnt, unsp->uns_lower_opencnt); */ if (unp->un_uppervp != NULLVP) vprint("unionfs: upper", unp->un_uppervp); if (unp->un_lowervp != NULLVP) vprint("unionfs: lower", unp->un_lowervp); return (0); } static int unionfs_get_llt_revlock(int flags) { int count; flags &= LK_TYPE_MASK; for (count = 0; un_llt[count].lock != 0; count++) { if (flags == un_llt[count].lock) { return un_llt[count].revlock; } } return 0; } static int -unionfs_lock(struct _vop_lock_args *ap) +unionfs_lock(struct vop_lock1_args *ap) { int error; int flags; int revlock; int uhold; struct unionfs_mount *ump; struct unionfs_node *unp; struct vnode *vp; struct vnode *uvp; struct vnode *lvp; struct thread *td; error = 0; uhold = 0; flags = ap->a_flags; vp = ap->a_vp; td = ap->a_td; if (LK_RELEASE == (flags & LK_TYPE_MASK) || !(flags & LK_TYPE_MASK)) return (VOP_UNLOCK(vp, flags, td)); if ((revlock = unionfs_get_llt_revlock(flags)) == 0) panic("unknown lock type: 0x%x", flags & LK_TYPE_MASK); if (!(flags & LK_INTERLOCK)) VI_LOCK(vp); ump = MOUNTTOUNIONFSMOUNT(vp->v_mount); unp = VTOUNIONFS(vp); if (NULL == unp) goto unionfs_lock_null_vnode; lvp = unp->un_lowervp; uvp = unp->un_uppervp; /* * Sometimes, lower or upper is already exclusive locked. * (ex. vfs_domount: mounted vnode is already locked.) */ if ((flags & LK_TYPE_MASK) == LK_EXCLUSIVE && vp == ump->um_rootvp) flags |= LK_CANRECURSE; if (lvp != NULLVP) { VI_LOCK_FLAGS(lvp, MTX_DUPOK); flags |= LK_INTERLOCK; vholdl(lvp); VI_UNLOCK(vp); ap->a_flags &= ~LK_INTERLOCK; error = VOP_LOCK(lvp, flags, td); VI_LOCK(vp); unp = VTOUNIONFS(vp); if (unp == NULL) { if (error == 0) VOP_UNLOCK(lvp, 0, td); VI_UNLOCK(vp); vdrop(lvp); return (vop_stdlock(ap)); } } if (error == 0 && uvp != NULLVP) { VI_LOCK_FLAGS(uvp, MTX_DUPOK); flags |= LK_INTERLOCK; vholdl(uvp); uhold = 1; VI_UNLOCK(vp); ap->a_flags &= ~LK_INTERLOCK; error = VOP_LOCK(uvp, flags, td); VI_LOCK(vp); unp = VTOUNIONFS(vp); if (unp == NULL) { if (error == 0) { VOP_UNLOCK(uvp, 0, td); if (lvp != NULLVP) VOP_UNLOCK(lvp, 0, td); } VI_UNLOCK(vp); if (lvp != NULLVP) vdrop(lvp); vdrop(uvp); return (vop_stdlock(ap)); } if (error != 0 && lvp != NULLVP) vn_lock(lvp, revlock | LK_RETRY, td); } VI_UNLOCK(vp); if (lvp != NULLVP) vdrop(lvp); if (uhold != 0) vdrop(uvp); return (error); unionfs_lock_null_vnode: ap->a_flags |= LK_INTERLOCK; return (vop_stdlock(ap)); } static int unionfs_unlock(struct vop_unlock_args *ap) { int error; int flags; int mtxlkflag; int uhold; struct vnode *vp; struct vnode *lvp; struct vnode *uvp; struct unionfs_node *unp; error = 0; mtxlkflag = 0; uhold = 0; flags = ap->a_flags | LK_RELEASE; vp = ap->a_vp; if (flags & LK_INTERLOCK) mtxlkflag = 1; else if (mtx_owned(VI_MTX(vp)) == 0) { VI_LOCK(vp); mtxlkflag = 2; } unp = VTOUNIONFS(vp); if (unp == NULL) goto unionfs_unlock_null_vnode; lvp = unp->un_lowervp; uvp = unp->un_uppervp; if (lvp != NULLVP) { VI_LOCK_FLAGS(lvp, MTX_DUPOK); flags |= LK_INTERLOCK; vholdl(lvp); VI_UNLOCK(vp); ap->a_flags &= ~LK_INTERLOCK; error = VOP_UNLOCK(lvp, flags, ap->a_td); VI_LOCK(vp); } if (error == 0 && uvp != NULLVP) { VI_LOCK_FLAGS(uvp, MTX_DUPOK); flags |= LK_INTERLOCK; vholdl(uvp); uhold = 1; VI_UNLOCK(vp); ap->a_flags &= ~LK_INTERLOCK; error = VOP_UNLOCK(uvp, flags, ap->a_td); VI_LOCK(vp); } VI_UNLOCK(vp); if (lvp != NULLVP) vdrop(lvp); if (uhold != 0) vdrop(uvp); if (mtxlkflag == 0) VI_LOCK(vp); return error; unionfs_unlock_null_vnode: if (mtxlkflag == 2) VI_UNLOCK(vp); return (vop_stdunlock(ap)); } static int unionfs_pathconf(struct vop_pathconf_args *ap) { struct unionfs_node *unp; struct vnode *vp; unp = VTOUNIONFS(ap->a_vp); vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); return (VOP_PATHCONF(vp, ap->a_name, ap->a_retval)); } static int unionfs_advlock(struct vop_advlock_args *ap) { int error; struct unionfs_node *unp; struct unionfs_node_status *unsp; struct vnode *vp; struct vnode *uvp; struct thread *td; UNIONFS_INTERNAL_DEBUG("unionfs_advlock: enter\n"); vp = ap->a_vp; td = curthread; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; if (uvp == NULLVP) { error = unionfs_copyfile(unp, 1, td->td_ucred, td); if (error != 0) goto unionfs_advlock_abort; uvp = unp->un_uppervp; unionfs_get_node_status(unp, td, &unsp); if (unsp->uns_lower_opencnt > 0) { /* try reopen the vnode */ error = VOP_OPEN(uvp, unsp->uns_lower_openmode, td->td_ucred, td, unsp->uns_lower_fdidx); if (error) goto unionfs_advlock_abort; unsp->uns_upper_opencnt++; VOP_CLOSE(unp->un_lowervp, unsp->uns_lower_openmode, td->td_ucred, td); unsp->uns_lower_opencnt--; } } VOP_UNLOCK(vp, 0, td); error = VOP_ADVLOCK(uvp, ap->a_id, ap->a_op, ap->a_fl, ap->a_flags); UNIONFS_INTERNAL_DEBUG("unionfs_advlock: leave (%d)\n", error); return error; unionfs_advlock_abort: VOP_UNLOCK(vp, 0, td); UNIONFS_INTERNAL_DEBUG("unionfs_advlock: leave (%d)\n", error); return error; } static int unionfs_strategy(struct vop_strategy_args *ap) { struct unionfs_node *unp; struct vnode *vp; unp = VTOUNIONFS(ap->a_vp); vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); #ifdef DIAGNOSTIC if (vp == NULLVP) panic("unionfs_strategy: nullvp"); if (ap->a_bp->b_iocmd == BIO_WRITE && vp == unp->un_lowervp) panic("unionfs_strategy: writing to lowervp"); #endif return (VOP_STRATEGY(vp, ap->a_bp)); } static int unionfs_getacl(struct vop_getacl_args *ap) { int error; struct unionfs_node *unp; struct vnode *vp; unp = VTOUNIONFS(ap->a_vp); vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); UNIONFS_INTERNAL_DEBUG("unionfs_getacl: enter\n"); error = VOP_GETACL(vp, ap->a_type, ap->a_aclp, ap->a_cred, ap->a_td); UNIONFS_INTERNAL_DEBUG("unionfs_getacl: leave (%d)\n", error); return (error); } static int unionfs_setacl(struct vop_setacl_args *ap) { int error; struct unionfs_node *unp; struct vnode *uvp; struct vnode *lvp; struct thread *td; UNIONFS_INTERNAL_DEBUG("unionfs_setacl: enter\n"); error = EROFS; unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; td = ap->a_td; if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (uvp == NULLVP && lvp->v_type == VREG) { if ((error = unionfs_copyfile(unp, 1, ap->a_cred, td)) != 0) return (error); uvp = unp->un_uppervp; } if (uvp != NULLVP) error = VOP_SETACL(uvp, ap->a_type, ap->a_aclp, ap->a_cred, td); UNIONFS_INTERNAL_DEBUG("unionfs_setacl: leave (%d)\n", error); return (error); } static int unionfs_aclcheck(struct vop_aclcheck_args *ap) { int error; struct unionfs_node *unp; struct vnode *vp; UNIONFS_INTERNAL_DEBUG("unionfs_aclcheck: enter\n"); unp = VTOUNIONFS(ap->a_vp); vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); error = VOP_ACLCHECK(vp, ap->a_type, ap->a_aclp, ap->a_cred, ap->a_td); UNIONFS_INTERNAL_DEBUG("unionfs_aclcheck: leave (%d)\n", error); return (error); } static int unionfs_openextattr(struct vop_openextattr_args *ap) { int error; struct unionfs_node *unp; struct vnode *vp; unp = VTOUNIONFS(ap->a_vp); vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); if ((vp == unp->un_uppervp && (unp->un_flag & UNIONFS_OPENEXTU)) || (vp == unp->un_lowervp && (unp->un_flag & UNIONFS_OPENEXTL))) return (EBUSY); error = VOP_OPENEXTATTR(vp, ap->a_cred, ap->a_td); if (error == 0) { if (vp == unp->un_uppervp) unp->un_flag |= UNIONFS_OPENEXTU; else unp->un_flag |= UNIONFS_OPENEXTL; } return (error); } static int unionfs_closeextattr(struct vop_closeextattr_args *ap) { int error; struct unionfs_node *unp; struct vnode *vp; unp = VTOUNIONFS(ap->a_vp); vp = NULLVP; if (unp->un_flag & UNIONFS_OPENEXTU) vp = unp->un_uppervp; else if (unp->un_flag & UNIONFS_OPENEXTL) vp = unp->un_lowervp; if (vp == NULLVP) return (EOPNOTSUPP); error = VOP_CLOSEEXTATTR(vp, ap->a_commit, ap->a_cred, ap->a_td); if (error == 0) { if (vp == unp->un_uppervp) unp->un_flag &= ~UNIONFS_OPENEXTU; else unp->un_flag &= ~UNIONFS_OPENEXTL; } return (error); } static int unionfs_getextattr(struct vop_getextattr_args *ap) { struct unionfs_node *unp; struct vnode *vp; unp = VTOUNIONFS(ap->a_vp); vp = NULLVP; if (unp->un_flag & UNIONFS_OPENEXTU) vp = unp->un_uppervp; else if (unp->un_flag & UNIONFS_OPENEXTL) vp = unp->un_lowervp; if (vp == NULLVP) return (EOPNOTSUPP); return (VOP_GETEXTATTR(vp, ap->a_attrnamespace, ap->a_name, ap->a_uio, ap->a_size, ap->a_cred, ap->a_td)); } static int unionfs_setextattr(struct vop_setextattr_args *ap) { int error; struct unionfs_node *unp; struct vnode *uvp; struct vnode *lvp; struct vnode *ovp; struct ucred *cred; struct thread *td; error = EROFS; unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; ovp = NULLVP; cred = ap->a_cred; td = ap->a_td; UNIONFS_INTERNAL_DEBUG("unionfs_setextattr: enter (un_flag=%x)\n", unp->un_flag); if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (unp->un_flag & UNIONFS_OPENEXTU) ovp = unp->un_uppervp; else if (unp->un_flag & UNIONFS_OPENEXTL) ovp = unp->un_lowervp; if (ovp == NULLVP) return (EOPNOTSUPP); if (ovp == lvp && lvp->v_type == VREG) { VOP_CLOSEEXTATTR(lvp, 0, cred, td); if (uvp == NULLVP && (error = unionfs_copyfile(unp, 1, cred, td)) != 0) { unionfs_setextattr_reopen: if ((unp->un_flag & UNIONFS_OPENEXTL) && VOP_OPENEXTATTR(lvp, cred, td)) { #ifdef DIAGNOSTIC panic("unionfs: VOP_OPENEXTATTR failed"); #endif unp->un_flag &= ~UNIONFS_OPENEXTL; } goto unionfs_setextattr_abort; } uvp = unp->un_uppervp; if ((error = VOP_OPENEXTATTR(uvp, cred, td)) != 0) goto unionfs_setextattr_reopen; unp->un_flag &= ~UNIONFS_OPENEXTL; unp->un_flag |= UNIONFS_OPENEXTU; ovp = uvp; } if (ovp == uvp) error = VOP_SETEXTATTR(ovp, ap->a_attrnamespace, ap->a_name, ap->a_uio, cred, td); unionfs_setextattr_abort: UNIONFS_INTERNAL_DEBUG("unionfs_setextattr: leave (%d)\n", error); return (error); } static int unionfs_listextattr(struct vop_listextattr_args *ap) { struct unionfs_node *unp; struct vnode *vp; unp = VTOUNIONFS(ap->a_vp); vp = NULLVP; if (unp->un_flag & UNIONFS_OPENEXTU) vp = unp->un_uppervp; else if (unp->un_flag & UNIONFS_OPENEXTL) vp = unp->un_lowervp; if (vp == NULLVP) return (EOPNOTSUPP); return (VOP_LISTEXTATTR(vp, ap->a_attrnamespace, ap->a_uio, ap->a_size, ap->a_cred, ap->a_td)); } static int unionfs_deleteextattr(struct vop_deleteextattr_args *ap) { int error; struct unionfs_node *unp; struct vnode *uvp; struct vnode *lvp; struct vnode *ovp; struct ucred *cred; struct thread *td; error = EROFS; unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; ovp = NULLVP; cred = ap->a_cred; td = ap->a_td; UNIONFS_INTERNAL_DEBUG("unionfs_deleteextattr: enter (un_flag=%x)\n", unp->un_flag); if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (unp->un_flag & UNIONFS_OPENEXTU) ovp = unp->un_uppervp; else if (unp->un_flag & UNIONFS_OPENEXTL) ovp = unp->un_lowervp; if (ovp == NULLVP) return (EOPNOTSUPP); if (ovp == lvp && lvp->v_type == VREG) { VOP_CLOSEEXTATTR(lvp, 0, cred, td); if (uvp == NULLVP && (error = unionfs_copyfile(unp, 1, cred, td)) != 0) { unionfs_deleteextattr_reopen: if ((unp->un_flag & UNIONFS_OPENEXTL) && VOP_OPENEXTATTR(lvp, cred, td)) { #ifdef DIAGNOSTIC panic("unionfs: VOP_OPENEXTATTR failed"); #endif unp->un_flag &= ~UNIONFS_OPENEXTL; } goto unionfs_deleteextattr_abort; } uvp = unp->un_uppervp; if ((error = VOP_OPENEXTATTR(uvp, cred, td)) != 0) goto unionfs_deleteextattr_reopen; unp->un_flag &= ~UNIONFS_OPENEXTL; unp->un_flag |= UNIONFS_OPENEXTU; ovp = uvp; } if (ovp == uvp) error = VOP_DELETEEXTATTR(ovp, ap->a_attrnamespace, ap->a_name, ap->a_cred, ap->a_td); unionfs_deleteextattr_abort: UNIONFS_INTERNAL_DEBUG("unionfs_deleteextattr: leave (%d)\n", error); return (error); } static int unionfs_setlabel(struct vop_setlabel_args *ap) { int error; struct unionfs_node *unp; struct vnode *uvp; struct vnode *lvp; struct thread *td; UNIONFS_INTERNAL_DEBUG("unionfs_setlabel: enter\n"); error = EROFS; unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; td = ap->a_td; if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (uvp == NULLVP && lvp->v_type == VREG) { if ((error = unionfs_copyfile(unp, 1, ap->a_cred, td)) != 0) return (error); uvp = unp->un_uppervp; } if (uvp != NULLVP) error = VOP_SETLABEL(uvp, ap->a_label, ap->a_cred, td); UNIONFS_INTERNAL_DEBUG("unionfs_setlabel: leave (%d)\n", error); return (error); } static int unionfs_vptofh(struct vop_vptofh_args *ap) { return (EOPNOTSUPP); } struct vop_vector unionfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = unionfs_access, .vop_aclcheck = unionfs_aclcheck, .vop_advlock = unionfs_advlock, .vop_bmap = VOP_EOPNOTSUPP, .vop_close = unionfs_close, .vop_closeextattr = unionfs_closeextattr, .vop_create = unionfs_create, .vop_deleteextattr = unionfs_deleteextattr, .vop_fsync = unionfs_fsync, .vop_getacl = unionfs_getacl, .vop_getattr = unionfs_getattr, .vop_getextattr = unionfs_getextattr, .vop_getwritemount = unionfs_getwritemount, .vop_inactive = unionfs_inactive, .vop_ioctl = unionfs_ioctl, .vop_lease = unionfs_lease, .vop_link = unionfs_link, .vop_listextattr = unionfs_listextattr, - ._vop_lock = unionfs_lock, + .vop_lock1 = unionfs_lock, .vop_lookup = unionfs_lookup, .vop_mkdir = unionfs_mkdir, .vop_mknod = unionfs_mknod, .vop_open = unionfs_open, .vop_openextattr = unionfs_openextattr, .vop_pathconf = unionfs_pathconf, .vop_poll = unionfs_poll, .vop_print = unionfs_print, .vop_read = unionfs_read, .vop_readdir = unionfs_readdir, .vop_readlink = unionfs_readlink, .vop_reclaim = unionfs_reclaim, .vop_remove = unionfs_remove, .vop_rename = unionfs_rename, .vop_rmdir = unionfs_rmdir, .vop_setacl = unionfs_setacl, .vop_setattr = unionfs_setattr, .vop_setextattr = unionfs_setextattr, .vop_setlabel = unionfs_setlabel, .vop_strategy = unionfs_strategy, .vop_symlink = unionfs_symlink, .vop_unlock = unionfs_unlock, .vop_whiteout = unionfs_whiteout, .vop_write = unionfs_write, .vop_vptofh = unionfs_vptofh, }; Index: head/sys/kern/vfs_default.c =================================================================== --- head/sys/kern/vfs_default.c (revision 169670) +++ head/sys/kern/vfs_default.c (revision 169671) @@ -1,677 +1,677 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed * to Berkeley by John Heidemann of the UCLA Ficus project. * * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int vop_nolookup(struct vop_lookup_args *); static int vop_nostrategy(struct vop_strategy_args *); /* * This vnode table stores what we want to do if the filesystem doesn't * implement a particular VOP. * * If there is no specific entry here, we will return EOPNOTSUPP. * */ struct vop_vector default_vnodeops = { .vop_default = NULL, .vop_bypass = VOP_EOPNOTSUPP, .vop_advlock = VOP_EINVAL, .vop_bmap = vop_stdbmap, .vop_close = VOP_NULL, .vop_fsync = VOP_NULL, .vop_getpages = vop_stdgetpages, .vop_getwritemount = vop_stdgetwritemount, .vop_inactive = VOP_NULL, .vop_ioctl = VOP_ENOTTY, .vop_kqfilter = vop_stdkqfilter, .vop_islocked = vop_stdislocked, .vop_lease = VOP_NULL, - ._vop_lock = vop_stdlock, + .vop_lock1 = vop_stdlock, .vop_lookup = vop_nolookup, .vop_open = VOP_NULL, .vop_pathconf = VOP_EINVAL, .vop_poll = vop_nopoll, .vop_putpages = vop_stdputpages, .vop_readlink = VOP_EINVAL, .vop_revoke = VOP_PANIC, .vop_strategy = vop_nostrategy, .vop_unlock = vop_stdunlock, .vop_vptofh = vop_stdvptofh, }; /* * Series of placeholder functions for various error returns for * VOPs. */ int vop_eopnotsupp(struct vop_generic_args *ap) { /* printf("vop_notsupp[%s]\n", ap->a_desc->vdesc_name); */ return (EOPNOTSUPP); } int vop_ebadf(struct vop_generic_args *ap) { return (EBADF); } int vop_enotty(struct vop_generic_args *ap) { return (ENOTTY); } int vop_einval(struct vop_generic_args *ap) { return (EINVAL); } int vop_null(struct vop_generic_args *ap) { return (0); } /* * Helper function to panic on some bad VOPs in some filesystems. */ int vop_panic(struct vop_generic_args *ap) { panic("filesystem goof: vop_panic[%s]", ap->a_desc->vdesc_name); } /* * vop_std and vop_no are default functions for use by * filesystems that need the "default reasonable" implementation for a * particular operation. * * The documentation for the operations they implement exists (if it exists) * in the VOP_(9) manpage (all uppercase). */ /* * Default vop for filesystems that do not support name lookup */ static int vop_nolookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { *ap->a_vpp = NULL; return (ENOTDIR); } /* * vop_nostrategy: * * Strategy routine for VFS devices that have none. * * BIO_ERROR and B_INVAL must be cleared prior to calling any strategy * routine. Typically this is done for a BIO_READ strategy call. * Typically B_INVAL is assumed to already be clear prior to a write * and should not be cleared manually unless you just made the buffer * invalid. BIO_ERROR should be cleared either way. */ static int vop_nostrategy (struct vop_strategy_args *ap) { printf("No strategy for buffer at %p\n", ap->a_bp); vprint("vnode", ap->a_vp); ap->a_bp->b_ioflags |= BIO_ERROR; ap->a_bp->b_error = EOPNOTSUPP; bufdone(ap->a_bp); return (EOPNOTSUPP); } /* * vop_stdpathconf: * * Standard implementation of POSIX pathconf, to get information about limits * for a filesystem. * Override per filesystem for the case where the filesystem has smaller * limits. */ int vop_stdpathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; } */ *ap; { switch (ap->a_name) { case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_LINK_MAX: *ap->a_retval = LINK_MAX; return (0); case _PC_MAX_CANON: *ap->a_retval = MAX_CANON; return (0); case _PC_MAX_INPUT: *ap->a_retval = MAX_INPUT; return (0); case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_VDISABLE: *ap->a_retval = _POSIX_VDISABLE; return (0); default: return (EINVAL); } /* NOTREACHED */ } /* * Standard lock, unlock and islocked functions. */ int vop_stdlock(ap) - struct _vop_lock_args /* { + struct vop_lock1_args /* { struct vnode *a_vp; int a_flags; struct thread *a_td; char *file; int line; } */ *ap; { struct vnode *vp = ap->a_vp; return (_lockmgr(vp->v_vnlock, ap->a_flags, VI_MTX(vp), ap->a_td, ap->a_file, ap->a_line)); } /* See above. */ int vop_stdunlock(ap) struct vop_unlock_args /* { struct vnode *a_vp; int a_flags; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; return (lockmgr(vp->v_vnlock, ap->a_flags | LK_RELEASE, VI_MTX(vp), ap->a_td)); } /* See above. */ int vop_stdislocked(ap) struct vop_islocked_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { return (lockstatus(ap->a_vp->v_vnlock, ap->a_td)); } /* * Return true for select/poll. */ int vop_nopoll(ap) struct vop_poll_args /* { struct vnode *a_vp; int a_events; struct ucred *a_cred; struct thread *a_td; } */ *ap; { /* * Return true for read/write. If the user asked for something * special, return POLLNVAL, so that clients have a way of * determining reliably whether or not the extended * functionality is present without hard-coding knowledge * of specific filesystem implementations. * Stay in sync with kern_conf.c::no_poll(). */ if (ap->a_events & ~POLLSTANDARD) return (POLLNVAL); return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); } /* * Implement poll for local filesystems that support it. */ int vop_stdpoll(ap) struct vop_poll_args /* { struct vnode *a_vp; int a_events; struct ucred *a_cred; struct thread *a_td; } */ *ap; { if (ap->a_events & ~POLLSTANDARD) return (vn_pollrecord(ap->a_vp, ap->a_td, ap->a_events)); return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); } /* * Return our mount point, as we will take charge of the writes. */ int vop_stdgetwritemount(ap) struct vop_getwritemount_args /* { struct vnode *a_vp; struct mount **a_mpp; } */ *ap; { struct mount *mp; /* * XXX Since this is called unlocked we may be recycled while * attempting to ref the mount. If this is the case or mountpoint * will be set to NULL. We only have to prevent this call from * returning with a ref to an incorrect mountpoint. It is not * harmful to return with a ref to our previous mountpoint. */ mp = ap->a_vp->v_mount; if (mp != NULL) { vfs_ref(mp); if (mp != ap->a_vp->v_mount) { vfs_rel(mp); mp = NULL; } } *(ap->a_mpp) = mp; return (0); } /* XXX Needs good comment and VOP_BMAP(9) manpage */ int vop_stdbmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct bufobj **a_bop; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { if (ap->a_bop != NULL) *ap->a_bop = &ap->a_vp->v_bufobj; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn * btodb(ap->a_vp->v_mount->mnt_stat.f_iosize); if (ap->a_runp != NULL) *ap->a_runp = 0; if (ap->a_runb != NULL) *ap->a_runb = 0; return (0); } int vop_stdfsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct buf *bp; struct bufobj *bo; struct buf *nbp; int error = 0; int maxretry = 1000; /* large, arbitrarily chosen */ VI_LOCK(vp); loop1: /* * MARK/SCAN initialization to avoid infinite loops. */ TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) { bp->b_vflags &= ~BV_SCANNED; bp->b_error = 0; } /* * Flush all dirty buffers associated with a vnode. */ loop2: TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) { if ((bp->b_vflags & BV_SCANNED) != 0) continue; bp->b_vflags |= BV_SCANNED; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; VI_UNLOCK(vp); KASSERT(bp->b_bufobj == &vp->v_bufobj, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, &vp->v_bufobj)); if ((bp->b_flags & B_DELWRI) == 0) panic("fsync: not dirty"); if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) { vfs_bio_awrite(bp); } else { bremfree(bp); bawrite(bp); } VI_LOCK(vp); goto loop2; } /* * If synchronous the caller expects us to completely resolve all * dirty buffers in the system. Wait for in-progress I/O to * complete (which could include background bitmap writes), then * retry if dirty blocks still exist. */ if (ap->a_waitfor == MNT_WAIT) { bo = &vp->v_bufobj; bufobj_wwait(bo, 0, 0); if (bo->bo_dirty.bv_cnt > 0) { /* * If we are unable to write any of these buffers * then we fail now rather than trying endlessly * to write them out. */ TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) if ((error = bp->b_error) == 0) continue; if (error == 0 && --maxretry >= 0) goto loop1; error = EAGAIN; } } VI_UNLOCK(vp); if (error == EAGAIN) vprint("fsync: giving up on dirty", vp); return (error); } /* XXX Needs good comment and more info in the manpage (VOP_GETPAGES(9)). */ int vop_stdgetpages(ap) struct vop_getpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_reqpage; vm_ooffset_t a_offset; } */ *ap; { return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage); } int vop_stdkqfilter(struct vop_kqfilter_args *ap) { return vfs_kqfilter(ap); } /* XXX Needs good comment and more info in the manpage (VOP_PUTPAGES(9)). */ int vop_stdputpages(ap) struct vop_putpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_sync; int *a_rtvals; vm_ooffset_t a_offset; } */ *ap; { return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, ap->a_rtvals); } int vop_stdvptofh(struct vop_vptofh_args *ap) { return (EOPNOTSUPP); } /* * vfs default ops * used to fill the vfs function table to get reasonable default return values. */ int vfs_stdroot (mp, flags, vpp, td) struct mount *mp; int flags; struct vnode **vpp; struct thread *td; { return (EOPNOTSUPP); } int vfs_stdstatfs (mp, sbp, td) struct mount *mp; struct statfs *sbp; struct thread *td; { return (EOPNOTSUPP); } int vfs_stdquotactl (mp, cmds, uid, arg, td) struct mount *mp; int cmds; uid_t uid; void *arg; struct thread *td; { return (EOPNOTSUPP); } int vfs_stdsync(mp, waitfor, td) struct mount *mp; int waitfor; struct thread *td; { struct vnode *vp, *mvp; int error, lockreq, allerror = 0; lockreq = LK_EXCLUSIVE | LK_INTERLOCK; if (waitfor != MNT_WAIT) lockreq |= LK_NOWAIT; /* * Force stale buffer cache information to be flushed. */ MNT_ILOCK(mp); loop: MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); if (vp->v_bufobj.bo_dirty.bv_cnt == 0) { VI_UNLOCK(vp); continue; } MNT_IUNLOCK(mp); if ((error = vget(vp, lockreq, td)) != 0) { MNT_ILOCK(mp); if (error == ENOENT) { MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); goto loop; } continue; } error = VOP_FSYNC(vp, waitfor, td); if (error) allerror = error; /* Do not turn this into vput. td is not always curthread. */ VOP_UNLOCK(vp, 0, td); vrele(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); return (allerror); } int vfs_stdnosync (mp, waitfor, td) struct mount *mp; int waitfor; struct thread *td; { return (0); } int vfs_stdvget (mp, ino, flags, vpp) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; { return (EOPNOTSUPP); } int vfs_stdfhtovp (mp, fhp, vpp) struct mount *mp; struct fid *fhp; struct vnode **vpp; { return (EOPNOTSUPP); } int vfs_stdinit (vfsp) struct vfsconf *vfsp; { return (0); } int vfs_stduninit (vfsp) struct vfsconf *vfsp; { return(0); } int vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname, td) struct mount *mp; int cmd; struct vnode *filename_vp; int attrnamespace; const char *attrname; struct thread *td; { if (filename_vp != NULL) VOP_UNLOCK(filename_vp, 0, td); return (EOPNOTSUPP); } int vfs_stdsysctl(mp, op, req) struct mount *mp; fsctlop_t op; struct sysctl_req *req; { return (EOPNOTSUPP); } /* end of vfs default ops */ Index: head/sys/kern/vfs_subr.c =================================================================== --- head/sys/kern/vfs_subr.c (revision 169670) +++ head/sys/kern/vfs_subr.c (revision 169671) @@ -1,3967 +1,3967 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 */ /* * External virtual filesystem routines */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif static MALLOC_DEFINE(M_NETADDR, "subr_export_host", "Export host address structure"); static void delmntque(struct vnode *vp); static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo); static void syncer_shutdown(void *arg, int howto); static int vtryrecycle(struct vnode *vp); static void vbusy(struct vnode *vp); static void vinactive(struct vnode *, struct thread *); static void v_incr_usecount(struct vnode *); static void v_decr_usecount(struct vnode *); static void v_decr_useonly(struct vnode *); static void v_upgrade_usecount(struct vnode *); static void vfree(struct vnode *); static void vnlru_free(int); static void vdestroy(struct vnode *); static void vgonel(struct vnode *); static void vfs_knllock(void *arg); static void vfs_knlunlock(void *arg); static int vfs_knllocked(void *arg); /* * Enable Giant pushdown based on whether or not the vm is mpsafe in this * build. Without mpsafevm the buffer cache can not run Giant free. */ int mpsafe_vfs = 1; TUNABLE_INT("debug.mpsafevfs", &mpsafe_vfs); SYSCTL_INT(_debug, OID_AUTO, mpsafevfs, CTLFLAG_RD, &mpsafe_vfs, 0, "MPSAFE VFS"); /* * Number of vnodes in existence. Increased whenever getnewvnode() * allocates a new vnode, decreased on vdestroy() called on VI_DOOMed * vnode. */ static unsigned long numvnodes; SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); /* * Conversion tables for conversion from vnode types to inode formats * and back. */ enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, }; int vttoif_tab[10] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT }; /* * List of vnodes that are ready for recycling. */ static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* * Free vnode target. Free vnodes may simply be files which have been stat'd * but not read. This is somewhat common, and a small cache of such files * should be kept to avoid recreation costs. */ static u_long wantfreevnodes; SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); /* Number of vnodes in the free list. */ static u_long freevnodes; SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); /* * Various variables used for debugging the new implementation of * reassignbuf(). * XXX these are probably of (very) limited utility now. */ static int reassignbufcalls; SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); /* * Cache for the mount type id assigned to NFS. This is used for * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c. */ int nfs_mount_type = -1; /* To keep more than one thread at a time from running vfs_getnewfsid */ static struct mtx mntid_mtx; /* * Lock for any access to the following: * vnode_free_list * numvnodes * freevnodes */ static struct mtx vnode_free_list_mtx; /* Publicly exported FS */ struct nfs_public nfs_pub; /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ static uma_zone_t vnode_zone; static uma_zone_t vnodepoll_zone; /* Set to 1 to print out reclaim of active vnodes */ int prtactive; /* * The workitem queue. * * It is useful to delay writes of file data and filesystem metadata * for tens of seconds so that quickly created and deleted files need * not waste disk bandwidth being created and removed. To realize this, * we append vnodes to a "workitem" queue. When running with a soft * updates implementation, most pending metadata dependencies should * not wait for more than a few seconds. Thus, mounted on block devices * are delayed only about a half the time that file data is delayed. * Similarly, directory updates are more critical, so are only delayed * about a third the time that file data is delayed. Thus, there are * SYNCER_MAXDELAY queues that are processed round-robin at a rate of * one each second (driven off the filesystem syncer process). The * syncer_delayno variable indicates the next queue that is to be processed. * Items that need to be processed soon are placed in this queue: * * syncer_workitem_pending[syncer_delayno] * * A delay of fifteen seconds is done by placing the request fifteen * entries later in the queue: * * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * */ static int syncer_delayno; static long syncer_mask; LIST_HEAD(synclist, bufobj); static struct synclist *syncer_workitem_pending; /* * The sync_mtx protects: * bo->bo_synclist * sync_vnode_count * syncer_delayno * syncer_state * syncer_workitem_pending * syncer_worklist_len * rushjob */ static struct mtx sync_mtx; #define SYNCER_MAXDELAY 32 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ static int syncdelay = 30; /* max time to delay syncing data */ static int filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); static int dirdelay = 29; /* time to delay syncing directories */ SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); static int metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); static int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); /* * When shutting down the syncer, run it at four times normal speed. */ #define SYNCER_SHUTDOWN_SPEEDUP 4 static int sync_vnode_count; static int syncer_worklist_len; static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } syncer_state; /* * Number of vnodes we want to exist at any one time. This is mostly used * to size hash tables in vnode-related code. It is normally not used in * getnewvnode(), as wantfreevnodes is normally nonzero.) * * XXX desiredvnodes is historical cruft and should not exist. */ int desiredvnodes; SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "Maximum number of vnodes"); SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, &wantfreevnodes, 0, "Minimum number of vnodes (legacy)"); static int vnlru_nowhere; SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); /* * Macros to control when a vnode is freed and recycled. All require * the vnode interlock. */ #define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt) #define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt) #define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt) /* * Initialize the vnode management data structures. */ #ifndef MAXVNODES_MAX #define MAXVNODES_MAX 100000 #endif static void vntblinit(void *dummy __unused) { /* * Desiredvnodes is a function of the physical memory size and * the kernel's heap size. Specifically, desiredvnodes scales * in proportion to the physical memory size until two fifths * of the kernel's heap size is consumed by vnodes and vm * objects. */ desiredvnodes = min(maxproc + VMCNT_GET(page_count) / 4, 2 * vm_kmem_size / (5 * (sizeof(struct vm_object) + sizeof(struct vnode)))); if (desiredvnodes > MAXVNODES_MAX) { if (bootverbose) printf("Reducing kern.maxvnodes %d -> %d\n", desiredvnodes, MAXVNODES_MAX); desiredvnodes = MAXVNODES_MAX; } wantfreevnodes = desiredvnodes / 4; mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); TAILQ_INIT(&vnode_free_list); mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF); vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); /* * Initialize the filesystem syncer. */ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_maxdelay = syncer_mask + 1; mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL) /* * Mark a mount point as busy. Used to synchronize access and to delay * unmounting. Interlock is not released on failure. */ int vfs_busy(struct mount *mp, int flags, struct mtx *interlkp, struct thread *td) { int lkflags; MNT_ILOCK(mp); MNT_REF(mp); if (mp->mnt_kern_flag & MNTK_UNMOUNT) { if (flags & LK_NOWAIT) { MNT_REL(mp); MNT_IUNLOCK(mp); return (ENOENT); } if (interlkp) mtx_unlock(interlkp); mp->mnt_kern_flag |= MNTK_MWAIT; /* * Since all busy locks are shared except the exclusive * lock granted when unmounting, the only place that a * wakeup needs to be done is at the release of the * exclusive lock at the end of dounmount. */ msleep(mp, MNT_MTX(mp), PVFS, "vfs_busy", 0); MNT_REL(mp); MNT_IUNLOCK(mp); if (interlkp) mtx_lock(interlkp); return (ENOENT); } if (interlkp) mtx_unlock(interlkp); lkflags = LK_SHARED | LK_INTERLOCK; if (lockmgr(&mp->mnt_lock, lkflags, MNT_MTX(mp), td)) panic("vfs_busy: unexpected lock failure"); return (0); } /* * Free a busy filesystem. */ void vfs_unbusy(struct mount *mp, struct thread *td) { lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td); vfs_rel(mp); } /* * Lookup a mount point by filesystem identifier. */ struct mount * vfs_getvfs(fsid_t *fsid) { struct mount *mp; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { vfs_ref(mp); mtx_unlock(&mountlist_mtx); return (mp); } } mtx_unlock(&mountlist_mtx); return ((struct mount *) 0); } /* * Check if a user can access privileged mount options. */ int vfs_suser(struct mount *mp, struct thread *td) { int error; /* * If the thread is jailed, but this is not a jail-friendly file * system, deny immediately. */ if (jailed(td->td_ucred) && !(mp->mnt_vfc->vfc_flags & VFCF_JAIL)) return (EPERM); /* * If the file system was mounted outside a jail and a jailed thread * tries to access it, deny immediately. */ if (!jailed(mp->mnt_cred) && jailed(td->td_ucred)) return (EPERM); /* * If the file system was mounted inside different jail that the jail of * the calling thread, deny immediately. */ if (jailed(mp->mnt_cred) && jailed(td->td_ucred) && mp->mnt_cred->cr_prison != td->td_ucred->cr_prison) { return (EPERM); } if ((mp->mnt_flag & MNT_USER) == 0 || mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) return (error); } return (0); } /* * Get a new unique fsid. Try to make its val[0] unique, since this value * will be used to create fake device numbers for stat(). Also try (but * not so hard) make its val[0] unique mod 2^16, since some emulators only * support 16-bit device numbers. We end up with unique val[0]'s for the * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. * * Keep in mind that several mounts may be running in parallel. Starting * the search one past where the previous search terminated is both a * micro-optimization and a defense against returning the same fsid to * different mounts. */ void vfs_getnewfsid(struct mount *mp) { static u_int16_t mntid_base; struct mount *nmp; fsid_t tfsid; int mtype; mtx_lock(&mntid_mtx); mtype = mp->mnt_vfc->vfc_typenum; tfsid.val[1] = mtype; mtype = (mtype & 0xFF) << 24; for (;;) { tfsid.val[0] = makedev(255, mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); mntid_base++; if ((nmp = vfs_getvfs(&tfsid)) == NULL) break; vfs_rel(nmp); } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; mtx_unlock(&mntid_mtx); } /* * Knob to control the precision of file timestamps: * * 0 = seconds only; nanoseconds zeroed. * 1 = seconds and nanoseconds, accurate within 1/HZ. * 2 = seconds and nanoseconds, truncated to microseconds. * >=3 = seconds and nanoseconds, maximum precision. */ enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; static int timestamp_precision = TSP_SEC; SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, ×tamp_precision, 0, ""); /* * Get a current timestamp. */ void vfs_timestamp(struct timespec *tsp) { struct timeval tv; switch (timestamp_precision) { case TSP_SEC: tsp->tv_sec = time_second; tsp->tv_nsec = 0; break; case TSP_HZ: getnanotime(tsp); break; case TSP_USEC: microtime(&tv); TIMEVAL_TO_TIMESPEC(&tv, tsp); break; case TSP_NSEC: default: nanotime(tsp); break; } } /* * Set vnode attributes to VNOVAL */ void vattr_null(struct vattr *vap) { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = VNOVAL; vap->va_nlink = VNOVAL; vap->va_uid = VNOVAL; vap->va_gid = VNOVAL; vap->va_fsid = VNOVAL; vap->va_fileid = VNOVAL; vap->va_blocksize = VNOVAL; vap->va_rdev = VNOVAL; vap->va_atime.tv_sec = VNOVAL; vap->va_atime.tv_nsec = VNOVAL; vap->va_mtime.tv_sec = VNOVAL; vap->va_mtime.tv_nsec = VNOVAL; vap->va_ctime.tv_sec = VNOVAL; vap->va_ctime.tv_nsec = VNOVAL; vap->va_birthtime.tv_sec = VNOVAL; vap->va_birthtime.tv_nsec = VNOVAL; vap->va_flags = VNOVAL; vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * This routine is called when we have too many vnodes. It attempts * to free vnodes and will potentially free vnodes that still * have VM backing store (VM backing store is typically the cause * of a vnode blowout so we want to do this). Therefore, this operation * is not considered cheap. * * A number of conditions may prevent a vnode from being reclaimed. * the buffer cache may have references on the vnode, a directory * vnode may still have references due to the namei cache representing * underlying files, or the vnode may be in active use. It is not * desireable to reuse such vnodes. These conditions may cause the * number of vnodes to reach some minimum value regardless of what * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. */ static int vlrureclaim(struct mount *mp) { struct thread *td; struct vnode *vp; int done; int trigger; int usevnodes; int count; /* * Calculate the trigger point, don't allow user * screwups to blow us up. This prevents us from * recycling vnodes with lots of resident pages. We * aren't trying to free memory, we are trying to * free vnodes. */ usevnodes = desiredvnodes; if (usevnodes <= 0) usevnodes = 1; trigger = VMCNT_GET(page_count) * 2 / usevnodes; done = 0; td = curthread; vn_start_write(NULL, &mp, V_WAIT); MNT_ILOCK(mp); count = mp->mnt_nvnodelistsize / 10 + 1; while (count != 0) { vp = TAILQ_FIRST(&mp->mnt_nvnodelist); while (vp != NULL && vp->v_type == VMARKER) vp = TAILQ_NEXT(vp, v_nmntvnodes); if (vp == NULL) break; TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); --count; if (!VI_TRYLOCK(vp)) goto next_iter; /* * If it's been deconstructed already, it's still * referenced, or it exceeds the trigger, skip it. */ if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) || (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VI_UNLOCK(vp); goto next_iter; } MNT_IUNLOCK(mp); vholdl(vp); if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT, td)) { vdrop(vp); goto next_iter_mntunlocked; } VI_LOCK(vp); /* * v_usecount may have been bumped after VOP_LOCK() dropped * the vnode interlock and before it was locked again. * * It is not necessary to recheck VI_DOOMED because it can * only be set by another thread that holds both the vnode * lock and vnode interlock. If another thread has the * vnode lock before we get to VOP_LOCK() and obtains the * vnode interlock after VOP_LOCK() drops the vnode * interlock, the other thread will be unable to drop the * vnode lock before our VOP_LOCK() call fails. */ if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VOP_UNLOCK(vp, LK_INTERLOCK, td); goto next_iter_mntunlocked; } KASSERT((vp->v_iflag & VI_DOOMED) == 0, ("VI_DOOMED unexpectedly detected in vlrureclaim()")); vgonel(vp); VOP_UNLOCK(vp, 0, td); vdropl(vp); done++; next_iter_mntunlocked: if ((count % 256) != 0) goto relock_mnt; goto yield; next_iter: if ((count % 256) != 0) continue; MNT_IUNLOCK(mp); yield: uio_yield(); relock_mnt: MNT_ILOCK(mp); } MNT_IUNLOCK(mp); vn_finished_write(mp); return done; } /* * Attempt to keep the free list at wantfreevnodes length. */ static void vnlru_free(int count) { struct vnode *vp; int vfslocked; mtx_assert(&vnode_free_list_mtx, MA_OWNED); for (; count > 0; count--) { vp = TAILQ_FIRST(&vnode_free_list); /* * The list can be modified while the free_list_mtx * has been dropped and vp could be NULL here. */ if (!vp) break; VNASSERT(vp->v_op != NULL, vp, ("vnlru_free: vnode already reclaimed.")); TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); /* * Don't recycle if we can't get the interlock. */ if (!VI_TRYLOCK(vp)) { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); continue; } VNASSERT(VCANRECYCLE(vp), vp, ("vp inconsistent on freelist")); freevnodes--; vp->v_iflag &= ~VI_FREE; vholdl(vp); mtx_unlock(&vnode_free_list_mtx); VI_UNLOCK(vp); vfslocked = VFS_LOCK_GIANT(vp->v_mount); vtryrecycle(vp); VFS_UNLOCK_GIANT(vfslocked); /* * If the recycled succeeded this vdrop will actually free * the vnode. If not it will simply place it back on * the free list. */ vdrop(vp); mtx_lock(&vnode_free_list_mtx); } } /* * Attempt to recycle vnodes in a context that is always safe to block. * Calling vlrurecycle() from the bowels of filesystem code has some * interesting deadlock problems. */ static struct proc *vnlruproc; static int vnlruproc_sig; static void vnlru_proc(void) { struct mount *mp, *nmp; int done; struct proc *p = vnlruproc; struct thread *td = FIRST_THREAD_IN_PROC(p); mtx_lock(&Giant); EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, SHUTDOWN_PRI_FIRST); for (;;) { kthread_suspend_check(p); mtx_lock(&vnode_free_list_mtx); if (freevnodes > wantfreevnodes) vnlru_free(freevnodes - wantfreevnodes); if (numvnodes <= desiredvnodes * 9 / 10) { vnlruproc_sig = 0; wakeup(&vnlruproc_sig); msleep(vnlruproc, &vnode_free_list_mtx, PVFS|PDROP, "vlruwt", hz); continue; } mtx_unlock(&vnode_free_list_mtx); done = 0; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { int vfsunlocked; if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } if (!VFS_NEEDSGIANT(mp)) { mtx_unlock(&Giant); vfsunlocked = 1; } else vfsunlocked = 0; done += vlrureclaim(mp); if (vfsunlocked) mtx_lock(&Giant); mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, td); } mtx_unlock(&mountlist_mtx); if (done == 0) { EVENTHANDLER_INVOKE(vfs_lowvnodes, desiredvnodes / 10); #if 0 /* These messages are temporary debugging aids */ if (vnlru_nowhere < 5) printf("vnlru process getting nowhere..\n"); else if (vnlru_nowhere == 5) printf("vnlru process messages stopped.\n"); #endif vnlru_nowhere++; tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); } else uio_yield(); } } static struct kproc_desc vnlru_kp = { "vnlru", vnlru_proc, &vnlruproc }; SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp) /* * Routines having to do with the management of the vnode table. */ static void vdestroy(struct vnode *vp) { struct bufobj *bo; CTR1(KTR_VFS, "vdestroy vp %p", vp); mtx_lock(&vnode_free_list_mtx); numvnodes--; mtx_unlock(&vnode_free_list_mtx); bo = &vp->v_bufobj; VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("cleaned vnode still on the free list.")); VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count")); VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL")); VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL")); VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); VI_UNLOCK(vp); #ifdef MAC mac_destroy_vnode(vp); #endif if (vp->v_pollinfo != NULL) { knlist_destroy(&vp->v_pollinfo->vpi_selinfo.si_note); mtx_destroy(&vp->v_pollinfo->vpi_lock); uma_zfree(vnodepoll_zone, vp->v_pollinfo); } #ifdef INVARIANTS /* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */ vp->v_op = NULL; #endif lockdestroy(vp->v_vnlock); mtx_destroy(&vp->v_interlock); uma_zfree(vnode_zone, vp); } /* * Try to recycle a freed vnode. We abort if anyone picks up a reference * before we actually vgone(). This function must be called with the vnode * held to prevent the vnode from being returned to the free list midway * through vgone(). */ static int vtryrecycle(struct vnode *vp) { struct thread *td = curthread; struct mount *vnmp; CTR1(KTR_VFS, "vtryrecycle: trying vp %p", vp); VNASSERT(vp->v_holdcnt, vp, ("vtryrecycle: Recycling vp %p without a reference.", vp)); /* * This vnode may found and locked via some other list, if so we * can't recycle it yet. */ if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT, td) != 0) return (EWOULDBLOCK); /* * Don't recycle if its filesystem is being suspended. */ if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { VOP_UNLOCK(vp, 0, td); return (EBUSY); } /* * If we got this far, we need to acquire the interlock and see if * anyone picked up this vnode from another list. If not, we will * mark it with DOOMED via vgonel() so that anyone who does find it * will skip over it. */ VI_LOCK(vp); if (vp->v_usecount) { VOP_UNLOCK(vp, LK_INTERLOCK, td); vn_finished_write(vnmp); return (EBUSY); } if ((vp->v_iflag & VI_DOOMED) == 0) vgonel(vp); VOP_UNLOCK(vp, LK_INTERLOCK, td); vn_finished_write(vnmp); CTR1(KTR_VFS, "vtryrecycle: recycled vp %p", vp); return (0); } /* * Return the next vnode from the free list. */ int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp) { struct vnode *vp = NULL; struct bufobj *bo; mtx_lock(&vnode_free_list_mtx); /* * Lend our context to reclaim vnodes if they've exceeded the max. */ if (freevnodes > wantfreevnodes) vnlru_free(1); /* * Wait for available vnodes. */ if (numvnodes > desiredvnodes) { if (mp != NULL && (mp->mnt_kern_flag & MNTK_SUSPEND)) { /* * File system is beeing suspended, we cannot risk a * deadlock here, so allocate new vnode anyway. */ if (freevnodes > wantfreevnodes) vnlru_free(freevnodes - wantfreevnodes); goto alloc; } if (vnlruproc_sig == 0) { vnlruproc_sig = 1; /* avoid unnecessary wakeups */ wakeup(vnlruproc); } msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS, "vlruwk", hz); #if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ if (numvnodes > desiredvnodes) { mtx_unlock(&vnode_free_list_mtx); return (ENFILE); } #endif } alloc: numvnodes++; mtx_unlock(&vnode_free_list_mtx); vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO); /* * Setup locks. */ vp->v_vnlock = &vp->v_lock; mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); /* * By default, don't allow shared locks unless filesystems * opt-in. */ lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE); /* * Initialize bufobj. */ bo = &vp->v_bufobj; bo->__bo_vnode = vp; bo->bo_mtx = &vp->v_interlock; bo->bo_ops = &buf_ops_bio; bo->bo_private = vp; TAILQ_INIT(&bo->bo_clean.bv_hd); TAILQ_INIT(&bo->bo_dirty.bv_hd); /* * Initialize namecache. */ LIST_INIT(&vp->v_cache_src); TAILQ_INIT(&vp->v_cache_dst); /* * Finalize various vnode identity bits. */ vp->v_type = VNON; vp->v_tag = tag; vp->v_op = vops; v_incr_usecount(vp); vp->v_data = 0; #ifdef MAC mac_init_vnode(vp); if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) mac_associate_vnode_singlelabel(mp, vp); else if (mp == NULL) printf("NULL mp in getnewvnode()\n"); #endif if (mp != NULL) { bo->bo_bsize = mp->mnt_stat.f_iosize; if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) vp->v_vflag |= VV_NOKNOTE; } CTR2(KTR_VFS, "getnewvnode: mp %p vp %p", mp, vp); *vpp = vp; return (0); } /* * Delete from old mount point vnode list, if on one. */ static void delmntque(struct vnode *vp) { struct mount *mp; mp = vp->v_mount; if (mp == NULL) return; MNT_ILOCK(mp); vp->v_mount = NULL; VNASSERT(mp->mnt_nvnodelistsize > 0, vp, ("bad mount point vnode list size")); TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); mp->mnt_nvnodelistsize--; MNT_REL(mp); MNT_IUNLOCK(mp); } static void insmntque_stddtr(struct vnode *vp, void *dtr_arg) { struct thread *td; td = curthread; /* XXX ? */ vp->v_data = NULL; vp->v_op = &dead_vnodeops; /* XXX non mp-safe fs may still call insmntque with vnode unlocked */ if (!VOP_ISLOCKED(vp, td)) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); vgone(vp); vput(vp); } /* * Insert into list of vnodes for the new mount point, if available. */ int insmntque1(struct vnode *vp, struct mount *mp, void (*dtr)(struct vnode *, void *), void *dtr_arg) { KASSERT(vp->v_mount == NULL, ("insmntque: vnode already on per mount vnode list")); VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 && mp->mnt_nvnodelistsize == 0) { MNT_IUNLOCK(mp); if (dtr != NULL) dtr(vp, dtr_arg); return (EBUSY); } vp->v_mount = mp; MNT_REF(mp); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, ("neg mount point vnode list size")); mp->mnt_nvnodelistsize++; MNT_IUNLOCK(mp); return (0); } int insmntque(struct vnode *vp, struct mount *mp) { return (insmntque1(vp, mp, insmntque_stddtr, NULL)); } /* * Flush out and invalidate all buffers associated with a bufobj * Called with the underlying object locked. */ int bufobj_invalbuf(struct bufobj *bo, int flags, struct thread *td, int slpflag, int slptimeo) { int error; BO_LOCK(bo); if (flags & V_SAVE) { error = bufobj_wwait(bo, slpflag, slptimeo); if (error) { BO_UNLOCK(bo); return (error); } if (bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); if ((error = BO_SYNC(bo, MNT_WAIT, td)) != 0) return (error); /* * XXX We could save a lock/unlock if this was only * enabled under INVARIANTS */ BO_LOCK(bo); if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) panic("vinvalbuf: dirty bufs"); } } /* * If you alter this loop please notice that interlock is dropped and * reacquired in flushbuflist. Special care is needed to ensure that * no race conditions occur from this. */ do { error = flushbuflist(&bo->bo_clean, flags, bo, slpflag, slptimeo); if (error == 0) error = flushbuflist(&bo->bo_dirty, flags, bo, slpflag, slptimeo); if (error != 0 && error != EAGAIN) { BO_UNLOCK(bo); return (error); } } while (error != 0); /* * Wait for I/O to complete. XXX needs cleaning up. The vnode can * have write I/O in-progress but if there is a VM object then the * VM object can also have read-I/O in-progress. */ do { bufobj_wwait(bo, 0, 0); BO_UNLOCK(bo); if (bo->bo_object != NULL) { VM_OBJECT_LOCK(bo->bo_object); vm_object_pip_wait(bo->bo_object, "bovlbx"); VM_OBJECT_UNLOCK(bo->bo_object); } BO_LOCK(bo); } while (bo->bo_numoutput > 0); BO_UNLOCK(bo); /* * Destroy the copy in the VM cache, too. */ if (bo->bo_object != NULL) { VM_OBJECT_LOCK(bo->bo_object); vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? TRUE : FALSE); VM_OBJECT_UNLOCK(bo->bo_object); } #ifdef INVARIANTS BO_LOCK(bo); if ((flags & (V_ALT | V_NORMAL)) == 0 && (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) panic("vinvalbuf: flush failed"); BO_UNLOCK(bo); #endif return (0); } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(struct vnode *vp, int flags, struct thread *td, int slpflag, int slptimeo) { CTR2(KTR_VFS, "vinvalbuf vp %p flags %d", vp, flags); ASSERT_VOP_LOCKED(vp, "vinvalbuf"); return (bufobj_invalbuf(&vp->v_bufobj, flags, td, slpflag, slptimeo)); } /* * Flush out buffers on the specified list. * */ static int flushbuflist( struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo) { struct buf *bp, *nbp; int retval, error; daddr_t lblkno; b_xflags_t xflags; ASSERT_BO_LOCKED(bo); retval = 0; TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) || ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) { continue; } lblkno = 0; xflags = 0; if (nbp != NULL) { lblkno = nbp->b_lblkno; xflags = nbp->b_xflags & (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN); } retval = EAGAIN; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo), "flushbuf", slpflag, slptimeo); if (error) { BO_LOCK(bo); return (error != ENOLCK ? error : EAGAIN); } KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); if (bp->b_bufobj != bo) { /* XXX: necessary ? */ BUF_UNLOCK(bp); BO_LOCK(bo); return (EAGAIN); } /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. */ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && (flags & V_SAVE)) { bremfree(bp); bp->b_flags |= B_ASYNC; bwrite(bp); BO_LOCK(bo); return (EAGAIN); /* XXX: why not loop ? */ } bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); BO_LOCK(bo); if (nbp != NULL && (nbp->b_bufobj != bo || nbp->b_lblkno != lblkno || (nbp->b_xflags & (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN)) != xflags)) break; /* nbp invalid */ } return (retval); } /* * Truncate a file's buffer and pages to a specified length. This * is in lieu of the old vinvalbuf mechanism, which performed unneeded * sync activity. */ int vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td, off_t length, int blksize) { struct buf *bp, *nbp; int anyfreed; int trunclbn; struct bufobj *bo; CTR2(KTR_VFS, "vtruncbuf vp %p length %jd", vp, length); /* * Round up to the *next* lbn. */ trunclbn = (length + blksize - 1) / blksize; ASSERT_VOP_LOCKED(vp, "vtruncbuf"); restart: VI_LOCK(vp); bo = &vp->v_bufobj; anyfreed = 1; for (;anyfreed;) { anyfreed = 0; TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < trunclbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, VI_MTX(vp)) == ENOLCK) goto restart; bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; if (nbp != NULL && (((nbp->b_xflags & BX_VNCLEAN) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI))) { goto restart; } VI_LOCK(vp); } TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < trunclbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, VI_MTX(vp)) == ENOLCK) goto restart; bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; if (nbp != NULL && (((nbp->b_xflags & BX_VNDIRTY) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI) == 0)) { goto restart; } VI_LOCK(vp); } } if (length > 0) { restartsync: TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno > 0) continue; /* * Since we hold the vnode lock this should only * fail if we're racing with the buf daemon. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, VI_MTX(vp)) == ENOLCK) { goto restart; } VNASSERT((bp->b_flags & B_DELWRI), vp, ("buf(%p) on dirty queue without DELWRI", bp)); bremfree(bp); bawrite(bp); VI_LOCK(vp); goto restartsync; } } bufobj_wwait(bo, 0, 0); VI_UNLOCK(vp); vnode_pager_setsize(vp, length); return (0); } /* * buf_splay() - splay tree core for the clean/dirty list of buffers in * a vnode. * * NOTE: We have to deal with the special case of a background bitmap * buffer, a situation where two buffers will have the same logical * block offset. We want (1) only the foreground buffer to be accessed * in a lookup and (2) must differentiate between the foreground and * background buffer in the splay tree algorithm because the splay * tree cannot normally handle multiple entities with the same 'index'. * We accomplish this by adding differentiating flags to the splay tree's * numerical domain. */ static struct buf * buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root) { struct buf dummy; struct buf *lefttreemax, *righttreemin, *y; if (root == NULL) return (NULL); lefttreemax = righttreemin = &dummy; for (;;) { if (lblkno < root->b_lblkno || (lblkno == root->b_lblkno && (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) { if ((y = root->b_left) == NULL) break; if (lblkno < y->b_lblkno) { /* Rotate right. */ root->b_left = y->b_right; y->b_right = root; root = y; if ((y = root->b_left) == NULL) break; } /* Link into the new root's right tree. */ righttreemin->b_left = root; righttreemin = root; } else if (lblkno > root->b_lblkno || (lblkno == root->b_lblkno && (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) { if ((y = root->b_right) == NULL) break; if (lblkno > y->b_lblkno) { /* Rotate left. */ root->b_right = y->b_left; y->b_left = root; root = y; if ((y = root->b_right) == NULL) break; } /* Link into the new root's left tree. */ lefttreemax->b_right = root; lefttreemax = root; } else { break; } root = y; } /* Assemble the new root. */ lefttreemax->b_right = root->b_left; righttreemin->b_left = root->b_right; root->b_left = dummy.b_right; root->b_right = dummy.b_left; return (root); } static void buf_vlist_remove(struct buf *bp) { struct buf *root; struct bufv *bv; KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); ASSERT_BO_LOCKED(bp->b_bufobj); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) != (BX_VNDIRTY|BX_VNCLEAN), ("buf_vlist_remove: Buf %p is on two lists", bp)); if (bp->b_xflags & BX_VNDIRTY) bv = &bp->b_bufobj->bo_dirty; else bv = &bp->b_bufobj->bo_clean; if (bp != bv->bv_root) { root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root); KASSERT(root == bp, ("splay lookup failed in remove")); } if (bp->b_left == NULL) { root = bp->b_right; } else { root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left); root->b_right = bp->b_right; } bv->bv_root = root; TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); bv->bv_cnt--; bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); } /* * Add the buffer to the sorted clean or dirty block list using a * splay tree algorithm. * * NOTE: xflags is passed as a constant, optimizing this inline function! */ static void buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) { struct buf *root; struct bufv *bv; ASSERT_BO_LOCKED(bo); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); bp->b_xflags |= xflags; if (xflags & BX_VNDIRTY) bv = &bo->bo_dirty; else bv = &bo->bo_clean; root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root); if (root == NULL) { bp->b_left = NULL; bp->b_right = NULL; TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); } else if (bp->b_lblkno < root->b_lblkno || (bp->b_lblkno == root->b_lblkno && (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) { bp->b_left = root->b_left; bp->b_right = root; root->b_left = NULL; TAILQ_INSERT_BEFORE(root, bp, b_bobufs); } else { bp->b_right = root->b_right; bp->b_left = root; root->b_right = NULL; TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs); } bv->bv_cnt++; bv->bv_root = bp; } /* * Lookup a buffer using the splay tree. Note that we specifically avoid * shadow buffers used in background bitmap writes. * * This code isn't quite efficient as it could be because we are maintaining * two sorted lists and do not know which list the block resides in. * * During a "make buildworld" the desired buffer is found at one of * the roots more than 60% of the time. Thus, checking both roots * before performing either splay eliminates unnecessary splays on the * first tree splayed. */ struct buf * gbincore(struct bufobj *bo, daddr_t lblkno) { struct buf *bp; ASSERT_BO_LOCKED(bo); if ((bp = bo->bo_clean.bv_root) != NULL && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) return (bp); if ((bp = bo->bo_dirty.bv_root) != NULL && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) return (bp); if ((bp = bo->bo_clean.bv_root) != NULL) { bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp); if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) return (bp); } if ((bp = bo->bo_dirty.bv_root) != NULL) { bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp); if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) return (bp); } return (NULL); } /* * Associate a buffer with a vnode. */ void bgetvp(struct vnode *vp, struct buf *bp) { VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, ("bgetvp: bp already attached! %p", bp)); ASSERT_VI_LOCKED(vp, "bgetvp"); vholdl(vp); if (VFS_NEEDSGIANT(vp->v_mount) || vp->v_bufobj.bo_flag & BO_NEEDSGIANT) bp->b_flags |= B_NEEDSGIANT; bp->b_vp = vp; bp->b_bufobj = &vp->v_bufobj; /* * Insert onto list for new vnode. */ buf_vlist_add(bp, &vp->v_bufobj, BX_VNCLEAN); } /* * Disassociate a buffer from a vnode. */ void brelvp(struct buf *bp) { struct bufobj *bo; struct vnode *vp; CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); /* * Delete from old vnode list, if on one. */ vp = bp->b_vp; /* XXX */ bo = bp->b_bufobj; BO_LOCK(bo); if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) buf_vlist_remove(bp); else panic("brelvp: Buffer %p not on queue.", bp); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { bo->bo_flag &= ~BO_ONWORKLST; mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); } bp->b_flags &= ~B_NEEDSGIANT; bp->b_vp = NULL; bp->b_bufobj = NULL; vdropl(vp); } /* * Add an item to the syncer work queue. */ static void vn_syncer_add_to_worklist(struct bufobj *bo, int delay) { int slot; ASSERT_BO_LOCKED(bo); mtx_lock(&sync_mtx); if (bo->bo_flag & BO_ONWORKLST) LIST_REMOVE(bo, bo_synclist); else { bo->bo_flag |= BO_ONWORKLST; syncer_worklist_len++; } if (delay > syncer_maxdelay - 2) delay = syncer_maxdelay - 2; slot = (syncer_delayno + delay) & syncer_mask; LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); mtx_unlock(&sync_mtx); } static int sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) { int error, len; mtx_lock(&sync_mtx); len = syncer_worklist_len - sync_vnode_count; mtx_unlock(&sync_mtx); error = SYSCTL_OUT(req, &len, sizeof(len)); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); static struct proc *updateproc; static void sched_sync(void); static struct kproc_desc up_kp = { "syncer", sched_sync, &updateproc }; SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) static int sync_vnode(struct bufobj *bo, struct thread *td) { struct vnode *vp; struct mount *mp; vp = bo->__bo_vnode; /* XXX */ if (VOP_ISLOCKED(vp, NULL) != 0) return (1); if (VI_TRYLOCK(vp) == 0) return (1); /* * We use vhold in case the vnode does not * successfully sync. vhold prevents the vnode from * going away when we unlock the sync_mtx so that * we can acquire the vnode interlock. */ vholdl(vp); mtx_unlock(&sync_mtx); VI_UNLOCK(vp); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { vdrop(vp); mtx_lock(&sync_mtx); return (1); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); (void) VOP_FSYNC(vp, MNT_LAZY, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); VI_LOCK(vp); if ((bo->bo_flag & BO_ONWORKLST) != 0) { /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ vn_syncer_add_to_worklist(bo, syncdelay); } vdropl(vp); mtx_lock(&sync_mtx); return (0); } /* * System filesystem synchronizer daemon. */ static void sched_sync(void) { struct synclist *next; struct synclist *slp; struct bufobj *bo; long starttime; struct thread *td = FIRST_THREAD_IN_PROC(updateproc); static int dummychan; int last_work_seen; int net_worklist_len; int syncer_final_iter; int first_printf; int error; mtx_lock(&Giant); last_work_seen = 0; syncer_final_iter = 0; first_printf = 1; syncer_state = SYNCER_RUNNING; starttime = time_uptime; td->td_pflags |= TDP_NORUNNINGBUF; EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, SHUTDOWN_PRI_LAST); mtx_lock(&sync_mtx); for (;;) { if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter == 0) { mtx_unlock(&sync_mtx); kthread_suspend_check(td->td_proc); mtx_lock(&sync_mtx); } net_worklist_len = syncer_worklist_len - sync_vnode_count; if (syncer_state != SYNCER_RUNNING && starttime != time_uptime) { if (first_printf) { printf("\nSyncing disks, vnodes remaining..."); first_printf = 0; } printf("%d ", net_worklist_len); } starttime = time_uptime; /* * Push files whose dirty time has expired. Be careful * of interrupt race on slp queue. * * Skip over empty worklist slots when shutting down. */ do { slp = &syncer_workitem_pending[syncer_delayno]; syncer_delayno += 1; if (syncer_delayno == syncer_maxdelay) syncer_delayno = 0; next = &syncer_workitem_pending[syncer_delayno]; /* * If the worklist has wrapped since the * it was emptied of all but syncer vnodes, * switch to the FINAL_DELAY state and run * for one more second. */ if (syncer_state == SYNCER_SHUTTING_DOWN && net_worklist_len == 0 && last_work_seen == syncer_delayno) { syncer_state = SYNCER_FINAL_DELAY; syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; } } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && syncer_worklist_len > 0); /* * Keep track of the last time there was anything * on the worklist other than syncer vnodes. * Return to the SHUTTING_DOWN state if any * new work appears. */ if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) last_work_seen = syncer_delayno; if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) syncer_state = SYNCER_SHUTTING_DOWN; while ((bo = LIST_FIRST(slp)) != NULL) { error = sync_vnode(bo, td); if (error == 1) { LIST_REMOVE(bo, bo_synclist); LIST_INSERT_HEAD(next, bo, bo_synclist); continue; } } if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) syncer_final_iter--; /* * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob * value of N tells the filesystem syncer to process the next * N seconds worth of work on its queue ASAP. Currently rushjob * is used by the soft update code to speed up the filesystem * syncer process when the incore state is getting so far * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. */ if (rushjob > 0) { rushjob -= 1; continue; } /* * Just sleep for a short period of time between * iterations when shutting down to allow some I/O * to happen. * * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. */ if (syncer_state != SYNCER_RUNNING) msleep(&dummychan, &sync_mtx, PPAUSE, "syncfnl", hz / SYNCER_SHUTDOWN_SPEEDUP); else if (time_uptime == starttime) msleep(&lbolt, &sync_mtx, PPAUSE, "syncer", 0); } } /* * Request the syncer daemon to speed up its work. * We never push it to speed up more than half of its * normal turn time, otherwise it could take over the cpu. */ int speedup_syncer(void) { struct thread *td; int ret = 0; td = FIRST_THREAD_IN_PROC(updateproc); sleepq_remove(td, &lbolt); mtx_lock(&sync_mtx); if (rushjob < syncdelay / 2) { rushjob += 1; stat_rush_requests += 1; ret = 1; } mtx_unlock(&sync_mtx); return (ret); } /* * Tell the syncer to speed up its work and run though its work * list several times, then tell it to shut down. */ static void syncer_shutdown(void *arg, int howto) { struct thread *td; if (howto & RB_NOSYNC) return; td = FIRST_THREAD_IN_PROC(updateproc); sleepq_remove(td, &lbolt); mtx_lock(&sync_mtx); syncer_state = SYNCER_SHUTTING_DOWN; rushjob = 0; mtx_unlock(&sync_mtx); kproc_shutdown(arg, howto); } /* * Reassign a buffer from one vnode to another. * Used to assign file specific control information * (indirect blocks) to the vnode to which they belong. */ void reassignbuf(struct buf *bp) { struct vnode *vp; struct bufobj *bo; int delay; #ifdef INVARIANTS struct bufv *bv; #endif vp = bp->b_vp; bo = bp->b_bufobj; ++reassignbufcalls; CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); /* * B_PAGING flagged buffers cannot be reassigned because their vp * is not fully linked in. */ if (bp->b_flags & B_PAGING) panic("cannot reassign paging buffer"); /* * Delete from old vnode list, if on one. */ VI_LOCK(vp); if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) buf_vlist_remove(bp); else panic("reassignbuf: Buffer %p not on queue.", bp); /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { if ((bo->bo_flag & BO_ONWORKLST) == 0) { switch (vp->v_type) { case VDIR: delay = dirdelay; break; case VCHR: delay = metadelay; break; default: delay = filedelay; } vn_syncer_add_to_worklist(bo, delay); } buf_vlist_add(bp, bo, BX_VNDIRTY); } else { buf_vlist_add(bp, bo, BX_VNCLEAN); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); bo->bo_flag &= ~BO_ONWORKLST; } } #ifdef INVARIANTS bv = &bo->bo_clean; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bv = &bo->bo_dirty; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); #endif VI_UNLOCK(vp); } /* * Increment the use and hold counts on the vnode, taking care to reference * the driver's usecount if this is a chardev. The vholdl() will remove * the vnode from the free list if it is presently free. Requires the * vnode interlock and returns with it held. */ static void v_incr_usecount(struct vnode *vp) { CTR3(KTR_VFS, "v_incr_usecount: vp %p holdcnt %d usecount %d\n", vp, vp->v_holdcnt, vp->v_usecount); vp->v_usecount++; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount++; dev_unlock(); } vholdl(vp); } /* * Turn a holdcnt into a use+holdcnt such that only one call to * v_decr_usecount is needed. */ static void v_upgrade_usecount(struct vnode *vp) { CTR3(KTR_VFS, "v_upgrade_usecount: vp %p holdcnt %d usecount %d\n", vp, vp->v_holdcnt, vp->v_usecount); vp->v_usecount++; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount++; dev_unlock(); } } /* * Decrement the vnode use and hold count along with the driver's usecount * if this is a chardev. The vdropl() below releases the vnode interlock * as it may free the vnode. */ static void v_decr_usecount(struct vnode *vp) { CTR3(KTR_VFS, "v_decr_usecount: vp %p holdcnt %d usecount %d\n", vp, vp->v_holdcnt, vp->v_usecount); ASSERT_VI_LOCKED(vp, __FUNCTION__); VNASSERT(vp->v_usecount > 0, vp, ("v_decr_usecount: negative usecount")); vp->v_usecount--; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount--; dev_unlock(); } vdropl(vp); } /* * Decrement only the use count and driver use count. This is intended to * be paired with a follow on vdropl() to release the remaining hold count. * In this way we may vgone() a vnode with a 0 usecount without risk of * having it end up on a free list because the hold count is kept above 0. */ static void v_decr_useonly(struct vnode *vp) { CTR3(KTR_VFS, "v_decr_useonly: vp %p holdcnt %d usecount %d\n", vp, vp->v_holdcnt, vp->v_usecount); ASSERT_VI_LOCKED(vp, __FUNCTION__); VNASSERT(vp->v_usecount > 0, vp, ("v_decr_useonly: negative usecount")); vp->v_usecount--; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount--; dev_unlock(); } } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. The vnode lock bit is set if the * vnode is being eliminated in vgone. The process is awakened * when the transition is completed, and an error returned to * indicate that the vnode is no longer usable (possibly having * been changed to a new filesystem type). */ int vget(struct vnode *vp, int flags, struct thread *td) { int oweinact; int oldflags; int error; error = 0; oldflags = flags; oweinact = 0; VFS_ASSERT_GIANT(vp->v_mount); if ((flags & LK_INTERLOCK) == 0) VI_LOCK(vp); /* * If the inactive call was deferred because vput() was called * with a shared lock, we have to do it here before another thread * gets a reference to data that should be dead. */ if (vp->v_iflag & VI_OWEINACT) { if (flags & LK_NOWAIT) { VI_UNLOCK(vp); return (EBUSY); } flags &= ~LK_TYPE_MASK; flags |= LK_EXCLUSIVE; oweinact = 1; } vholdl(vp); if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) { vdrop(vp); return (error); } VI_LOCK(vp); /* Upgrade our holdcnt to a usecount. */ v_upgrade_usecount(vp); if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0) panic("vget: vn_lock failed to return ENOENT\n"); if (oweinact) { if (vp->v_iflag & VI_OWEINACT) vinactive(vp, td); VI_UNLOCK(vp); if ((oldflags & LK_TYPE_MASK) == 0) VOP_UNLOCK(vp, 0, td); } else VI_UNLOCK(vp); return (0); } /* * Increase the reference count of a vnode. */ void vref(struct vnode *vp) { VI_LOCK(vp); v_incr_usecount(vp); VI_UNLOCK(vp); } /* * Return reference count of a vnode. * * The results of this call are only guaranteed when some mechanism other * than the VI lock is used to stop other processes from gaining references * to the vnode. This may be the case if the caller holds the only reference. * This is also useful when stale data is acceptable as race conditions may * be accounted for by some other means. */ int vrefcnt(struct vnode *vp) { int usecnt; VI_LOCK(vp); usecnt = vp->v_usecount; VI_UNLOCK(vp); return (usecnt); } /* * Vnode put/release. * If count drops to zero, call inactive routine and return to freelist. */ void vrele(struct vnode *vp) { struct thread *td = curthread; /* XXX */ KASSERT(vp != NULL, ("vrele: null vp")); VFS_ASSERT_GIANT(vp->v_mount); VI_LOCK(vp); /* Skip this v_writecount check if we're going to panic below. */ VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp, ("vrele: missed vn_close")); if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) && vp->v_usecount == 1)) { v_decr_usecount(vp); return; } if (vp->v_usecount != 1) { #ifdef DIAGNOSTIC vprint("vrele: negative ref count", vp); #endif VI_UNLOCK(vp); panic("vrele: negative ref cnt"); } /* * We want to hold the vnode until the inactive finishes to * prevent vgone() races. We drop the use count here and the * hold count below when we're done. */ v_decr_useonly(vp); /* * We must call VOP_INACTIVE with the node locked. Mark * as VI_DOINGINACT to avoid recursion. */ vp->v_iflag |= VI_OWEINACT; if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) { VI_LOCK(vp); if (vp->v_usecount > 0) vp->v_iflag &= ~VI_OWEINACT; if (vp->v_iflag & VI_OWEINACT) vinactive(vp, td); VOP_UNLOCK(vp, 0, td); } else { VI_LOCK(vp); if (vp->v_usecount > 0) vp->v_iflag &= ~VI_OWEINACT; } vdropl(vp); } /* * Release an already locked vnode. This give the same effects as * unlock+vrele(), but takes less time and avoids releasing and * re-aquiring the lock (as vrele() aquires the lock internally.) */ void vput(struct vnode *vp) { struct thread *td = curthread; /* XXX */ int error; KASSERT(vp != NULL, ("vput: null vp")); ASSERT_VOP_LOCKED(vp, "vput"); VFS_ASSERT_GIANT(vp->v_mount); VI_LOCK(vp); /* Skip this v_writecount check if we're going to panic below. */ VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp, ("vput: missed vn_close")); error = 0; if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) && vp->v_usecount == 1)) { VOP_UNLOCK(vp, 0, td); v_decr_usecount(vp); return; } if (vp->v_usecount != 1) { #ifdef DIAGNOSTIC vprint("vput: negative ref count", vp); #endif panic("vput: negative ref cnt"); } /* * We want to hold the vnode until the inactive finishes to * prevent vgone() races. We drop the use count here and the * hold count below when we're done. */ v_decr_useonly(vp); vp->v_iflag |= VI_OWEINACT; if (VOP_ISLOCKED(vp, NULL) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_EXCLUPGRADE|LK_INTERLOCK|LK_NOWAIT, td); VI_LOCK(vp); if (error) { if (vp->v_usecount > 0) vp->v_iflag &= ~VI_OWEINACT; goto done; } } if (vp->v_usecount > 0) vp->v_iflag &= ~VI_OWEINACT; if (vp->v_iflag & VI_OWEINACT) vinactive(vp, td); VOP_UNLOCK(vp, 0, td); done: vdropl(vp); } /* * Somebody doesn't want the vnode recycled. */ void vhold(struct vnode *vp) { VI_LOCK(vp); vholdl(vp); VI_UNLOCK(vp); } void vholdl(struct vnode *vp) { vp->v_holdcnt++; if (VSHOULDBUSY(vp)) vbusy(vp); } /* * Note that there is one less who cares about this vnode. vdrop() is the * opposite of vhold(). */ void vdrop(struct vnode *vp) { VI_LOCK(vp); vdropl(vp); } /* * Drop the hold count of the vnode. If this is the last reference to * the vnode we will free it if it has been vgone'd otherwise it is * placed on the free list. */ void vdropl(struct vnode *vp) { ASSERT_VI_LOCKED(vp, "vdropl"); if (vp->v_holdcnt <= 0) panic("vdrop: holdcnt %d", vp->v_holdcnt); vp->v_holdcnt--; if (vp->v_holdcnt == 0) { if (vp->v_iflag & VI_DOOMED) { vdestroy(vp); return; } else vfree(vp); } VI_UNLOCK(vp); } /* * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT * flags. DOINGINACT prevents us from recursing in calls to vinactive. * OWEINACT tracks whether a vnode missed a call to inactive due to a * failed lock upgrade. */ static void vinactive(struct vnode *vp, struct thread *td) { ASSERT_VOP_LOCKED(vp, "vinactive"); ASSERT_VI_LOCKED(vp, "vinactive"); VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, ("vinactive: recursed on VI_DOINGINACT")); vp->v_iflag |= VI_DOINGINACT; vp->v_iflag &= ~VI_OWEINACT; VI_UNLOCK(vp); VOP_INACTIVE(vp, td); VI_LOCK(vp); VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, ("vinactive: lost VI_DOINGINACT")); vp->v_iflag &= ~VI_DOINGINACT; } /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If FORCECLOSE is not specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If FORCECLOSE is specified, detach any active vnodes * that are found. * * If WRITECLOSE is set, only flush out regular file vnodes open for * writing. * * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. * * `rootrefs' specifies the base reference count for the root vnode * of this filesystem. The root vnode is considered busy if its * v_usecount exceeds this value. On a successful return, vflush(, td) * will call vrele() on the root vnode exactly rootrefs times. * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must * be zero. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); #endif int vflush( struct mount *mp, int rootrefs, int flags, struct thread *td) { struct vnode *vp, *mvp, *rootvp = NULL; struct vattr vattr; int busy = 0, error; CTR1(KTR_VFS, "vflush: mp %p", mp); if (rootrefs > 0) { KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, ("vflush: bad args")); /* * Get the filesystem root vnode. We can vput() it * immediately, since with rootrefs > 0, it won't go away. */ if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp, td)) != 0) return (error); vput(rootvp); } MNT_ILOCK(mp); loop: MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); vholdl(vp); MNT_IUNLOCK(mp); error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td); if (error) { vdrop(vp); MNT_ILOCK(mp); MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); goto loop; } /* * Skip over a vnodes marked VV_SYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { VOP_UNLOCK(vp, 0, td); vdrop(vp); MNT_ILOCK(mp); continue; } /* * If WRITECLOSE is set, flush out unlinked but still open * files (even if open only for reading) and regular file * vnodes open for writing. */ if (flags & WRITECLOSE) { error = VOP_GETATTR(vp, &vattr, td->td_ucred, td); VI_LOCK(vp); if ((vp->v_type == VNON || (error == 0 && vattr.va_nlink > 0)) && (vp->v_writecount == 0 || vp->v_type != VREG)) { VOP_UNLOCK(vp, 0, td); vdropl(vp); MNT_ILOCK(mp); continue; } } else VI_LOCK(vp); /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. * * If FORCECLOSE is set, forcibly close the vnode. */ if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { VNASSERT(vp->v_usecount == 0 || (vp->v_type != VCHR && vp->v_type != VBLK), vp, ("device VNODE %p is FORCECLOSED", vp)); vgonel(vp); } else { busy++; #ifdef DIAGNOSTIC if (busyprt) vprint("vflush: busy vnode", vp); #endif } VOP_UNLOCK(vp, 0, td); vdropl(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { /* * If just the root vnode is busy, and if its refcount * is equal to `rootrefs', then go ahead and kill it. */ VI_LOCK(rootvp); KASSERT(busy > 0, ("vflush: not busy")); VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, ("vflush: usecount %d < rootrefs %d", rootvp->v_usecount, rootrefs)); if (busy == 1 && rootvp->v_usecount == rootrefs) { VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK, td); vgone(rootvp); VOP_UNLOCK(rootvp, 0, td); busy = 0; } else VI_UNLOCK(rootvp); } if (busy) return (EBUSY); for (; rootrefs > 0; rootrefs--) vrele(rootvp); return (0); } /* * Recycle an unused vnode to the front of the free list. */ int vrecycle(struct vnode *vp, struct thread *td) { int recycled; ASSERT_VOP_LOCKED(vp, "vrecycle"); recycled = 0; VI_LOCK(vp); if (vp->v_usecount == 0) { recycled = 1; vgonel(vp); } VI_UNLOCK(vp); return (recycled); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(struct vnode *vp) { VI_LOCK(vp); vgonel(vp); VI_UNLOCK(vp); } /* * vgone, with the vp interlock held. */ void vgonel(struct vnode *vp) { struct thread *td; int oweinact; int active; struct mount *mp; CTR1(KTR_VFS, "vgonel: vp %p", vp); ASSERT_VOP_LOCKED(vp, "vgonel"); ASSERT_VI_LOCKED(vp, "vgonel"); VNASSERT(vp->v_holdcnt, vp, ("vgonel: vp %p has no reference.", vp)); td = curthread; /* * Don't vgonel if we're already doomed. */ if (vp->v_iflag & VI_DOOMED) return; vp->v_iflag |= VI_DOOMED; /* * Check to see if the vnode is in use. If so, we have to call * VOP_CLOSE() and VOP_INACTIVE(). */ active = vp->v_usecount; oweinact = (vp->v_iflag & VI_OWEINACT); VI_UNLOCK(vp); /* * Clean out any buffers associated with the vnode. * If the flush fails, just toss the buffers. */ mp = NULL; if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) (void) vn_start_secondary_write(vp, &mp, V_WAIT); if (vinvalbuf(vp, V_SAVE, td, 0, 0) != 0) vinvalbuf(vp, 0, td, 0, 0); /* * If purging an active vnode, it must be closed and * deactivated before being reclaimed. */ if (active) VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); if (oweinact || active) { VI_LOCK(vp); if ((vp->v_iflag & VI_DOINGINACT) == 0) vinactive(vp, td); VI_UNLOCK(vp); } /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp, td)) panic("vgone: cannot reclaim"); if (mp != NULL) vn_finished_secondary_write(mp); VNASSERT(vp->v_object == NULL, vp, ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag)); /* * Delete from old mount point vnode list. */ delmntque(vp); cache_purge(vp); /* * Done with purge, reset to the standard lock and invalidate * the vnode. */ VI_LOCK(vp); vp->v_vnlock = &vp->v_lock; vp->v_op = &dead_vnodeops; vp->v_tag = "none"; vp->v_type = VBAD; } /* * Calculate the total number of references to a special device. */ int vcount(struct vnode *vp) { int count; dev_lock(); count = vp->v_rdev->si_usecount; dev_unlock(); return (count); } /* * Same as above, but using the struct cdev *as argument */ int count_dev(struct cdev *dev) { int count; dev_lock(); count = dev->si_usecount; dev_unlock(); return(count); } /* * Print out a description of a vnode. */ static char *typename[] = {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", "VMARKER"}; void vn_printf(struct vnode *vp, const char *fmt, ...) { va_list ap; char buf[96]; va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("%p: ", (void *)vp); printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]); printf(" usecount %d, writecount %d, refcount %d mountedhere %p\n", vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere); buf[0] = '\0'; buf[1] = '\0'; if (vp->v_vflag & VV_ROOT) strcat(buf, "|VV_ROOT"); if (vp->v_vflag & VV_TEXT) strcat(buf, "|VV_TEXT"); if (vp->v_vflag & VV_SYSTEM) strcat(buf, "|VV_SYSTEM"); if (vp->v_vflag & VV_DELETED) strcat(buf, "|VV_DELETED"); if (vp->v_iflag & VI_DOOMED) strcat(buf, "|VI_DOOMED"); if (vp->v_iflag & VI_FREE) strcat(buf, "|VI_FREE"); printf(" flags (%s)\n", buf + 1); if (mtx_owned(VI_MTX(vp))) printf(" VI_LOCKed"); if (vp->v_object != NULL) printf(" v_object %p ref %d pages %d\n", vp->v_object, vp->v_object->ref_count, vp->v_object->resident_page_count); printf(" "); lockmgr_printinfo(vp->v_vnlock); printf("\n"); if (vp->v_data != NULL) VOP_PRINT(vp); } #ifdef DDB /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ DB_SHOW_COMMAND(lockedvnods, lockedvnodes) { struct mount *mp, *nmp; struct vnode *vp; /* * Note: because this is DDB, we can't obey the locking semantics * for these structures, which means we could catch an inconsistent * state and dereference a nasty pointer. Not much to be done * about that. */ printf("Locked vnodes\n"); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { nmp = TAILQ_NEXT(mp, mnt_list); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && VOP_ISLOCKED(vp, NULL)) vprint("", vp); } nmp = TAILQ_NEXT(mp, mnt_list); } } /* * Show details about the given vnode. */ DB_SHOW_COMMAND(vnode, db_show_vnode) { struct vnode *vp; if (!have_addr) return; vp = (struct vnode *)addr; vn_printf(vp, "vnode "); } #endif /* DDB */ /* * Fill in a struct xvfsconf based on a struct vfsconf. */ static void vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp) { strcpy(xvfsp->vfc_name, vfsp->vfc_name); xvfsp->vfc_typenum = vfsp->vfc_typenum; xvfsp->vfc_refcount = vfsp->vfc_refcount; xvfsp->vfc_flags = vfsp->vfc_flags; /* * These are unused in userland, we keep them * to not break binary compatibility. */ xvfsp->vfc_vfsops = NULL; xvfsp->vfc_next = NULL; } /* * Top level filesystem related information gathering. */ static int sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) { struct vfsconf *vfsp; struct xvfsconf xvfsp; int error; error = 0; TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { bzero(&xvfsp, sizeof(xvfsp)); vfsconf2x(vfsp, &xvfsp); error = SYSCTL_OUT(req, &xvfsp, sizeof xvfsp); if (error) break; } return (error); } SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist, "S,xvfsconf", "List of all configured filesystems"); #ifndef BURN_BRIDGES static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); static int vfs_sysctl(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1 - 1; /* XXX */ u_int namelen = arg2 + 1; /* XXX */ struct vfsconf *vfsp; struct xvfsconf xvfsp; printf("WARNING: userland calling deprecated sysctl, " "please rebuild world\n"); #if 1 || defined(COMPAT_PRELITE2) /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ if (namelen == 1) return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); #endif switch (name[1]) { case VFS_MAXTYPENUM: if (namelen != 2) return (ENOTDIR); return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); case VFS_CONF: if (namelen != 3) return (ENOTDIR); /* overloaded */ TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) if (vfsp->vfc_typenum == name[2]) break; if (vfsp == NULL) return (EOPNOTSUPP); bzero(&xvfsp, sizeof(xvfsp)); vfsconf2x(vfsp, &xvfsp); return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } return (EOPNOTSUPP); } static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP, vfs_sysctl, "Generic filesystem"); #if 1 || defined(COMPAT_PRELITE2) static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) { int error; struct vfsconf *vfsp; struct ovfsconf ovfs; TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { bzero(&ovfs, sizeof(ovfs)); ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ strcpy(ovfs.vfc_name, vfsp->vfc_name); ovfs.vfc_index = vfsp->vfc_typenum; ovfs.vfc_refcount = vfsp->vfc_refcount; ovfs.vfc_flags = vfsp->vfc_flags; error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); if (error) return error; } return 0; } #endif /* 1 || COMPAT_PRELITE2 */ #endif /* !BURN_BRIDGES */ #define KINFO_VNODESLOP 10 #ifdef notyet /* * Dump vnode list (via sysctl). */ /* ARGSUSED */ static int sysctl_vnode(SYSCTL_HANDLER_ARGS) { struct xvnode *xvn; struct thread *td = req->td; struct mount *mp; struct vnode *vp; int error, len, n; /* * Stale numvnodes access is not fatal here. */ req->lock = 0; len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; if (!req->oldptr) /* Make an estimate */ return (SYSCTL_OUT(req, 0, len)); error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); n = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) continue; MNT_ILOCK(mp); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (n == len) break; vref(vp); xvn[n].xv_size = sizeof *xvn; xvn[n].xv_vnode = vp; xvn[n].xv_id = 0; /* XXX compat */ #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field XV_COPY(usecount); XV_COPY(writecount); XV_COPY(holdcnt); XV_COPY(mount); XV_COPY(numoutput); XV_COPY(type); #undef XV_COPY xvn[n].xv_flag = vp->v_vflag; switch (vp->v_type) { case VREG: case VDIR: case VLNK: break; case VBLK: case VCHR: if (vp->v_rdev == NULL) { vrele(vp); continue; } xvn[n].xv_dev = dev2udev(vp->v_rdev); break; case VSOCK: xvn[n].xv_socket = vp->v_socket; break; case VFIFO: xvn[n].xv_fifo = vp->v_fifoinfo; break; case VNON: case VBAD: default: /* shouldn't happen? */ vrele(vp); continue; } vrele(vp); ++n; } MNT_IUNLOCK(mp); mtx_lock(&mountlist_mtx); vfs_unbusy(mp, td); if (n == len) break; } mtx_unlock(&mountlist_mtx); error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); free(xvn, M_TEMP); return (error); } SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_vnode, "S,xvnode", ""); #endif /* * Unmount all filesystems. The list is traversed in reverse order * of mounting to avoid dependencies. */ void vfs_unmountall(void) { struct mount *mp; struct thread *td; int error; KASSERT(curthread != NULL, ("vfs_unmountall: NULL curthread")); td = curthread; /* * Since this only runs when rebooting, it is not interlocked. */ while(!TAILQ_EMPTY(&mountlist)) { mp = TAILQ_LAST(&mountlist, mntlist); error = dounmount(mp, MNT_FORCE, td); if (error) { TAILQ_REMOVE(&mountlist, mp, mnt_list); /* * XXX: Due to the way in which we mount the root * file system off of devfs, devfs will generate a * "busy" warning when we try to unmount it before * the root. Don't print a warning as a result in * order to avoid false positive errors that may * cause needless upset. */ if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) { printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } } else { /* The unmount has removed mp from the mountlist */ } } } /* * perform msync on all vnodes under a mount point * the mount point must be locked. */ void vfs_msync(struct mount *mp, int flags) { struct vnode *vp, *mvp; struct vm_object *obj; MNT_ILOCK(mp); MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); if ((vp->v_iflag & VI_OBJDIRTY) && (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) { MNT_IUNLOCK(mp); if (!vget(vp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, curthread)) { if (vp->v_vflag & VV_NOSYNC) { /* unlinked */ vput(vp); MNT_ILOCK(mp); continue; } obj = vp->v_object; if (obj != NULL) { VM_OBJECT_LOCK(obj); vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); VM_OBJECT_UNLOCK(obj); } vput(vp); } MNT_ILOCK(mp); } else VI_UNLOCK(vp); } MNT_IUNLOCK(mp); } /* * Mark a vnode as free, putting it up for recycling. */ static void vfree(struct vnode *vp) { CTR1(KTR_VFS, "vfree vp %p", vp); ASSERT_VI_LOCKED(vp, "vfree"); mtx_lock(&vnode_free_list_mtx); VNASSERT(vp->v_op != NULL, vp, ("vfree: vnode already reclaimed.")); VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free")); VNASSERT(VSHOULDFREE(vp), vp, ("vfree: freeing when we shouldn't")); VNASSERT((vp->v_iflag & VI_DOOMED) == 0, vp, ("vfree: Freeing doomed vnode")); if (vp->v_iflag & VI_AGE) { TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); } else { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); } freevnodes++; vp->v_iflag &= ~VI_AGE; vp->v_iflag |= VI_FREE; mtx_unlock(&vnode_free_list_mtx); } /* * Opposite of vfree() - mark a vnode as in use. */ static void vbusy(struct vnode *vp) { CTR1(KTR_VFS, "vbusy vp %p", vp); ASSERT_VI_LOCKED(vp, "vbusy"); VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free")); VNASSERT(vp->v_op != NULL, vp, ("vbusy: vnode already reclaimed.")); mtx_lock(&vnode_free_list_mtx); TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); freevnodes--; vp->v_iflag &= ~(VI_FREE|VI_AGE); mtx_unlock(&vnode_free_list_mtx); } /* * Initalize per-vnode helper structure to hold poll-related state. */ void v_addpollinfo(struct vnode *vp) { struct vpollinfo *vi; vi = uma_zalloc(vnodepoll_zone, M_WAITOK); if (vp->v_pollinfo != NULL) { uma_zfree(vnodepoll_zone, vi); return; } vp->v_pollinfo = vi; mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); knlist_init(&vp->v_pollinfo->vpi_selinfo.si_note, vp, vfs_knllock, vfs_knlunlock, vfs_knllocked); } /* * Record a process's interest in events which might happen to * a vnode. Because poll uses the historic select-style interface * internally, this routine serves as both the ``check for any * pending events'' and the ``record my interest in future events'' * functions. (These are done together, while the lock is held, * to avoid race conditions.) */ int vn_pollrecord(struct vnode *vp, struct thread *td, int events) { if (vp->v_pollinfo == NULL) v_addpollinfo(vp); mtx_lock(&vp->v_pollinfo->vpi_lock); if (vp->v_pollinfo->vpi_revents & events) { /* * This leaves events we are not interested * in available for the other process which * which presumably had requested them * (otherwise they would never have been * recorded). */ events &= vp->v_pollinfo->vpi_revents; vp->v_pollinfo->vpi_revents &= ~events; mtx_unlock(&vp->v_pollinfo->vpi_lock); return events; } vp->v_pollinfo->vpi_events |= events; selrecord(td, &vp->v_pollinfo->vpi_selinfo); mtx_unlock(&vp->v_pollinfo->vpi_lock); return 0; } /* * Routine to create and manage a filesystem syncer vnode. */ #define sync_close ((int (*)(struct vop_close_args *))nullop) static int sync_fsync(struct vop_fsync_args *); static int sync_inactive(struct vop_inactive_args *); static int sync_reclaim(struct vop_reclaim_args *); static struct vop_vector sync_vnodeops = { .vop_bypass = VOP_EOPNOTSUPP, .vop_close = sync_close, /* close */ .vop_fsync = sync_fsync, /* fsync */ .vop_inactive = sync_inactive, /* inactive */ .vop_reclaim = sync_reclaim, /* reclaim */ - ._vop_lock = vop_stdlock, /* lock */ + .vop_lock1 = vop_stdlock, /* lock */ .vop_unlock = vop_stdunlock, /* unlock */ .vop_islocked = vop_stdislocked, /* islocked */ }; /* * Create a new filesystem syncer vnode for the specified mount point. */ int vfs_allocate_syncvnode(struct mount *mp) { struct vnode *vp; static long start, incr, next; int error; /* Allocate a new vnode */ if ((error = getnewvnode("syncer", mp, &sync_vnodeops, &vp)) != 0) { mp->mnt_syncer = NULL; return (error); } vp->v_type = VNON; error = insmntque(vp, mp); if (error != 0) panic("vfs_allocate_syncvnode: insmntque failed"); /* * Place the vnode onto the syncer worklist. We attempt to * scatter them about on the list so that they will go off * at evenly distributed times even if all the filesystems * are mounted at once. */ next += incr; if (next == 0 || next > syncer_maxdelay) { start /= 2; incr /= 2; if (start == 0) { start = syncer_maxdelay / 2; incr = syncer_maxdelay; } next = start; } VI_LOCK(vp); vn_syncer_add_to_worklist(&vp->v_bufobj, syncdelay > 0 ? next % syncdelay : 0); /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ mtx_lock(&sync_mtx); sync_vnode_count++; mtx_unlock(&sync_mtx); VI_UNLOCK(vp); mp->mnt_syncer = vp; return (0); } /* * Do a lazy sync of the filesystem. */ static int sync_fsync(struct vop_fsync_args *ap) { struct vnode *syncvp = ap->a_vp; struct mount *mp = syncvp->v_mount; struct thread *td = ap->a_td; int error; struct bufobj *bo; /* * We only need to do something if this is a lazy evaluation. */ if (ap->a_waitfor != MNT_LAZY) return (0); /* * Move ourselves to the back of the sync list. */ bo = &syncvp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay); BO_UNLOCK(bo); /* * Walk the list of vnodes pushing all that are dirty and * not already on the sync list. */ mtx_lock(&mountlist_mtx); if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) { mtx_unlock(&mountlist_mtx); return (0); } if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { vfs_unbusy(mp, td); return (0); } MNT_ILOCK(mp); mp->mnt_noasync++; mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); vfs_msync(mp, MNT_NOWAIT); error = VFS_SYNC(mp, MNT_LAZY, td); MNT_ILOCK(mp); mp->mnt_noasync--; if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0) mp->mnt_kern_flag |= MNTK_ASYNC; MNT_IUNLOCK(mp); vn_finished_write(mp); vfs_unbusy(mp, td); return (error); } /* * The syncer vnode is no referenced. */ static int sync_inactive(struct vop_inactive_args *ap) { vgone(ap->a_vp); return (0); } /* * The syncer vnode is no longer needed and is being decommissioned. * * Modifications to the worklist must be protected by sync_mtx. */ static int sync_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct bufobj *bo; VI_LOCK(vp); bo = &vp->v_bufobj; vp->v_mount->mnt_syncer = NULL; if (bo->bo_flag & BO_ONWORKLST) { mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; sync_vnode_count--; mtx_unlock(&sync_mtx); bo->bo_flag &= ~BO_ONWORKLST; } VI_UNLOCK(vp); return (0); } /* * Check if vnode represents a disk device */ int vn_isdisk(struct vnode *vp, int *errp) { int error; error = 0; dev_lock(); if (vp->v_type != VCHR) error = ENOTBLK; else if (vp->v_rdev == NULL) error = ENXIO; else if (vp->v_rdev->si_devsw == NULL) error = ENXIO; else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) error = ENOTBLK; dev_unlock(); if (errp != NULL) *errp = error; return (error == 0); } /* * Common filesystem object access control check routine. Accepts a * vnode's type, "mode", uid and gid, requested access mode, credentials, * and optional call-by-reference privused argument allowing vaccess() * to indicate to the caller whether privilege was used to satisfy the * request (obsoleted). Returns 0 on success, or an errno on failure. * * The ifdef'd CAPABILITIES version is here for reference, but is not * actually used. */ int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, mode_t acc_mode, struct ucred *cred, int *privused) { mode_t dac_granted; mode_t priv_granted; /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. */ if (privused != NULL) *privused = 0; dac_granted = 0; /* Check the owner. */ if (cred->cr_uid == file_uid) { dac_granted |= VADMIN; if (file_mode & S_IXUSR) dac_granted |= VEXEC; if (file_mode & S_IRUSR) dac_granted |= VREAD; if (file_mode & S_IWUSR) dac_granted |= (VWRITE | VAPPEND); if ((acc_mode & dac_granted) == acc_mode) return (0); goto privcheck; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) dac_granted |= VEXEC; if (file_mode & S_IRGRP) dac_granted |= VREAD; if (file_mode & S_IWGRP) dac_granted |= (VWRITE | VAPPEND); if ((acc_mode & dac_granted) == acc_mode) return (0); goto privcheck; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) dac_granted |= VEXEC; if (file_mode & S_IROTH) dac_granted |= VREAD; if (file_mode & S_IWOTH) dac_granted |= (VWRITE | VAPPEND); if ((acc_mode & dac_granted) == acc_mode) return (0); privcheck: /* * Build a privilege mask to determine if the set of privileges * satisfies the requirements when combined with the granted mask * from above. For each privilege, if the privilege is required, * bitwise or the request type onto the priv_granted mask. */ priv_granted = 0; if (type == VDIR) { /* * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC * requests, instead of PRIV_VFS_EXEC. */ if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && !priv_check_cred(cred, PRIV_VFS_LOOKUP, SUSER_ALLOWJAIL)) priv_granted |= VEXEC; } else { if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && !priv_check_cred(cred, PRIV_VFS_EXEC, SUSER_ALLOWJAIL)) priv_granted |= VEXEC; } if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) && !priv_check_cred(cred, PRIV_VFS_READ, SUSER_ALLOWJAIL)) priv_granted |= VREAD; if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) && !priv_check_cred(cred, PRIV_VFS_WRITE, SUSER_ALLOWJAIL)) priv_granted |= (VWRITE | VAPPEND); if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) && !priv_check_cred(cred, PRIV_VFS_ADMIN, SUSER_ALLOWJAIL)) priv_granted |= VADMIN; if ((acc_mode & (priv_granted | dac_granted)) == acc_mode) { /* XXX audit: privilege used */ if (privused != NULL) *privused = 1; return (0); } return ((acc_mode & VADMIN) ? EPERM : EACCES); } /* * Credential check based on process requesting service, and per-attribute * permissions. */ int extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, struct thread *td, int access) { /* * Kernel-invoked always succeeds. */ if (cred == NOCRED) return (0); /* * Do not allow privileged processes in jail to directly manipulate * system attributes. */ switch (attrnamespace) { case EXTATTR_NAMESPACE_SYSTEM: /* Potentially should be: return (EPERM); */ return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0)); case EXTATTR_NAMESPACE_USER: return (VOP_ACCESS(vp, access, cred, td)); default: return (EPERM); } } #ifdef DEBUG_VFS_LOCKS /* * This only exists to supress warnings from unlocked specfs accesses. It is * no longer ok to have an unlocked VFS. */ #define IGNORE_LOCK(vp) ((vp)->v_type == VCHR || (vp)->v_type == VBAD) int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, ""); int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, ""); int vfs_badlock_print = 1; /* Print lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, ""); #ifdef KDB int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, ""); #endif static void vfs_badlock(const char *msg, const char *str, struct vnode *vp) { #ifdef KDB if (vfs_badlock_backtrace) kdb_backtrace(); #endif if (vfs_badlock_print) printf("%s: %p %s\n", str, (void *)vp, msg); if (vfs_badlock_ddb) kdb_enter("lock violation"); } void assert_vi_locked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is not locked but should be", str, vp); } void assert_vi_unlocked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is locked but should not be", str, vp); } void assert_vop_locked(struct vnode *vp, const char *str) { if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, NULL) == 0) vfs_badlock("is not locked but should be", str, vp); } void assert_vop_unlocked(struct vnode *vp, const char *str) { if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE) vfs_badlock("is locked but should not be", str, vp); } void assert_vop_elocked(struct vnode *vp, const char *str) { if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, curthread) != LK_EXCLUSIVE) vfs_badlock("is not exclusive locked but should be", str, vp); } #if 0 void assert_vop_elocked_other(struct vnode *vp, const char *str) { if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, curthread) != LK_EXCLOTHER) vfs_badlock("is not exclusive locked by another thread", str, vp); } void assert_vop_slocked(struct vnode *vp, const char *str) { if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, curthread) != LK_SHARED) vfs_badlock("is not locked shared but should be", str, vp); } #endif /* 0 */ #endif /* DEBUG_VFS_LOCKS */ void vop_rename_pre(void *ap) { struct vop_rename_args *a = ap; #ifdef DEBUG_VFS_LOCKS if (a->a_tvp) ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); /* Check the source (from). */ if (a->a_tdvp != a->a_fdvp && a->a_tvp != a->a_fdvp) ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); if (a->a_tvp != a->a_fvp) ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); /* Check the target. */ if (a->a_tvp) ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); #endif if (a->a_tdvp != a->a_fdvp) vhold(a->a_fdvp); if (a->a_tvp != a->a_fvp) vhold(a->a_fvp); vhold(a->a_tdvp); if (a->a_tvp) vhold(a->a_tvp); } void vop_strategy_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_strategy_args *a; struct buf *bp; a = ap; bp = a->a_bp; /* * Cluster ops lock their component buffers but not the IO container. */ if ((bp->b_flags & B_CLUSTER) != 0) return; if (BUF_REFCNT(bp) < 1) { if (vfs_badlock_print) printf( "VOP_STRATEGY: bp is not locked but should be\n"); if (vfs_badlock_ddb) kdb_enter("lock violation"); } #endif } void vop_lookup_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_lookup_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP"); ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP"); #endif } void vop_lookup_post(void *ap, int rc) { #ifdef DEBUG_VFS_LOCKS struct vop_lookup_args *a; struct vnode *dvp; struct vnode *vp; a = ap; dvp = a->a_dvp; vp = *(a->a_vpp); ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP"); ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP"); if (!rc) ASSERT_VOP_LOCKED(vp, "VOP_LOOKUP (child)"); #endif } void vop_lock_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS - struct _vop_lock_args *a = ap; + struct vop_lock1_args *a = ap; if ((a->a_flags & LK_INTERLOCK) == 0) ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); else ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); #endif } void vop_lock_post(void *ap, int rc) { #ifdef DEBUG_VFS_LOCKS - struct _vop_lock_args *a = ap; + struct vop_lock1_args *a = ap; ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); if (rc == 0) ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); #endif } void vop_unlock_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_unlock_args *a = ap; if (a->a_flags & LK_INTERLOCK) ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK"); ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); #endif } void vop_unlock_post(void *ap, int rc) { #ifdef DEBUG_VFS_LOCKS struct vop_unlock_args *a = ap; if (a->a_flags & LK_INTERLOCK) ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK"); #endif } void vop_create_post(void *ap, int rc) { struct vop_create_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } void vop_link_post(void *ap, int rc) { struct vop_link_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK); VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE); } } void vop_mkdir_post(void *ap, int rc) { struct vop_mkdir_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); } void vop_mknod_post(void *ap, int rc) { struct vop_mknod_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } void vop_remove_post(void *ap, int rc) { struct vop_remove_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); } } void vop_rename_post(void *ap, int rc) { struct vop_rename_args *a = ap; if (!rc) { VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE); VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE); VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); if (a->a_tvp) VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); } if (a->a_tdvp != a->a_fdvp) vdrop(a->a_fdvp); if (a->a_tvp != a->a_fvp) vdrop(a->a_fvp); vdrop(a->a_tdvp); if (a->a_tvp) vdrop(a->a_tvp); } void vop_rmdir_post(void *ap, int rc) { struct vop_rmdir_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); } } void vop_setattr_post(void *ap, int rc) { struct vop_setattr_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); } void vop_symlink_post(void *ap, int rc) { struct vop_symlink_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } static struct knlist fs_knlist; static void vfs_event_init(void *arg) { knlist_init(&fs_knlist, NULL, NULL, NULL, NULL); } /* XXX - correct order? */ SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); void vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data __unused) { KNOTE_UNLOCKED(&fs_knlist, event); } static int filt_fsattach(struct knote *kn); static void filt_fsdetach(struct knote *kn); static int filt_fsevent(struct knote *kn, long hint); struct filterops fs_filtops = { 0, filt_fsattach, filt_fsdetach, filt_fsevent }; static int filt_fsattach(struct knote *kn) { kn->kn_flags |= EV_CLEAR; knlist_add(&fs_knlist, kn, 0); return (0); } static void filt_fsdetach(struct knote *kn) { knlist_remove(&fs_knlist, kn, 0); } static int filt_fsevent(struct knote *kn, long hint) { kn->kn_fflags |= hint; return (kn->kn_fflags != 0); } static int sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) { struct vfsidctl vc; int error; struct mount *mp; error = SYSCTL_IN(req, &vc, sizeof(vc)); if (error) return (error); if (vc.vc_vers != VFS_CTL_VERS1) return (EINVAL); mp = vfs_getvfs(&vc.vc_fsid); if (mp == NULL) return (ENOENT); /* ensure that a specific sysctl goes to the right filesystem. */ if (strcmp(vc.vc_fstypename, "*") != 0 && strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { vfs_rel(mp); return (EINVAL); } VCTLTOREQ(&vc, req); error = VFS_SYSCTL(mp, vc.vc_op, req); vfs_rel(mp); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLFLAG_WR, NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid"); /* * Function to initialize a va_filerev field sensibly. * XXX: Wouldn't a random number make a lot more sense ?? */ u_quad_t init_va_filerev(void) { struct bintime bt; getbinuptime(&bt); return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); } static int filt_vfsread(struct knote *kn, long hint); static int filt_vfswrite(struct knote *kn, long hint); static int filt_vfsvnode(struct knote *kn, long hint); static void filt_vfsdetach(struct knote *kn); static struct filterops vfsread_filtops = { 1, NULL, filt_vfsdetach, filt_vfsread }; static struct filterops vfswrite_filtops = { 1, NULL, filt_vfsdetach, filt_vfswrite }; static struct filterops vfsvnode_filtops = { 1, NULL, filt_vfsdetach, filt_vfsvnode }; static void vfs_knllock(void *arg) { struct vnode *vp = arg; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); } static void vfs_knlunlock(void *arg) { struct vnode *vp = arg; VOP_UNLOCK(vp, 0, curthread); } static int vfs_knllocked(void *arg) { struct vnode *vp = arg; return (VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE); } int vfs_kqfilter(struct vop_kqfilter_args *ap) { struct vnode *vp = ap->a_vp; struct knote *kn = ap->a_kn; struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &vfsread_filtops; break; case EVFILT_WRITE: kn->kn_fop = &vfswrite_filtops; break; case EVFILT_VNODE: kn->kn_fop = &vfsvnode_filtops; break; default: return (EINVAL); } kn->kn_hook = (caddr_t)vp; if (vp->v_pollinfo == NULL) v_addpollinfo(vp); if (vp->v_pollinfo == NULL) return (ENOMEM); knl = &vp->v_pollinfo->vpi_selinfo.si_note; knlist_add(knl, kn, 0); return (0); } /* * Detach knote from vnode */ static void filt_vfsdetach(struct knote *kn) { struct vnode *vp = (struct vnode *)kn->kn_hook; KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); } /*ARGSUSED*/ static int filt_vfsread(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; struct vattr va; /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE) { kn->kn_flags |= (EV_EOF | EV_ONESHOT); return (1); } if (VOP_GETATTR(vp, &va, curthread->td_ucred, curthread)) return (0); kn->kn_data = va.va_size - kn->kn_fp->f_offset; return (kn->kn_data != 0); } /*ARGSUSED*/ static int filt_vfswrite(struct knote *kn, long hint) { /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE) kn->kn_flags |= (EV_EOF | EV_ONESHOT); kn->kn_data = 0; return (1); } static int filt_vfsvnode(struct knote *kn, long hint) { if (kn->kn_sfflags & hint) kn->kn_fflags |= hint; if (hint == NOTE_REVOKE) { kn->kn_flags |= EV_EOF; return (1); } return (kn->kn_fflags != 0); } int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) { int error; if (dp->d_reclen > ap->a_uio->uio_resid) return (ENAMETOOLONG); error = uiomove(dp, dp->d_reclen, ap->a_uio); if (error) { if (ap->a_ncookies != NULL) { if (ap->a_cookies != NULL) free(ap->a_cookies, M_TEMP); ap->a_cookies = NULL; *ap->a_ncookies = 0; } return (error); } if (ap->a_ncookies == NULL) return (0); KASSERT(ap->a_cookies, ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); *ap->a_cookies = realloc(*ap->a_cookies, (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO); (*ap->a_cookies)[*ap->a_ncookies] = off; return (0); } /* * Mark for update the access time of the file if the filesystem * supports VA_MARK_ATIME. This functionality is used by execve * and mmap, so we want to avoid the synchronous I/O implied by * directly setting va_atime for the sake of efficiency. */ void vfs_mark_atime(struct vnode *vp, struct thread *td) { struct vattr atimeattr; if ((vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) { VATTR_NULL(&atimeattr); atimeattr.va_vaflags |= VA_MARK_ATIME; (void)VOP_SETATTR(vp, &atimeattr, td->td_ucred, td); } } Index: head/sys/kern/vfs_vnops.c =================================================================== --- head/sys/kern/vfs_vnops.c (revision 169670) +++ head/sys/kern/vfs_vnops.c (revision 169671) @@ -1,1251 +1,1251 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static fo_rdwr_t vn_read; static fo_rdwr_t vn_write; static fo_ioctl_t vn_ioctl; static fo_poll_t vn_poll; static fo_kqfilter_t vn_kqfilter; static fo_stat_t vn_statfile; static fo_close_t vn_closefile; struct fileops vnops = { .fo_read = vn_read, .fo_write = vn_write, .fo_ioctl = vn_ioctl, .fo_poll = vn_poll, .fo_kqfilter = vn_kqfilter, .fo_stat = vn_statfile, .fo_close = vn_closefile, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; int vn_open(ndp, flagp, cmode, fdidx) struct nameidata *ndp; int *flagp, cmode, fdidx; { struct thread *td = ndp->ni_cnd.cn_thread; return (vn_open_cred(ndp, flagp, cmode, td->td_ucred, fdidx)); } /* * Common code for vnode open operations. * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. * * Note that this does NOT free nameidata for the successful case, * due to the NDINIT being done elsewhere. */ int vn_open_cred(ndp, flagp, cmode, cred, fdidx) struct nameidata *ndp; int *flagp, cmode; struct ucred *cred; int fdidx; { struct vnode *vp; struct mount *mp; struct thread *td = ndp->ni_cnd.cn_thread; struct vattr vat; struct vattr *vap = &vat; int mode, fmode, error; int vfslocked, mpsafe; mpsafe = ndp->ni_cnd.cn_flags & MPSAFE; restart: vfslocked = 0; fmode = *flagp; if (fmode & O_CREAT) { ndp->ni_cnd.cn_nameiop = CREATE; ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1; if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0) ndp->ni_cnd.cn_flags |= FOLLOW; bwillwrite(); if ((error = namei(ndp)) != 0) return (error); vfslocked = NDHASGIANT(ndp); if (!mpsafe) ndp->ni_cnd.cn_flags &= ~MPSAFE; if (ndp->ni_vp == NULL) { VATTR_NULL(vap); vap->va_type = VREG; vap->va_mode = cmode; if (fmode & O_EXCL) vap->va_vaflags |= VA_EXCLUSIVE; if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(ndp, NDF_ONLY_PNBUF); vput(ndp->ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } #ifdef MAC error = mac_check_vnode_create(cred, ndp->ni_dvp, &ndp->ni_cnd, vap); if (error == 0) { #endif VOP_LEASE(ndp->ni_dvp, td, cred, LEASE_WRITE); error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd, vap); #ifdef MAC } #endif vput(ndp->ni_dvp); vn_finished_write(mp); if (error) { VFS_UNLOCK_GIANT(vfslocked); NDFREE(ndp, NDF_ONLY_PNBUF); return (error); } fmode &= ~O_TRUNC; vp = ndp->ni_vp; } else { if (ndp->ni_dvp == ndp->ni_vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); ndp->ni_dvp = NULL; vp = ndp->ni_vp; if (fmode & O_EXCL) { error = EEXIST; goto bad; } fmode &= ~O_CREAT; } } else { ndp->ni_cnd.cn_nameiop = LOOKUP; ndp->ni_cnd.cn_flags = ISOPEN | ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKSHARED | LOCKLEAF | MPSAFE | AUDITVNODE1; if ((error = namei(ndp)) != 0) return (error); if (!mpsafe) ndp->ni_cnd.cn_flags &= ~MPSAFE; vfslocked = NDHASGIANT(ndp); vp = ndp->ni_vp; } if (vp->v_type == VLNK) { error = EMLINK; goto bad; } if (vp->v_type == VSOCK) { error = EOPNOTSUPP; goto bad; } mode = 0; if (fmode & (FWRITE | O_TRUNC)) { if (vp->v_type == VDIR) { error = EISDIR; goto bad; } mode |= VWRITE; } if (fmode & FREAD) mode |= VREAD; if (fmode & O_APPEND) mode |= VAPPEND; #ifdef MAC error = mac_check_vnode_open(cred, vp, mode); if (error) goto bad; #endif if ((fmode & O_CREAT) == 0) { if (mode & VWRITE) { error = vn_writechk(vp); if (error) goto bad; } if (mode) { error = VOP_ACCESS(vp, mode, cred, td); if (error) goto bad; } } if ((error = VOP_OPEN(vp, fmode, cred, td, fdidx)) != 0) goto bad; if (fmode & FWRITE) vp->v_writecount++; *flagp = fmode; ASSERT_VOP_LOCKED(vp, "vn_open_cred"); if (!mpsafe) VFS_UNLOCK_GIANT(vfslocked); return (0); bad: NDFREE(ndp, NDF_ONLY_PNBUF); vput(vp); VFS_UNLOCK_GIANT(vfslocked); *flagp = fmode; ndp->ni_vp = NULL; return (error); } /* * Check for write permissions on the specified vnode. * Prototype text segments cannot be written. */ int vn_writechk(vp) register struct vnode *vp; { ASSERT_VOP_LOCKED(vp, "vn_writechk"); /* * If there's shared text associated with * the vnode, try to free it up once. If * we fail, we can't allow writing. */ if (vp->v_vflag & VV_TEXT) return (ETXTBSY); return (0); } /* * Vnode close call */ int vn_close(vp, flags, file_cred, td) register struct vnode *vp; int flags; struct ucred *file_cred; struct thread *td; { struct mount *mp; int error; VFS_ASSERT_GIANT(vp->v_mount); vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (flags & FWRITE) { VNASSERT(vp->v_writecount > 0, vp, ("vn_close: negative writecount")); vp->v_writecount--; } error = VOP_CLOSE(vp, flags, file_cred, td); vput(vp); vn_finished_write(mp); return (error); } /* * Sequential heuristic - detect sequential operation */ static __inline int sequential_heuristic(struct uio *uio, struct file *fp) { if ((uio->uio_offset == 0 && fp->f_seqcount > 0) || uio->uio_offset == fp->f_nextoff) { /* * XXX we assume that the filesystem block size is * the default. Not true, but still gives us a pretty * good indicator of how sequential the read operations * are. */ fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE; if (fp->f_seqcount > IO_SEQMAX) fp->f_seqcount = IO_SEQMAX; return(fp->f_seqcount << IO_SEQSHIFT); } /* * Not sequential, quick draw-down of seqcount */ if (fp->f_seqcount > 1) fp->f_seqcount = 1; else fp->f_seqcount = 0; return(0); } /* * Package up an I/O request on a vnode into a uio and do it. */ int vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred, aresid, td) enum uio_rw rw; struct vnode *vp; void *base; int len; off_t offset; enum uio_seg segflg; int ioflg; struct ucred *active_cred; struct ucred *file_cred; int *aresid; struct thread *td; { struct uio auio; struct iovec aiov; struct mount *mp; struct ucred *cred; int error; VFS_ASSERT_GIANT(vp->v_mount); if ((ioflg & IO_NODELOCKED) == 0) { mp = NULL; if (rw == UIO_WRITE) { if (vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); } else { /* * XXX This should be LK_SHARED but I don't trust VFS * enough to leave it like that until it has been * reviewed further. */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); } } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = base; aiov.iov_len = len; auio.uio_resid = len; auio.uio_offset = offset; auio.uio_segflg = segflg; auio.uio_rw = rw; auio.uio_td = td; error = 0; #ifdef MAC if ((ioflg & IO_NOMACCHECK) == 0) { if (rw == UIO_READ) error = mac_check_vnode_read(active_cred, file_cred, vp); else error = mac_check_vnode_write(active_cred, file_cred, vp); } #endif if (error == 0) { if (file_cred) cred = file_cred; else cred = active_cred; if (rw == UIO_READ) error = VOP_READ(vp, &auio, ioflg, cred); else error = VOP_WRITE(vp, &auio, ioflg, cred); } if (aresid) *aresid = auio.uio_resid; else if (auio.uio_resid && error == 0) error = EIO; if ((ioflg & IO_NODELOCKED) == 0) { if (rw == UIO_WRITE && vp->v_type != VCHR) vn_finished_write(mp); VOP_UNLOCK(vp, 0, td); } return (error); } /* * Package up an I/O request on a vnode into a uio and do it. The I/O * request is split up into smaller chunks and we try to avoid saturating * the buffer cache while potentially holding a vnode locked, so we * check bwillwrite() before calling vn_rdwr(). We also call uio_yield() * to give other processes a chance to lock the vnode (either other processes * core'ing the same binary, or unrelated processes scanning the directory). */ int vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred, aresid, td) enum uio_rw rw; struct vnode *vp; void *base; size_t len; off_t offset; enum uio_seg segflg; int ioflg; struct ucred *active_cred; struct ucred *file_cred; size_t *aresid; struct thread *td; { int error = 0; int iaresid; VFS_ASSERT_GIANT(vp->v_mount); do { int chunk; /* * Force `offset' to a multiple of MAXBSIZE except possibly * for the first chunk, so that filesystems only need to * write full blocks except possibly for the first and last * chunks. */ chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE; if (chunk > len) chunk = len; if (rw != UIO_READ && vp->v_type == VREG) bwillwrite(); iaresid = 0; error = vn_rdwr(rw, vp, base, chunk, offset, segflg, ioflg, active_cred, file_cred, &iaresid, td); len -= chunk; /* aresid calc already includes length */ if (error) break; offset += chunk; base = (char *)base + chunk; uio_yield(); } while (len); if (aresid) *aresid = len + iaresid; return (error); } /* * File table vnode read routine. */ static int vn_read(fp, uio, active_cred, flags, td) struct file *fp; struct uio *uio; struct ucred *active_cred; struct thread *td; int flags; { struct vnode *vp; int error, ioflag; int vfslocked; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); vp = fp->f_vnode; ioflag = 0; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; vfslocked = VFS_LOCK_GIANT(vp->v_mount); VOP_LEASE(vp, td, fp->f_cred, LEASE_READ); /* * According to McKusick the vn lock was protecting f_offset here. * It is now protected by the FOFFSET_LOCKED flag. */ if ((flags & FOF_OFFSET) == 0) { FILE_LOCK(fp); while(fp->f_vnread_flags & FOFFSET_LOCKED) { fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; msleep(&fp->f_vnread_flags,fp->f_mtxp,PUSER -1,"vnread offlock",0); } fp->f_vnread_flags |= FOFFSET_LOCKED; FILE_UNLOCK(fp); vn_lock(vp, LK_SHARED | LK_RETRY, td); uio->uio_offset = fp->f_offset; } else vn_lock(vp, LK_SHARED | LK_RETRY, td); ioflag |= sequential_heuristic(uio, fp); #ifdef MAC error = mac_check_vnode_read(active_cred, fp->f_cred, vp); if (error == 0) #endif error = VOP_READ(vp, uio, ioflag, fp->f_cred); if ((flags & FOF_OFFSET) == 0) { fp->f_offset = uio->uio_offset; FILE_LOCK(fp); if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) wakeup(&fp->f_vnread_flags); fp->f_vnread_flags = 0; FILE_UNLOCK(fp); } fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * File table vnode write routine. */ static int vn_write(fp, uio, active_cred, flags, td) struct file *fp; struct uio *uio; struct ucred *active_cred; struct thread *td; int flags; { struct vnode *vp; struct mount *mp; int error, ioflag; int vfslocked; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); if (vp->v_type == VREG) bwillwrite(); ioflag = IO_UNIT; if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) ioflag |= IO_APPEND; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; if ((fp->f_flag & O_FSYNC) || (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) ioflag |= IO_SYNC; mp = NULL; if (vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto unlock; VOP_LEASE(vp, td, fp->f_cred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if ((flags & FOF_OFFSET) == 0) uio->uio_offset = fp->f_offset; ioflag |= sequential_heuristic(uio, fp); #ifdef MAC error = mac_check_vnode_write(active_cred, fp->f_cred, vp); if (error == 0) #endif error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); if ((flags & FOF_OFFSET) == 0) fp->f_offset = uio->uio_offset; fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0, td); if (vp->v_type != VCHR) vn_finished_write(mp); unlock: VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * File table vnode stat routine. */ static int vn_statfile(fp, sb, active_cred, td) struct file *fp; struct stat *sb; struct ucred *active_cred; struct thread *td; { struct vnode *vp = fp->f_vnode; int vfslocked; int error; vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); error = vn_stat(vp, sb, active_cred, fp->f_cred, td); VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Stat a vnode; implementation for the stat syscall */ int vn_stat(vp, sb, active_cred, file_cred, td) struct vnode *vp; register struct stat *sb; struct ucred *active_cred; struct ucred *file_cred; struct thread *td; { struct vattr vattr; register struct vattr *vap; int error; u_short mode; #ifdef MAC error = mac_check_vnode_stat(active_cred, file_cred, vp); if (error) return (error); #endif vap = &vattr; error = VOP_GETATTR(vp, vap, active_cred, td); if (error) return (error); /* * Zero the spare stat fields */ bzero(sb, sizeof *sb); /* * Copy from vattr table */ if (vap->va_fsid != VNOVAL) sb->st_dev = vap->va_fsid; else sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0]; sb->st_ino = vap->va_fileid; mode = vap->va_mode; switch (vap->va_type) { case VREG: mode |= S_IFREG; break; case VDIR: mode |= S_IFDIR; break; case VBLK: mode |= S_IFBLK; break; case VCHR: mode |= S_IFCHR; break; case VLNK: mode |= S_IFLNK; /* This is a cosmetic change, symlinks do not have a mode. */ if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) sb->st_mode &= ~ACCESSPERMS; /* 0000 */ else sb->st_mode |= ACCESSPERMS; /* 0777 */ break; case VSOCK: mode |= S_IFSOCK; break; case VFIFO: mode |= S_IFIFO; break; default: return (EBADF); }; sb->st_mode = mode; sb->st_nlink = vap->va_nlink; sb->st_uid = vap->va_uid; sb->st_gid = vap->va_gid; sb->st_rdev = vap->va_rdev; if (vap->va_size > OFF_MAX) return (EOVERFLOW); sb->st_size = vap->va_size; sb->st_atimespec = vap->va_atime; sb->st_mtimespec = vap->va_mtime; sb->st_ctimespec = vap->va_ctime; sb->st_birthtimespec = vap->va_birthtime; /* * According to www.opengroup.org, the meaning of st_blksize is * "a filesystem-specific preferred I/O block size for this * object. In some filesystem types, this may vary from file * to file" * Default to PAGE_SIZE after much discussion. * XXX: min(PAGE_SIZE, vp->v_bufobj.bo_bsize) may be more correct. */ sb->st_blksize = PAGE_SIZE; sb->st_flags = vap->va_flags; if (priv_check(td, PRIV_VFS_GENERATION)) sb->st_gen = 0; else sb->st_gen = vap->va_gen; sb->st_blocks = vap->va_bytes / S_BLKSIZE; return (0); } /* * File table vnode ioctl routine. */ static int vn_ioctl(fp, com, data, active_cred, td) struct file *fp; u_long com; void *data; struct ucred *active_cred; struct thread *td; { struct vnode *vp = fp->f_vnode; struct vattr vattr; int vfslocked; int error; vfslocked = VFS_LOCK_GIANT(vp->v_mount); error = ENOTTY; switch (vp->v_type) { case VREG: case VDIR: if (com == FIONREAD) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_GETATTR(vp, &vattr, active_cred, td); VOP_UNLOCK(vp, 0, td); if (!error) *(int *)data = vattr.va_size - fp->f_offset; } if (com == FIONBIO || com == FIOASYNC) /* XXX */ error = 0; else error = VOP_IOCTL(vp, com, data, fp->f_flag, active_cred, td); break; default: break; } VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * File table vnode poll routine. */ static int vn_poll(fp, events, active_cred, td) struct file *fp; int events; struct ucred *active_cred; struct thread *td; { struct vnode *vp; int vfslocked; int error; vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); #ifdef MAC vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); error = mac_check_vnode_poll(active_cred, fp->f_cred, vp); VOP_UNLOCK(vp, 0, td); if (!error) #endif error = VOP_POLL(vp, events, fp->f_cred, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Check that the vnode is still valid, and if so * acquire requested lock. */ int _vn_lock(struct vnode *vp, int flags, struct thread *td, char *file, int line) { int error; do { if ((flags & LK_INTERLOCK) == 0) VI_LOCK(vp); if ((flags & LK_NOWAIT || (flags & LK_TYPE_MASK) == 0) && vp->v_iflag & VI_DOOMED) { VI_UNLOCK(vp); return (ENOENT); } /* * Just polling to check validity. */ if ((flags & LK_TYPE_MASK) == 0) { VI_UNLOCK(vp); return (0); } /* * lockmgr drops interlock before it will return for * any reason. So force the code above to relock it. */ - error = _VOP_LOCK(vp, flags | LK_INTERLOCK, td, file, line); + error = VOP_LOCK1(vp, flags | LK_INTERLOCK, td, file, line); flags &= ~LK_INTERLOCK; KASSERT((flags & LK_RETRY) == 0 || error == 0, ("LK_RETRY set with incompatible flags %d\n", flags)); /* * Callers specify LK_RETRY if they wish to get dead vnodes. * If RETRY is not set, we return ENOENT instead. */ if (error == 0 && vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0) { VOP_UNLOCK(vp, 0, td); error = ENOENT; break; } } while (flags & LK_RETRY && error != 0); return (error); } /* * File table vnode close routine. */ static int vn_closefile(fp, td) struct file *fp; struct thread *td; { struct vnode *vp; struct flock lf; int vfslocked; int error; vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK); } fp->f_ops = &badfileops; error = vn_close(vp, fp->f_flag, fp->f_cred, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Preparing to start a filesystem write operation. If the operation is * permitted, then we bump the count of operations in progress and * proceed. If a suspend request is in progress, we wait until the * suspension is over, and then proceed. */ int vn_start_write(vp, mpp, flags) struct vnode *vp; struct mount **mpp; int flags; { struct mount *mp; int error; error = 0; /* * If a vnode is provided, get and return the mount point that * to which it will write. */ if (vp != NULL) { if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { *mpp = NULL; if (error != EOPNOTSUPP) return (error); return (0); } } if ((mp = *mpp) == NULL) return (0); MNT_ILOCK(mp); if (vp == NULL) MNT_REF(mp); /* * Check on status of suspension. */ while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { if (flags & V_NOWAIT) { error = EWOULDBLOCK; goto unlock; } error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | (flags & PCATCH), "suspfs", 0); if (error) goto unlock; } if (flags & V_XSLEEP) goto unlock; mp->mnt_writeopcount++; unlock: MNT_REL(mp); MNT_IUNLOCK(mp); return (error); } /* * Secondary suspension. Used by operations such as vop_inactive * routines that are needed by the higher level functions. These * are allowed to proceed until all the higher level functions have * completed (indicated by mnt_writeopcount dropping to zero). At that * time, these operations are halted until the suspension is over. */ int vn_write_suspend_wait(vp, mp, flags) struct vnode *vp; struct mount *mp; int flags; { int error; if (vp != NULL) { if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) { if (error != EOPNOTSUPP) return (error); return (0); } } /* * If we are not suspended or have not yet reached suspended * mode, then let the operation proceed. */ if (mp == NULL) return (0); MNT_ILOCK(mp); if (vp == NULL) MNT_REF(mp); if ((mp->mnt_kern_flag & MNTK_SUSPENDED) == 0) { MNT_REL(mp); MNT_IUNLOCK(mp); return (0); } if (flags & V_NOWAIT) { MNT_REL(mp); MNT_IUNLOCK(mp); return (EWOULDBLOCK); } /* * Wait for the suspension to finish. */ error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0); vfs_rel(mp); return (error); } /* * Secondary suspension. Used by operations such as vop_inactive * routines that are needed by the higher level functions. These * are allowed to proceed until all the higher level functions have * completed (indicated by mnt_writeopcount dropping to zero). At that * time, these operations are halted until the suspension is over. */ int vn_start_secondary_write(vp, mpp, flags) struct vnode *vp; struct mount **mpp; int flags; { struct mount *mp; int error; retry: if (vp != NULL) { if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { *mpp = NULL; if (error != EOPNOTSUPP) return (error); return (0); } } /* * If we are not suspended or have not yet reached suspended * mode, then let the operation proceed. */ if ((mp = *mpp) == NULL) return (0); MNT_ILOCK(mp); if (vp == NULL) MNT_REF(mp); if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) { mp->mnt_secondary_writes++; mp->mnt_secondary_accwrites++; MNT_REL(mp); MNT_IUNLOCK(mp); return (0); } if (flags & V_NOWAIT) { MNT_REL(mp); MNT_IUNLOCK(mp); return (EWOULDBLOCK); } /* * Wait for the suspension to finish. */ error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0); vfs_rel(mp); if (error == 0) goto retry; return (error); } /* * Filesystem write operation has completed. If we are suspending and this * operation is the last one, notify the suspender that the suspension is * now in effect. */ void vn_finished_write(mp) struct mount *mp; { if (mp == NULL) return; MNT_ILOCK(mp); mp->mnt_writeopcount--; if (mp->mnt_writeopcount < 0) panic("vn_finished_write: neg cnt"); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && mp->mnt_writeopcount <= 0) wakeup(&mp->mnt_writeopcount); MNT_IUNLOCK(mp); } /* * Filesystem secondary write operation has completed. If we are * suspending and this operation is the last one, notify the suspender * that the suspension is now in effect. */ void vn_finished_secondary_write(mp) struct mount *mp; { if (mp == NULL) return; MNT_ILOCK(mp); mp->mnt_secondary_writes--; if (mp->mnt_secondary_writes < 0) panic("vn_finished_secondary_write: neg cnt"); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && mp->mnt_secondary_writes <= 0) wakeup(&mp->mnt_secondary_writes); MNT_IUNLOCK(mp); } /* * Request a filesystem to suspend write operations. */ int vfs_write_suspend(mp) struct mount *mp; { struct thread *td = curthread; int error; MNT_ILOCK(mp); if (mp->mnt_kern_flag & MNTK_SUSPEND) { MNT_IUNLOCK(mp); return (0); } mp->mnt_kern_flag |= MNTK_SUSPEND; if (mp->mnt_writeopcount > 0) (void) msleep(&mp->mnt_writeopcount, MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0); else MNT_IUNLOCK(mp); if ((error = VFS_SYNC(mp, MNT_SUSPEND, td)) != 0) vfs_write_resume(mp); return (error); } /* * Request a filesystem to resume write operations. */ void vfs_write_resume(mp) struct mount *mp; { MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 | MNTK_SUSPENDED); wakeup(&mp->mnt_writeopcount); wakeup(&mp->mnt_flag); } MNT_IUNLOCK(mp); } /* * Implement kqueues for files by translating it to vnode operation. */ static int vn_kqfilter(struct file *fp, struct knote *kn) { int vfslocked; int error; vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); error = VOP_KQFILTER(fp->f_vnode, kn); VFS_UNLOCK_GIANT(vfslocked); return error; } /* * Simplified in-kernel wrapper calls for extended attribute access. * Both calls pass in a NULL credential, authorizing as "kernel" access. * Set IO_NODELOCKED in ioflg if the vnode is already locked. */ int vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int *buflen, char *buf, struct thread *td) { struct uio auio; struct iovec iov; int error; iov.iov_len = *buflen; iov.iov_base = buf; auio.uio_iov = &iov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_offset = 0; auio.uio_resid = *buflen; if ((ioflg & IO_NODELOCKED) == 0) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute retrieval as kernel */ error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) VOP_UNLOCK(vp, 0, td); if (error == 0) { *buflen = *buflen - auio.uio_resid; } return (error); } /* * XXX failure mode if partially written? */ int vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int buflen, char *buf, struct thread *td) { struct uio auio; struct iovec iov; struct mount *mp; int error; iov.iov_len = buflen; iov.iov_base = buf; auio.uio_iov = &iov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_offset = 0; auio.uio_resid = buflen; if ((ioflg & IO_NODELOCKED) == 0) { if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute setting as kernel */ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) { vn_finished_write(mp); VOP_UNLOCK(vp, 0, td); } return (error); } int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, struct thread *td) { struct mount *mp; int error; if ((ioflg & IO_NODELOCKED) == 0) { if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute removal as kernel */ error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td); if (error == EOPNOTSUPP) error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) { vn_finished_write(mp); VOP_UNLOCK(vp, 0, td); } return (error); } Index: head/sys/kern/vnode_if.src =================================================================== --- head/sys/kern/vnode_if.src (revision 169670) +++ head/sys/kern/vnode_if.src (revision 169671) @@ -1,589 +1,589 @@ #- # Copyright (c) 1992, 1993 # The Regents of the University of California. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 4. Neither the name of the University nor the names of its contributors # may be used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # # @(#)vnode_if.src 8.12 (Berkeley) 5/14/95 # $FreeBSD$ # # # Above each of the vop descriptors in lines starting with %% # is a specification of the locking protocol used by each vop call. # The first column is the name of the variable, the remaining three # columns are in, out and error respectively. The "in" column defines # the lock state on input, the "out" column defines the state on succesful # return, and the "error" column defines the locking state on error exit. # # The locking value can take the following values: # L: locked; not converted to type of lock. # A: any lock type. # S: locked with shared lock. # E: locked with exclusive lock for this process. # O: locked with exclusive lock for other process. # U: unlocked. # -: not applicable. vnode does not yet (or no longer) exists. # =: the same on input and output, may be either L or U. # X: locked if not nil. # # The paramater named "vpp" is assumed to be always used with double # indirection (**vpp) and that name is hard-codeed in vnode_if.awk ! # # Lines starting with %! specify a pre or post-condition function # to call before/after the vop call. # # If other such parameters are introduced, they have to be added to # the AWK script at the head of the definition of "add_debug_code()". # vop_islocked { IN struct vnode *vp; IN struct thread *td; }; %% lookup dvp L ? ? %% lookup vpp - L - %! lookup pre vop_lookup_pre %! lookup post vop_lookup_post # XXX - the lookup locking protocol defies simple description and depends # on the flags and operation fields in the (cnp) structure. Note # especially that *vpp may equal dvp and both may be locked. vop_lookup { IN struct vnode *dvp; INOUT struct vnode **vpp; IN struct componentname *cnp; }; %% cachedlookup dvp L ? ? %% cachedlookup vpp - L - # This must be an exact copy of lookup. See kern/vfs_cache.c for details. vop_cachedlookup { IN struct vnode *dvp; INOUT struct vnode **vpp; IN struct componentname *cnp; }; %% create dvp E E E %% create vpp - L - %! create post vop_create_post vop_create { IN struct vnode *dvp; OUT struct vnode **vpp; IN struct componentname *cnp; IN struct vattr *vap; }; %% whiteout dvp E E E vop_whiteout { IN struct vnode *dvp; IN struct componentname *cnp; IN int flags; }; %% mknod dvp E E E %% mknod vpp - L - %! mknod post vop_mknod_post vop_mknod { IN struct vnode *dvp; OUT struct vnode **vpp; IN struct componentname *cnp; IN struct vattr *vap; }; %% open vp L L L vop_open { IN struct vnode *vp; IN int mode; IN struct ucred *cred; IN struct thread *td; IN int fdidx; }; %% close vp E E E vop_close { IN struct vnode *vp; IN int fflag; IN struct ucred *cred; IN struct thread *td; }; %% access vp L L L vop_access { IN struct vnode *vp; IN int mode; IN struct ucred *cred; IN struct thread *td; }; %% getattr vp L L L vop_getattr { IN struct vnode *vp; OUT struct vattr *vap; IN struct ucred *cred; IN struct thread *td; }; %% setattr vp E E E %! setattr post vop_setattr_post vop_setattr { IN struct vnode *vp; IN struct vattr *vap; IN struct ucred *cred; IN struct thread *td; }; %% read vp L L L vop_read { IN struct vnode *vp; INOUT struct uio *uio; IN int ioflag; IN struct ucred *cred; }; %% write vp E E E %! write pre VOP_WRITE_PRE %! write post VOP_WRITE_POST vop_write { IN struct vnode *vp; INOUT struct uio *uio; IN int ioflag; IN struct ucred *cred; }; %% lease vp = = = vop_lease { IN struct vnode *vp; IN struct thread *td; IN struct ucred *cred; IN int flag; }; %% ioctl vp U U U vop_ioctl { IN struct vnode *vp; IN u_long command; IN void *data; IN int fflag; IN struct ucred *cred; IN struct thread *td; }; %% poll vp U U U vop_poll { IN struct vnode *vp; IN int events; IN struct ucred *cred; IN struct thread *td; }; %% kqfilter vp U U U vop_kqfilter { IN struct vnode *vp; IN struct knote *kn; }; %% revoke vp L L L vop_revoke { IN struct vnode *vp; IN int flags; }; %% fsync vp E E E vop_fsync { IN struct vnode *vp; IN int waitfor; IN struct thread *td; }; %% remove dvp E E E %% remove vp E E E %! remove post vop_remove_post vop_remove { IN struct vnode *dvp; IN struct vnode *vp; IN struct componentname *cnp; }; %% link tdvp E E E %% link vp E E E %! link post vop_link_post vop_link { IN struct vnode *tdvp; IN struct vnode *vp; IN struct componentname *cnp; }; %! rename pre vop_rename_pre %! rename post vop_rename_post vop_rename { IN WILLRELE struct vnode *fdvp; IN WILLRELE struct vnode *fvp; IN struct componentname *fcnp; IN WILLRELE struct vnode *tdvp; IN WILLRELE struct vnode *tvp; IN struct componentname *tcnp; }; %% mkdir dvp E E E %% mkdir vpp - E - %! mkdir post vop_mkdir_post vop_mkdir { IN struct vnode *dvp; OUT struct vnode **vpp; IN struct componentname *cnp; IN struct vattr *vap; }; %% rmdir dvp E E E %% rmdir vp E E E %! rmdir post vop_rmdir_post vop_rmdir { IN struct vnode *dvp; IN struct vnode *vp; IN struct componentname *cnp; }; %% symlink dvp E E E %% symlink vpp - E - %! symlink post vop_symlink_post vop_symlink { IN struct vnode *dvp; OUT struct vnode **vpp; IN struct componentname *cnp; IN struct vattr *vap; IN char *target; }; %% readdir vp L L L vop_readdir { IN struct vnode *vp; INOUT struct uio *uio; IN struct ucred *cred; INOUT int *eofflag; OUT int *ncookies; INOUT u_long **cookies; }; %% readlink vp L L L vop_readlink { IN struct vnode *vp; INOUT struct uio *uio; IN struct ucred *cred; }; %% inactive vp E E E vop_inactive { IN struct vnode *vp; IN struct thread *td; }; %% reclaim vp E E E vop_reclaim { IN struct vnode *vp; IN struct thread *td; }; -%! lock pre vop_lock_pre -%! lock post vop_lock_post +%! lock1 pre vop_lock_pre +%! lock1 post vop_lock_post -_vop_lock { +vop_lock1 { IN struct vnode *vp; IN int flags; IN struct thread *td; IN char *file; IN int line; }; %! unlock pre vop_unlock_pre %! unlock post vop_unlock_post vop_unlock { IN struct vnode *vp; IN int flags; IN struct thread *td; }; %% bmap vp L L L vop_bmap { IN struct vnode *vp; IN daddr_t bn; OUT struct bufobj **bop; IN daddr_t *bnp; OUT int *runp; OUT int *runb; }; %% strategy vp L L L %! strategy pre vop_strategy_pre vop_strategy { IN struct vnode *vp; IN struct buf *bp; }; %% getwritemount vp = = = vop_getwritemount { IN struct vnode *vp; OUT struct mount **mpp; }; %% print vp = = = vop_print { IN struct vnode *vp; }; %% pathconf vp L L L vop_pathconf { IN struct vnode *vp; IN int name; OUT register_t *retval; }; %% advlock vp U U U vop_advlock { IN struct vnode *vp; IN void *id; IN int op; IN struct flock *fl; IN int flags; }; %% reallocblks vp E E E vop_reallocblks { IN struct vnode *vp; IN struct cluster_save *buflist; }; %% getpages vp L L L vop_getpages { IN struct vnode *vp; IN vm_page_t *m; IN int count; IN int reqpage; IN vm_ooffset_t offset; }; %% putpages vp E E E vop_putpages { IN struct vnode *vp; IN vm_page_t *m; IN int count; IN int sync; IN int *rtvals; IN vm_ooffset_t offset; }; %% getacl vp L L L vop_getacl { IN struct vnode *vp; IN acl_type_t type; OUT struct acl *aclp; IN struct ucred *cred; IN struct thread *td; }; %% setacl vp E E E vop_setacl { IN struct vnode *vp; IN acl_type_t type; IN struct acl *aclp; IN struct ucred *cred; IN struct thread *td; }; %% aclcheck vp = = = vop_aclcheck { IN struct vnode *vp; IN acl_type_t type; IN struct acl *aclp; IN struct ucred *cred; IN struct thread *td; }; %% closeextattr vp L L L vop_closeextattr { IN struct vnode *vp; IN int commit; IN struct ucred *cred; IN struct thread *td; }; %% getextattr vp L L L vop_getextattr { IN struct vnode *vp; IN int attrnamespace; IN const char *name; INOUT struct uio *uio; OUT size_t *size; IN struct ucred *cred; IN struct thread *td; }; %% listextattr vp L L L vop_listextattr { IN struct vnode *vp; IN int attrnamespace; INOUT struct uio *uio; OUT size_t *size; IN struct ucred *cred; IN struct thread *td; }; %% openextattr vp L L L vop_openextattr { IN struct vnode *vp; IN struct ucred *cred; IN struct thread *td; }; %% deleteextattr vp E E E vop_deleteextattr { IN struct vnode *vp; IN int attrnamespace; IN const char *name; IN struct ucred *cred; IN struct thread *td; }; %% setextattr vp E E E vop_setextattr { IN struct vnode *vp; IN int attrnamespace; IN const char *name; INOUT struct uio *uio; IN struct ucred *cred; IN struct thread *td; }; %% setlabel vp E E E vop_setlabel { IN struct vnode *vp; IN struct label *label; IN struct ucred *cred; IN struct thread *td; }; %% setlabel vp = = = vop_vptofh { IN struct vnode *vp; IN struct fid *fhp; }; Index: head/sys/sys/vnode.h =================================================================== --- head/sys/sys/vnode.h (revision 169670) +++ head/sys/sys/vnode.h (revision 169671) @@ -1,745 +1,745 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vnode.h 8.7 (Berkeley) 2/4/94 * $FreeBSD$ */ #ifndef _SYS_VNODE_H_ #define _SYS_VNODE_H_ /* * XXX - compatability until lockmgr() goes away or all the #includes are * updated. */ #include #include #include #include #include #include #include #include #include #include #include /* * The vnode is the focus of all file activity in UNIX. There is a * unique vnode allocated for each active file, each current directory, * each mounted-on file, text file, and the root. */ /* * Vnode types. VNON means no type. */ enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD, VMARKER }; /* * Each underlying filesystem allocates its own private area and hangs * it from v_data. If non-null, this area is freed in getnewvnode(). */ struct namecache; struct vpollinfo { struct mtx vpi_lock; /* lock to protect below */ struct selinfo vpi_selinfo; /* identity of poller(s) */ short vpi_events; /* what they are looking for */ short vpi_revents; /* what has happened */ }; /* * Reading or writing any of these items requires holding the appropriate lock. * * Lock reference: * c - namecache mutex * f - freelist mutex * G - Giant * i - interlock * m - mntvnodes mutex * p - pollinfo lock * s - spechash mutex * S - syncer mutex * u - Only a reference to the vnode is needed to read. * v - vnode lock * * Vnodes may be found on many lists. The general way to deal with operating * on a vnode that is on a list is: * 1) Lock the list and find the vnode. * 2) Lock interlock so that the vnode does not go away. * 3) Unlock the list to avoid lock order reversals. * 4) vget with LK_INTERLOCK and check for ENOENT, or * 5) Check for DOOMED if the vnode lock is not required. * 6) Perform your operation, then vput(). * * XXX Not all fields are locked yet and some fields that are marked are not * locked consistently. This is a work in progress. Requires Giant! */ #if defined(_KERNEL) || defined(_KVM_VNODE) struct vnode { /* * Fields which define the identity of the vnode. These fields are * owned by the filesystem (XXX: and vgone() ?) */ enum vtype v_type; /* u vnode type */ const char *v_tag; /* u type of underlying data */ struct vop_vector *v_op; /* u vnode operations vector */ void *v_data; /* u private data for fs */ /* * Filesystem instance stuff */ struct mount *v_mount; /* u ptr to vfs we are in */ TAILQ_ENTRY(vnode) v_nmntvnodes; /* m vnodes for mount point */ /* * Type specific fields, only one applies to any given vnode. * See #defines below for renaming to v_* namespace. */ union { struct mount *vu_mount; /* v ptr to mountpoint (VDIR) */ struct socket *vu_socket; /* v unix domain net (VSOCK) */ struct cdev *vu_cdev; /* v device (VCHR, VBLK) */ struct fifoinfo *vu_fifoinfo; /* v fifo (VFIFO) */ } v_un; /* * vfs_hash: (mount + inode) -> vnode hash. */ LIST_ENTRY(vnode) v_hashlist; u_int v_hash; /* * VFS_namecache stuff */ LIST_HEAD(, namecache) v_cache_src; /* c Cache entries from us */ TAILQ_HEAD(, namecache) v_cache_dst; /* c Cache entries to us */ struct vnode *v_dd; /* c .. vnode */ /* * clustering stuff */ daddr_t v_cstart; /* v start block of cluster */ daddr_t v_lasta; /* v last allocation */ daddr_t v_lastw; /* v last write */ int v_clen; /* v length of cur. cluster */ /* * Locking */ struct lock v_lock; /* u (if fs don't have one) */ struct mtx v_interlock; /* lock for "i" things */ struct lock *v_vnlock; /* u pointer to vnode lock */ int v_holdcnt; /* i prevents recycling. */ int v_usecount; /* i ref count of users */ u_long v_iflag; /* i vnode flags (see below) */ u_long v_vflag; /* v vnode flags */ int v_writecount; /* v ref count of writers */ /* * The machinery of being a vnode */ TAILQ_ENTRY(vnode) v_freelist; /* f vnode freelist */ struct bufobj v_bufobj; /* * Buffer cache object */ /* * Hooks for various subsystems and features. */ struct vpollinfo *v_pollinfo; /* G Poll events, p for *v_pi */ struct label *v_label; /* MAC label for vnode */ }; #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */ #define v_mountedhere v_un.vu_mount #define v_socket v_un.vu_socket #define v_rdev v_un.vu_cdev #define v_fifoinfo v_un.vu_fifoinfo /* XXX: These are temporary to avoid a source sweep at this time */ #define v_object v_bufobj.bo_object /* * Userland version of struct vnode, for sysctl. */ struct xvnode { size_t xv_size; /* sizeof(struct xvnode) */ void *xv_vnode; /* address of real vnode */ u_long xv_flag; /* vnode vflags */ int xv_usecount; /* reference count of users */ int xv_writecount; /* reference count of writers */ int xv_holdcnt; /* page & buffer references */ u_long xv_id; /* capability identifier */ void *xv_mount; /* address of parent mount */ long xv_numoutput; /* num of writes in progress */ enum vtype xv_type; /* vnode type */ union { void *xvu_socket; /* socket, if VSOCK */ void *xvu_fifo; /* fifo, if VFIFO */ dev_t xvu_rdev; /* maj/min, if VBLK/VCHR */ struct { dev_t xvu_dev; /* device, if VDIR/VREG/VLNK */ ino_t xvu_ino; /* id, if VDIR/VREG/VLNK */ } xv_uns; } xv_un; }; #define xv_socket xv_un.xvu_socket #define xv_fifo xv_un.xvu_fifo #define xv_rdev xv_un.xvu_rdev #define xv_dev xv_un.xv_uns.xvu_dev #define xv_ino xv_un.xv_uns.xvu_ino /* We don't need to lock the knlist */ #define VN_KNLIST_EMPTY(vp) ((vp)->v_pollinfo == NULL || \ KNLIST_EMPTY(&(vp)->v_pollinfo->vpi_selinfo.si_note)) #define VN_KNOTE(vp, b, a) \ do { \ if (!VN_KNLIST_EMPTY(vp)) \ KNOTE(&vp->v_pollinfo->vpi_selinfo.si_note, (b), (a)); \ } while (0) #define VN_KNOTE_LOCKED(vp, b) VN_KNOTE(vp, b, 1) #define VN_KNOTE_UNLOCKED(vp, b) VN_KNOTE(vp, b, 0) /* * Vnode flags. * VI flags are protected by interlock and live in v_iflag * VV flags are protected by the vnode lock and live in v_vflag */ #define VI_MOUNT 0x0020 /* Mount in progress */ #define VI_AGE 0x0040 /* Insert vnode at head of free list */ #define VI_DOOMED 0x0080 /* This vnode is being recycled */ #define VI_FREE 0x0100 /* This vnode is on the freelist */ #define VI_OBJDIRTY 0x0400 /* object might be dirty */ #define VI_DOINGINACT 0x0800 /* VOP_INACTIVE is in progress */ #define VI_OWEINACT 0x1000 /* Need to call inactive */ #define VV_ROOT 0x0001 /* root of its filesystem */ #define VV_ISTTY 0x0002 /* vnode represents a tty */ #define VV_NOSYNC 0x0004 /* unlinked, stop syncing */ #define VV_CACHEDLABEL 0x0010 /* Vnode has valid cached MAC label */ #define VV_TEXT 0x0020 /* vnode is a pure text prototype */ #define VV_COPYONWRITE 0x0040 /* vnode is doing copy-on-write */ #define VV_SYSTEM 0x0080 /* vnode being used by kernel */ #define VV_PROCDEP 0x0100 /* vnode is process dependent */ #define VV_NOKNOTE 0x0200 /* don't activate knotes on this vnode */ #define VV_DELETED 0x0400 /* should be removed */ #define VV_MD 0x0800 /* vnode backs the md device */ /* * Vnode attributes. A field value of VNOVAL represents a field whose value * is unavailable (getattr) or which is not to be changed (setattr). */ struct vattr { enum vtype va_type; /* vnode type (for create) */ u_short va_mode; /* files access mode and type */ short va_nlink; /* number of references to file */ uid_t va_uid; /* owner user id */ gid_t va_gid; /* owner group id */ dev_t va_fsid; /* filesystem id */ long va_fileid; /* file id */ u_quad_t va_size; /* file size in bytes */ long va_blocksize; /* blocksize preferred for i/o */ struct timespec va_atime; /* time of last access */ struct timespec va_mtime; /* time of last modification */ struct timespec va_ctime; /* time file changed */ struct timespec va_birthtime; /* time file created */ u_long va_gen; /* generation number of file */ u_long va_flags; /* flags defined for file */ dev_t va_rdev; /* device the special file represents */ u_quad_t va_bytes; /* bytes of disk space held by file */ u_quad_t va_filerev; /* file modification number */ u_int va_vaflags; /* operations flags, see below */ long va_spare; /* remain quad aligned */ }; /* * Flags for va_vaflags. */ #define VA_UTIMES_NULL 0x01 /* utimes argument was NULL */ #define VA_EXCLUSIVE 0x02 /* exclusive create request */ #define VA_MARK_ATIME 0x04 /* setting atime for execve/mmap */ /* * Flags for ioflag. (high 16 bits used to ask for read-ahead and * help with write clustering) * NB: IO_NDELAY and IO_DIRECT are linked to fcntl.h */ #define IO_UNIT 0x0001 /* do I/O as atomic unit */ #define IO_APPEND 0x0002 /* append write to end */ #define IO_NDELAY 0x0004 /* FNDELAY flag set in file table */ #define IO_NODELOCKED 0x0008 /* underlying node already locked */ #define IO_ASYNC 0x0010 /* bawrite rather then bdwrite */ #define IO_VMIO 0x0020 /* data already in VMIO space */ #define IO_INVAL 0x0040 /* invalidate after I/O */ #define IO_SYNC 0x0080 /* do I/O synchronously */ #define IO_DIRECT 0x0100 /* attempt to bypass buffer cache */ #define IO_EXT 0x0400 /* operate on external attributes */ #define IO_NORMAL 0x0800 /* operate on regular data */ #define IO_NOMACCHECK 0x1000 /* MAC checks unnecessary */ #define IO_SEQMAX 0x7F /* seq heuristic max value */ #define IO_SEQSHIFT 16 /* seq heuristic in upper 16 bits */ /* * Modes. Some values same as Ixxx entries from inode.h for now. */ #define VEXEC 000100 /* execute/search permission */ #define VWRITE 000200 /* write permission */ #define VREAD 000400 /* read permission */ #define VSVTX 001000 /* save swapped text even after use */ #define VSGID 002000 /* set group id on execution */ #define VSUID 004000 /* set user id on execution */ #define VADMIN 010000 /* permission to administer */ #define VSTAT 020000 /* permission to retrieve attrs */ #define VAPPEND 040000 /* permission to write/append */ #define VALLPERM (VEXEC | VWRITE | VREAD | VADMIN | VSTAT | VAPPEND) /* * Token indicating no attribute value yet assigned. */ #define VNOVAL (-1) /* * LK_TIMELOCK timeout for vnode locks (used mainly by the pageout daemon) */ #define VLKTIMEOUT (hz / 20 + 1) #ifdef _KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_VNODE); #endif /* * Convert between vnode types and inode formats (since POSIX.1 * defines mode word of stat structure in terms of inode formats). */ extern enum vtype iftovt_tab[]; extern int vttoif_tab[]; #define IFTOVT(mode) (iftovt_tab[((mode) & S_IFMT) >> 12]) #define VTTOIF(indx) (vttoif_tab[(int)(indx)]) #define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode)) /* * Flags to various vnode functions. */ #define SKIPSYSTEM 0x0001 /* vflush: skip vnodes marked VSYSTEM */ #define FORCECLOSE 0x0002 /* vflush: force file closure */ #define WRITECLOSE 0x0004 /* vflush: only close writable files */ #define DOCLOSE 0x0008 /* vclean: close active files */ #define V_SAVE 0x0001 /* vinvalbuf: sync file first */ #define V_ALT 0x0002 /* vinvalbuf: invalidate only alternate bufs */ #define V_NORMAL 0x0004 /* vinvalbuf: invalidate only regular bufs */ #define REVOKEALL 0x0001 /* vop_revoke: revoke all aliases */ #define V_WAIT 0x0001 /* vn_start_write: sleep for suspend */ #define V_NOWAIT 0x0002 /* vn_start_write: don't sleep for suspend */ #define V_XSLEEP 0x0004 /* vn_start_write: just return after sleep */ #define VREF(vp) vref(vp) #ifdef DIAGNOSTIC #define VATTR_NULL(vap) vattr_null(vap) #else #define VATTR_NULL(vap) (*(vap) = va_null) /* initialize a vattr */ #endif /* DIAGNOSTIC */ #define NULLVP ((struct vnode *)NULL) /* * Global vnode data. */ extern struct vnode *rootvnode; /* root (i.e. "/") vnode */ extern int async_io_version; /* 0 or POSIX version of AIO i'face */ extern int desiredvnodes; /* number of vnodes desired */ extern struct uma_zone *namei_zone; extern int prtactive; /* nonzero to call vprint() */ extern struct vattr va_null; /* predefined null vattr structure */ /* * Macro/function to check for client cache inconsistency w.r.t. leasing. */ #define LEASE_READ 0x1 /* Check lease for readers */ #define LEASE_WRITE 0x2 /* Check lease for modifiers */ extern void (*lease_updatetime)(int deltat); #define VI_LOCK(vp) mtx_lock(&(vp)->v_interlock) #define VI_LOCK_FLAGS(vp, flags) mtx_lock_flags(&(vp)->v_interlock, (flags)) #define VI_TRYLOCK(vp) mtx_trylock(&(vp)->v_interlock) #define VI_UNLOCK(vp) mtx_unlock(&(vp)->v_interlock) #define VI_MTX(vp) (&(vp)->v_interlock) #endif /* _KERNEL */ /* * Mods for extensibility. */ /* * Flags for vdesc_flags: */ #define VDESC_MAX_VPS 16 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */ #define VDESC_VP0_WILLRELE 0x0001 #define VDESC_VP1_WILLRELE 0x0002 #define VDESC_VP2_WILLRELE 0x0004 #define VDESC_VP3_WILLRELE 0x0008 #define VDESC_NOMAP_VPP 0x0100 #define VDESC_VPP_WILLRELE 0x0200 /* * A generic structure. * This can be used by bypass routines to identify generic arguments. */ struct vop_generic_args { struct vnodeop_desc *a_desc; /* other random data follows, presumably */ }; typedef int vop_bypass_t(struct vop_generic_args *); /* * VDESC_NO_OFFSET is used to identify the end of the offset list * and in places where no such field exists. */ #define VDESC_NO_OFFSET -1 /* * This structure describes the vnode operation taking place. */ struct vnodeop_desc { char *vdesc_name; /* a readable name for debugging */ int vdesc_flags; /* VDESC_* flags */ vop_bypass_t *vdesc_call; /* Function to call */ /* * These ops are used by bypass routines to map and locate arguments. * Creds and procs are not needed in bypass routines, but sometimes * they are useful to (for example) transport layers. * Nameidata is useful because it has a cred in it. */ int *vdesc_vp_offsets; /* list ended by VDESC_NO_OFFSET */ int vdesc_vpp_offset; /* return vpp location */ int vdesc_cred_offset; /* cred location, if any */ int vdesc_thread_offset; /* thread location, if any */ int vdesc_componentname_offset; /* if any */ }; #ifdef _KERNEL /* * A list of all the operation descs. */ extern struct vnodeop_desc *vnodeop_descs[]; #define VOPARG_OFFSETOF(s_type, field) __offsetof(s_type, field) #define VOPARG_OFFSETTO(s_type, s_offset, struct_p) \ ((s_type)(((char*)(struct_p)) + (s_offset))) #ifdef DEBUG_VFS_LOCKS /* * Support code to aid in debugging VFS locking problems. Not totally * reliable since if the thread sleeps between changing the lock * state and checking it with the assert, some other thread could * change the state. They are good enough for debugging a single * filesystem using a single-threaded test. */ void assert_vi_locked(struct vnode *vp, const char *str); void assert_vi_unlocked(struct vnode *vp, const char *str); void assert_vop_elocked(struct vnode *vp, const char *str); #if 0 void assert_vop_elocked_other(struct vnode *vp, const char *str); #endif void assert_vop_locked(struct vnode *vp, const char *str); #if 0 voi0 assert_vop_slocked(struct vnode *vp, const char *str); #endif void assert_vop_unlocked(struct vnode *vp, const char *str); #define ASSERT_VI_LOCKED(vp, str) assert_vi_locked((vp), (str)) #define ASSERT_VI_UNLOCKED(vp, str) assert_vi_unlocked((vp), (str)) #define ASSERT_VOP_ELOCKED(vp, str) assert_vop_elocked((vp), (str)) #if 0 #define ASSERT_VOP_ELOCKED_OTHER(vp, str) assert_vop_locked_other((vp), (str)) #endif #define ASSERT_VOP_LOCKED(vp, str) assert_vop_locked((vp), (str)) #if 0 #define ASSERT_VOP_SLOCKED(vp, str) assert_vop_slocked((vp), (str)) #endif #define ASSERT_VOP_UNLOCKED(vp, str) assert_vop_unlocked((vp), (str)) #else /* !DEBUG_VFS_LOCKS */ #define ASSERT_VI_LOCKED(vp, str) #define ASSERT_VI_UNLOCKED(vp, str) #define ASSERT_VOP_ELOCKED(vp, str) #if 0 #define ASSERT_VOP_ELOCKED_OTHER(vp, str) #endif #define ASSERT_VOP_LOCKED(vp, str) #if 0 #define ASSERT_VOP_SLOCKED(vp, str) #endif #define ASSERT_VOP_UNLOCKED(vp, str) #endif /* DEBUG_VFS_LOCKS */ /* * This call works for vnodes in the kernel. */ #define VCALL(c) ((c)->a_desc->vdesc_call(c)) /* * VMIO support inline */ extern int vmiodirenable; static __inline int vn_canvmio(struct vnode *vp) { if (vp && (vp->v_type == VREG || (vmiodirenable && vp->v_type == VDIR))) return(TRUE); return(FALSE); } /* * Finally, include the default set of vnode operations. */ #include "vnode_if.h" /* * Public vnode manipulation functions. */ struct componentname; struct file; struct mount; struct nameidata; struct ostat; struct thread; struct proc; struct stat; struct nstat; struct ucred; struct uio; struct vattr; struct vnode; extern int (*lease_check_hook)(struct vop_lease_args *); /* cache_* may belong in namei.h. */ void cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp); int cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp); void cache_purge(struct vnode *vp); void cache_purgevfs(struct mount *mp); int cache_leaf_test(struct vnode *vp); int change_dir(struct vnode *vp, struct thread *td); int change_root(struct vnode *vp, struct thread *td); void cvtstat(struct stat *st, struct ostat *ost); void cvtnstat(struct stat *sb, struct nstat *nsb); int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp); int insmntque1(struct vnode *vp, struct mount *mp, void (*dtr)(struct vnode *, void *), void *dtr_arg); int insmntque(struct vnode *vp, struct mount *mp); u_quad_t init_va_filerev(void); int lease_check(struct vop_lease_args *ap); int speedup_syncer(void); #define textvp_fullpath(p, rb, rfb) \ vn_fullpath(FIRST_THREAD_IN_PROC(p), (p)->p_textvp, rb, rfb) int vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf); int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, mode_t acc_mode, struct ucred *cred, int *privused); int vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid, struct acl *acl, mode_t acc_mode, struct ucred *cred, int *privused); void vattr_null(struct vattr *vap); int vcount(struct vnode *vp); void vdrop(struct vnode *); void vdropl(struct vnode *); void vfs_add_vnodeops(const void *); void vfs_rm_vnodeops(const void *); int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td); int vget(struct vnode *vp, int lockflag, struct thread *td); void vgone(struct vnode *vp); void vhold(struct vnode *); void vholdl(struct vnode *); int vinvalbuf(struct vnode *vp, int save, struct thread *td, int slpflag, int slptimeo); int vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td, off_t length, int blksize); void vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3); #define vprint(label, vp) vn_printf((vp), "%s\n", (label)) int vrecycle(struct vnode *vp, struct thread *td); int vn_close(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td); void vn_finished_write(struct mount *mp); void vn_finished_secondary_write(struct mount *mp); int vn_isdisk(struct vnode *vp, int *errp); int _vn_lock(struct vnode *vp, int flags, struct thread *td, char *file, int line); #define vn_lock(vp, flags, td) _vn_lock(vp, flags, td, __FILE__, __LINE__) int vn_open(struct nameidata *ndp, int *flagp, int cmode, int fdidx); int vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, struct ucred *cred, int fdidx); int vn_pollrecord(struct vnode *vp, struct thread *p, int events); int vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, int *aresid, struct thread *td); int vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, size_t *aresid, struct thread *td); int vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred, struct ucred *file_cred, struct thread *td); int vn_start_write(struct vnode *vp, struct mount **mpp, int flags); int vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags); int vn_write_suspend_wait(struct vnode *vp, struct mount *mp, int flags); int vn_writechk(struct vnode *vp); int vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int *buflen, char *buf, struct thread *td); int vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int buflen, char *buf, struct thread *td); int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, struct thread *td); int vfs_cache_lookup(struct vop_lookup_args *ap); void vfs_timestamp(struct timespec *); void vfs_write_resume(struct mount *mp); int vfs_write_suspend(struct mount *mp); int vop_stdbmap(struct vop_bmap_args *); int vop_stdfsync(struct vop_fsync_args *); int vop_stdgetwritemount(struct vop_getwritemount_args *); int vop_stdgetpages(struct vop_getpages_args *); int vop_stdinactive(struct vop_inactive_args *); int vop_stdislocked(struct vop_islocked_args *); int vop_stdkqfilter(struct vop_kqfilter_args *); -int vop_stdlock(struct _vop_lock_args *); +int vop_stdlock(struct vop_lock1_args *); int vop_stdputpages(struct vop_putpages_args *); int vop_stdunlock(struct vop_unlock_args *); int vop_nopoll(struct vop_poll_args *); int vop_stdpathconf(struct vop_pathconf_args *); int vop_stdpoll(struct vop_poll_args *); int vop_stdvptofh(struct vop_vptofh_args *ap); int vop_eopnotsupp(struct vop_generic_args *ap); int vop_ebadf(struct vop_generic_args *ap); int vop_einval(struct vop_generic_args *ap); int vop_enotty(struct vop_generic_args *ap); int vop_null(struct vop_generic_args *ap); int vop_panic(struct vop_generic_args *ap); /* These are called from within the actual VOPS. */ void vop_create_post(void *a, int rc); void vop_link_post(void *a, int rc); void vop_lock_pre(void *a); void vop_lock_post(void *a, int rc); void vop_lookup_post(void *a, int rc); void vop_lookup_pre(void *a); void vop_mkdir_post(void *a, int rc); void vop_mknod_post(void *a, int rc); void vop_remove_post(void *a, int rc); void vop_rename_post(void *a, int rc); void vop_rename_pre(void *a); void vop_rmdir_post(void *a, int rc); void vop_setattr_post(void *a, int rc); void vop_strategy_pre(void *a); void vop_symlink_post(void *a, int rc); void vop_unlock_post(void *a, int rc); void vop_unlock_pre(void *a); #define VOP_WRITE_PRE(ap) \ struct vattr va; \ int error, osize, ooffset, noffset; \ \ osize = ooffset = noffset = 0; \ if (!VN_KNLIST_EMPTY((ap)->a_vp)) { \ error = VOP_GETATTR((ap)->a_vp, &va, (ap)->a_cred, \ curthread); \ if (error) \ return (error); \ ooffset = (ap)->a_uio->uio_offset; \ osize = va.va_size; \ } #define VOP_WRITE_POST(ap, ret) \ noffset = (ap)->a_uio->uio_offset; \ if (noffset > ooffset && !VN_KNLIST_EMPTY((ap)->a_vp)) { \ VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE \ | (noffset > osize ? NOTE_EXTEND : 0)); \ } -#define VOP_LOCK(vp, flags, td) _VOP_LOCK(vp, flags, td, __FILE__, __LINE__) +#define VOP_LOCK(vp, flags, td) VOP_LOCK1(vp, flags, td, __FILE__, __LINE__) void vput(struct vnode *vp); void vrele(struct vnode *vp); void vref(struct vnode *vp); int vrefcnt(struct vnode *vp); void v_addpollinfo(struct vnode *vp); int vnode_create_vobject(struct vnode *vp, off_t size, struct thread *td); void vnode_destroy_vobject(struct vnode *vp); extern struct vop_vector fifo_specops; extern struct vop_vector dead_vnodeops; extern struct vop_vector default_vnodeops; #define VOP_PANIC ((void*)(uintptr_t)vop_panic) #define VOP_NULL ((void*)(uintptr_t)vop_null) #define VOP_EBADF ((void*)(uintptr_t)vop_ebadf) #define VOP_ENOTTY ((void*)(uintptr_t)vop_enotty) #define VOP_EINVAL ((void*)(uintptr_t)vop_einval) #define VOP_EOPNOTSUPP ((void*)(uintptr_t)vop_eopnotsupp) /* vfs_hash.c */ typedef int vfs_hash_cmp_t(struct vnode *vp, void *arg); int vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); int vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); void vfs_hash_rehash(struct vnode *vp, u_int hash); void vfs_hash_remove(struct vnode *vp); int vfs_kqfilter(struct vop_kqfilter_args *); void vfs_mark_atime(struct vnode *vp, struct thread *td); struct dirent; int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off); #endif /* _KERNEL */ #endif /* !_SYS_VNODE_H_ */ Index: head/sys/ufs/ffs/ffs_vnops.c =================================================================== --- head/sys/ufs/ffs/ffs_vnops.c (revision 169670) +++ head/sys/ufs/ffs/ffs_vnops.c (revision 169671) @@ -1,1736 +1,1736 @@ /*- * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ... * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_directio.h" #include "opt_ffs.h" #ifdef DIRECTIO extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); #endif static vop_fsync_t ffs_fsync; -static _vop_lock_t ffs_lock; +static vop_lock1_t ffs_lock; static vop_getpages_t ffs_getpages; static vop_read_t ffs_read; static vop_write_t ffs_write; static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred); static vop_strategy_t ffsext_strategy; static vop_closeextattr_t ffs_closeextattr; static vop_deleteextattr_t ffs_deleteextattr; static vop_getextattr_t ffs_getextattr; static vop_listextattr_t ffs_listextattr; static vop_openextattr_t ffs_openextattr; static vop_setextattr_t ffs_setextattr; static vop_vptofh_t ffs_vptofh; /* Global vfs data structures for ufs. */ struct vop_vector ffs_vnodeops1 = { .vop_default = &ufs_vnodeops, .vop_fsync = ffs_fsync, .vop_getpages = ffs_getpages, - ._vop_lock = ffs_lock, + .vop_lock1 = ffs_lock, .vop_read = ffs_read, .vop_reallocblks = ffs_reallocblks, .vop_write = ffs_write, .vop_vptofh = ffs_vptofh, }; struct vop_vector ffs_fifoops1 = { .vop_default = &ufs_fifoops, .vop_fsync = ffs_fsync, .vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */ .vop_vptofh = ffs_vptofh, }; /* Global vfs data structures for ufs. */ struct vop_vector ffs_vnodeops2 = { .vop_default = &ufs_vnodeops, .vop_fsync = ffs_fsync, .vop_getpages = ffs_getpages, - ._vop_lock = ffs_lock, + .vop_lock1 = ffs_lock, .vop_read = ffs_read, .vop_reallocblks = ffs_reallocblks, .vop_write = ffs_write, .vop_closeextattr = ffs_closeextattr, .vop_deleteextattr = ffs_deleteextattr, .vop_getextattr = ffs_getextattr, .vop_listextattr = ffs_listextattr, .vop_openextattr = ffs_openextattr, .vop_setextattr = ffs_setextattr, .vop_vptofh = ffs_vptofh, }; struct vop_vector ffs_fifoops2 = { .vop_default = &ufs_fifoops, .vop_fsync = ffs_fsync, - ._vop_lock = ffs_lock, + .vop_lock1 = ffs_lock, .vop_reallocblks = ffs_reallocblks, .vop_strategy = ffsext_strategy, .vop_closeextattr = ffs_closeextattr, .vop_deleteextattr = ffs_deleteextattr, .vop_getextattr = ffs_getextattr, .vop_listextattr = ffs_listextattr, .vop_openextattr = ffs_openextattr, .vop_setextattr = ffs_setextattr, .vop_vptofh = ffs_vptofh, }; /* * Synch an open file. */ /* ARGSUSED */ static int ffs_fsync(struct vop_fsync_args *ap) { int error; error = ffs_syncvnode(ap->a_vp, ap->a_waitfor); if (error) return (error); if (ap->a_waitfor == MNT_WAIT && (ap->a_vp->v_mount->mnt_flag & MNT_SOFTDEP)) error = softdep_fsync(ap->a_vp); return (error); } int ffs_syncvnode(struct vnode *vp, int waitfor) { struct inode *ip = VTOI(vp); struct buf *bp; struct buf *nbp; int s, error, wait, passes, skipmeta; ufs_lbn_t lbn; wait = (waitfor == MNT_WAIT); lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); /* * Flush all dirty buffers associated with a vnode. */ passes = NIADDR + 1; skipmeta = 0; if (wait) skipmeta = 1; s = splbio(); VI_LOCK(vp); loop: TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) bp->b_vflags &= ~BV_SCANNED; TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) { /* * Reasons to skip this buffer: it has already been considered * on this pass, this pass is the first time through on a * synchronous flush request and the buffer being considered * is metadata, the buffer has dependencies that will cause * it to be redirtied and it has not already been deferred, * or it is already being written. */ if ((bp->b_vflags & BV_SCANNED) != 0) continue; bp->b_vflags |= BV_SCANNED; if ((skipmeta == 1 && bp->b_lblkno < 0)) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; VI_UNLOCK(vp); if (!wait && !LIST_EMPTY(&bp->b_dep) && (bp->b_flags & B_DEFERRED) == 0 && buf_countdeps(bp, 0)) { bp->b_flags |= B_DEFERRED; BUF_UNLOCK(bp); VI_LOCK(vp); continue; } if ((bp->b_flags & B_DELWRI) == 0) panic("ffs_fsync: not dirty"); /* * If this is a synchronous flush request, or it is not a * file or device, start the write on this buffer immediatly. */ if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) { /* * On our final pass through, do all I/O synchronously * so that we can find out if our flush is failing * because of write errors. */ if (passes > 0 || !wait) { if ((bp->b_flags & B_CLUSTEROK) && !wait) { (void) vfs_bio_awrite(bp); } else { bremfree(bp); splx(s); (void) bawrite(bp); s = splbio(); } } else { bremfree(bp); splx(s); if ((error = bwrite(bp)) != 0) return (error); s = splbio(); } } else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) { /* * If the buffer is for data that has been truncated * off the file, then throw it away. */ bremfree(bp); bp->b_flags |= B_INVAL | B_NOCACHE; splx(s); brelse(bp); s = splbio(); } else vfs_bio_awrite(bp); /* * Since we may have slept during the I/O, we need * to start from a known point. */ VI_LOCK(vp); nbp = TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd); } /* * If we were asked to do this synchronously, then go back for * another pass, this time doing the metadata. */ if (skipmeta) { skipmeta = 0; goto loop; } if (wait) { bufobj_wwait(&vp->v_bufobj, 3, 0); VI_UNLOCK(vp); /* * Ensure that any filesystem metatdata associated * with the vnode has been written. */ splx(s); if ((error = softdep_sync_metadata(vp)) != 0) return (error); s = splbio(); VI_LOCK(vp); if (vp->v_bufobj.bo_dirty.bv_cnt > 0) { /* * Block devices associated with filesystems may * have new I/O requests posted for them even if * the vnode is locked, so no amount of trying will * get them clean. Thus we give block devices a * good effort, then just give up. For all other file * types, go around and try again until it is clean. */ if (passes > 0) { passes -= 1; goto loop; } #ifdef DIAGNOSTIC if (!vn_isdisk(vp, NULL)) vprint("ffs_fsync: dirty", vp); #endif } } VI_UNLOCK(vp); splx(s); return (ffs_update(vp, wait)); } static int ffs_lock(ap) - struct _vop_lock_args /* { + struct vop_lock1_args /* { struct vnode *a_vp; int a_flags; struct thread *a_td; char *file; int line; } */ *ap; { #ifndef NO_FFS_SNAPSHOT struct vnode *vp; int flags; struct lock *lkp; int result; switch (ap->a_flags & LK_TYPE_MASK) { case LK_SHARED: case LK_UPGRADE: case LK_EXCLUSIVE: vp = ap->a_vp; flags = ap->a_flags; for (;;) { /* * vnode interlock must be held to ensure that * the possibly external lock isn't freed, * e.g. when mutating from snapshot file vnode * to regular file vnode. */ if ((flags & LK_INTERLOCK) == 0) { VI_LOCK(vp); flags |= LK_INTERLOCK; } lkp = vp->v_vnlock; result = _lockmgr(lkp, flags, VI_MTX(vp), ap->a_td, ap->a_file, ap->a_line); if (lkp == vp->v_vnlock || result != 0) break; /* * Apparent success, except that the vnode * mutated between snapshot file vnode and * regular file vnode while this process * slept. The lock currently held is not the * right lock. Release it, and try to get the * new lock. */ (void) _lockmgr(lkp, LK_RELEASE, VI_MTX(vp), ap->a_td, ap->a_file, ap->a_line); if ((flags & LK_TYPE_MASK) == LK_UPGRADE) flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; flags &= ~LK_INTERLOCK; } break; default: - result = _VOP_LOCK_APV(&ufs_vnodeops, ap); + result = VOP_LOCK1_APV(&ufs_vnodeops, ap); } return (result); #else - return (_VOP_LOCK_APV(&ufs_vnodeops, ap)); + return (VOP_LOCK1_APV(&ufs_vnodeops, ap)); #endif } /* * Vnode op for reading. */ /* ARGSUSED */ static int ffs_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp; struct inode *ip; struct uio *uio; struct fs *fs; struct buf *bp; ufs_lbn_t lbn, nextlbn; off_t bytesinfile; long size, xfersize, blkoffset; int error, orig_resid; int seqcount; int ioflag; vp = ap->a_vp; uio = ap->a_uio; ioflag = ap->a_ioflag; if (ap->a_ioflag & IO_EXT) #ifdef notyet return (ffs_extread(vp, uio, ioflag)); #else panic("ffs_read+IO_EXT"); #endif #ifdef DIRECTIO if ((ioflag & IO_DIRECT) != 0) { int workdone; error = ffs_rawread(vp, uio, &workdone); if (error != 0 || workdone != 0) return error; } #endif seqcount = ap->a_ioflag >> IO_SEQSHIFT; ip = VTOI(vp); #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_READ) panic("ffs_read: mode"); if (vp->v_type == VLNK) { if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) panic("ffs_read: short symlink"); } else if (vp->v_type != VREG && vp->v_type != VDIR) panic("ffs_read: type %d", vp->v_type); #endif orig_resid = uio->uio_resid; KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0")); if (orig_resid == 0) return (0); KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0")); fs = ip->i_fs; if (uio->uio_offset < ip->i_size && uio->uio_offset >= fs->fs_maxfilesize) return (EOVERFLOW); for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) break; lbn = lblkno(fs, uio->uio_offset); nextlbn = lbn + 1; /* * size of buffer. The buffer representing the * end of the file is rounded up to the size of * the block type ( fragment or full block, * depending ). */ size = blksize(fs, ip, lbn); blkoffset = blkoff(fs, uio->uio_offset); /* * The amount we want to transfer in this iteration is * one FS block less the amount of the data before * our startpoint (duh!) */ xfersize = fs->fs_bsize - blkoffset; /* * But if we actually want less than the block, * or the file doesn't have a whole block more of data, * then use the lesser number. */ if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (bytesinfile < xfersize) xfersize = bytesinfile; if (lblktosize(fs, nextlbn) >= ip->i_size) { /* * Don't do readahead if this is the end of the file. */ error = bread(vp, lbn, size, NOCRED, &bp); } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { /* * Otherwise if we are allowed to cluster, * grab as much as we can. * * XXX This may not be a win if we are not * doing sequential access. */ error = cluster_read(vp, ip->i_size, lbn, size, NOCRED, blkoffset + uio->uio_resid, seqcount, &bp); } else if (seqcount > 1) { /* * If we are NOT allowed to cluster, then * if we appear to be acting sequentially, * fire off a request for a readahead * as well as a read. Note that the 4th and 5th * arguments point to arrays of the size specified in * the 6th argument. */ int nextsize = blksize(fs, ip, nextlbn); error = breadn(vp, lbn, size, &nextlbn, &nextsize, 1, NOCRED, &bp); } else { /* * Failing all of the above, just read what the * user asked for. Interestingly, the same as * the first option above. */ error = bread(vp, lbn, size, NOCRED, &bp); } if (error) { brelse(bp); bp = NULL; break; } /* * If IO_DIRECT then set B_DIRECT for the buffer. This * will cause us to attempt to release the buffer later on * and will cause the buffer cache to attempt to free the * underlying pages. */ if (ioflag & IO_DIRECT) bp->b_flags |= B_DIRECT; /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, * then we want to ensure that we do not uiomove bad * or uninitialized data. */ size -= bp->b_resid; if (size < xfersize) { if (size == 0) break; xfersize = size; } error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if (error) break; if ((ioflag & (IO_VMIO|IO_DIRECT)) && (LIST_EMPTY(&bp->b_dep))) { /* * If there are no dependencies, and it's VMIO, * then we don't need the buf, mark it available * for freeing. The VM has the data. */ bp->b_flags |= B_RELBUF; brelse(bp); } else { /* * Otherwise let whoever * made the request take care of * freeing it. We just queue * it onto another list. */ bqrelse(bp); } } /* * This can only happen in the case of an error * because the loop above resets bp to NULL on each iteration * and on normal completion has not set a new value into it. * so it must have come from a 'break' statement */ if (bp != NULL) { if ((ioflag & (IO_VMIO|IO_DIRECT)) && (LIST_EMPTY(&bp->b_dep))) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bqrelse(bp); } } if ((error == 0 || uio->uio_resid != orig_resid) && (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) { VI_LOCK(vp); ip->i_flag |= IN_ACCESS; VI_UNLOCK(vp); } return (error); } /* * Vnode op for writing. */ static int ffs_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp; struct uio *uio; struct inode *ip; struct fs *fs; struct buf *bp; struct thread *td; ufs_lbn_t lbn; off_t osize; int seqcount; int blkoffset, error, flags, ioflag, resid, size, xfersize; vp = ap->a_vp; uio = ap->a_uio; ioflag = ap->a_ioflag; if (ap->a_ioflag & IO_EXT) #ifdef notyet return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); #else panic("ffs_write+IO_EXT"); #endif seqcount = ap->a_ioflag >> IO_SEQSHIFT; ip = VTOI(vp); #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_WRITE) panic("ffs_write: mode"); #endif switch (vp->v_type) { case VREG: if (ioflag & IO_APPEND) uio->uio_offset = ip->i_size; if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) return (EPERM); /* FALLTHROUGH */ case VLNK: break; case VDIR: panic("ffs_write: dir write"); break; default: panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type, (int)uio->uio_offset, (int)uio->uio_resid ); } KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0")); KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0")); fs = ip->i_fs; if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) return (EFBIG); /* * Maybe this should be above the vnode op call, but so long as * file servers have no limits, I don't think it matters. */ td = uio->uio_td; if (vp->v_type == VREG && td != NULL) { PROC_LOCK(td->td_proc); if (uio->uio_offset + uio->uio_resid > lim_cur(td->td_proc, RLIMIT_FSIZE)) { psignal(td->td_proc, SIGXFSZ); PROC_UNLOCK(td->td_proc); return (EFBIG); } PROC_UNLOCK(td->td_proc); } resid = uio->uio_resid; osize = ip->i_size; if (seqcount > BA_SEQMAX) flags = BA_SEQMAX << BA_SEQSHIFT; else flags = seqcount << BA_SEQSHIFT; if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) flags |= IO_SYNC; for (error = 0; uio->uio_resid > 0;) { lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->fs_bsize - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (uio->uio_offset + xfersize > ip->i_size) vnode_pager_setsize(vp, uio->uio_offset + xfersize); /* * We must perform a read-before-write if the transfer size * does not cover the entire buffer. */ if (fs->fs_bsize > xfersize) flags |= BA_CLRBUF; else flags &= ~BA_CLRBUF; /* XXX is uio->uio_offset the right thing here? */ error = UFS_BALLOC(vp, uio->uio_offset, xfersize, ap->a_cred, flags, &bp); if (error != 0) break; /* * If the buffer is not valid we have to clear out any * garbage data from the pages instantiated for the buffer. * If we do not, a failed uiomove() during a write can leave * the prior contents of the pages exposed to a userland * mmap(). XXX deal with uiomove() errors a better way. */ if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) vfs_bio_clrbuf(bp); if (ioflag & IO_DIRECT) bp->b_flags |= B_DIRECT; if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) bp->b_flags |= B_NOCACHE; if (uio->uio_offset + xfersize > ip->i_size) { ip->i_size = uio->uio_offset + xfersize; DIP_SET(ip, i_size, ip->i_size); } size = blksize(fs, ip, lbn) - bp->b_resid; if (size < xfersize) xfersize = size; error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if ((ioflag & (IO_VMIO|IO_DIRECT)) && (LIST_EMPTY(&bp->b_dep))) { bp->b_flags |= B_RELBUF; } /* * If IO_SYNC each buffer is written synchronously. Otherwise * if we have a severe page deficiency write the buffer * asynchronously. Otherwise try to cluster, and if that * doesn't do it then either do an async write (if O_DIRECT), * or a delayed write (if not). */ if (ioflag & IO_SYNC) { (void)bwrite(bp); } else if (vm_page_count_severe() || buf_dirty_count_severe() || (ioflag & IO_ASYNC)) { bp->b_flags |= B_CLUSTEROK; bawrite(bp); } else if (xfersize + blkoffset == fs->fs_bsize) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { bp->b_flags |= B_CLUSTEROK; cluster_write(vp, bp, ip->i_size, seqcount); } else { bawrite(bp); } } else if (ioflag & IO_DIRECT) { bp->b_flags |= B_CLUSTEROK; bawrite(bp); } else { bp->b_flags |= B_CLUSTEROK; bdwrite(bp); } if (error || xfersize == 0) break; ip->i_flag |= IN_CHANGE | IN_UPDATE; } /* * If we successfully wrote any data, and we are not the superuser * we clear the setuid and setgid bits as a precaution against * tampering. */ if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ap->a_cred) { if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, SUSER_ALLOWJAIL)) { ip->i_mode &= ~(ISUID | ISGID); DIP_SET(ip, i_mode, ip->i_mode); } } if (error) { if (ioflag & IO_UNIT) { (void)ffs_truncate(vp, osize, IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred, uio->uio_td); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) error = ffs_update(vp, 1); return (error); } /* * get page routine */ static int ffs_getpages(ap) struct vop_getpages_args *ap; { int i; vm_page_t mreq; int pcount; pcount = round_page(ap->a_count) / PAGE_SIZE; mreq = ap->a_m[ap->a_reqpage]; /* * if ANY DEV_BSIZE blocks are valid on a large filesystem block, * then the entire page is valid. Since the page may be mapped, * user programs might reference data beyond the actual end of file * occuring within the page. We have to zero that data. */ VM_OBJECT_LOCK(mreq->object); if (mreq->valid) { if (mreq->valid != VM_PAGE_BITS_ALL) vm_page_zero_invalid(mreq, TRUE); vm_page_lock_queues(); for (i = 0; i < pcount; i++) { if (i != ap->a_reqpage) { vm_page_free(ap->a_m[i]); } } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(mreq->object); return VM_PAGER_OK; } VM_OBJECT_UNLOCK(mreq->object); return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage); } /* * Extended attribute area reading. */ static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) { struct inode *ip; struct ufs2_dinode *dp; struct fs *fs; struct buf *bp; ufs_lbn_t lbn, nextlbn; off_t bytesinfile; long size, xfersize, blkoffset; int error, orig_resid; ip = VTOI(vp); fs = ip->i_fs; dp = ip->i_din2; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) panic("ffs_extread: mode"); #endif orig_resid = uio->uio_resid; KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0")); if (orig_resid == 0) return (0); KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0")); for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) break; lbn = lblkno(fs, uio->uio_offset); nextlbn = lbn + 1; /* * size of buffer. The buffer representing the * end of the file is rounded up to the size of * the block type ( fragment or full block, * depending ). */ size = sblksize(fs, dp->di_extsize, lbn); blkoffset = blkoff(fs, uio->uio_offset); /* * The amount we want to transfer in this iteration is * one FS block less the amount of the data before * our startpoint (duh!) */ xfersize = fs->fs_bsize - blkoffset; /* * But if we actually want less than the block, * or the file doesn't have a whole block more of data, * then use the lesser number. */ if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (bytesinfile < xfersize) xfersize = bytesinfile; if (lblktosize(fs, nextlbn) >= dp->di_extsize) { /* * Don't do readahead if this is the end of the info. */ error = bread(vp, -1 - lbn, size, NOCRED, &bp); } else { /* * If we have a second block, then * fire off a request for a readahead * as well as a read. Note that the 4th and 5th * arguments point to arrays of the size specified in * the 6th argument. */ int nextsize = sblksize(fs, dp->di_extsize, nextlbn); nextlbn = -1 - nextlbn; error = breadn(vp, -1 - lbn, size, &nextlbn, &nextsize, 1, NOCRED, &bp); } if (error) { brelse(bp); bp = NULL; break; } /* * If IO_DIRECT then set B_DIRECT for the buffer. This * will cause us to attempt to release the buffer later on * and will cause the buffer cache to attempt to free the * underlying pages. */ if (ioflag & IO_DIRECT) bp->b_flags |= B_DIRECT; /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, * then we want to ensure that we do not uiomove bad * or uninitialized data. */ size -= bp->b_resid; if (size < xfersize) { if (size == 0) break; xfersize = size; } error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if (error) break; if ((ioflag & (IO_VMIO|IO_DIRECT)) && (LIST_EMPTY(&bp->b_dep))) { /* * If there are no dependencies, and it's VMIO, * then we don't need the buf, mark it available * for freeing. The VM has the data. */ bp->b_flags |= B_RELBUF; brelse(bp); } else { /* * Otherwise let whoever * made the request take care of * freeing it. We just queue * it onto another list. */ bqrelse(bp); } } /* * This can only happen in the case of an error * because the loop above resets bp to NULL on each iteration * and on normal completion has not set a new value into it. * so it must have come from a 'break' statement */ if (bp != NULL) { if ((ioflag & (IO_VMIO|IO_DIRECT)) && (LIST_EMPTY(&bp->b_dep))) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bqrelse(bp); } } if ((error == 0 || uio->uio_resid != orig_resid) && (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) { VI_LOCK(vp); ip->i_flag |= IN_ACCESS; VI_UNLOCK(vp); } return (error); } /* * Extended attribute area writing. */ static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) { struct inode *ip; struct ufs2_dinode *dp; struct fs *fs; struct buf *bp; ufs_lbn_t lbn; off_t osize; int blkoffset, error, flags, resid, size, xfersize; ip = VTOI(vp); fs = ip->i_fs; dp = ip->i_din2; KASSERT(!(ip->i_flag & IN_SPACECOUNTED), ("inode %u: inode is dead", ip->i_number)); #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) panic("ffs_extwrite: mode"); #endif if (ioflag & IO_APPEND) uio->uio_offset = dp->di_extsize; KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0")); KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0")); if ((uoff_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) return (EFBIG); resid = uio->uio_resid; osize = dp->di_extsize; flags = IO_EXT; if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) flags |= IO_SYNC; for (error = 0; uio->uio_resid > 0;) { lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->fs_bsize - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; /* * We must perform a read-before-write if the transfer size * does not cover the entire buffer. */ if (fs->fs_bsize > xfersize) flags |= BA_CLRBUF; else flags &= ~BA_CLRBUF; error = UFS_BALLOC(vp, uio->uio_offset, xfersize, ucred, flags, &bp); if (error != 0) break; /* * If the buffer is not valid we have to clear out any * garbage data from the pages instantiated for the buffer. * If we do not, a failed uiomove() during a write can leave * the prior contents of the pages exposed to a userland * mmap(). XXX deal with uiomove() errors a better way. */ if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) vfs_bio_clrbuf(bp); if (ioflag & IO_DIRECT) bp->b_flags |= B_DIRECT; if (uio->uio_offset + xfersize > dp->di_extsize) dp->di_extsize = uio->uio_offset + xfersize; size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; if (size < xfersize) xfersize = size; error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if ((ioflag & (IO_VMIO|IO_DIRECT)) && (LIST_EMPTY(&bp->b_dep))) { bp->b_flags |= B_RELBUF; } /* * If IO_SYNC each buffer is written synchronously. Otherwise * if we have a severe page deficiency write the buffer * asynchronously. Otherwise try to cluster, and if that * doesn't do it then either do an async write (if O_DIRECT), * or a delayed write (if not). */ if (ioflag & IO_SYNC) { (void)bwrite(bp); } else if (vm_page_count_severe() || buf_dirty_count_severe() || xfersize + blkoffset == fs->fs_bsize || (ioflag & (IO_ASYNC | IO_DIRECT))) bawrite(bp); else bdwrite(bp); if (error || xfersize == 0) break; ip->i_flag |= IN_CHANGE | IN_UPDATE; } /* * If we successfully wrote any data, and we are not the superuser * we clear the setuid and setgid bits as a precaution against * tampering. */ if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) { if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, SUSER_ALLOWJAIL)) { ip->i_mode &= ~(ISUID | ISGID); dp->di_mode = ip->i_mode; } } if (error) { if (ioflag & IO_UNIT) { (void)ffs_truncate(vp, osize, IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) error = ffs_update(vp, 1); return (error); } /* * Vnode operating to retrieve a named extended attribute. * * Locate a particular EA (nspace:name) in the area (ptr:length), and return * the length of the EA, and possibly the pointer to the entry and to the data. */ static int ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, u_char **eap, u_char **eac) { u_char *p, *pe, *pn, *p0; int eapad1, eapad2, ealength, ealen, nlen; uint32_t ul; pe = ptr + length; nlen = strlen(name); for (p = ptr; p < pe; p = pn) { p0 = p; bcopy(p, &ul, sizeof(ul)); pn = p + ul; /* make sure this entry is complete */ if (pn > pe) break; p += sizeof(uint32_t); if (*p != nspace) continue; p++; eapad2 = *p++; if (*p != nlen) continue; p++; if (bcmp(p, name, nlen)) continue; ealength = sizeof(uint32_t) + 3 + nlen; eapad1 = 8 - (ealength % 8); if (eapad1 == 8) eapad1 = 0; ealength += eapad1; ealen = ul - ealength - eapad2; p += nlen + eapad1; if (eap != NULL) *eap = p0; if (eac != NULL) *eac = p; return (ealen); } return(-1); } static int ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) { struct inode *ip; struct ufs2_dinode *dp; struct uio luio; struct iovec liovec; int easize, error; u_char *eae; ip = VTOI(vp); dp = ip->i_din2; easize = dp->di_extsize; eae = malloc(easize + extra, M_TEMP, M_WAITOK); liovec.iov_base = eae; liovec.iov_len = easize; luio.uio_iov = &liovec; luio.uio_iovcnt = 1; luio.uio_offset = 0; luio.uio_resid = easize; luio.uio_segflg = UIO_SYSSPACE; luio.uio_rw = UIO_READ; luio.uio_td = td; error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); if (error) { free(eae, M_TEMP); return(error); } *p = eae; return (0); } static int ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) { struct inode *ip; struct ufs2_dinode *dp; int error; ip = VTOI(vp); if (ip->i_ea_area != NULL) return (EBUSY); dp = ip->i_din2; error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); if (error) return (error); ip->i_ea_len = dp->di_extsize; ip->i_ea_error = 0; return (0); } /* * Vnode extattr transaction commit/abort */ static int ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) { struct inode *ip; struct uio luio; struct iovec liovec; int error; struct ufs2_dinode *dp; ip = VTOI(vp); if (ip->i_ea_area == NULL) return (EINVAL); dp = ip->i_din2; error = ip->i_ea_error; if (commit && error == 0) { if (cred == NOCRED) cred = vp->v_mount->mnt_cred; liovec.iov_base = ip->i_ea_area; liovec.iov_len = ip->i_ea_len; luio.uio_iov = &liovec; luio.uio_iovcnt = 1; luio.uio_offset = 0; luio.uio_resid = ip->i_ea_len; luio.uio_segflg = UIO_SYSSPACE; luio.uio_rw = UIO_WRITE; luio.uio_td = td; /* XXX: I'm not happy about truncating to zero size */ if (ip->i_ea_len < dp->di_extsize) error = ffs_truncate(vp, 0, IO_EXT, cred, td); error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); } free(ip->i_ea_area, M_TEMP); ip->i_ea_area = NULL; ip->i_ea_len = 0; ip->i_ea_error = 0; return (error); } /* * Vnode extattr strategy routine for fifos. * * We need to check for a read or write of the external attributes. * Otherwise we just fall through and do the usual thing. */ static int ffsext_strategy(struct vop_strategy_args *ap) /* struct vop_strategy_args { struct vnodeop_desc *a_desc; struct vnode *a_vp; struct buf *a_bp; }; */ { struct vnode *vp; daddr_t lbn; vp = ap->a_vp; lbn = ap->a_bp->b_lblkno; if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC && lbn < 0 && lbn >= -NXADDR) return (VOP_STRATEGY_APV(&ufs_vnodeops, ap)); if (vp->v_type == VFIFO) return (VOP_STRATEGY_APV(&ufs_fifoops, ap)); panic("spec nodes went here"); } /* * Vnode extattr transaction commit/abort */ static int ffs_openextattr(struct vop_openextattr_args *ap) /* struct vop_openextattr_args { struct vnodeop_desc *a_desc; struct vnode *a_vp; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct inode *ip; struct fs *fs; ip = VTOI(ap->a_vp); fs = ip->i_fs; if (ap->a_vp->v_type == VCHR) return (EOPNOTSUPP); return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); } /* * Vnode extattr transaction commit/abort */ static int ffs_closeextattr(struct vop_closeextattr_args *ap) /* struct vop_closeextattr_args { struct vnodeop_desc *a_desc; struct vnode *a_vp; int a_commit; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct inode *ip; struct fs *fs; ip = VTOI(ap->a_vp); fs = ip->i_fs; if (ap->a_vp->v_type == VCHR) return (EOPNOTSUPP); if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) return (EROFS); return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); } /* * Vnode operation to remove a named attribute. */ static int ffs_deleteextattr(struct vop_deleteextattr_args *ap) /* vop_deleteextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct inode *ip; struct fs *fs; uint32_t ealength, ul; int ealen, olen, eapad1, eapad2, error, i, easize; u_char *eae, *p; int stand_alone; ip = VTOI(ap->a_vp); fs = ip->i_fs; if (ap->a_vp->v_type == VCHR) return (EOPNOTSUPP); if (strlen(ap->a_name) == 0) return (EINVAL); if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, IWRITE); if (error) { if (ip->i_ea_area != NULL && ip->i_ea_error == 0) ip->i_ea_error = error; return (error); } if (ip->i_ea_area == NULL) { error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); if (error) return (error); stand_alone = 1; } else { stand_alone = 0; } ealength = eapad1 = ealen = eapad2 = 0; eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK); bcopy(ip->i_ea_area, eae, ip->i_ea_len); easize = ip->i_ea_len; olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, &p, NULL); if (olen == -1) { /* delete but nonexistent */ free(eae, M_TEMP); if (stand_alone) ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); return(ENOATTR); } bcopy(p, &ul, sizeof ul); i = p - eae + ul; if (ul != ealength) { bcopy(p + ul, p + ealength, easize - i); easize += (ealength - ul); } if (easize > NXADDR * fs->fs_bsize) { free(eae, M_TEMP); if (stand_alone) ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); else if (ip->i_ea_error == 0) ip->i_ea_error = ENOSPC; return(ENOSPC); } p = ip->i_ea_area; ip->i_ea_area = eae; ip->i_ea_len = easize; free(p, M_TEMP); if (stand_alone) error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); return(error); } /* * Vnode operation to retrieve a named extended attribute. */ static int ffs_getextattr(struct vop_getextattr_args *ap) /* vop_getextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct inode *ip; struct fs *fs; u_char *eae, *p; unsigned easize; int error, ealen, stand_alone; ip = VTOI(ap->a_vp); fs = ip->i_fs; if (ap->a_vp->v_type == VCHR) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, IREAD); if (error) return (error); if (ip->i_ea_area == NULL) { error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); if (error) return (error); stand_alone = 1; } else { stand_alone = 0; } eae = ip->i_ea_area; easize = ip->i_ea_len; ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, NULL, &p); if (ealen >= 0) { error = 0; if (ap->a_size != NULL) *ap->a_size = ealen; else if (ap->a_uio != NULL) error = uiomove(p, ealen, ap->a_uio); } else error = ENOATTR; if (stand_alone) ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); return(error); } /* * Vnode operation to retrieve extended attributes on a vnode. */ static int ffs_listextattr(struct vop_listextattr_args *ap) /* vop_listextattr { IN struct vnode *a_vp; IN int a_attrnamespace; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct inode *ip; struct fs *fs; u_char *eae, *p, *pe, *pn; unsigned easize; uint32_t ul; int error, ealen, stand_alone; ip = VTOI(ap->a_vp); fs = ip->i_fs; if (ap->a_vp->v_type == VCHR) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, IREAD); if (error) return (error); if (ip->i_ea_area == NULL) { error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); if (error) return (error); stand_alone = 1; } else { stand_alone = 0; } eae = ip->i_ea_area; easize = ip->i_ea_len; error = 0; if (ap->a_size != NULL) *ap->a_size = 0; pe = eae + easize; for(p = eae; error == 0 && p < pe; p = pn) { bcopy(p, &ul, sizeof(ul)); pn = p + ul; if (pn > pe) break; p += sizeof(ul); if (*p++ != ap->a_attrnamespace) continue; p++; /* pad2 */ ealen = *p; if (ap->a_size != NULL) { *ap->a_size += ealen + 1; } else if (ap->a_uio != NULL) { error = uiomove(p, ealen + 1, ap->a_uio); } } if (stand_alone) ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); return(error); } /* * Vnode operation to set a named attribute. */ static int ffs_setextattr(struct vop_setextattr_args *ap) /* vop_setextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct inode *ip; struct fs *fs; uint32_t ealength, ul; int ealen, olen, eapad1, eapad2, error, i, easize; u_char *eae, *p; int stand_alone; ip = VTOI(ap->a_vp); fs = ip->i_fs; if (ap->a_vp->v_type == VCHR) return (EOPNOTSUPP); if (strlen(ap->a_name) == 0) return (EINVAL); /* XXX Now unsupported API to delete EAs using NULL uio. */ if (ap->a_uio == NULL) return (EOPNOTSUPP); if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, IWRITE); if (error) { if (ip->i_ea_area != NULL && ip->i_ea_error == 0) ip->i_ea_error = error; return (error); } if (ip->i_ea_area == NULL) { error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); if (error) return (error); stand_alone = 1; } else { stand_alone = 0; } ealen = ap->a_uio->uio_resid; ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); eapad1 = 8 - (ealength % 8); if (eapad1 == 8) eapad1 = 0; eapad2 = 8 - (ealen % 8); if (eapad2 == 8) eapad2 = 0; ealength += eapad1 + ealen + eapad2; eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); bcopy(ip->i_ea_area, eae, ip->i_ea_len); easize = ip->i_ea_len; olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, &p, NULL); if (olen == -1) { /* new, append at end */ p = eae + easize; easize += ealength; } else { bcopy(p, &ul, sizeof ul); i = p - eae + ul; if (ul != ealength) { bcopy(p + ul, p + ealength, easize - i); easize += (ealength - ul); } } if (easize > NXADDR * fs->fs_bsize) { free(eae, M_TEMP); if (stand_alone) ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); else if (ip->i_ea_error == 0) ip->i_ea_error = ENOSPC; return(ENOSPC); } bcopy(&ealength, p, sizeof(ealength)); p += sizeof(ealength); *p++ = ap->a_attrnamespace; *p++ = eapad2; *p++ = strlen(ap->a_name); strcpy(p, ap->a_name); p += strlen(ap->a_name); bzero(p, eapad1); p += eapad1; error = uiomove(p, ealen, ap->a_uio); if (error) { free(eae, M_TEMP); if (stand_alone) ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); else if (ip->i_ea_error == 0) ip->i_ea_error = error; return(error); } p += ealen; bzero(p, eapad2); p = ip->i_ea_area; ip->i_ea_area = eae; ip->i_ea_len = easize; free(p, M_TEMP); if (stand_alone) error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); return(error); } /* * Vnode pointer to File handle */ static int ffs_vptofh(struct vop_vptofh_args *ap) /* vop_vptofh { IN struct vnode *a_vp; IN struct fid *a_fhp; }; */ { struct inode *ip; struct ufid *ufhp; ip = VTOI(ap->a_vp); ufhp = (struct ufid *)ap->a_fhp; ufhp->ufid_len = sizeof(struct ufid); ufhp->ufid_ino = ip->i_number; ufhp->ufid_gen = ip->i_gen; return (0); }