Index: head/sys/coda/coda_vnops.c =================================================================== --- head/sys/coda/coda_vnops.c (revision 75579) +++ head/sys/coda/coda_vnops.c (revision 75580) @@ -1,1957 +1,1956 @@ /* * * Coda: an Experimental Distributed File System * Release 3.1 * * Copyright (c) 1987-1998 Carnegie Mellon University * All Rights Reserved * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation, and * that credit is given to Carnegie Mellon University in all documents * and publicity pertaining to direct or indirect use of this code or its * derivatives. * * CODA IS AN EXPERIMENTAL SOFTWARE SYSTEM AND IS KNOWN TO HAVE BUGS, * SOME OF WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON ALLOWS * FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION. CARNEGIE MELLON * DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER * RESULTING DIRECTLY OR INDIRECTLY FROM THE USE OF THIS SOFTWARE OR OF * ANY DERIVATIVE WORK. * * Carnegie Mellon encourages users of this software to return any * improvements or extensions that they make, and to grant Carnegie * Mellon the rights to redistribute these changes without encumbrance. * * @(#) src/sys/coda/coda_vnops.c,v 1.1.1.1 1998/08/29 21:14:52 rvb Exp $ * $FreeBSD$ * */ /* * Mach Operating System * Copyright (c) 1990 Carnegie-Mellon University * Copyright (c) 1989 Carnegie-Mellon University * All rights reserved. The CMU software License Agreement specifies * the terms and conditions for use and redistribution. */ /* * This code was written for the Coda file system at Carnegie Mellon * University. Contributers include David Steere, James Kistler, and * M. Satyanarayanan. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * These flags select various performance enhancements. */ int coda_attr_cache = 1; /* Set to cache attributes in the kernel */ int coda_symlink_cache = 1; /* Set to cache symbolic link information */ int coda_access_cache = 1; /* Set to handle some access checks directly */ /* structure to keep track of vfs calls */ struct coda_op_stats coda_vnodeopstats[CODA_VNODEOPS_SIZE]; #define MARK_ENTRY(op) (coda_vnodeopstats[op].entries++) #define MARK_INT_SAT(op) (coda_vnodeopstats[op].sat_intrn++) #define MARK_INT_FAIL(op) (coda_vnodeopstats[op].unsat_intrn++) #define MARK_INT_GEN(op) (coda_vnodeopstats[op].gen_intrn++) /* What we are delaying for in printf */ int coda_printf_delay = 0; /* in microseconds */ int coda_vnop_print_entry = 0; static int coda_lockdebug = 0; /* Definition of the vfs operation vector */ /* * Some NetBSD details: * * coda_start is called at the end of the mount syscall. * coda_init is called at boot time. */ #define ENTRY if(coda_vnop_print_entry) myprintf(("Entered %s\n",__FUNCTION__)) /* Definition of the vnode operation vector */ struct vnodeopv_entry_desc coda_vnodeop_entries[] = { { &vop_default_desc, coda_vop_error }, { &vop_lookup_desc, coda_lookup }, /* lookup */ { &vop_create_desc, coda_create }, /* create */ { &vop_mknod_desc, coda_vop_error }, /* mknod */ { &vop_open_desc, coda_open }, /* open */ { &vop_close_desc, coda_close }, /* close */ { &vop_access_desc, coda_access }, /* access */ { &vop_getattr_desc, coda_getattr }, /* getattr */ { &vop_setattr_desc, coda_setattr }, /* setattr */ { &vop_read_desc, coda_read }, /* read */ { &vop_write_desc, coda_write }, /* write */ { &vop_ioctl_desc, coda_ioctl }, /* ioctl */ { &vop_fsync_desc, coda_fsync }, /* fsync */ { &vop_remove_desc, coda_remove }, /* remove */ { &vop_link_desc, coda_link }, /* link */ { &vop_rename_desc, coda_rename }, /* rename */ { &vop_mkdir_desc, coda_mkdir }, /* mkdir */ { &vop_rmdir_desc, coda_rmdir }, /* rmdir */ { &vop_symlink_desc, coda_symlink }, /* symlink */ { &vop_readdir_desc, coda_readdir }, /* readdir */ { &vop_readlink_desc, coda_readlink }, /* readlink */ { &vop_inactive_desc, coda_inactive }, /* inactive */ { &vop_reclaim_desc, coda_reclaim }, /* reclaim */ { &vop_lock_desc, coda_lock }, /* lock */ { &vop_unlock_desc, coda_unlock }, /* unlock */ { &vop_bmap_desc, coda_bmap }, /* bmap */ { &vop_strategy_desc, coda_strategy }, /* strategy */ { &vop_print_desc, coda_vop_error }, /* print */ { &vop_islocked_desc, coda_islocked }, /* islocked */ { &vop_pathconf_desc, coda_vop_error }, /* pathconf */ { &vop_advlock_desc, coda_vop_nop }, /* advlock */ - { &vop_bwrite_desc, coda_vop_error }, /* bwrite */ { &vop_lease_desc, coda_vop_nop }, /* lease */ { &vop_poll_desc, (vop_t *) vop_stdpoll }, { &vop_getpages_desc, coda_fbsd_getpages }, /* pager intf.*/ { &vop_putpages_desc, coda_fbsd_putpages }, /* pager intf.*/ #if 0 we need to define these someday #define UFS_BLKATOFF(aa, bb, cc, dd) VFSTOUFS((aa)->v_mount)->um_blkatoff(aa, bb, cc, dd) #define UFS_VALLOC(aa, bb, cc, dd) VFSTOUFS((aa)->v_mount)->um_valloc(aa, bb, cc, dd) #define UFS_VFREE(aa, bb, cc) VFSTOUFS((aa)->v_mount)->um_vfree(aa, bb, cc) #define UFS_TRUNCATE(aa, bb, cc, dd, ee) VFSTOUFS((aa)->v_mount)->um_truncate(aa, bb, cc, dd, ee) #define UFS_UPDATE(aa, bb) VFSTOUFS((aa)->v_mount)->um_update(aa, bb) missing { &vop_reallocblks_desc, (vop_t *) ufs_missingop }, { &vop_cachedlookup_desc, (vop_t *) ufs_lookup }, { &vop_whiteout_desc, (vop_t *) ufs_whiteout }, #endif { (struct vnodeop_desc*)NULL, (int(*)(void *))NULL } }; static struct vnodeopv_desc coda_vnodeop_opv_desc = { &coda_vnodeop_p, coda_vnodeop_entries }; VNODEOP_SET(coda_vnodeop_opv_desc); /* A generic panic: we were called with something we didn't define yet */ int coda_vop_error(void *anon) { struct vnodeop_desc **desc = (struct vnodeop_desc **)anon; myprintf(("coda_vop_error: Vnode operation %s called, but not defined.\n", (*desc)->vdesc_name)); /* panic("coda_vop_error"); */ return EIO; } /* A generic do-nothing. For lease_check, advlock */ int coda_vop_nop(void *anon) { struct vnodeop_desc **desc = (struct vnodeop_desc **)anon; if (codadebug) { myprintf(("Vnode operation %s called, but unsupported\n", (*desc)->vdesc_name)); } return (0); } int coda_vnodeopstats_init(void) { register int i; for(i=0;ia_vp); struct cnode *cp = VTOC(*vpp); int flag = ap->a_mode & (~O_EXCL); struct ucred *cred = ap->a_cred; struct proc *p = ap->a_p; /* locals */ int error; struct vnode *vp; dev_t dev; ino_t inode; MARK_ENTRY(CODA_OPEN_STATS); /* Check for open of control file. */ if (IS_CTL_VP(*vpp)) { /* XXX */ /* if (WRITEABLE(flag)) */ if (flag & (FWRITE | O_TRUNC | O_CREAT | O_EXCL)) { MARK_INT_FAIL(CODA_OPEN_STATS); return(EACCES); } MARK_INT_SAT(CODA_OPEN_STATS); return(0); } error = venus_open(vtomi((*vpp)), &cp->c_fid, flag, cred, p, &dev, &inode); if (error) return (error); if (!error) { CODADEBUG( CODA_OPEN,myprintf(("open: dev %#lx inode %lu result %d\n", (u_long)dev2udev(dev), (u_long)inode, error)); ) } /* Translate the pair for the cache file into an inode pointer. */ error = coda_grab_vnode(dev, inode, &vp); if (error) return (error); /* We get the vnode back locked. Needs unlocked */ VOP_UNLOCK(vp, 0, p); /* Keep a reference until the close comes in. */ vref(*vpp); /* Save the vnode pointer for the cache file. */ if (cp->c_ovp == NULL) { cp->c_ovp = vp; } else { if (cp->c_ovp != vp) panic("coda_open: cp->c_ovp != ITOV(ip)"); } cp->c_ocount++; /* Flush the attribute cached if writing the file. */ if (flag & FWRITE) { cp->c_owrite++; cp->c_flags &= ~C_VATTR; } /* Save the pair for the cache file to speed up subsequent page_read's. */ cp->c_device = dev; cp->c_inode = inode; /* Open the cache file. */ error = VOP_OPEN(vp, flag, cred, p); if (error) { printf("coda_open: VOP_OPEN on container failed %d\n", error); return (error); } /* grab (above) does this when it calls newvnode unless it's in the cache*/ if (vp->v_type == VREG) { error = vfs_object_create(vp, p, cred); if (error != 0) { printf("coda_open: vfs_object_create() returns %d\n", error); vput(vp); } } return(error); } /* * Close the cache file used for I/O and notify Venus. */ int coda_close(v) void *v; { /* true args */ struct vop_close_args *ap = v; struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); int flag = ap->a_fflag; struct ucred *cred = ap->a_cred; struct proc *p = ap->a_p; /* locals */ int error; MARK_ENTRY(CODA_CLOSE_STATS); /* Check for close of control file. */ if (IS_CTL_VP(vp)) { MARK_INT_SAT(CODA_CLOSE_STATS); return(0); } if (IS_UNMOUNTING(cp)) { if (cp->c_ovp) { #ifdef CODA_VERBOSE printf("coda_close: destroying container ref %d, ufs vp %p of vp %p/cp %p\n", vp->v_usecount, cp->c_ovp, vp, cp); #endif #ifdef hmm vgone(cp->c_ovp); #else VOP_CLOSE(cp->c_ovp, flag, cred, p); /* Do errors matter here? */ vrele(cp->c_ovp); #endif } else { #ifdef CODA_VERBOSE printf("coda_close: NO container vp %p/cp %p\n", vp, cp); #endif } return ENODEV; } else { VOP_CLOSE(cp->c_ovp, flag, cred, p); /* Do errors matter here? */ vrele(cp->c_ovp); } if (--cp->c_ocount == 0) cp->c_ovp = NULL; if (flag & FWRITE) /* file was opened for write */ --cp->c_owrite; error = venus_close(vtomi(vp), &cp->c_fid, flag, cred, p); vrele(CTOV(cp)); CODADEBUG(CODA_CLOSE, myprintf(("close: result %d\n",error)); ) return(error); } int coda_read(v) void *v; { struct vop_read_args *ap = v; ENTRY; return(coda_rdwr(ap->a_vp, ap->a_uio, UIO_READ, ap->a_ioflag, ap->a_cred, ap->a_uio->uio_procp)); } int coda_write(v) void *v; { struct vop_write_args *ap = v; ENTRY; return(coda_rdwr(ap->a_vp, ap->a_uio, UIO_WRITE, ap->a_ioflag, ap->a_cred, ap->a_uio->uio_procp)); } int coda_rdwr(vp, uiop, rw, ioflag, cred, p) struct vnode *vp; struct uio *uiop; enum uio_rw rw; int ioflag; struct ucred *cred; struct proc *p; { /* upcall decl */ /* NOTE: container file operation!!! */ /* locals */ struct cnode *cp = VTOC(vp); struct vnode *cfvp = cp->c_ovp; int igot_internally = 0; int opened_internally = 0; int error = 0; MARK_ENTRY(CODA_RDWR_STATS); CODADEBUG(CODA_RDWR, myprintf(("coda_rdwr(%d, %p, %d, %lld, %d)\n", rw, (void *)uiop->uio_iov->iov_base, uiop->uio_resid, (long long)uiop->uio_offset, uiop->uio_segflg)); ) /* Check for rdwr of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_RDWR_STATS); return(EINVAL); } /* * If file is not already open this must be a page * {read,write} request. Iget the cache file's inode * pointer if we still have its pair. * Otherwise, we must do an internal open to derive the * pair. */ if (cfvp == NULL) { /* * If we're dumping core, do the internal open. Otherwise * venus won't have the correct size of the core when * it's completely written. */ PROC_LOCK(p); if (cp->c_inode != 0 && !(p && (p->p_acflag & ACORE))) { PROC_UNLOCK(p); igot_internally = 1; error = coda_grab_vnode(cp->c_device, cp->c_inode, &cfvp); if (error) { MARK_INT_FAIL(CODA_RDWR_STATS); return(error); } /* * We get the vnode back locked in both Mach and * NetBSD. Needs unlocked */ VOP_UNLOCK(cfvp, 0, p); } else { PROC_UNLOCK(p); opened_internally = 1; MARK_INT_GEN(CODA_OPEN_STATS); error = VOP_OPEN(vp, (rw == UIO_READ ? FREAD : FWRITE), cred, p); printf("coda_rdwr: Internally Opening %p\n", vp); if (error) { printf("coda_rdwr: VOP_OPEN on container failed %d\n", error); return (error); } if (vp->v_type == VREG) { error = vfs_object_create(vp, p, cred); if (error != 0) { printf("coda_rdwr: vfs_object_create() returns %d\n", error); vput(vp); } } if (error) { MARK_INT_FAIL(CODA_RDWR_STATS); return(error); } cfvp = cp->c_ovp; } } /* Have UFS handle the call. */ CODADEBUG(CODA_RDWR, myprintf(("indirect rdwr: fid = (%lx.%lx.%lx), refcnt = %d\n", cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique, CTOV(cp)->v_usecount)); ) if (rw == UIO_READ) { error = VOP_READ(cfvp, uiop, ioflag, cred); } else { error = VOP_WRITE(cfvp, uiop, ioflag, cred); /* ufs_write updates the vnode_pager_setsize for the vnode/object */ { struct vattr attr; if (VOP_GETATTR(cfvp, &attr, cred, p) == 0) { vnode_pager_setsize(vp, attr.va_size); } } } if (error) MARK_INT_FAIL(CODA_RDWR_STATS); else MARK_INT_SAT(CODA_RDWR_STATS); /* Do an internal close if necessary. */ if (opened_internally) { MARK_INT_GEN(CODA_CLOSE_STATS); (void)VOP_CLOSE(vp, (rw == UIO_READ ? FREAD : FWRITE), cred, p); } /* Invalidate cached attributes if writing. */ if (rw == UIO_WRITE) cp->c_flags &= ~C_VATTR; return(error); } int coda_ioctl(v) void *v; { /* true args */ struct vop_ioctl_args *ap = v; struct vnode *vp = ap->a_vp; int com = ap->a_command; caddr_t data = ap->a_data; int flag = ap->a_fflag; struct ucred *cred = ap->a_cred; struct proc *p = ap->a_p; /* locals */ int error; struct vnode *tvp; struct nameidata ndp; struct PioctlData *iap = (struct PioctlData *)data; MARK_ENTRY(CODA_IOCTL_STATS); CODADEBUG(CODA_IOCTL, myprintf(("in coda_ioctl on %s\n", iap->path));) /* Don't check for operation on a dying object, for ctlvp it shouldn't matter */ /* Must be control object to succeed. */ if (!IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_IOCTL_STATS); CODADEBUG(CODA_IOCTL, myprintf(("coda_ioctl error: vp != ctlvp"));) return (EOPNOTSUPP); } /* Look up the pathname. */ /* Should we use the name cache here? It would get it from lookupname sooner or later anyway, right? */ NDINIT(&ndp, LOOKUP, (iap->follow ? FOLLOW : NOFOLLOW), UIO_USERSPACE, iap->path, p); error = namei(&ndp); tvp = ndp.ni_vp; if (error) { MARK_INT_FAIL(CODA_IOCTL_STATS); CODADEBUG(CODA_IOCTL, myprintf(("coda_ioctl error: lookup returns %d\n", error));) return(error); } /* * Make sure this is a coda style cnode, but it may be a * different vfsp */ if (tvp->v_op != coda_vnodeop_p) { vrele(tvp); NDFREE(&ndp, NDF_ONLY_PNBUF); MARK_INT_FAIL(CODA_IOCTL_STATS); CODADEBUG(CODA_IOCTL, myprintf(("coda_ioctl error: %s not a coda object\n", iap->path));) return(EINVAL); } if (iap->vi.in_size > VC_MAXDATASIZE) { NDFREE(&ndp, 0); return(EINVAL); } error = venus_ioctl(vtomi(tvp), &((VTOC(tvp))->c_fid), com, flag, data, cred, p); if (error) MARK_INT_FAIL(CODA_IOCTL_STATS); else CODADEBUG(CODA_IOCTL, myprintf(("Ioctl returns %d \n", error)); ) vrele(tvp); NDFREE(&ndp, NDF_ONLY_PNBUF); return(error); } /* * To reduce the cost of a user-level venus;we cache attributes in * the kernel. Each cnode has storage allocated for an attribute. If * c_vattr is valid, return a reference to it. Otherwise, get the * attributes from venus and store them in the cnode. There is some * question if this method is a security leak. But I think that in * order to make this call, the user must have done a lookup and * opened the file, and therefore should already have access. */ int coda_getattr(v) void *v; { /* true args */ struct vop_getattr_args *ap = v; struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct proc *p = ap->a_p; /* locals */ int error; MARK_ENTRY(CODA_GETATTR_STATS); if (IS_UNMOUNTING(cp)) return ENODEV; /* Check for getattr of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_GETATTR_STATS); return(ENOENT); } /* Check to see if the attributes have already been cached */ if (VALID_VATTR(cp)) { CODADEBUG(CODA_GETATTR, { myprintf(("attr cache hit: (%lx.%lx.%lx)\n", cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique));}); CODADEBUG(CODA_GETATTR, if (!(codadebug & ~CODA_GETATTR)) print_vattr(&cp->c_vattr); ); *vap = cp->c_vattr; MARK_INT_SAT(CODA_GETATTR_STATS); return(0); } error = venus_getattr(vtomi(vp), &cp->c_fid, cred, p, vap); if (!error) { CODADEBUG(CODA_GETATTR, myprintf(("getattr miss (%lx.%lx.%lx): result %d\n", cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique, error)); ) CODADEBUG(CODA_GETATTR, if (!(codadebug & ~CODA_GETATTR)) print_vattr(vap); ); { int size = vap->va_size; struct vnode *convp = cp->c_ovp; if (convp != (struct vnode *)0) { vnode_pager_setsize(convp, size); } } /* If not open for write, store attributes in cnode */ if ((cp->c_owrite == 0) && (coda_attr_cache)) { cp->c_vattr = *vap; cp->c_flags |= C_VATTR; } } return(error); } int coda_setattr(v) void *v; { /* true args */ struct vop_setattr_args *ap = v; register struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); register struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct proc *p = ap->a_p; /* locals */ int error; MARK_ENTRY(CODA_SETATTR_STATS); /* Check for setattr of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_SETATTR_STATS); return(ENOENT); } if (codadebug & CODADBGMSK(CODA_SETATTR)) { print_vattr(vap); } error = venus_setattr(vtomi(vp), &cp->c_fid, vap, cred, p); if (!error) cp->c_flags &= ~C_VATTR; { int size = vap->va_size; struct vnode *convp = cp->c_ovp; if (size != VNOVAL && convp != (struct vnode *)0) { vnode_pager_setsize(convp, size); } } CODADEBUG(CODA_SETATTR, myprintf(("setattr %d\n", error)); ) return(error); } int coda_access(v) void *v; { /* true args */ struct vop_access_args *ap = v; struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); int mode = ap->a_mode; struct ucred *cred = ap->a_cred; struct proc *p = ap->a_p; /* locals */ int error; MARK_ENTRY(CODA_ACCESS_STATS); /* Check for access of control object. Only read access is allowed on it. */ if (IS_CTL_VP(vp)) { /* bogus hack - all will be marked as successes */ MARK_INT_SAT(CODA_ACCESS_STATS); return(((mode & VREAD) && !(mode & (VWRITE | VEXEC))) ? 0 : EACCES); } /* * if the file is a directory, and we are checking exec (eg lookup) * access, and the file is in the namecache, then the user must have * lookup access to it. */ if (coda_access_cache) { if ((vp->v_type == VDIR) && (mode & VEXEC)) { if (coda_nc_lookup(cp, ".", 1, cred)) { MARK_INT_SAT(CODA_ACCESS_STATS); return(0); /* it was in the cache */ } } } error = venus_access(vtomi(vp), &cp->c_fid, mode, cred, p); return(error); } int coda_readlink(v) void *v; { /* true args */ struct vop_readlink_args *ap = v; struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct uio *uiop = ap->a_uio; struct ucred *cred = ap->a_cred; struct proc *p = ap->a_uio->uio_procp; /* locals */ int error; char *str; int len; MARK_ENTRY(CODA_READLINK_STATS); /* Check for readlink of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_READLINK_STATS); return(ENOENT); } if ((coda_symlink_cache) && (VALID_SYMLINK(cp))) { /* symlink was cached */ uiop->uio_rw = UIO_READ; error = uiomove(cp->c_symlink, (int)cp->c_symlen, uiop); if (error) MARK_INT_FAIL(CODA_READLINK_STATS); else MARK_INT_SAT(CODA_READLINK_STATS); return(error); } error = venus_readlink(vtomi(vp), &cp->c_fid, cred, p, &str, &len); if (!error) { uiop->uio_rw = UIO_READ; error = uiomove(str, len, uiop); if (coda_symlink_cache) { cp->c_symlink = str; cp->c_symlen = len; cp->c_flags |= C_SYMLINK; } else CODA_FREE(str, len); } CODADEBUG(CODA_READLINK, myprintf(("in readlink result %d\n",error));) return(error); } int coda_fsync(v) void *v; { /* true args */ struct vop_fsync_args *ap = v; struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct ucred *cred = ap->a_cred; struct proc *p = ap->a_p; /* locals */ struct vnode *convp = cp->c_ovp; int error; MARK_ENTRY(CODA_FSYNC_STATS); /* Check for fsync on an unmounting object */ /* The NetBSD kernel, in it's infinite wisdom, can try to fsync * after an unmount has been initiated. This is a Bad Thing, * which we have to avoid. Not a legitimate failure for stats. */ if (IS_UNMOUNTING(cp)) { return(ENODEV); } /* Check for fsync of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_SAT(CODA_FSYNC_STATS); return(0); } if (convp) VOP_FSYNC(convp, cred, MNT_WAIT, p); /* * We see fsyncs with usecount == 1 then usecount == 0. * For now we ignore them. */ /* if (!vp->v_usecount) { printf("coda_fsync on vnode %p with %d usecount. c_flags = %x (%x)\n", vp, vp->v_usecount, cp->c_flags, cp->c_flags&C_PURGING); } */ /* * We can expect fsync on any vnode at all if venus is pruging it. * Venus can't very well answer the fsync request, now can it? * Hopefully, it won't have to, because hopefully, venus preserves * the (possibly untrue) invariant that it never purges an open * vnode. Hopefully. */ if (cp->c_flags & C_PURGING) { return(0); } /* needs research */ return 0; error = venus_fsync(vtomi(vp), &cp->c_fid, cred, p); CODADEBUG(CODA_FSYNC, myprintf(("in fsync result %d\n",error)); ); return(error); } int coda_inactive(v) void *v; { /* XXX - at the moment, inactive doesn't look at cred, and doesn't have a proc pointer. Oops. */ /* true args */ struct vop_inactive_args *ap = v; struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct ucred *cred __attribute__((unused)) = NULL; struct proc *p __attribute__((unused)) = curproc; /* upcall decl */ /* locals */ /* We don't need to send inactive to venus - DCS */ MARK_ENTRY(CODA_INACTIVE_STATS); if (IS_CTL_VP(vp)) { MARK_INT_SAT(CODA_INACTIVE_STATS); return 0; } CODADEBUG(CODA_INACTIVE, myprintf(("in inactive, %lx.%lx.%lx. vfsp %p\n", cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique, vp->v_mount));) /* If an array has been allocated to hold the symlink, deallocate it */ if ((coda_symlink_cache) && (VALID_SYMLINK(cp))) { if (cp->c_symlink == NULL) panic("coda_inactive: null symlink pointer in cnode"); CODA_FREE(cp->c_symlink, cp->c_symlen); cp->c_flags &= ~C_SYMLINK; cp->c_symlen = 0; } /* Remove it from the table so it can't be found. */ coda_unsave(cp); if ((struct coda_mntinfo *)(vp->v_mount->mnt_data) == NULL) { myprintf(("Help! vfsp->vfs_data was NULL, but vnode %p wasn't dying\n", vp)); panic("badness in coda_inactive\n"); } if (IS_UNMOUNTING(cp)) { #ifdef DEBUG printf("coda_inactive: IS_UNMOUNTING use %d: vp %p, cp %p\n", vp->v_usecount, vp, cp); if (cp->c_ovp != NULL) printf("coda_inactive: cp->ovp != NULL use %d: vp %p, cp %p\n", vp->v_usecount, vp, cp); #endif lockmgr(&cp->c_lock, LK_RELEASE, &vp->v_interlock, p); } else { #ifdef OLD_DIAGNOSTIC if (CTOV(cp)->v_usecount) { panic("coda_inactive: nonzero reference count"); } if (cp->c_ovp != NULL) { panic("coda_inactive: cp->ovp != NULL"); } #endif VOP_UNLOCK(vp, 0, p); vgone(vp); } MARK_INT_SAT(CODA_INACTIVE_STATS); return(0); } /* * Remote file system operations having to do with directory manipulation. */ /* * It appears that in NetBSD, lookup is supposed to return the vnode locked */ int coda_lookup(v) void *v; { /* true args */ struct vop_lookup_args *ap = v; struct vnode *dvp = ap->a_dvp; struct cnode *dcp = VTOC(dvp); struct vnode **vpp = ap->a_vpp; /* * It looks as though ap->a_cnp->ni_cnd->cn_nameptr holds the rest * of the string to xlate, and that we must try to get at least * ap->a_cnp->ni_cnd->cn_namelen of those characters to macth. I * could be wrong. */ struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; struct proc *p = cnp->cn_proc; /* locals */ struct cnode *cp; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; ViceFid VFid; int vtype; int error = 0; MARK_ENTRY(CODA_LOOKUP_STATS); CODADEBUG(CODA_LOOKUP, myprintf(("lookup: %s in %lx.%lx.%lx\n", nm, dcp->c_fid.Volume, dcp->c_fid.Vnode, dcp->c_fid.Unique));); /* Check for lookup of control object. */ if (IS_CTL_NAME(dvp, nm, len)) { *vpp = coda_ctlvp; vref(*vpp); MARK_INT_SAT(CODA_LOOKUP_STATS); goto exit; } if (len+1 > CODA_MAXNAMLEN) { MARK_INT_FAIL(CODA_LOOKUP_STATS); CODADEBUG(CODA_LOOKUP, myprintf(("name too long: lookup, %lx.%lx.%lx(%s)\n", dcp->c_fid.Volume, dcp->c_fid.Vnode, dcp->c_fid.Unique, nm));); *vpp = (struct vnode *)0; error = EINVAL; goto exit; } /* First try to look the file up in the cfs name cache */ /* lock the parent vnode? */ cp = coda_nc_lookup(dcp, nm, len, cred); if (cp) { *vpp = CTOV(cp); vref(*vpp); CODADEBUG(CODA_LOOKUP, myprintf(("lookup result %d vpp %p\n",error,*vpp));) } else { /* The name wasn't cached, so we need to contact Venus */ error = venus_lookup(vtomi(dvp), &dcp->c_fid, nm, len, cred, p, &VFid, &vtype); if (error) { MARK_INT_FAIL(CODA_LOOKUP_STATS); CODADEBUG(CODA_LOOKUP, myprintf(("lookup error on %lx.%lx.%lx(%s)%d\n", dcp->c_fid.Volume, dcp->c_fid.Vnode, dcp->c_fid.Unique, nm, error));) *vpp = (struct vnode *)0; } else { MARK_INT_SAT(CODA_LOOKUP_STATS); CODADEBUG(CODA_LOOKUP, myprintf(("lookup: vol %lx vno %lx uni %lx type %o result %d\n", VFid.Volume, VFid.Vnode, VFid.Unique, vtype, error)); ) cp = make_coda_node(&VFid, dvp->v_mount, vtype); *vpp = CTOV(cp); /* enter the new vnode in the Name Cache only if the top bit isn't set */ /* And don't enter a new vnode for an invalid one! */ if (!(vtype & CODA_NOCACHE)) coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp)); } } exit: /* * If we are creating, and this was the last name to be looked up, * and the error was ENOENT, then there really shouldn't be an * error and we can make the leaf NULL and return success. Since * this is supposed to work under Mach as well as NetBSD, we're * leaving this fn wrapped. We also must tell lookup/namei that * we need to save the last component of the name. (Create will * have to free the name buffer later...lucky us...) */ if (((cnp->cn_nameiop == CREATE) || (cnp->cn_nameiop == RENAME)) && (cnp->cn_flags & ISLASTCN) && (error == ENOENT)) { error = EJUSTRETURN; cnp->cn_flags |= SAVENAME; *ap->a_vpp = NULL; } /* * If we are removing, and we are at the last element, and we * found it, then we need to keep the name around so that the * removal will go ahead as planned. Unfortunately, this will * probably also lock the to-be-removed vnode, which may or may * not be a good idea. I'll have to look at the bits of * coda_remove to make sure. We'll only save the name if we did in * fact find the name, otherwise coda_remove won't have a chance * to free the pathname. */ if ((cnp->cn_nameiop == DELETE) && (cnp->cn_flags & ISLASTCN) && !error) { cnp->cn_flags |= SAVENAME; } /* * If the lookup went well, we need to (potentially?) unlock the * parent, and lock the child. We are only responsible for * checking to see if the parent is supposed to be unlocked before * we return. We must always lock the child (provided there is * one, and (the parent isn't locked or it isn't the same as the * parent.) Simple, huh? We can never leave the parent locked unless * we are ISLASTCN */ if (!error || (error == EJUSTRETURN)) { if (!(cnp->cn_flags & LOCKPARENT) || !(cnp->cn_flags & ISLASTCN)) { if ((error = VOP_UNLOCK(dvp, 0, p))) { return error; } /* * The parent is unlocked. As long as there is a child, * lock it without bothering to check anything else. */ if (*ap->a_vpp) { if ((error = VOP_LOCK(*ap->a_vpp, LK_EXCLUSIVE, p))) { printf("coda_lookup: "); panic("unlocked parent but couldn't lock child"); } } } else { /* The parent is locked, and may be the same as the child */ if (*ap->a_vpp && (*ap->a_vpp != dvp)) { /* Different, go ahead and lock it. */ if ((error = VOP_LOCK(*ap->a_vpp, LK_EXCLUSIVE, p))) { printf("coda_lookup: "); panic("unlocked parent but couldn't lock child"); } } } } else { /* If the lookup failed, we need to ensure that the leaf is NULL */ /* Don't change any locking? */ *ap->a_vpp = NULL; } return(error); } /*ARGSUSED*/ int coda_create(v) void *v; { /* true args */ struct vop_create_args *ap = v; struct vnode *dvp = ap->a_dvp; struct cnode *dcp = VTOC(dvp); struct vattr *va = ap->a_vap; int exclusive = 1; int mode = ap->a_vap->va_mode; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; struct proc *p = cnp->cn_proc; /* locals */ int error; struct cnode *cp; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; ViceFid VFid; struct vattr attr; MARK_ENTRY(CODA_CREATE_STATS); /* All creates are exclusive XXX */ /* I'm assuming the 'mode' argument is the file mode bits XXX */ /* Check for create of control object. */ if (IS_CTL_NAME(dvp, nm, len)) { *vpp = (struct vnode *)0; MARK_INT_FAIL(CODA_CREATE_STATS); return(EACCES); } error = venus_create(vtomi(dvp), &dcp->c_fid, nm, len, exclusive, mode, va, cred, p, &VFid, &attr); if (!error) { /* If this is an exclusive create, panic if the file already exists. */ /* Venus should have detected the file and reported EEXIST. */ if ((exclusive == 1) && (coda_find(&VFid) != NULL)) panic("cnode existed for newly created file!"); cp = make_coda_node(&VFid, dvp->v_mount, attr.va_type); *vpp = CTOV(cp); /* Update va to reflect the new attributes. */ (*va) = attr; /* Update the attribute cache and mark it as valid */ if (coda_attr_cache) { VTOC(*vpp)->c_vattr = attr; VTOC(*vpp)->c_flags |= C_VATTR; } /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(dvp)->c_flags &= ~C_VATTR; /* enter the new vnode in the Name Cache */ coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp)); CODADEBUG(CODA_CREATE, myprintf(("create: (%lx.%lx.%lx), result %d\n", VFid.Volume, VFid.Vnode, VFid.Unique, error)); ) } else { *vpp = (struct vnode *)0; CODADEBUG(CODA_CREATE, myprintf(("create error %d\n", error));) } if (!error) { if (cnp->cn_flags & LOCKLEAF) { if ((error = VOP_LOCK(*ap->a_vpp, LK_EXCLUSIVE, p))) { printf("coda_create: "); panic("unlocked parent but couldn't lock child"); } } #ifdef OLD_DIAGNOSTIC else { printf("coda_create: LOCKLEAF not set!\n"); } #endif } return(error); } int coda_remove(v) void *v; { /* true args */ struct vop_remove_args *ap = v; struct vnode *dvp = ap->a_dvp; struct cnode *cp = VTOC(dvp); struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; struct proc *p = cnp->cn_proc; /* locals */ int error; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; struct cnode *tp; MARK_ENTRY(CODA_REMOVE_STATS); CODADEBUG(CODA_REMOVE, myprintf(("remove: %s in %lx.%lx.%lx\n", nm, cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique));); /* Remove the file's entry from the CODA Name Cache */ /* We're being conservative here, it might be that this person * doesn't really have sufficient access to delete the file * but we feel zapping the entry won't really hurt anyone -- dcs */ /* I'm gonna go out on a limb here. If a file and a hardlink to it * exist, and one is removed, the link count on the other will be * off by 1. We could either invalidate the attrs if cached, or * fix them. I'll try to fix them. DCS 11/8/94 */ tp = coda_nc_lookup(VTOC(dvp), nm, len, cred); if (tp) { if (VALID_VATTR(tp)) { /* If attrs are cached */ if (tp->c_vattr.va_nlink > 1) { /* If it's a hard link */ tp->c_vattr.va_nlink--; } } coda_nc_zapfile(VTOC(dvp), nm, len); /* No need to flush it if it doesn't exist! */ } /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(dvp)->c_flags &= ~C_VATTR; /* Check for remove of control object. */ if (IS_CTL_NAME(dvp, nm, len)) { MARK_INT_FAIL(CODA_REMOVE_STATS); return(ENOENT); } error = venus_remove(vtomi(dvp), &cp->c_fid, nm, len, cred, p); CODADEBUG(CODA_REMOVE, myprintf(("in remove result %d\n",error)); ) return(error); } int coda_link(v) void *v; { /* true args */ struct vop_link_args *ap = v; struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct vnode *tdvp = ap->a_tdvp; struct cnode *tdcp = VTOC(tdvp); struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; struct proc *p = cnp->cn_proc; /* locals */ int error; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; MARK_ENTRY(CODA_LINK_STATS); if (codadebug & CODADBGMSK(CODA_LINK)) { myprintf(("nb_link: vp fid: (%lx.%lx.%lx)\n", cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique)); myprintf(("nb_link: tdvp fid: (%lx.%lx.%lx)\n", tdcp->c_fid.Volume, tdcp->c_fid.Vnode, tdcp->c_fid.Unique)); } if (codadebug & CODADBGMSK(CODA_LINK)) { myprintf(("link: vp fid: (%lx.%lx.%lx)\n", cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique)); myprintf(("link: tdvp fid: (%lx.%lx.%lx)\n", tdcp->c_fid.Volume, tdcp->c_fid.Vnode, tdcp->c_fid.Unique)); } /* Check for link to/from control object. */ if (IS_CTL_NAME(tdvp, nm, len) || IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_LINK_STATS); return(EACCES); } error = venus_link(vtomi(vp), &cp->c_fid, &tdcp->c_fid, nm, len, cred, p); /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(tdvp)->c_flags &= ~C_VATTR; VTOC(vp)->c_flags &= ~C_VATTR; CODADEBUG(CODA_LINK, myprintf(("in link result %d\n",error)); ) return(error); } int coda_rename(v) void *v; { /* true args */ struct vop_rename_args *ap = v; struct vnode *odvp = ap->a_fdvp; struct cnode *odcp = VTOC(odvp); struct componentname *fcnp = ap->a_fcnp; struct vnode *ndvp = ap->a_tdvp; struct cnode *ndcp = VTOC(ndvp); struct componentname *tcnp = ap->a_tcnp; struct ucred *cred = fcnp->cn_cred; struct proc *p = fcnp->cn_proc; /* true args */ int error; const char *fnm = fcnp->cn_nameptr; int flen = fcnp->cn_namelen; const char *tnm = tcnp->cn_nameptr; int tlen = tcnp->cn_namelen; MARK_ENTRY(CODA_RENAME_STATS); /* Hmmm. The vnodes are already looked up. Perhaps they are locked? This could be Bad. XXX */ #ifdef OLD_DIAGNOSTIC if ((fcnp->cn_cred != tcnp->cn_cred) || (fcnp->cn_proc != tcnp->cn_proc)) { panic("coda_rename: component names don't agree"); } #endif /* Check for rename involving control object. */ if (IS_CTL_NAME(odvp, fnm, flen) || IS_CTL_NAME(ndvp, tnm, tlen)) { MARK_INT_FAIL(CODA_RENAME_STATS); return(EACCES); } /* Problem with moving directories -- need to flush entry for .. */ if (odvp != ndvp) { struct cnode *ovcp = coda_nc_lookup(VTOC(odvp), fnm, flen, cred); if (ovcp) { struct vnode *ovp = CTOV(ovcp); if ((ovp) && (ovp->v_type == VDIR)) /* If it's a directory */ coda_nc_zapfile(VTOC(ovp),"..", 2); } } /* Remove the entries for both source and target files */ coda_nc_zapfile(VTOC(odvp), fnm, flen); coda_nc_zapfile(VTOC(ndvp), tnm, tlen); /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(odvp)->c_flags &= ~C_VATTR; VTOC(ndvp)->c_flags &= ~C_VATTR; if (flen+1 > CODA_MAXNAMLEN) { MARK_INT_FAIL(CODA_RENAME_STATS); error = EINVAL; goto exit; } if (tlen+1 > CODA_MAXNAMLEN) { MARK_INT_FAIL(CODA_RENAME_STATS); error = EINVAL; goto exit; } error = venus_rename(vtomi(odvp), &odcp->c_fid, &ndcp->c_fid, fnm, flen, tnm, tlen, cred, p); exit: CODADEBUG(CODA_RENAME, myprintf(("in rename result %d\n",error));) /* XXX - do we need to call cache pureg on the moved vnode? */ cache_purge(ap->a_fvp); /* It seems to be incumbent on us to drop locks on all four vnodes */ /* From-vnodes are not locked, only ref'd. To-vnodes are locked. */ vrele(ap->a_fvp); vrele(odvp); if (ap->a_tvp) { if (ap->a_tvp == ndvp) { vrele(ap->a_tvp); } else { vput(ap->a_tvp); } } vput(ndvp); return(error); } int coda_mkdir(v) void *v; { /* true args */ struct vop_mkdir_args *ap = v; struct vnode *dvp = ap->a_dvp; struct cnode *dcp = VTOC(dvp); struct componentname *cnp = ap->a_cnp; register struct vattr *va = ap->a_vap; struct vnode **vpp = ap->a_vpp; struct ucred *cred = cnp->cn_cred; struct proc *p = cnp->cn_proc; /* locals */ int error; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; struct cnode *cp; ViceFid VFid; struct vattr ova; MARK_ENTRY(CODA_MKDIR_STATS); /* Check for mkdir of target object. */ if (IS_CTL_NAME(dvp, nm, len)) { *vpp = (struct vnode *)0; MARK_INT_FAIL(CODA_MKDIR_STATS); return(EACCES); } if (len+1 > CODA_MAXNAMLEN) { *vpp = (struct vnode *)0; MARK_INT_FAIL(CODA_MKDIR_STATS); return(EACCES); } error = venus_mkdir(vtomi(dvp), &dcp->c_fid, nm, len, va, cred, p, &VFid, &ova); if (!error) { if (coda_find(&VFid) != NULL) panic("cnode existed for newly created directory!"); cp = make_coda_node(&VFid, dvp->v_mount, va->va_type); *vpp = CTOV(cp); /* enter the new vnode in the Name Cache */ coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp)); /* as a side effect, enter "." and ".." for the directory */ coda_nc_enter(VTOC(*vpp), ".", 1, cred, VTOC(*vpp)); coda_nc_enter(VTOC(*vpp), "..", 2, cred, VTOC(dvp)); if (coda_attr_cache) { VTOC(*vpp)->c_vattr = ova; /* update the attr cache */ VTOC(*vpp)->c_flags |= C_VATTR; /* Valid attributes in cnode */ } /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(dvp)->c_flags &= ~C_VATTR; CODADEBUG( CODA_MKDIR, myprintf(("mkdir: (%lx.%lx.%lx) result %d\n", VFid.Volume, VFid.Vnode, VFid.Unique, error)); ) } else { *vpp = (struct vnode *)0; CODADEBUG(CODA_MKDIR, myprintf(("mkdir error %d\n",error));) } return(error); } int coda_rmdir(v) void *v; { /* true args */ struct vop_rmdir_args *ap = v; struct vnode *dvp = ap->a_dvp; struct cnode *dcp = VTOC(dvp); struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; struct proc *p = cnp->cn_proc; /* true args */ int error; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; struct cnode *cp; MARK_ENTRY(CODA_RMDIR_STATS); /* Check for rmdir of control object. */ if (IS_CTL_NAME(dvp, nm, len)) { MARK_INT_FAIL(CODA_RMDIR_STATS); return(ENOENT); } /* We're being conservative here, it might be that this person * doesn't really have sufficient access to delete the file * but we feel zapping the entry won't really hurt anyone -- dcs */ /* * As a side effect of the rmdir, remove any entries for children of * the directory, especially "." and "..". */ cp = coda_nc_lookup(dcp, nm, len, cred); if (cp) coda_nc_zapParentfid(&(cp->c_fid), NOT_DOWNCALL); /* Remove the file's entry from the CODA Name Cache */ coda_nc_zapfile(dcp, nm, len); /* Invalidate the parent's attr cache, the modification time has changed */ dcp->c_flags &= ~C_VATTR; error = venus_rmdir(vtomi(dvp), &dcp->c_fid, nm, len, cred, p); CODADEBUG(CODA_RMDIR, myprintf(("in rmdir result %d\n", error)); ) return(error); } int coda_symlink(v) void *v; { /* true args */ struct vop_symlink_args *ap = v; struct vnode *tdvp = ap->a_dvp; struct cnode *tdcp = VTOC(tdvp); struct componentname *cnp = ap->a_cnp; struct vattr *tva = ap->a_vap; char *path = ap->a_target; struct ucred *cred = cnp->cn_cred; struct proc *p = cnp->cn_proc; struct vnode **vpp = ap->a_vpp; /* locals */ int error; /* * XXX I'm assuming the following things about coda_symlink's * arguments: * t(foo) is the new name/parent/etc being created. * lname is the contents of the new symlink. */ char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; int plen = strlen(path); /* * Here's the strategy for the moment: perform the symlink, then * do a lookup to grab the resulting vnode. I know this requires * two communications with Venus for a new sybolic link, but * that's the way the ball bounces. I don't yet want to change * the way the Mach symlink works. When Mach support is * deprecated, we should change symlink so that the common case * returns the resultant vnode in a vpp argument. */ MARK_ENTRY(CODA_SYMLINK_STATS); /* Check for symlink of control object. */ if (IS_CTL_NAME(tdvp, nm, len)) { MARK_INT_FAIL(CODA_SYMLINK_STATS); return(EACCES); } if (plen+1 > CODA_MAXPATHLEN) { MARK_INT_FAIL(CODA_SYMLINK_STATS); return(EINVAL); } if (len+1 > CODA_MAXNAMLEN) { MARK_INT_FAIL(CODA_SYMLINK_STATS); error = EINVAL; goto exit; } error = venus_symlink(vtomi(tdvp), &tdcp->c_fid, path, plen, nm, len, tva, cred, p); /* Invalidate the parent's attr cache, the modification time has changed */ tdcp->c_flags &= ~C_VATTR; if (error == 0) error = VOP_LOOKUP(tdvp, vpp, cnp); exit: CODADEBUG(CODA_SYMLINK, myprintf(("in symlink result %d\n",error)); ) return(error); } /* * Read directory entries. */ int coda_readdir(v) void *v; { /* true args */ struct vop_readdir_args *ap = v; struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); register struct uio *uiop = ap->a_uio; struct ucred *cred = ap->a_cred; int *eofflag = ap->a_eofflag; u_long **cookies = ap->a_cookies; int *ncookies = ap->a_ncookies; struct proc *p = ap->a_uio->uio_procp; /* upcall decl */ /* locals */ int error = 0; MARK_ENTRY(CODA_READDIR_STATS); CODADEBUG(CODA_READDIR, myprintf(("coda_readdir(%p, %d, %lld, %d)\n", (void *)uiop->uio_iov->iov_base, uiop->uio_resid, (long long)uiop->uio_offset, uiop->uio_segflg)); ) /* Check for readdir of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_READDIR_STATS); return(ENOENT); } { /* If directory is not already open do an "internal open" on it. */ int opened_internally = 0; if (cp->c_ovp == NULL) { opened_internally = 1; MARK_INT_GEN(CODA_OPEN_STATS); error = VOP_OPEN(vp, FREAD, cred, p); printf("coda_readdir: Internally Opening %p\n", vp); if (error) { printf("coda_readdir: VOP_OPEN on container failed %d\n", error); return (error); } if (vp->v_type == VREG) { error = vfs_object_create(vp, p, cred); if (error != 0) { printf("coda_readdir: vfs_object_create() returns %d\n", error); vput(vp); } } if (error) return(error); } /* Have UFS handle the call. */ CODADEBUG(CODA_READDIR, myprintf(("indirect readdir: fid = (%lx.%lx.%lx), refcnt = %d\n",cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique, vp->v_usecount)); ) error = VOP_READDIR(cp->c_ovp, uiop, cred, eofflag, ncookies, cookies); if (error) MARK_INT_FAIL(CODA_READDIR_STATS); else MARK_INT_SAT(CODA_READDIR_STATS); /* Do an "internal close" if necessary. */ if (opened_internally) { MARK_INT_GEN(CODA_CLOSE_STATS); (void)VOP_CLOSE(vp, FREAD, cred, p); } } return(error); } /* * Convert from file system blocks to device blocks */ int coda_bmap(v) void *v; { /* XXX on the global proc */ /* true args */ struct vop_bmap_args *ap = v; struct vnode *vp __attribute__((unused)) = ap->a_vp; /* file's vnode */ daddr_t bn __attribute__((unused)) = ap->a_bn; /* fs block number */ struct vnode **vpp = ap->a_vpp; /* RETURN vp of device */ daddr_t *bnp __attribute__((unused)) = ap->a_bnp; /* RETURN device block number */ struct proc *p __attribute__((unused)) = curproc; /* upcall decl */ /* locals */ int ret = 0; struct cnode *cp; cp = VTOC(vp); if (cp->c_ovp) { return EINVAL; ret = VOP_BMAP(cp->c_ovp, bn, vpp, bnp, ap->a_runp, ap->a_runb); #if 0 printf("VOP_BMAP(cp->c_ovp %p, bn %p, vpp %p, bnp %p, ap->a_runp %p, ap->a_runb %p) = %d\n", cp->c_ovp, bn, vpp, bnp, ap->a_runp, ap->a_runb, ret); #endif return ret; } else { #if 0 printf("coda_bmap: no container\n"); #endif return(EOPNOTSUPP); } } /* * I don't think the following two things are used anywhere, so I've * commented them out * * struct buf *async_bufhead; * int async_daemon_count; */ int coda_strategy(v) void *v; { /* true args */ struct vop_strategy_args *ap = v; register struct buf *bp __attribute__((unused)) = ap->a_bp; struct proc *p __attribute__((unused)) = curproc; /* upcall decl */ /* locals */ printf("coda_strategy: called ???\n"); return(EOPNOTSUPP); } int coda_reclaim(v) void *v; { /* true args */ struct vop_reclaim_args *ap = v; struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); /* upcall decl */ /* locals */ /* * Forced unmount/flush will let vnodes with non zero use be destroyed! */ ENTRY; if (IS_UNMOUNTING(cp)) { #ifdef DEBUG if (VTOC(vp)->c_ovp) { if (IS_UNMOUNTING(cp)) printf("coda_reclaim: c_ovp not void: vp %p, cp %p\n", vp, cp); } #endif } else { #ifdef OLD_DIAGNOSTIC if (vp->v_usecount != 0) print("coda_reclaim: pushing active %p\n", vp); if (VTOC(vp)->c_ovp) { panic("coda_reclaim: c_ovp not void"); } #endif } cache_purge(vp); lockdestroy(&(VTOC(vp)->c_lock)); coda_free(VTOC(vp)); VTOC(vp) = NULL; return (0); } int coda_lock(v) void *v; { /* true args */ struct vop_lock_args *ap = v; struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct proc *p = ap->a_p; /* upcall decl */ /* locals */ ENTRY; if (coda_lockdebug) { myprintf(("Attempting lock on %lx.%lx.%lx\n", cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique)); } #ifndef DEBUG_LOCKS return (lockmgr(&cp->c_lock, ap->a_flags, &vp->v_interlock, p)); #else return (debuglockmgr(&cp->c_lock, ap->a_flags, &vp->v_interlock, p, "coda_lock", vp->filename, vp->line)); #endif } int coda_unlock(v) void *v; { /* true args */ struct vop_unlock_args *ap = v; struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct proc *p = ap->a_p; /* upcall decl */ /* locals */ ENTRY; if (coda_lockdebug) { myprintf(("Attempting unlock on %lx.%lx.%lx\n", cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique)); } return (lockmgr(&cp->c_lock, ap->a_flags | LK_RELEASE, &vp->v_interlock, p)); } int coda_islocked(v) void *v; { /* true args */ struct vop_islocked_args *ap = v; struct cnode *cp = VTOC(ap->a_vp); ENTRY; return (lockstatus(&cp->c_lock, ap->a_p)); } /* How one looks up a vnode given a device/inode pair: */ int coda_grab_vnode(dev_t dev, ino_t ino, struct vnode **vpp) { /* This is like VFS_VGET() or igetinode()! */ int error; struct mount *mp; if (!(mp = devtomp(dev))) { myprintf(("coda_grab_vnode: devtomp(%#lx) returns NULL\n", (u_long)dev2udev(dev))); return(ENXIO); } /* XXX - ensure that nonzero-return means failure */ error = VFS_VGET(mp,ino,vpp); if (error) { myprintf(("coda_grab_vnode: iget/vget(%lx, %lu) returns %p, err %d\n", (u_long)dev2udev(dev), (u_long)ino, (void *)*vpp, error)); return(ENOENT); } return(0); } void print_vattr( attr ) struct vattr *attr; { char *typestr; switch (attr->va_type) { case VNON: typestr = "VNON"; break; case VREG: typestr = "VREG"; break; case VDIR: typestr = "VDIR"; break; case VBLK: typestr = "VBLK"; break; case VCHR: typestr = "VCHR"; break; case VLNK: typestr = "VLNK"; break; case VSOCK: typestr = "VSCK"; break; case VFIFO: typestr = "VFFO"; break; case VBAD: typestr = "VBAD"; break; default: typestr = "????"; break; } myprintf(("attr: type %s mode %d uid %d gid %d fsid %d rdev %d\n", typestr, (int)attr->va_mode, (int)attr->va_uid, (int)attr->va_gid, (int)attr->va_fsid, (int)attr->va_rdev)); myprintf((" fileid %d nlink %d size %d blocksize %d bytes %d\n", (int)attr->va_fileid, (int)attr->va_nlink, (int)attr->va_size, (int)attr->va_blocksize,(int)attr->va_bytes)); myprintf((" gen %ld flags %ld vaflags %d\n", attr->va_gen, attr->va_flags, attr->va_vaflags)); myprintf((" atime sec %d nsec %d\n", (int)attr->va_atime.tv_sec, (int)attr->va_atime.tv_nsec)); myprintf((" mtime sec %d nsec %d\n", (int)attr->va_mtime.tv_sec, (int)attr->va_mtime.tv_nsec)); myprintf((" ctime sec %d nsec %d\n", (int)attr->va_ctime.tv_sec, (int)attr->va_ctime.tv_nsec)); } /* How to print a ucred */ void print_cred(cred) struct ucred *cred; { int i; myprintf(("ref %d\tuid %d\n",cred->cr_ref,cred->cr_uid)); for (i=0; i < cred->cr_ngroups; i++) myprintf(("\tgroup %d: (%d)\n",i,cred->cr_groups[i])); myprintf(("\n")); } /* * Return a vnode for the given fid. * If no cnode exists for this fid create one and put it * in a table hashed by fid.Volume and fid.Vnode. If the cnode for * this fid is already in the table return it (ref count is * incremented by coda_find. The cnode will be flushed from the * table when coda_inactive calls coda_unsave. */ struct cnode * make_coda_node(fid, vfsp, type) ViceFid *fid; struct mount *vfsp; short type; { struct cnode *cp; int err; if ((cp = coda_find(fid)) == NULL) { struct vnode *vp; cp = coda_alloc(); lockinit(&cp->c_lock, PINOD, "cnode", 0, 0); cp->c_fid = *fid; err = getnewvnode(VT_CODA, vfsp, coda_vnodeop_p, &vp); if (err) { panic("coda: getnewvnode returned error %d\n", err); } vp->v_data = cp; vp->v_type = type; cp->c_vnode = vp; coda_save(cp); } else { vref(CTOV(cp)); } return cp; } Index: head/sys/fs/coda/coda_vnops.c =================================================================== --- head/sys/fs/coda/coda_vnops.c (revision 75579) +++ head/sys/fs/coda/coda_vnops.c (revision 75580) @@ -1,1957 +1,1956 @@ /* * * Coda: an Experimental Distributed File System * Release 3.1 * * Copyright (c) 1987-1998 Carnegie Mellon University * All Rights Reserved * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation, and * that credit is given to Carnegie Mellon University in all documents * and publicity pertaining to direct or indirect use of this code or its * derivatives. * * CODA IS AN EXPERIMENTAL SOFTWARE SYSTEM AND IS KNOWN TO HAVE BUGS, * SOME OF WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON ALLOWS * FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION. CARNEGIE MELLON * DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER * RESULTING DIRECTLY OR INDIRECTLY FROM THE USE OF THIS SOFTWARE OR OF * ANY DERIVATIVE WORK. * * Carnegie Mellon encourages users of this software to return any * improvements or extensions that they make, and to grant Carnegie * Mellon the rights to redistribute these changes without encumbrance. * * @(#) src/sys/coda/coda_vnops.c,v 1.1.1.1 1998/08/29 21:14:52 rvb Exp $ * $FreeBSD$ * */ /* * Mach Operating System * Copyright (c) 1990 Carnegie-Mellon University * Copyright (c) 1989 Carnegie-Mellon University * All rights reserved. The CMU software License Agreement specifies * the terms and conditions for use and redistribution. */ /* * This code was written for the Coda file system at Carnegie Mellon * University. Contributers include David Steere, James Kistler, and * M. Satyanarayanan. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * These flags select various performance enhancements. */ int coda_attr_cache = 1; /* Set to cache attributes in the kernel */ int coda_symlink_cache = 1; /* Set to cache symbolic link information */ int coda_access_cache = 1; /* Set to handle some access checks directly */ /* structure to keep track of vfs calls */ struct coda_op_stats coda_vnodeopstats[CODA_VNODEOPS_SIZE]; #define MARK_ENTRY(op) (coda_vnodeopstats[op].entries++) #define MARK_INT_SAT(op) (coda_vnodeopstats[op].sat_intrn++) #define MARK_INT_FAIL(op) (coda_vnodeopstats[op].unsat_intrn++) #define MARK_INT_GEN(op) (coda_vnodeopstats[op].gen_intrn++) /* What we are delaying for in printf */ int coda_printf_delay = 0; /* in microseconds */ int coda_vnop_print_entry = 0; static int coda_lockdebug = 0; /* Definition of the vfs operation vector */ /* * Some NetBSD details: * * coda_start is called at the end of the mount syscall. * coda_init is called at boot time. */ #define ENTRY if(coda_vnop_print_entry) myprintf(("Entered %s\n",__FUNCTION__)) /* Definition of the vnode operation vector */ struct vnodeopv_entry_desc coda_vnodeop_entries[] = { { &vop_default_desc, coda_vop_error }, { &vop_lookup_desc, coda_lookup }, /* lookup */ { &vop_create_desc, coda_create }, /* create */ { &vop_mknod_desc, coda_vop_error }, /* mknod */ { &vop_open_desc, coda_open }, /* open */ { &vop_close_desc, coda_close }, /* close */ { &vop_access_desc, coda_access }, /* access */ { &vop_getattr_desc, coda_getattr }, /* getattr */ { &vop_setattr_desc, coda_setattr }, /* setattr */ { &vop_read_desc, coda_read }, /* read */ { &vop_write_desc, coda_write }, /* write */ { &vop_ioctl_desc, coda_ioctl }, /* ioctl */ { &vop_fsync_desc, coda_fsync }, /* fsync */ { &vop_remove_desc, coda_remove }, /* remove */ { &vop_link_desc, coda_link }, /* link */ { &vop_rename_desc, coda_rename }, /* rename */ { &vop_mkdir_desc, coda_mkdir }, /* mkdir */ { &vop_rmdir_desc, coda_rmdir }, /* rmdir */ { &vop_symlink_desc, coda_symlink }, /* symlink */ { &vop_readdir_desc, coda_readdir }, /* readdir */ { &vop_readlink_desc, coda_readlink }, /* readlink */ { &vop_inactive_desc, coda_inactive }, /* inactive */ { &vop_reclaim_desc, coda_reclaim }, /* reclaim */ { &vop_lock_desc, coda_lock }, /* lock */ { &vop_unlock_desc, coda_unlock }, /* unlock */ { &vop_bmap_desc, coda_bmap }, /* bmap */ { &vop_strategy_desc, coda_strategy }, /* strategy */ { &vop_print_desc, coda_vop_error }, /* print */ { &vop_islocked_desc, coda_islocked }, /* islocked */ { &vop_pathconf_desc, coda_vop_error }, /* pathconf */ { &vop_advlock_desc, coda_vop_nop }, /* advlock */ - { &vop_bwrite_desc, coda_vop_error }, /* bwrite */ { &vop_lease_desc, coda_vop_nop }, /* lease */ { &vop_poll_desc, (vop_t *) vop_stdpoll }, { &vop_getpages_desc, coda_fbsd_getpages }, /* pager intf.*/ { &vop_putpages_desc, coda_fbsd_putpages }, /* pager intf.*/ #if 0 we need to define these someday #define UFS_BLKATOFF(aa, bb, cc, dd) VFSTOUFS((aa)->v_mount)->um_blkatoff(aa, bb, cc, dd) #define UFS_VALLOC(aa, bb, cc, dd) VFSTOUFS((aa)->v_mount)->um_valloc(aa, bb, cc, dd) #define UFS_VFREE(aa, bb, cc) VFSTOUFS((aa)->v_mount)->um_vfree(aa, bb, cc) #define UFS_TRUNCATE(aa, bb, cc, dd, ee) VFSTOUFS((aa)->v_mount)->um_truncate(aa, bb, cc, dd, ee) #define UFS_UPDATE(aa, bb) VFSTOUFS((aa)->v_mount)->um_update(aa, bb) missing { &vop_reallocblks_desc, (vop_t *) ufs_missingop }, { &vop_cachedlookup_desc, (vop_t *) ufs_lookup }, { &vop_whiteout_desc, (vop_t *) ufs_whiteout }, #endif { (struct vnodeop_desc*)NULL, (int(*)(void *))NULL } }; static struct vnodeopv_desc coda_vnodeop_opv_desc = { &coda_vnodeop_p, coda_vnodeop_entries }; VNODEOP_SET(coda_vnodeop_opv_desc); /* A generic panic: we were called with something we didn't define yet */ int coda_vop_error(void *anon) { struct vnodeop_desc **desc = (struct vnodeop_desc **)anon; myprintf(("coda_vop_error: Vnode operation %s called, but not defined.\n", (*desc)->vdesc_name)); /* panic("coda_vop_error"); */ return EIO; } /* A generic do-nothing. For lease_check, advlock */ int coda_vop_nop(void *anon) { struct vnodeop_desc **desc = (struct vnodeop_desc **)anon; if (codadebug) { myprintf(("Vnode operation %s called, but unsupported\n", (*desc)->vdesc_name)); } return (0); } int coda_vnodeopstats_init(void) { register int i; for(i=0;ia_vp); struct cnode *cp = VTOC(*vpp); int flag = ap->a_mode & (~O_EXCL); struct ucred *cred = ap->a_cred; struct proc *p = ap->a_p; /* locals */ int error; struct vnode *vp; dev_t dev; ino_t inode; MARK_ENTRY(CODA_OPEN_STATS); /* Check for open of control file. */ if (IS_CTL_VP(*vpp)) { /* XXX */ /* if (WRITEABLE(flag)) */ if (flag & (FWRITE | O_TRUNC | O_CREAT | O_EXCL)) { MARK_INT_FAIL(CODA_OPEN_STATS); return(EACCES); } MARK_INT_SAT(CODA_OPEN_STATS); return(0); } error = venus_open(vtomi((*vpp)), &cp->c_fid, flag, cred, p, &dev, &inode); if (error) return (error); if (!error) { CODADEBUG( CODA_OPEN,myprintf(("open: dev %#lx inode %lu result %d\n", (u_long)dev2udev(dev), (u_long)inode, error)); ) } /* Translate the pair for the cache file into an inode pointer. */ error = coda_grab_vnode(dev, inode, &vp); if (error) return (error); /* We get the vnode back locked. Needs unlocked */ VOP_UNLOCK(vp, 0, p); /* Keep a reference until the close comes in. */ vref(*vpp); /* Save the vnode pointer for the cache file. */ if (cp->c_ovp == NULL) { cp->c_ovp = vp; } else { if (cp->c_ovp != vp) panic("coda_open: cp->c_ovp != ITOV(ip)"); } cp->c_ocount++; /* Flush the attribute cached if writing the file. */ if (flag & FWRITE) { cp->c_owrite++; cp->c_flags &= ~C_VATTR; } /* Save the pair for the cache file to speed up subsequent page_read's. */ cp->c_device = dev; cp->c_inode = inode; /* Open the cache file. */ error = VOP_OPEN(vp, flag, cred, p); if (error) { printf("coda_open: VOP_OPEN on container failed %d\n", error); return (error); } /* grab (above) does this when it calls newvnode unless it's in the cache*/ if (vp->v_type == VREG) { error = vfs_object_create(vp, p, cred); if (error != 0) { printf("coda_open: vfs_object_create() returns %d\n", error); vput(vp); } } return(error); } /* * Close the cache file used for I/O and notify Venus. */ int coda_close(v) void *v; { /* true args */ struct vop_close_args *ap = v; struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); int flag = ap->a_fflag; struct ucred *cred = ap->a_cred; struct proc *p = ap->a_p; /* locals */ int error; MARK_ENTRY(CODA_CLOSE_STATS); /* Check for close of control file. */ if (IS_CTL_VP(vp)) { MARK_INT_SAT(CODA_CLOSE_STATS); return(0); } if (IS_UNMOUNTING(cp)) { if (cp->c_ovp) { #ifdef CODA_VERBOSE printf("coda_close: destroying container ref %d, ufs vp %p of vp %p/cp %p\n", vp->v_usecount, cp->c_ovp, vp, cp); #endif #ifdef hmm vgone(cp->c_ovp); #else VOP_CLOSE(cp->c_ovp, flag, cred, p); /* Do errors matter here? */ vrele(cp->c_ovp); #endif } else { #ifdef CODA_VERBOSE printf("coda_close: NO container vp %p/cp %p\n", vp, cp); #endif } return ENODEV; } else { VOP_CLOSE(cp->c_ovp, flag, cred, p); /* Do errors matter here? */ vrele(cp->c_ovp); } if (--cp->c_ocount == 0) cp->c_ovp = NULL; if (flag & FWRITE) /* file was opened for write */ --cp->c_owrite; error = venus_close(vtomi(vp), &cp->c_fid, flag, cred, p); vrele(CTOV(cp)); CODADEBUG(CODA_CLOSE, myprintf(("close: result %d\n",error)); ) return(error); } int coda_read(v) void *v; { struct vop_read_args *ap = v; ENTRY; return(coda_rdwr(ap->a_vp, ap->a_uio, UIO_READ, ap->a_ioflag, ap->a_cred, ap->a_uio->uio_procp)); } int coda_write(v) void *v; { struct vop_write_args *ap = v; ENTRY; return(coda_rdwr(ap->a_vp, ap->a_uio, UIO_WRITE, ap->a_ioflag, ap->a_cred, ap->a_uio->uio_procp)); } int coda_rdwr(vp, uiop, rw, ioflag, cred, p) struct vnode *vp; struct uio *uiop; enum uio_rw rw; int ioflag; struct ucred *cred; struct proc *p; { /* upcall decl */ /* NOTE: container file operation!!! */ /* locals */ struct cnode *cp = VTOC(vp); struct vnode *cfvp = cp->c_ovp; int igot_internally = 0; int opened_internally = 0; int error = 0; MARK_ENTRY(CODA_RDWR_STATS); CODADEBUG(CODA_RDWR, myprintf(("coda_rdwr(%d, %p, %d, %lld, %d)\n", rw, (void *)uiop->uio_iov->iov_base, uiop->uio_resid, (long long)uiop->uio_offset, uiop->uio_segflg)); ) /* Check for rdwr of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_RDWR_STATS); return(EINVAL); } /* * If file is not already open this must be a page * {read,write} request. Iget the cache file's inode * pointer if we still have its pair. * Otherwise, we must do an internal open to derive the * pair. */ if (cfvp == NULL) { /* * If we're dumping core, do the internal open. Otherwise * venus won't have the correct size of the core when * it's completely written. */ PROC_LOCK(p); if (cp->c_inode != 0 && !(p && (p->p_acflag & ACORE))) { PROC_UNLOCK(p); igot_internally = 1; error = coda_grab_vnode(cp->c_device, cp->c_inode, &cfvp); if (error) { MARK_INT_FAIL(CODA_RDWR_STATS); return(error); } /* * We get the vnode back locked in both Mach and * NetBSD. Needs unlocked */ VOP_UNLOCK(cfvp, 0, p); } else { PROC_UNLOCK(p); opened_internally = 1; MARK_INT_GEN(CODA_OPEN_STATS); error = VOP_OPEN(vp, (rw == UIO_READ ? FREAD : FWRITE), cred, p); printf("coda_rdwr: Internally Opening %p\n", vp); if (error) { printf("coda_rdwr: VOP_OPEN on container failed %d\n", error); return (error); } if (vp->v_type == VREG) { error = vfs_object_create(vp, p, cred); if (error != 0) { printf("coda_rdwr: vfs_object_create() returns %d\n", error); vput(vp); } } if (error) { MARK_INT_FAIL(CODA_RDWR_STATS); return(error); } cfvp = cp->c_ovp; } } /* Have UFS handle the call. */ CODADEBUG(CODA_RDWR, myprintf(("indirect rdwr: fid = (%lx.%lx.%lx), refcnt = %d\n", cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique, CTOV(cp)->v_usecount)); ) if (rw == UIO_READ) { error = VOP_READ(cfvp, uiop, ioflag, cred); } else { error = VOP_WRITE(cfvp, uiop, ioflag, cred); /* ufs_write updates the vnode_pager_setsize for the vnode/object */ { struct vattr attr; if (VOP_GETATTR(cfvp, &attr, cred, p) == 0) { vnode_pager_setsize(vp, attr.va_size); } } } if (error) MARK_INT_FAIL(CODA_RDWR_STATS); else MARK_INT_SAT(CODA_RDWR_STATS); /* Do an internal close if necessary. */ if (opened_internally) { MARK_INT_GEN(CODA_CLOSE_STATS); (void)VOP_CLOSE(vp, (rw == UIO_READ ? FREAD : FWRITE), cred, p); } /* Invalidate cached attributes if writing. */ if (rw == UIO_WRITE) cp->c_flags &= ~C_VATTR; return(error); } int coda_ioctl(v) void *v; { /* true args */ struct vop_ioctl_args *ap = v; struct vnode *vp = ap->a_vp; int com = ap->a_command; caddr_t data = ap->a_data; int flag = ap->a_fflag; struct ucred *cred = ap->a_cred; struct proc *p = ap->a_p; /* locals */ int error; struct vnode *tvp; struct nameidata ndp; struct PioctlData *iap = (struct PioctlData *)data; MARK_ENTRY(CODA_IOCTL_STATS); CODADEBUG(CODA_IOCTL, myprintf(("in coda_ioctl on %s\n", iap->path));) /* Don't check for operation on a dying object, for ctlvp it shouldn't matter */ /* Must be control object to succeed. */ if (!IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_IOCTL_STATS); CODADEBUG(CODA_IOCTL, myprintf(("coda_ioctl error: vp != ctlvp"));) return (EOPNOTSUPP); } /* Look up the pathname. */ /* Should we use the name cache here? It would get it from lookupname sooner or later anyway, right? */ NDINIT(&ndp, LOOKUP, (iap->follow ? FOLLOW : NOFOLLOW), UIO_USERSPACE, iap->path, p); error = namei(&ndp); tvp = ndp.ni_vp; if (error) { MARK_INT_FAIL(CODA_IOCTL_STATS); CODADEBUG(CODA_IOCTL, myprintf(("coda_ioctl error: lookup returns %d\n", error));) return(error); } /* * Make sure this is a coda style cnode, but it may be a * different vfsp */ if (tvp->v_op != coda_vnodeop_p) { vrele(tvp); NDFREE(&ndp, NDF_ONLY_PNBUF); MARK_INT_FAIL(CODA_IOCTL_STATS); CODADEBUG(CODA_IOCTL, myprintf(("coda_ioctl error: %s not a coda object\n", iap->path));) return(EINVAL); } if (iap->vi.in_size > VC_MAXDATASIZE) { NDFREE(&ndp, 0); return(EINVAL); } error = venus_ioctl(vtomi(tvp), &((VTOC(tvp))->c_fid), com, flag, data, cred, p); if (error) MARK_INT_FAIL(CODA_IOCTL_STATS); else CODADEBUG(CODA_IOCTL, myprintf(("Ioctl returns %d \n", error)); ) vrele(tvp); NDFREE(&ndp, NDF_ONLY_PNBUF); return(error); } /* * To reduce the cost of a user-level venus;we cache attributes in * the kernel. Each cnode has storage allocated for an attribute. If * c_vattr is valid, return a reference to it. Otherwise, get the * attributes from venus and store them in the cnode. There is some * question if this method is a security leak. But I think that in * order to make this call, the user must have done a lookup and * opened the file, and therefore should already have access. */ int coda_getattr(v) void *v; { /* true args */ struct vop_getattr_args *ap = v; struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct proc *p = ap->a_p; /* locals */ int error; MARK_ENTRY(CODA_GETATTR_STATS); if (IS_UNMOUNTING(cp)) return ENODEV; /* Check for getattr of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_GETATTR_STATS); return(ENOENT); } /* Check to see if the attributes have already been cached */ if (VALID_VATTR(cp)) { CODADEBUG(CODA_GETATTR, { myprintf(("attr cache hit: (%lx.%lx.%lx)\n", cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique));}); CODADEBUG(CODA_GETATTR, if (!(codadebug & ~CODA_GETATTR)) print_vattr(&cp->c_vattr); ); *vap = cp->c_vattr; MARK_INT_SAT(CODA_GETATTR_STATS); return(0); } error = venus_getattr(vtomi(vp), &cp->c_fid, cred, p, vap); if (!error) { CODADEBUG(CODA_GETATTR, myprintf(("getattr miss (%lx.%lx.%lx): result %d\n", cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique, error)); ) CODADEBUG(CODA_GETATTR, if (!(codadebug & ~CODA_GETATTR)) print_vattr(vap); ); { int size = vap->va_size; struct vnode *convp = cp->c_ovp; if (convp != (struct vnode *)0) { vnode_pager_setsize(convp, size); } } /* If not open for write, store attributes in cnode */ if ((cp->c_owrite == 0) && (coda_attr_cache)) { cp->c_vattr = *vap; cp->c_flags |= C_VATTR; } } return(error); } int coda_setattr(v) void *v; { /* true args */ struct vop_setattr_args *ap = v; register struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); register struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct proc *p = ap->a_p; /* locals */ int error; MARK_ENTRY(CODA_SETATTR_STATS); /* Check for setattr of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_SETATTR_STATS); return(ENOENT); } if (codadebug & CODADBGMSK(CODA_SETATTR)) { print_vattr(vap); } error = venus_setattr(vtomi(vp), &cp->c_fid, vap, cred, p); if (!error) cp->c_flags &= ~C_VATTR; { int size = vap->va_size; struct vnode *convp = cp->c_ovp; if (size != VNOVAL && convp != (struct vnode *)0) { vnode_pager_setsize(convp, size); } } CODADEBUG(CODA_SETATTR, myprintf(("setattr %d\n", error)); ) return(error); } int coda_access(v) void *v; { /* true args */ struct vop_access_args *ap = v; struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); int mode = ap->a_mode; struct ucred *cred = ap->a_cred; struct proc *p = ap->a_p; /* locals */ int error; MARK_ENTRY(CODA_ACCESS_STATS); /* Check for access of control object. Only read access is allowed on it. */ if (IS_CTL_VP(vp)) { /* bogus hack - all will be marked as successes */ MARK_INT_SAT(CODA_ACCESS_STATS); return(((mode & VREAD) && !(mode & (VWRITE | VEXEC))) ? 0 : EACCES); } /* * if the file is a directory, and we are checking exec (eg lookup) * access, and the file is in the namecache, then the user must have * lookup access to it. */ if (coda_access_cache) { if ((vp->v_type == VDIR) && (mode & VEXEC)) { if (coda_nc_lookup(cp, ".", 1, cred)) { MARK_INT_SAT(CODA_ACCESS_STATS); return(0); /* it was in the cache */ } } } error = venus_access(vtomi(vp), &cp->c_fid, mode, cred, p); return(error); } int coda_readlink(v) void *v; { /* true args */ struct vop_readlink_args *ap = v; struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct uio *uiop = ap->a_uio; struct ucred *cred = ap->a_cred; struct proc *p = ap->a_uio->uio_procp; /* locals */ int error; char *str; int len; MARK_ENTRY(CODA_READLINK_STATS); /* Check for readlink of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_READLINK_STATS); return(ENOENT); } if ((coda_symlink_cache) && (VALID_SYMLINK(cp))) { /* symlink was cached */ uiop->uio_rw = UIO_READ; error = uiomove(cp->c_symlink, (int)cp->c_symlen, uiop); if (error) MARK_INT_FAIL(CODA_READLINK_STATS); else MARK_INT_SAT(CODA_READLINK_STATS); return(error); } error = venus_readlink(vtomi(vp), &cp->c_fid, cred, p, &str, &len); if (!error) { uiop->uio_rw = UIO_READ; error = uiomove(str, len, uiop); if (coda_symlink_cache) { cp->c_symlink = str; cp->c_symlen = len; cp->c_flags |= C_SYMLINK; } else CODA_FREE(str, len); } CODADEBUG(CODA_READLINK, myprintf(("in readlink result %d\n",error));) return(error); } int coda_fsync(v) void *v; { /* true args */ struct vop_fsync_args *ap = v; struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct ucred *cred = ap->a_cred; struct proc *p = ap->a_p; /* locals */ struct vnode *convp = cp->c_ovp; int error; MARK_ENTRY(CODA_FSYNC_STATS); /* Check for fsync on an unmounting object */ /* The NetBSD kernel, in it's infinite wisdom, can try to fsync * after an unmount has been initiated. This is a Bad Thing, * which we have to avoid. Not a legitimate failure for stats. */ if (IS_UNMOUNTING(cp)) { return(ENODEV); } /* Check for fsync of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_SAT(CODA_FSYNC_STATS); return(0); } if (convp) VOP_FSYNC(convp, cred, MNT_WAIT, p); /* * We see fsyncs with usecount == 1 then usecount == 0. * For now we ignore them. */ /* if (!vp->v_usecount) { printf("coda_fsync on vnode %p with %d usecount. c_flags = %x (%x)\n", vp, vp->v_usecount, cp->c_flags, cp->c_flags&C_PURGING); } */ /* * We can expect fsync on any vnode at all if venus is pruging it. * Venus can't very well answer the fsync request, now can it? * Hopefully, it won't have to, because hopefully, venus preserves * the (possibly untrue) invariant that it never purges an open * vnode. Hopefully. */ if (cp->c_flags & C_PURGING) { return(0); } /* needs research */ return 0; error = venus_fsync(vtomi(vp), &cp->c_fid, cred, p); CODADEBUG(CODA_FSYNC, myprintf(("in fsync result %d\n",error)); ); return(error); } int coda_inactive(v) void *v; { /* XXX - at the moment, inactive doesn't look at cred, and doesn't have a proc pointer. Oops. */ /* true args */ struct vop_inactive_args *ap = v; struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct ucred *cred __attribute__((unused)) = NULL; struct proc *p __attribute__((unused)) = curproc; /* upcall decl */ /* locals */ /* We don't need to send inactive to venus - DCS */ MARK_ENTRY(CODA_INACTIVE_STATS); if (IS_CTL_VP(vp)) { MARK_INT_SAT(CODA_INACTIVE_STATS); return 0; } CODADEBUG(CODA_INACTIVE, myprintf(("in inactive, %lx.%lx.%lx. vfsp %p\n", cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique, vp->v_mount));) /* If an array has been allocated to hold the symlink, deallocate it */ if ((coda_symlink_cache) && (VALID_SYMLINK(cp))) { if (cp->c_symlink == NULL) panic("coda_inactive: null symlink pointer in cnode"); CODA_FREE(cp->c_symlink, cp->c_symlen); cp->c_flags &= ~C_SYMLINK; cp->c_symlen = 0; } /* Remove it from the table so it can't be found. */ coda_unsave(cp); if ((struct coda_mntinfo *)(vp->v_mount->mnt_data) == NULL) { myprintf(("Help! vfsp->vfs_data was NULL, but vnode %p wasn't dying\n", vp)); panic("badness in coda_inactive\n"); } if (IS_UNMOUNTING(cp)) { #ifdef DEBUG printf("coda_inactive: IS_UNMOUNTING use %d: vp %p, cp %p\n", vp->v_usecount, vp, cp); if (cp->c_ovp != NULL) printf("coda_inactive: cp->ovp != NULL use %d: vp %p, cp %p\n", vp->v_usecount, vp, cp); #endif lockmgr(&cp->c_lock, LK_RELEASE, &vp->v_interlock, p); } else { #ifdef OLD_DIAGNOSTIC if (CTOV(cp)->v_usecount) { panic("coda_inactive: nonzero reference count"); } if (cp->c_ovp != NULL) { panic("coda_inactive: cp->ovp != NULL"); } #endif VOP_UNLOCK(vp, 0, p); vgone(vp); } MARK_INT_SAT(CODA_INACTIVE_STATS); return(0); } /* * Remote file system operations having to do with directory manipulation. */ /* * It appears that in NetBSD, lookup is supposed to return the vnode locked */ int coda_lookup(v) void *v; { /* true args */ struct vop_lookup_args *ap = v; struct vnode *dvp = ap->a_dvp; struct cnode *dcp = VTOC(dvp); struct vnode **vpp = ap->a_vpp; /* * It looks as though ap->a_cnp->ni_cnd->cn_nameptr holds the rest * of the string to xlate, and that we must try to get at least * ap->a_cnp->ni_cnd->cn_namelen of those characters to macth. I * could be wrong. */ struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; struct proc *p = cnp->cn_proc; /* locals */ struct cnode *cp; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; ViceFid VFid; int vtype; int error = 0; MARK_ENTRY(CODA_LOOKUP_STATS); CODADEBUG(CODA_LOOKUP, myprintf(("lookup: %s in %lx.%lx.%lx\n", nm, dcp->c_fid.Volume, dcp->c_fid.Vnode, dcp->c_fid.Unique));); /* Check for lookup of control object. */ if (IS_CTL_NAME(dvp, nm, len)) { *vpp = coda_ctlvp; vref(*vpp); MARK_INT_SAT(CODA_LOOKUP_STATS); goto exit; } if (len+1 > CODA_MAXNAMLEN) { MARK_INT_FAIL(CODA_LOOKUP_STATS); CODADEBUG(CODA_LOOKUP, myprintf(("name too long: lookup, %lx.%lx.%lx(%s)\n", dcp->c_fid.Volume, dcp->c_fid.Vnode, dcp->c_fid.Unique, nm));); *vpp = (struct vnode *)0; error = EINVAL; goto exit; } /* First try to look the file up in the cfs name cache */ /* lock the parent vnode? */ cp = coda_nc_lookup(dcp, nm, len, cred); if (cp) { *vpp = CTOV(cp); vref(*vpp); CODADEBUG(CODA_LOOKUP, myprintf(("lookup result %d vpp %p\n",error,*vpp));) } else { /* The name wasn't cached, so we need to contact Venus */ error = venus_lookup(vtomi(dvp), &dcp->c_fid, nm, len, cred, p, &VFid, &vtype); if (error) { MARK_INT_FAIL(CODA_LOOKUP_STATS); CODADEBUG(CODA_LOOKUP, myprintf(("lookup error on %lx.%lx.%lx(%s)%d\n", dcp->c_fid.Volume, dcp->c_fid.Vnode, dcp->c_fid.Unique, nm, error));) *vpp = (struct vnode *)0; } else { MARK_INT_SAT(CODA_LOOKUP_STATS); CODADEBUG(CODA_LOOKUP, myprintf(("lookup: vol %lx vno %lx uni %lx type %o result %d\n", VFid.Volume, VFid.Vnode, VFid.Unique, vtype, error)); ) cp = make_coda_node(&VFid, dvp->v_mount, vtype); *vpp = CTOV(cp); /* enter the new vnode in the Name Cache only if the top bit isn't set */ /* And don't enter a new vnode for an invalid one! */ if (!(vtype & CODA_NOCACHE)) coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp)); } } exit: /* * If we are creating, and this was the last name to be looked up, * and the error was ENOENT, then there really shouldn't be an * error and we can make the leaf NULL and return success. Since * this is supposed to work under Mach as well as NetBSD, we're * leaving this fn wrapped. We also must tell lookup/namei that * we need to save the last component of the name. (Create will * have to free the name buffer later...lucky us...) */ if (((cnp->cn_nameiop == CREATE) || (cnp->cn_nameiop == RENAME)) && (cnp->cn_flags & ISLASTCN) && (error == ENOENT)) { error = EJUSTRETURN; cnp->cn_flags |= SAVENAME; *ap->a_vpp = NULL; } /* * If we are removing, and we are at the last element, and we * found it, then we need to keep the name around so that the * removal will go ahead as planned. Unfortunately, this will * probably also lock the to-be-removed vnode, which may or may * not be a good idea. I'll have to look at the bits of * coda_remove to make sure. We'll only save the name if we did in * fact find the name, otherwise coda_remove won't have a chance * to free the pathname. */ if ((cnp->cn_nameiop == DELETE) && (cnp->cn_flags & ISLASTCN) && !error) { cnp->cn_flags |= SAVENAME; } /* * If the lookup went well, we need to (potentially?) unlock the * parent, and lock the child. We are only responsible for * checking to see if the parent is supposed to be unlocked before * we return. We must always lock the child (provided there is * one, and (the parent isn't locked or it isn't the same as the * parent.) Simple, huh? We can never leave the parent locked unless * we are ISLASTCN */ if (!error || (error == EJUSTRETURN)) { if (!(cnp->cn_flags & LOCKPARENT) || !(cnp->cn_flags & ISLASTCN)) { if ((error = VOP_UNLOCK(dvp, 0, p))) { return error; } /* * The parent is unlocked. As long as there is a child, * lock it without bothering to check anything else. */ if (*ap->a_vpp) { if ((error = VOP_LOCK(*ap->a_vpp, LK_EXCLUSIVE, p))) { printf("coda_lookup: "); panic("unlocked parent but couldn't lock child"); } } } else { /* The parent is locked, and may be the same as the child */ if (*ap->a_vpp && (*ap->a_vpp != dvp)) { /* Different, go ahead and lock it. */ if ((error = VOP_LOCK(*ap->a_vpp, LK_EXCLUSIVE, p))) { printf("coda_lookup: "); panic("unlocked parent but couldn't lock child"); } } } } else { /* If the lookup failed, we need to ensure that the leaf is NULL */ /* Don't change any locking? */ *ap->a_vpp = NULL; } return(error); } /*ARGSUSED*/ int coda_create(v) void *v; { /* true args */ struct vop_create_args *ap = v; struct vnode *dvp = ap->a_dvp; struct cnode *dcp = VTOC(dvp); struct vattr *va = ap->a_vap; int exclusive = 1; int mode = ap->a_vap->va_mode; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; struct proc *p = cnp->cn_proc; /* locals */ int error; struct cnode *cp; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; ViceFid VFid; struct vattr attr; MARK_ENTRY(CODA_CREATE_STATS); /* All creates are exclusive XXX */ /* I'm assuming the 'mode' argument is the file mode bits XXX */ /* Check for create of control object. */ if (IS_CTL_NAME(dvp, nm, len)) { *vpp = (struct vnode *)0; MARK_INT_FAIL(CODA_CREATE_STATS); return(EACCES); } error = venus_create(vtomi(dvp), &dcp->c_fid, nm, len, exclusive, mode, va, cred, p, &VFid, &attr); if (!error) { /* If this is an exclusive create, panic if the file already exists. */ /* Venus should have detected the file and reported EEXIST. */ if ((exclusive == 1) && (coda_find(&VFid) != NULL)) panic("cnode existed for newly created file!"); cp = make_coda_node(&VFid, dvp->v_mount, attr.va_type); *vpp = CTOV(cp); /* Update va to reflect the new attributes. */ (*va) = attr; /* Update the attribute cache and mark it as valid */ if (coda_attr_cache) { VTOC(*vpp)->c_vattr = attr; VTOC(*vpp)->c_flags |= C_VATTR; } /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(dvp)->c_flags &= ~C_VATTR; /* enter the new vnode in the Name Cache */ coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp)); CODADEBUG(CODA_CREATE, myprintf(("create: (%lx.%lx.%lx), result %d\n", VFid.Volume, VFid.Vnode, VFid.Unique, error)); ) } else { *vpp = (struct vnode *)0; CODADEBUG(CODA_CREATE, myprintf(("create error %d\n", error));) } if (!error) { if (cnp->cn_flags & LOCKLEAF) { if ((error = VOP_LOCK(*ap->a_vpp, LK_EXCLUSIVE, p))) { printf("coda_create: "); panic("unlocked parent but couldn't lock child"); } } #ifdef OLD_DIAGNOSTIC else { printf("coda_create: LOCKLEAF not set!\n"); } #endif } return(error); } int coda_remove(v) void *v; { /* true args */ struct vop_remove_args *ap = v; struct vnode *dvp = ap->a_dvp; struct cnode *cp = VTOC(dvp); struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; struct proc *p = cnp->cn_proc; /* locals */ int error; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; struct cnode *tp; MARK_ENTRY(CODA_REMOVE_STATS); CODADEBUG(CODA_REMOVE, myprintf(("remove: %s in %lx.%lx.%lx\n", nm, cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique));); /* Remove the file's entry from the CODA Name Cache */ /* We're being conservative here, it might be that this person * doesn't really have sufficient access to delete the file * but we feel zapping the entry won't really hurt anyone -- dcs */ /* I'm gonna go out on a limb here. If a file and a hardlink to it * exist, and one is removed, the link count on the other will be * off by 1. We could either invalidate the attrs if cached, or * fix them. I'll try to fix them. DCS 11/8/94 */ tp = coda_nc_lookup(VTOC(dvp), nm, len, cred); if (tp) { if (VALID_VATTR(tp)) { /* If attrs are cached */ if (tp->c_vattr.va_nlink > 1) { /* If it's a hard link */ tp->c_vattr.va_nlink--; } } coda_nc_zapfile(VTOC(dvp), nm, len); /* No need to flush it if it doesn't exist! */ } /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(dvp)->c_flags &= ~C_VATTR; /* Check for remove of control object. */ if (IS_CTL_NAME(dvp, nm, len)) { MARK_INT_FAIL(CODA_REMOVE_STATS); return(ENOENT); } error = venus_remove(vtomi(dvp), &cp->c_fid, nm, len, cred, p); CODADEBUG(CODA_REMOVE, myprintf(("in remove result %d\n",error)); ) return(error); } int coda_link(v) void *v; { /* true args */ struct vop_link_args *ap = v; struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct vnode *tdvp = ap->a_tdvp; struct cnode *tdcp = VTOC(tdvp); struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; struct proc *p = cnp->cn_proc; /* locals */ int error; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; MARK_ENTRY(CODA_LINK_STATS); if (codadebug & CODADBGMSK(CODA_LINK)) { myprintf(("nb_link: vp fid: (%lx.%lx.%lx)\n", cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique)); myprintf(("nb_link: tdvp fid: (%lx.%lx.%lx)\n", tdcp->c_fid.Volume, tdcp->c_fid.Vnode, tdcp->c_fid.Unique)); } if (codadebug & CODADBGMSK(CODA_LINK)) { myprintf(("link: vp fid: (%lx.%lx.%lx)\n", cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique)); myprintf(("link: tdvp fid: (%lx.%lx.%lx)\n", tdcp->c_fid.Volume, tdcp->c_fid.Vnode, tdcp->c_fid.Unique)); } /* Check for link to/from control object. */ if (IS_CTL_NAME(tdvp, nm, len) || IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_LINK_STATS); return(EACCES); } error = venus_link(vtomi(vp), &cp->c_fid, &tdcp->c_fid, nm, len, cred, p); /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(tdvp)->c_flags &= ~C_VATTR; VTOC(vp)->c_flags &= ~C_VATTR; CODADEBUG(CODA_LINK, myprintf(("in link result %d\n",error)); ) return(error); } int coda_rename(v) void *v; { /* true args */ struct vop_rename_args *ap = v; struct vnode *odvp = ap->a_fdvp; struct cnode *odcp = VTOC(odvp); struct componentname *fcnp = ap->a_fcnp; struct vnode *ndvp = ap->a_tdvp; struct cnode *ndcp = VTOC(ndvp); struct componentname *tcnp = ap->a_tcnp; struct ucred *cred = fcnp->cn_cred; struct proc *p = fcnp->cn_proc; /* true args */ int error; const char *fnm = fcnp->cn_nameptr; int flen = fcnp->cn_namelen; const char *tnm = tcnp->cn_nameptr; int tlen = tcnp->cn_namelen; MARK_ENTRY(CODA_RENAME_STATS); /* Hmmm. The vnodes are already looked up. Perhaps they are locked? This could be Bad. XXX */ #ifdef OLD_DIAGNOSTIC if ((fcnp->cn_cred != tcnp->cn_cred) || (fcnp->cn_proc != tcnp->cn_proc)) { panic("coda_rename: component names don't agree"); } #endif /* Check for rename involving control object. */ if (IS_CTL_NAME(odvp, fnm, flen) || IS_CTL_NAME(ndvp, tnm, tlen)) { MARK_INT_FAIL(CODA_RENAME_STATS); return(EACCES); } /* Problem with moving directories -- need to flush entry for .. */ if (odvp != ndvp) { struct cnode *ovcp = coda_nc_lookup(VTOC(odvp), fnm, flen, cred); if (ovcp) { struct vnode *ovp = CTOV(ovcp); if ((ovp) && (ovp->v_type == VDIR)) /* If it's a directory */ coda_nc_zapfile(VTOC(ovp),"..", 2); } } /* Remove the entries for both source and target files */ coda_nc_zapfile(VTOC(odvp), fnm, flen); coda_nc_zapfile(VTOC(ndvp), tnm, tlen); /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(odvp)->c_flags &= ~C_VATTR; VTOC(ndvp)->c_flags &= ~C_VATTR; if (flen+1 > CODA_MAXNAMLEN) { MARK_INT_FAIL(CODA_RENAME_STATS); error = EINVAL; goto exit; } if (tlen+1 > CODA_MAXNAMLEN) { MARK_INT_FAIL(CODA_RENAME_STATS); error = EINVAL; goto exit; } error = venus_rename(vtomi(odvp), &odcp->c_fid, &ndcp->c_fid, fnm, flen, tnm, tlen, cred, p); exit: CODADEBUG(CODA_RENAME, myprintf(("in rename result %d\n",error));) /* XXX - do we need to call cache pureg on the moved vnode? */ cache_purge(ap->a_fvp); /* It seems to be incumbent on us to drop locks on all four vnodes */ /* From-vnodes are not locked, only ref'd. To-vnodes are locked. */ vrele(ap->a_fvp); vrele(odvp); if (ap->a_tvp) { if (ap->a_tvp == ndvp) { vrele(ap->a_tvp); } else { vput(ap->a_tvp); } } vput(ndvp); return(error); } int coda_mkdir(v) void *v; { /* true args */ struct vop_mkdir_args *ap = v; struct vnode *dvp = ap->a_dvp; struct cnode *dcp = VTOC(dvp); struct componentname *cnp = ap->a_cnp; register struct vattr *va = ap->a_vap; struct vnode **vpp = ap->a_vpp; struct ucred *cred = cnp->cn_cred; struct proc *p = cnp->cn_proc; /* locals */ int error; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; struct cnode *cp; ViceFid VFid; struct vattr ova; MARK_ENTRY(CODA_MKDIR_STATS); /* Check for mkdir of target object. */ if (IS_CTL_NAME(dvp, nm, len)) { *vpp = (struct vnode *)0; MARK_INT_FAIL(CODA_MKDIR_STATS); return(EACCES); } if (len+1 > CODA_MAXNAMLEN) { *vpp = (struct vnode *)0; MARK_INT_FAIL(CODA_MKDIR_STATS); return(EACCES); } error = venus_mkdir(vtomi(dvp), &dcp->c_fid, nm, len, va, cred, p, &VFid, &ova); if (!error) { if (coda_find(&VFid) != NULL) panic("cnode existed for newly created directory!"); cp = make_coda_node(&VFid, dvp->v_mount, va->va_type); *vpp = CTOV(cp); /* enter the new vnode in the Name Cache */ coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp)); /* as a side effect, enter "." and ".." for the directory */ coda_nc_enter(VTOC(*vpp), ".", 1, cred, VTOC(*vpp)); coda_nc_enter(VTOC(*vpp), "..", 2, cred, VTOC(dvp)); if (coda_attr_cache) { VTOC(*vpp)->c_vattr = ova; /* update the attr cache */ VTOC(*vpp)->c_flags |= C_VATTR; /* Valid attributes in cnode */ } /* Invalidate the parent's attr cache, the modification time has changed */ VTOC(dvp)->c_flags &= ~C_VATTR; CODADEBUG( CODA_MKDIR, myprintf(("mkdir: (%lx.%lx.%lx) result %d\n", VFid.Volume, VFid.Vnode, VFid.Unique, error)); ) } else { *vpp = (struct vnode *)0; CODADEBUG(CODA_MKDIR, myprintf(("mkdir error %d\n",error));) } return(error); } int coda_rmdir(v) void *v; { /* true args */ struct vop_rmdir_args *ap = v; struct vnode *dvp = ap->a_dvp; struct cnode *dcp = VTOC(dvp); struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; struct proc *p = cnp->cn_proc; /* true args */ int error; const char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; struct cnode *cp; MARK_ENTRY(CODA_RMDIR_STATS); /* Check for rmdir of control object. */ if (IS_CTL_NAME(dvp, nm, len)) { MARK_INT_FAIL(CODA_RMDIR_STATS); return(ENOENT); } /* We're being conservative here, it might be that this person * doesn't really have sufficient access to delete the file * but we feel zapping the entry won't really hurt anyone -- dcs */ /* * As a side effect of the rmdir, remove any entries for children of * the directory, especially "." and "..". */ cp = coda_nc_lookup(dcp, nm, len, cred); if (cp) coda_nc_zapParentfid(&(cp->c_fid), NOT_DOWNCALL); /* Remove the file's entry from the CODA Name Cache */ coda_nc_zapfile(dcp, nm, len); /* Invalidate the parent's attr cache, the modification time has changed */ dcp->c_flags &= ~C_VATTR; error = venus_rmdir(vtomi(dvp), &dcp->c_fid, nm, len, cred, p); CODADEBUG(CODA_RMDIR, myprintf(("in rmdir result %d\n", error)); ) return(error); } int coda_symlink(v) void *v; { /* true args */ struct vop_symlink_args *ap = v; struct vnode *tdvp = ap->a_dvp; struct cnode *tdcp = VTOC(tdvp); struct componentname *cnp = ap->a_cnp; struct vattr *tva = ap->a_vap; char *path = ap->a_target; struct ucred *cred = cnp->cn_cred; struct proc *p = cnp->cn_proc; struct vnode **vpp = ap->a_vpp; /* locals */ int error; /* * XXX I'm assuming the following things about coda_symlink's * arguments: * t(foo) is the new name/parent/etc being created. * lname is the contents of the new symlink. */ char *nm = cnp->cn_nameptr; int len = cnp->cn_namelen; int plen = strlen(path); /* * Here's the strategy for the moment: perform the symlink, then * do a lookup to grab the resulting vnode. I know this requires * two communications with Venus for a new sybolic link, but * that's the way the ball bounces. I don't yet want to change * the way the Mach symlink works. When Mach support is * deprecated, we should change symlink so that the common case * returns the resultant vnode in a vpp argument. */ MARK_ENTRY(CODA_SYMLINK_STATS); /* Check for symlink of control object. */ if (IS_CTL_NAME(tdvp, nm, len)) { MARK_INT_FAIL(CODA_SYMLINK_STATS); return(EACCES); } if (plen+1 > CODA_MAXPATHLEN) { MARK_INT_FAIL(CODA_SYMLINK_STATS); return(EINVAL); } if (len+1 > CODA_MAXNAMLEN) { MARK_INT_FAIL(CODA_SYMLINK_STATS); error = EINVAL; goto exit; } error = venus_symlink(vtomi(tdvp), &tdcp->c_fid, path, plen, nm, len, tva, cred, p); /* Invalidate the parent's attr cache, the modification time has changed */ tdcp->c_flags &= ~C_VATTR; if (error == 0) error = VOP_LOOKUP(tdvp, vpp, cnp); exit: CODADEBUG(CODA_SYMLINK, myprintf(("in symlink result %d\n",error)); ) return(error); } /* * Read directory entries. */ int coda_readdir(v) void *v; { /* true args */ struct vop_readdir_args *ap = v; struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); register struct uio *uiop = ap->a_uio; struct ucred *cred = ap->a_cred; int *eofflag = ap->a_eofflag; u_long **cookies = ap->a_cookies; int *ncookies = ap->a_ncookies; struct proc *p = ap->a_uio->uio_procp; /* upcall decl */ /* locals */ int error = 0; MARK_ENTRY(CODA_READDIR_STATS); CODADEBUG(CODA_READDIR, myprintf(("coda_readdir(%p, %d, %lld, %d)\n", (void *)uiop->uio_iov->iov_base, uiop->uio_resid, (long long)uiop->uio_offset, uiop->uio_segflg)); ) /* Check for readdir of control object. */ if (IS_CTL_VP(vp)) { MARK_INT_FAIL(CODA_READDIR_STATS); return(ENOENT); } { /* If directory is not already open do an "internal open" on it. */ int opened_internally = 0; if (cp->c_ovp == NULL) { opened_internally = 1; MARK_INT_GEN(CODA_OPEN_STATS); error = VOP_OPEN(vp, FREAD, cred, p); printf("coda_readdir: Internally Opening %p\n", vp); if (error) { printf("coda_readdir: VOP_OPEN on container failed %d\n", error); return (error); } if (vp->v_type == VREG) { error = vfs_object_create(vp, p, cred); if (error != 0) { printf("coda_readdir: vfs_object_create() returns %d\n", error); vput(vp); } } if (error) return(error); } /* Have UFS handle the call. */ CODADEBUG(CODA_READDIR, myprintf(("indirect readdir: fid = (%lx.%lx.%lx), refcnt = %d\n",cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique, vp->v_usecount)); ) error = VOP_READDIR(cp->c_ovp, uiop, cred, eofflag, ncookies, cookies); if (error) MARK_INT_FAIL(CODA_READDIR_STATS); else MARK_INT_SAT(CODA_READDIR_STATS); /* Do an "internal close" if necessary. */ if (opened_internally) { MARK_INT_GEN(CODA_CLOSE_STATS); (void)VOP_CLOSE(vp, FREAD, cred, p); } } return(error); } /* * Convert from file system blocks to device blocks */ int coda_bmap(v) void *v; { /* XXX on the global proc */ /* true args */ struct vop_bmap_args *ap = v; struct vnode *vp __attribute__((unused)) = ap->a_vp; /* file's vnode */ daddr_t bn __attribute__((unused)) = ap->a_bn; /* fs block number */ struct vnode **vpp = ap->a_vpp; /* RETURN vp of device */ daddr_t *bnp __attribute__((unused)) = ap->a_bnp; /* RETURN device block number */ struct proc *p __attribute__((unused)) = curproc; /* upcall decl */ /* locals */ int ret = 0; struct cnode *cp; cp = VTOC(vp); if (cp->c_ovp) { return EINVAL; ret = VOP_BMAP(cp->c_ovp, bn, vpp, bnp, ap->a_runp, ap->a_runb); #if 0 printf("VOP_BMAP(cp->c_ovp %p, bn %p, vpp %p, bnp %p, ap->a_runp %p, ap->a_runb %p) = %d\n", cp->c_ovp, bn, vpp, bnp, ap->a_runp, ap->a_runb, ret); #endif return ret; } else { #if 0 printf("coda_bmap: no container\n"); #endif return(EOPNOTSUPP); } } /* * I don't think the following two things are used anywhere, so I've * commented them out * * struct buf *async_bufhead; * int async_daemon_count; */ int coda_strategy(v) void *v; { /* true args */ struct vop_strategy_args *ap = v; register struct buf *bp __attribute__((unused)) = ap->a_bp; struct proc *p __attribute__((unused)) = curproc; /* upcall decl */ /* locals */ printf("coda_strategy: called ???\n"); return(EOPNOTSUPP); } int coda_reclaim(v) void *v; { /* true args */ struct vop_reclaim_args *ap = v; struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); /* upcall decl */ /* locals */ /* * Forced unmount/flush will let vnodes with non zero use be destroyed! */ ENTRY; if (IS_UNMOUNTING(cp)) { #ifdef DEBUG if (VTOC(vp)->c_ovp) { if (IS_UNMOUNTING(cp)) printf("coda_reclaim: c_ovp not void: vp %p, cp %p\n", vp, cp); } #endif } else { #ifdef OLD_DIAGNOSTIC if (vp->v_usecount != 0) print("coda_reclaim: pushing active %p\n", vp); if (VTOC(vp)->c_ovp) { panic("coda_reclaim: c_ovp not void"); } #endif } cache_purge(vp); lockdestroy(&(VTOC(vp)->c_lock)); coda_free(VTOC(vp)); VTOC(vp) = NULL; return (0); } int coda_lock(v) void *v; { /* true args */ struct vop_lock_args *ap = v; struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct proc *p = ap->a_p; /* upcall decl */ /* locals */ ENTRY; if (coda_lockdebug) { myprintf(("Attempting lock on %lx.%lx.%lx\n", cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique)); } #ifndef DEBUG_LOCKS return (lockmgr(&cp->c_lock, ap->a_flags, &vp->v_interlock, p)); #else return (debuglockmgr(&cp->c_lock, ap->a_flags, &vp->v_interlock, p, "coda_lock", vp->filename, vp->line)); #endif } int coda_unlock(v) void *v; { /* true args */ struct vop_unlock_args *ap = v; struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct proc *p = ap->a_p; /* upcall decl */ /* locals */ ENTRY; if (coda_lockdebug) { myprintf(("Attempting unlock on %lx.%lx.%lx\n", cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique)); } return (lockmgr(&cp->c_lock, ap->a_flags | LK_RELEASE, &vp->v_interlock, p)); } int coda_islocked(v) void *v; { /* true args */ struct vop_islocked_args *ap = v; struct cnode *cp = VTOC(ap->a_vp); ENTRY; return (lockstatus(&cp->c_lock, ap->a_p)); } /* How one looks up a vnode given a device/inode pair: */ int coda_grab_vnode(dev_t dev, ino_t ino, struct vnode **vpp) { /* This is like VFS_VGET() or igetinode()! */ int error; struct mount *mp; if (!(mp = devtomp(dev))) { myprintf(("coda_grab_vnode: devtomp(%#lx) returns NULL\n", (u_long)dev2udev(dev))); return(ENXIO); } /* XXX - ensure that nonzero-return means failure */ error = VFS_VGET(mp,ino,vpp); if (error) { myprintf(("coda_grab_vnode: iget/vget(%lx, %lu) returns %p, err %d\n", (u_long)dev2udev(dev), (u_long)ino, (void *)*vpp, error)); return(ENOENT); } return(0); } void print_vattr( attr ) struct vattr *attr; { char *typestr; switch (attr->va_type) { case VNON: typestr = "VNON"; break; case VREG: typestr = "VREG"; break; case VDIR: typestr = "VDIR"; break; case VBLK: typestr = "VBLK"; break; case VCHR: typestr = "VCHR"; break; case VLNK: typestr = "VLNK"; break; case VSOCK: typestr = "VSCK"; break; case VFIFO: typestr = "VFFO"; break; case VBAD: typestr = "VBAD"; break; default: typestr = "????"; break; } myprintf(("attr: type %s mode %d uid %d gid %d fsid %d rdev %d\n", typestr, (int)attr->va_mode, (int)attr->va_uid, (int)attr->va_gid, (int)attr->va_fsid, (int)attr->va_rdev)); myprintf((" fileid %d nlink %d size %d blocksize %d bytes %d\n", (int)attr->va_fileid, (int)attr->va_nlink, (int)attr->va_size, (int)attr->va_blocksize,(int)attr->va_bytes)); myprintf((" gen %ld flags %ld vaflags %d\n", attr->va_gen, attr->va_flags, attr->va_vaflags)); myprintf((" atime sec %d nsec %d\n", (int)attr->va_atime.tv_sec, (int)attr->va_atime.tv_nsec)); myprintf((" mtime sec %d nsec %d\n", (int)attr->va_mtime.tv_sec, (int)attr->va_mtime.tv_nsec)); myprintf((" ctime sec %d nsec %d\n", (int)attr->va_ctime.tv_sec, (int)attr->va_ctime.tv_nsec)); } /* How to print a ucred */ void print_cred(cred) struct ucred *cred; { int i; myprintf(("ref %d\tuid %d\n",cred->cr_ref,cred->cr_uid)); for (i=0; i < cred->cr_ngroups; i++) myprintf(("\tgroup %d: (%d)\n",i,cred->cr_groups[i])); myprintf(("\n")); } /* * Return a vnode for the given fid. * If no cnode exists for this fid create one and put it * in a table hashed by fid.Volume and fid.Vnode. If the cnode for * this fid is already in the table return it (ref count is * incremented by coda_find. The cnode will be flushed from the * table when coda_inactive calls coda_unsave. */ struct cnode * make_coda_node(fid, vfsp, type) ViceFid *fid; struct mount *vfsp; short type; { struct cnode *cp; int err; if ((cp = coda_find(fid)) == NULL) { struct vnode *vp; cp = coda_alloc(); lockinit(&cp->c_lock, PINOD, "cnode", 0, 0); cp->c_fid = *fid; err = getnewvnode(VT_CODA, vfsp, coda_vnodeop_p, &vp); if (err) { panic("coda: getnewvnode returned error %d\n", err); } vp->v_data = cp; vp->v_type = type; cp->c_vnode = vp; coda_save(cp); } else { vref(CTOV(cp)); } return cp; } Index: head/sys/fs/hpfs/hpfs_vnops.c =================================================================== --- head/sys/fs/hpfs/hpfs_vnops.c (revision 75579) +++ head/sys/fs/hpfs/hpfs_vnops.c (revision 75580) @@ -1,1426 +1,1425 @@ /*- * Copyright (c) 1998, 1999 Semen Ustimenko (semenu@FreeBSD.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if !defined(__FreeBSD__) #include #endif #include #include #include #include #if defined(__FreeBSD__) #include #endif #include #if !defined(__FreeBSD__) #include #include #endif #include /* for pathconf(2) constants */ #include #include #include #include static int hpfs_de_uiomove __P((struct hpfsmount *, struct hpfsdirent *, struct uio *)); static int hpfs_ioctl __P((struct vop_ioctl_args *ap)); static int hpfs_bypass __P((struct vop_generic_args *ap)); static int hpfs_read __P((struct vop_read_args *)); static int hpfs_write __P((struct vop_write_args *ap)); static int hpfs_getattr __P((struct vop_getattr_args *ap)); static int hpfs_setattr __P((struct vop_setattr_args *ap)); static int hpfs_inactive __P((struct vop_inactive_args *ap)); static int hpfs_print __P((struct vop_print_args *ap)); static int hpfs_reclaim __P((struct vop_reclaim_args *ap)); static int hpfs_strategy __P((struct vop_strategy_args *ap)); static int hpfs_access __P((struct vop_access_args *ap)); static int hpfs_open __P((struct vop_open_args *ap)); static int hpfs_close __P((struct vop_close_args *ap)); static int hpfs_readdir __P((struct vop_readdir_args *ap)); static int hpfs_lookup __P((struct vop_lookup_args *ap)); static int hpfs_create __P((struct vop_create_args *)); static int hpfs_remove __P((struct vop_remove_args *)); static int hpfs_bmap __P((struct vop_bmap_args *ap)); #if defined(__FreeBSD__) static int hpfs_getpages __P((struct vop_getpages_args *ap)); static int hpfs_putpages __P((struct vop_putpages_args *)); static int hpfs_fsync __P((struct vop_fsync_args *ap)); #else static int hpfs_abortop __P((struct vop_abortop_args *)); #endif static int hpfs_pathconf __P((struct vop_pathconf_args *ap)); #if defined(__FreeBSD__) int hpfs_getpages(ap) struct vop_getpages_args *ap; { return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage); } int hpfs_putpages(ap) struct vop_putpages_args *ap; { return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, ap->a_rtvals); } static int hpfs_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; int s; struct buf *bp, *nbp; /* * Flush all dirty buffers associated with a vnode. */ loop: s = splbio(); for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) continue; if ((bp->b_flags & B_DELWRI) == 0) panic("hpfs_fsync: not dirty"); bremfree(bp); splx(s); (void) bwrite(bp); goto loop; } while (vp->v_numoutput) { vp->v_flag |= VBWAIT; (void) tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "hpfsn", 0); } #ifdef DIAGNOSTIC if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { vprint("hpfs_fsync: dirty", vp); goto loop; } #endif splx(s); /* * Write out the on-disc version of the vnode. */ return hpfs_update(VTOHP(vp)); } #endif static int hpfs_ioctl ( struct vop_ioctl_args /* { struct vnode *a_vp; u_long a_command; caddr_t a_data; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap) { register struct vnode *vp = ap->a_vp; register struct hpfsnode *hp = VTOHP(vp); int error; printf("hpfs_ioctl(0x%x, 0x%lx, 0x%p, 0x%x): ", hp->h_no, ap->a_command, ap->a_data, ap->a_fflag); switch (ap->a_command) { case HPFSIOCGEANUM: { u_long eanum; u_long passed; struct ea *eap; eanum = 0; if (hp->h_fn.fn_ealen > 0) { eap = (struct ea *)&(hp->h_fn.fn_int); passed = 0; while (passed < hp->h_fn.fn_ealen) { printf("EAname: %s\n", EA_NAME(eap)); eanum++; passed += sizeof(struct ea) + eap->ea_namelen + 1 + eap->ea_vallen; eap = (struct ea *)((caddr_t)hp->h_fn.fn_int + passed); } error = 0; } else { error = ENOENT; } printf("%lu eas\n", eanum); *(u_long *)ap->a_data = eanum; break; } case HPFSIOCGEASZ: { u_long eanum; u_long passed; struct ea *eap; printf("EA%ld\n", *(u_long *)ap->a_data); eanum = 0; if (hp->h_fn.fn_ealen > 0) { eap = (struct ea *)&(hp->h_fn.fn_int); passed = 0; error = ENOENT; while (passed < hp->h_fn.fn_ealen) { printf("EAname: %s\n", EA_NAME(eap)); if (eanum == *(u_long *)ap->a_data) { *(u_long *)ap->a_data = eap->ea_namelen + 1 + eap->ea_vallen; error = 0; break; } eanum++; passed += sizeof(struct ea) + eap->ea_namelen + 1 + eap->ea_vallen; eap = (struct ea *)((caddr_t)hp->h_fn.fn_int + passed); } } else { error = ENOENT; } break; } case HPFSIOCRDEA: { u_long eanum; u_long passed; struct hpfs_rdea *rdeap; struct ea *eap; rdeap = (struct hpfs_rdea *)ap->a_data; printf("EA%ld\n", rdeap->ea_no); eanum = 0; if (hp->h_fn.fn_ealen > 0) { eap = (struct ea *)&(hp->h_fn.fn_int); passed = 0; error = ENOENT; while (passed < hp->h_fn.fn_ealen) { printf("EAname: %s\n", EA_NAME(eap)); if (eanum == rdeap->ea_no) { rdeap->ea_sz = eap->ea_namelen + 1 + eap->ea_vallen; copyout(EA_NAME(eap),rdeap->ea_data, rdeap->ea_sz); error = 0; break; } eanum++; passed += sizeof(struct ea) + eap->ea_namelen + 1 + eap->ea_vallen; eap = (struct ea *)((caddr_t)hp->h_fn.fn_int + passed); } } else { error = ENOENT; } break; } default: error = EOPNOTSUPP; break; } return (error); } /* * Map file offset to disk offset. */ int hpfs_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct vnode **a_vpp; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { register struct hpfsnode *hp = VTOHP(ap->a_vp); int error; if (ap->a_vpp != NULL) *ap->a_vpp = hp->h_devvp; #if defined(__FreeBSD__) if (ap->a_runb != NULL) *ap->a_runb = 0; #endif if (ap->a_bnp == NULL) return (0); dprintf(("hpfs_bmap(0x%x, 0x%x): ",hp->h_no, ap->a_bn)); error = hpfs_hpbmap (hp, ap->a_bn, ap->a_bnp, ap->a_runp); return (error); } static int hpfs_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct hpfsnode *hp = VTOHP(vp); struct uio *uio = ap->a_uio; struct buf *bp; u_int xfersz, toread; u_int off; daddr_t lbn, bn; int resid; int runl; int error = 0; resid = min (uio->uio_resid, hp->h_fn.fn_size - uio->uio_offset); dprintf(("hpfs_read(0x%x, off: %d resid: %d, segflg: %d): [resid: 0x%x]\n",hp->h_no,(u_int32_t)uio->uio_offset,uio->uio_resid,uio->uio_segflg, resid)); while (resid) { lbn = uio->uio_offset >> DEV_BSHIFT; off = uio->uio_offset & (DEV_BSIZE - 1); dprintf(("hpfs_read: resid: 0x%x lbn: 0x%x off: 0x%x\n", uio->uio_resid, lbn, off)); error = hpfs_hpbmap(hp, lbn, &bn, &runl); if (error) return (error); toread = min(off + resid, min(DFLTPHYS, (runl+1)*DEV_BSIZE)); xfersz = (toread + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); dprintf(("hpfs_read: bn: 0x%x (0x%x) toread: 0x%x (0x%x)\n", bn, runl, toread, xfersz)); if (toread == 0) break; error = bread(hp->h_devvp, bn, xfersz, NOCRED, &bp); if (error) { brelse(bp); break; } error = uiomove(bp->b_data + off, toread - off, uio); if(error) { brelse(bp); break; } brelse(bp); resid -= toread; } dprintf(("hpfs_read: successful\n")); return (error); } static int hpfs_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct hpfsnode *hp = VTOHP(vp); struct uio *uio = ap->a_uio; struct buf *bp; u_int xfersz, towrite; u_int off; daddr_t lbn, bn; int runl; int error = 0; dprintf(("hpfs_write(0x%x, off: %d resid: %d, segflg: %d):\n",hp->h_no,(u_int32_t)uio->uio_offset,uio->uio_resid,uio->uio_segflg)); if (ap->a_ioflag & IO_APPEND) { dprintf(("hpfs_write: APPEND mode\n")); uio->uio_offset = hp->h_fn.fn_size; } if (uio->uio_offset + uio->uio_resid > hp->h_fn.fn_size) { error = hpfs_extend (hp, uio->uio_offset + uio->uio_resid); if (error) { printf("hpfs_write: hpfs_extend FAILED %d\n", error); return (error); } } while (uio->uio_resid) { lbn = uio->uio_offset >> DEV_BSHIFT; off = uio->uio_offset & (DEV_BSIZE - 1); dprintf(("hpfs_write: resid: 0x%x lbn: 0x%x off: 0x%x\n", uio->uio_resid, lbn, off)); error = hpfs_hpbmap(hp, lbn, &bn, &runl); if (error) return (error); towrite = min(off + uio->uio_resid, min(DFLTPHYS, (runl+1)*DEV_BSIZE)); xfersz = (towrite + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); dprintf(("hpfs_write: bn: 0x%x (0x%x) towrite: 0x%x (0x%x)\n", bn, runl, towrite, xfersz)); if ((off == 0) && (towrite == xfersz)) { bp = getblk(hp->h_devvp, bn, xfersz, 0, 0); clrbuf(bp); } else { error = bread(hp->h_devvp, bn, xfersz, NOCRED, &bp); if (error) { brelse(bp); return (error); } } error = uiomove(bp->b_data + off, towrite - off, uio); if(error) { brelse(bp); return (error); } if (ap->a_ioflag & IO_SYNC) bwrite(bp); else bawrite(bp); } dprintf(("hpfs_write: successful\n")); return (0); } static int hpfs_bypass(ap) struct vop_generic_args /* { struct vnodeop_desc *a_desc; } */ *ap; { dprintf(("hpfs_bypass: %s\n", ap->a_desc->vdesc_name)); return (0); } /* * XXXXX do we need hpfsnode locking inside? */ static int hpfs_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct hpfsnode *hp = VTOHP(vp); register struct vattr *vap = ap->a_vap; int error; dprintf(("hpfs_getattr(0x%x):\n", hp->h_no)); #if defined(__FreeBSD__) vap->va_fsid = dev2udev(hp->h_dev); #else /* defined(__NetBSD__) */ vap->va_fsid = ip->i_dev; #endif vap->va_fileid = hp->h_no; vap->va_mode = hp->h_mode; vap->va_nlink = 1; vap->va_uid = hp->h_uid; vap->va_gid = hp->h_gid; vap->va_rdev = 0; /* XXX UNODEV ? */ vap->va_size = hp->h_fn.fn_size; vap->va_bytes = ((hp->h_fn.fn_size + DEV_BSIZE-1) & ~(DEV_BSIZE-1)) + DEV_BSIZE; if (!(hp->h_flag & H_PARVALID)) { error = hpfs_validateparent(hp); if (error) return (error); } vap->va_atime = hpfstimetounix(hp->h_atime); vap->va_mtime = hpfstimetounix(hp->h_mtime); vap->va_ctime = hpfstimetounix(hp->h_ctime); vap->va_flags = 0; vap->va_gen = 0; vap->va_blocksize = DEV_BSIZE; vap->va_type = vp->v_type; vap->va_filerev = 0; return (0); } /* * XXXXX do we need hpfsnode locking inside? */ static int hpfs_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct hpfsnode *hp = VTOHP(vp); struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct proc *p = ap->a_p; int error; dprintf(("hpfs_setattr(0x%x):\n", hp->h_no)); /* * Check for unsettable attributes. */ if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || (vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { dprintf(("hpfs_setattr: changing nonsettable attr\n")); return (EINVAL); } /* Can't change flags XXX Could be implemented */ if (vap->va_flags != VNOVAL) { printf("hpfs_setattr: FLAGS CANNOT BE SET\n"); return (EINVAL); } /* Can't change uid/gid XXX Could be implemented */ if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { printf("hpfs_setattr: UID/GID CANNOT BE SET\n"); return (EINVAL); } /* Can't change mode XXX Could be implemented */ if (vap->va_mode != (mode_t)VNOVAL) { printf("hpfs_setattr: MODE CANNOT BE SET\n"); return (EINVAL); } /* Update times */ if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (cred->cr_uid != hp->h_uid && (error = suser_xxx(cred, p, PRISON_ROOT)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(vp, VWRITE, cred, p)))) return (error); if (vap->va_atime.tv_sec != VNOVAL) hp->h_atime = vap->va_atime.tv_sec; if (vap->va_mtime.tv_sec != VNOVAL) hp->h_mtime = vap->va_mtime.tv_sec; hp->h_flag |= H_PARCHANGE; } if (vap->va_size != VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: printf("hpfs_setattr: WRONG v_type\n"); return (EINVAL); } if (vap->va_size < hp->h_fn.fn_size) { #if defined(__FreeBSD__) error = vtruncbuf(vp, cred, p, vap->va_size, DEV_BSIZE); if (error) return (error); #else /* defined(__NetBSD__) */ #error Need alternation for vtruncbuf() #endif error = hpfs_truncate(hp, vap->va_size); if (error) return (error); } else if (vap->va_size > hp->h_fn.fn_size) { #if defined(__FreeBSD__) vnode_pager_setsize(vp, vap->va_size); #endif error = hpfs_extend(hp, vap->va_size); if (error) return (error); } } return (0); } /* * Last reference to an node. If necessary, write or delete it. */ int hpfs_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct hpfsnode *hp = VTOHP(vp); int error; dprintf(("hpfs_inactive(0x%x): \n", hp->h_no)); if (hp->h_flag & H_CHANGE) { dprintf(("hpfs_inactive: node changed, update\n")); error = hpfs_update (hp); if (error) return (error); } if (hp->h_flag & H_PARCHANGE) { dprintf(("hpfs_inactive: parent node changed, update\n")); error = hpfs_updateparent (hp); if (error) return (error); } if (prtactive && vp->v_usecount != 0) vprint("hpfs_inactive: pushing active", vp); if (hp->h_flag & H_INVAL) { VOP__UNLOCK(vp,0,ap->a_p); #if defined(__FreeBSD__) vrecycle(vp, NULL, ap->a_p); #else /* defined(__NetBSD__) */ vgone(vp); #endif return (0); } VOP__UNLOCK(vp,0,ap->a_p); return (0); } /* * Reclaim an inode so that it can be used for other purposes. */ int hpfs_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct hpfsnode *hp = VTOHP(vp); dprintf(("hpfs_reclaim(0x%x0): \n", hp->h_no)); hpfs_hphashrem(hp); /* Purge old data structures associated with the inode. */ cache_purge(vp); if (hp->h_devvp) { vrele(hp->h_devvp); hp->h_devvp = NULL; } lockdestroy(&hp->h_lock); mtx_destroy(&hp->h_interlock); vp->v_data = NULL; FREE(hp, M_HPFSNO); return (0); } static int hpfs_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct hpfsnode *hp = VTOHP(vp); printf("tag VT_HPFS, ino 0x%x",hp->h_no); lockmgr_printinfo(&hp->h_lock); printf("\n"); return (0); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. * * In order to be able to swap to a file, the VOP_BMAP operation may not * deadlock on memory. See hpfs_bmap() for details. XXXXXXX (not impl) */ int hpfs_strategy(ap) struct vop_strategy_args /* { struct buf *a_bp; } */ *ap; { register struct buf *bp = ap->a_bp; register struct vnode *vp = ap->a_vp; struct vnode *nvp; int error; dprintf(("hpfs_strategy(): \n")); if (vp->v_type == VBLK || vp->v_type == VCHR) panic("hpfs_strategy: spec"); if (bp->b_blkno == bp->b_lblkno) { error = VOP_BMAP(vp, bp->b_lblkno, &nvp, &bp->b_blkno, NULL, NULL); if (error) { printf("hpfs_strategy: VOP_BMAP FAILED %d\n", error); bp->b_error = error; bp->b_ioflags |= BIO_ERROR; biodone(bp); return (error); } if ((long)bp->b_blkno == -1) vfs_bio_clrbuf(bp); } if ((long)bp->b_blkno == -1) { biodone(bp); return (0); } bp->b_dev = nvp->v_rdev; VOP_STRATEGY(nvp, bp); return (0); } /* * XXXXX do we need hpfsnode locking inside? */ int hpfs_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct hpfsnode *hp = VTOHP(vp); mode_t mode = ap->a_mode; dprintf(("hpfs_access(0x%x):\n", hp->h_no)); /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ if (mode & VWRITE) { switch ((int)vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; } } return (vaccess(vp->v_type, hp->h_mode, hp->h_uid, hp->h_gid, ap->a_mode, ap->a_cred, NULL)); } /* * Open called. * * Nothing to do. */ /* ARGSUSED */ static int hpfs_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { #if HPFS_DEBUG register struct vnode *vp = ap->a_vp; register struct hpfsnode *hp = VTOHP(vp); printf("hpfs_open(0x%x):\n",hp->h_no); #endif /* * Files marked append-only must be opened for appending. */ return (0); } /* * Close called. * * Update the times on the inode. */ /* ARGSUSED */ static int hpfs_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { #if HPFS_DEBUG register struct vnode *vp = ap->a_vp; register struct hpfsnode *hp = VTOHP(vp); printf("hpfs_close: %d\n",hp->h_no); #endif return (0); } static int hpfs_de_uiomove ( struct hpfsmount *hpmp, struct hpfsdirent *dep, struct uio *uio) { struct dirent cde; int i, error; dprintf(("[no: 0x%x, size: %d, name: %2d:%.*s, flag: 0x%x] ", dep->de_fnode, dep->de_size, dep->de_namelen, dep->de_namelen, dep->de_name, dep->de_flag)); /*strncpy(cde.d_name, dep->de_name, dep->de_namelen);*/ for (i=0; ide_namelen; i++) cde.d_name[i] = hpfs_d2u(hpmp, dep->de_name[i]); cde.d_name[dep->de_namelen] = '\0'; cde.d_namlen = dep->de_namelen; cde.d_fileno = dep->de_fnode; cde.d_type = (dep->de_flag & DE_DIR) ? DT_DIR : DT_REG; cde.d_reclen = sizeof(struct dirent); error = uiomove((char *)&cde, sizeof(struct dirent), uio); if (error) return (error); dprintf(("[0x%x] ", uio->uio_resid)); return (error); } static struct dirent hpfs_de_dot = { 0, sizeof(struct dirent), DT_DIR, 1, "." }; static struct dirent hpfs_de_dotdot = { 0, sizeof(struct dirent), DT_DIR, 2, ".." }; int hpfs_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_ncookies; u_int **cookies; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct hpfsnode *hp = VTOHP(vp); struct hpfsmount *hpmp = hp->h_hpmp; struct uio *uio = ap->a_uio; int ncookies = 0, i, num, cnum; int error = 0; off_t off; struct buf *bp; struct dirblk *dp; struct hpfsdirent *dep; lsn_t olsn; lsn_t lsn; int level; dprintf(("hpfs_readdir(0x%x, 0x%x, 0x%x): ",hp->h_no,(u_int32_t)uio->uio_offset,uio->uio_resid)); off = uio->uio_offset; if( uio->uio_offset < sizeof(struct dirent) ) { dprintf((". faked, ")); hpfs_de_dot.d_fileno = hp->h_no; error = uiomove((char *)&hpfs_de_dot,sizeof(struct dirent),uio); if(error) { return (error); } ncookies ++; } if( uio->uio_offset < 2 * sizeof(struct dirent) ) { dprintf((".. faked, ")); hpfs_de_dotdot.d_fileno = hp->h_fn.fn_parent; error = uiomove((char *)&hpfs_de_dotdot, sizeof(struct dirent), uio); if(error) { return (error); } ncookies ++; } num = uio->uio_offset / sizeof(struct dirent) - 2; cnum = 0; lsn = ((alleaf_t *)hp->h_fn.fn_abd)->al_lsn; olsn = 0; level = 1; dive: dprintf(("[dive 0x%x] ", lsn)); error = bread(hp->h_devvp, lsn, D_BSIZE, NOCRED, &bp); if (error) { brelse(bp); return (error); } dp = (struct dirblk *) bp->b_data; if (dp->d_magic != D_MAGIC) { printf("hpfs_readdir: MAGIC DOESN'T MATCH\n"); brelse(bp); return (EINVAL); } dep = D_DIRENT(dp); if (olsn) { dprintf(("[restore 0x%x] ", olsn)); while(!(dep->de_flag & DE_END) ) { if((dep->de_flag & DE_DOWN) && (olsn == DE_DOWNLSN(dep))) break; dep = (hpfsdirent_t *)((caddr_t)dep + dep->de_reclen); } if((dep->de_flag & DE_DOWN) && (olsn == DE_DOWNLSN(dep))) { if (dep->de_flag & DE_END) goto blockdone; if (!(dep->de_flag & DE_SPECIAL)) { if (num <= cnum) { if (uio->uio_resid < sizeof(struct dirent)) { brelse(bp); dprintf(("[resid] ")); goto readdone; } error = hpfs_de_uiomove(hpmp, dep, uio); if (error) { brelse (bp); return (error); } ncookies++; if (uio->uio_resid < sizeof(struct dirent)) { brelse(bp); dprintf(("[resid] ")); goto readdone; } } cnum++; } dep = (hpfsdirent_t *)((caddr_t)dep + dep->de_reclen); } else { printf("hpfs_readdir: ERROR! oLSN not found\n"); brelse(bp); return (EINVAL); } } olsn = 0; while(!(dep->de_flag & DE_END)) { if(dep->de_flag & DE_DOWN) { lsn = DE_DOWNLSN(dep); brelse(bp); level++; goto dive; } if (!(dep->de_flag & DE_SPECIAL)) { if (num <= cnum) { if (uio->uio_resid < sizeof(struct dirent)) { brelse(bp); dprintf(("[resid] ")); goto readdone; } error = hpfs_de_uiomove(hpmp, dep, uio); if (error) { brelse (bp); return (error); } ncookies++; if (uio->uio_resid < sizeof(struct dirent)) { brelse(bp); dprintf(("[resid] ")); goto readdone; } } cnum++; } dep = (hpfsdirent_t *)((caddr_t)dep + dep->de_reclen); } if(dep->de_flag & DE_DOWN) { dprintf(("[enddive] ")); lsn = DE_DOWNLSN(dep); brelse(bp); level++; goto dive; } blockdone: dprintf(("[EOB] ")); olsn = lsn; lsn = dp->d_parent; brelse(bp); level--; dprintf(("[level %d] ", level)); if (level > 0) goto dive; /* undive really */ if (ap->a_eofflag) { dprintf(("[EOF] ")); *ap->a_eofflag = 1; } readdone: dprintf(("[readdone]\n")); if (!error && ap->a_ncookies != NULL) { struct dirent* dpStart; struct dirent* dp; #if defined(__FreeBSD__) u_long *cookies; u_long *cookiep; #else /* defined(__NetBSD__) */ off_t *cookies; off_t *cookiep; #endif dprintf(("%d cookies, ",ncookies)); if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) panic("hpfs_readdir: unexpected uio from NFS server"); dpStart = (struct dirent *) ((caddr_t)uio->uio_iov->iov_base - (uio->uio_offset - off)); #if defined(__FreeBSD__) MALLOC(cookies, u_long *, ncookies * sizeof(u_long), M_TEMP, M_WAITOK); #else /* defined(__NetBSD__) */ MALLOC(cookies, off_t *, ncookies * sizeof(off_t), M_TEMP, M_WAITOK); #endif for (dp = dpStart, cookiep = cookies, i=0; i < ncookies; dp = (struct dirent *)((caddr_t) dp + dp->d_reclen), i++) { off += dp->d_reclen; *cookiep++ = (u_int) off; } *ap->a_ncookies = ncookies; *ap->a_cookies = cookies; } return (0); } int hpfs_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { register struct vnode *dvp = ap->a_dvp; register struct hpfsnode *dhp = VTOHP(dvp); struct hpfsmount *hpmp = dhp->h_hpmp; struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; int error; int nameiop = cnp->cn_nameiop; int flags = cnp->cn_flags; int lockparent = flags & LOCKPARENT; #if HPFS_DEBUG int wantparent = flags & (LOCKPARENT|WANTPARENT); #endif dprintf(("hpfs_lookup(0x%x, %s, %ld, %d, %d): \n", dhp->h_no, cnp->cn_nameptr, cnp->cn_namelen, lockparent, wantparent)); if (nameiop != CREATE && nameiop != DELETE && nameiop != LOOKUP) { printf("hpfs_lookup: LOOKUP, DELETE and CREATE are only supported\n"); return (EOPNOTSUPP); } error = VOP_ACCESS(dvp, VEXEC, cred, cnp->cn_proc); if(error) return (error); if( (cnp->cn_namelen == 1) && !strncmp(cnp->cn_nameptr,".",1) ) { dprintf(("hpfs_lookup(0x%x,...): . faked\n",dhp->h_no)); VREF(dvp); *ap->a_vpp = dvp; return (0); } else if( (cnp->cn_namelen == 2) && !strncmp(cnp->cn_nameptr,"..",2) && (flags & ISDOTDOT) ) { dprintf(("hpfs_lookup(0x%x,...): .. faked (0x%x)\n", dhp->h_no, dhp->h_fn.fn_parent)); VOP__UNLOCK(dvp,0,cnp->cn_proc); error = VFS_VGET(hpmp->hpm_mp, dhp->h_fn.fn_parent, ap->a_vpp); if(error) { VOP__LOCK(dvp, 0, cnp->cn_proc); return(error); } if( lockparent && (flags & ISLASTCN) && (error = VOP__LOCK(dvp, 0, cnp->cn_proc)) ) { vput( *(ap->a_vpp) ); return (error); } return (error); } else { struct buf *bp; struct hpfsdirent *dep; struct hpfsnode *hp; error = hpfs_genlookupbyname(dhp, cnp->cn_nameptr, cnp->cn_namelen, &bp, &dep); if (error) { if ((error == ENOENT) && (flags & ISLASTCN) && (nameiop == CREATE || nameiop == RENAME)) { if(!lockparent) VOP__UNLOCK(dvp, 0, cnp->cn_proc); cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } return (error); } dprintf(("hpfs_lookup: fnode: 0x%x, CPID: 0x%x\n", dep->de_fnode, dep->de_cpid)); if (nameiop == DELETE && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cred, cnp->cn_proc); if (error) { brelse(bp); return (error); } } if (dhp->h_no == dep->de_fnode) { brelse(bp); VREF(dvp); *ap->a_vpp = dvp; return (0); } error = VFS_VGET(hpmp->hpm_mp, dep->de_fnode, ap->a_vpp); if (error) { printf("hpfs_lookup: VFS_VGET FAILED %d\n", error); brelse(bp); return(error); } hp = VTOHP(*ap->a_vpp); hp->h_mtime = dep->de_mtime; hp->h_ctime = dep->de_ctime; hp->h_atime = dep->de_atime; bcopy(dep->de_name, hp->h_name, dep->de_namelen); hp->h_name[dep->de_namelen] = '\0'; hp->h_namelen = dep->de_namelen; hp->h_flag |= H_PARVALID; brelse(bp); if(!lockparent || !(flags & ISLASTCN)) VOP__UNLOCK(dvp, 0, cnp->cn_proc); if ((flags & MAKEENTRY) && (!(flags & ISLASTCN) || (nameiop != DELETE && nameiop != CREATE))) cache_enter(dvp, *ap->a_vpp, cnp); } return (error); } int hpfs_remove(ap) struct vop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { int error; dprintf(("hpfs_remove(0x%x, %s, %ld): \n", VTOHP(ap->a_vp)->h_no, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen)); if (ap->a_vp->v_type == VDIR) return (EPERM); error = hpfs_removefnode (ap->a_dvp, ap->a_vp, ap->a_cnp); return (error); } int hpfs_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { int error; dprintf(("hpfs_create(0x%x, %s, %ld): \n", VTOHP(ap->a_dvp)->h_no, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen)); if (!(ap->a_cnp->cn_flags & HASBUF)) panic ("hpfs_create: no name\n"); error = hpfs_makefnode (ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap); return (error); } /* * Return POSIX pathconf information applicable to NTFS filesystem */ int hpfs_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap; { switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = 1; return (0); case _PC_NAME_MAX: *ap->a_retval = HPFS_MAXFILENAME; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_NO_TRUNC: *ap->a_retval = 0; return (0); #if defined(__NetBSD__) case _PC_SYNC_IO: *ap->a_retval = 1; return (0); case _PC_FILESIZEBITS: *ap->a_retval = 32; return (0); #endif default: return (EINVAL); } /* NOTREACHED */ } /* * Global vfs data structures */ vop_t **hpfs_vnodeop_p; #if defined(__FreeBSD__) struct vnodeopv_entry_desc hpfs_vnodeop_entries[] = { { &vop_default_desc, (vop_t *)hpfs_bypass }, { &vop_getattr_desc, (vop_t *)hpfs_getattr }, { &vop_setattr_desc, (vop_t *)hpfs_setattr }, { &vop_inactive_desc, (vop_t *)hpfs_inactive }, { &vop_reclaim_desc, (vop_t *)hpfs_reclaim }, { &vop_print_desc, (vop_t *)hpfs_print }, { &vop_create_desc, (vop_t *)hpfs_create }, { &vop_remove_desc, (vop_t *)hpfs_remove }, { &vop_islocked_desc, (vop_t *)vop_stdislocked }, { &vop_unlock_desc, (vop_t *)vop_stdunlock }, { &vop_lock_desc, (vop_t *)vop_stdlock }, { &vop_cachedlookup_desc, (vop_t *)hpfs_lookup }, { &vop_lookup_desc, (vop_t *)vfs_cache_lookup }, { &vop_access_desc, (vop_t *)hpfs_access }, { &vop_close_desc, (vop_t *)hpfs_close }, { &vop_open_desc, (vop_t *)hpfs_open }, { &vop_readdir_desc, (vop_t *)hpfs_readdir }, { &vop_fsync_desc, (vop_t *)hpfs_fsync }, { &vop_bmap_desc, (vop_t *)hpfs_bmap }, { &vop_getpages_desc, (vop_t *) hpfs_getpages }, { &vop_putpages_desc, (vop_t *) hpfs_putpages }, { &vop_strategy_desc, (vop_t *)hpfs_strategy }, - { &vop_bwrite_desc, (vop_t *)vop_stdbwrite }, { &vop_read_desc, (vop_t *)hpfs_read }, { &vop_write_desc, (vop_t *)hpfs_write }, { &vop_ioctl_desc, (vop_t *)hpfs_ioctl }, { &vop_pathconf_desc, (vop_t *)hpfs_pathconf }, { NULL, NULL } }; static struct vnodeopv_desc hpfs_vnodeop_opv_desc = { &hpfs_vnodeop_p, hpfs_vnodeop_entries }; VNODEOP_SET(hpfs_vnodeop_opv_desc); #else /* defined(__NetBSD__) */ struct vnodeopv_entry_desc ntfs_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) hpfs_bypass }, { &vop_lookup_desc, (vop_t *) hpfs_lookup }, /* lookup */ { &vop_create_desc, genfs_eopnotsupp }, /* create */ { &vop_mknod_desc, genfs_eopnotsupp }, /* mknod */ { &vop_open_desc, (vop_t *) hpfs_open }, /* open */ { &vop_close_desc,(vop_t *) hpfs_close }, /* close */ { &vop_access_desc, (vop_t *) hpfs_access }, /* access */ { &vop_getattr_desc, (vop_t *) hpfs_getattr }, /* getattr */ { &vop_setattr_desc, genfs_eopnotsupp }, /* setattr */ { &vop_read_desc, (vop_t *) hpfs_read }, /* read */ { &vop_write_desc, (vop_t *) hpfs_write }, /* write */ { &vop_lease_desc, genfs_lease_check }, /* lease */ { &vop_fcntl_desc, genfs_fcntl }, /* fcntl */ { &vop_ioctl_desc, genfs_enoioctl }, /* ioctl */ { &vop_poll_desc, genfs_poll }, /* poll */ { &vop_revoke_desc, genfs_revoke }, /* revoke */ { &vop_fsync_desc, genfs_fsync }, /* fsync */ { &vop_seek_desc, genfs_seek }, /* seek */ { &vop_remove_desc, genfs_eopnotsupp }, /* remove */ { &vop_link_desc, genfs_eopnotsupp }, /* link */ { &vop_rename_desc, genfs_eopnotsupp }, /* rename */ { &vop_mkdir_desc, genfs_eopnotsupp }, /* mkdir */ { &vop_rmdir_desc, genfs_eopnotsupp }, /* rmdir */ { &vop_symlink_desc, genfs_eopnotsupp }, /* symlink */ { &vop_readdir_desc, (vop_t *) hpfs_readdir }, /* readdir */ { &vop_readlink_desc, genfs_eopnotsupp }, /* readlink */ { &vop_abortop_desc, genfs_abortop }, /* abortop */ { &vop_inactive_desc, (vop_t *) hpfs_inactive }, /* inactive */ { &vop_reclaim_desc, (vop_t *) hpfs_reclaim }, /* reclaim */ { &vop_lock_desc, genfs_lock }, /* lock */ { &vop_unlock_desc, genfs_unlock }, /* unlock */ { &vop_bmap_desc, (vop_t *) hpfs_bmap }, /* bmap */ { &vop_strategy_desc, (vop_t *) hpfs_strategy }, /* strategy */ { &vop_print_desc, (vop_t *) hpfs_print }, /* print */ { &vop_islocked_desc, genfs_islocked }, /* islocked */ { &vop_pathconf_desc, hpfs_pathconf }, /* pathconf */ { &vop_advlock_desc, genfs_nullop }, /* advlock */ { &vop_blkatoff_desc, genfs_eopnotsupp }, /* blkatoff */ { &vop_valloc_desc, genfs_eopnotsupp }, /* valloc */ { &vop_reallocblks_desc, genfs_eopnotsupp }, /* reallocblks */ { &vop_vfree_desc, genfs_eopnotsupp }, /* vfree */ { &vop_truncate_desc, genfs_eopnotsupp }, /* truncate */ { &vop_update_desc, genfs_eopnotsupp }, /* update */ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ { (struct vnodeop_desc *)NULL, (int (*) __P((void *)))NULL } }; struct vnodeopv_desc ntfs_vnodeop_opv_desc = { &ntfs_vnodeop_p, ntfs_vnodeop_entries }; #endif Index: head/sys/fs/ntfs/ntfs_vnops.c =================================================================== --- head/sys/fs/ntfs/ntfs_vnops.c (revision 75579) +++ head/sys/fs/ntfs/ntfs_vnops.c (revision 75580) @@ -1,943 +1,942 @@ /* $NetBSD: ntfs_vnops.c,v 1.23 1999/10/31 19:45:27 jdolecek Exp $ */ /* * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * John Heidemann of the UCLA Ficus project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__NetBSD__) #include #endif #include #include #include #if defined(__FreeBSD__) #include #endif #include #include /*#define NTFS_DEBUG 1*/ #include #include #include #if defined(__NetBSD__) #include #include #endif #include /* for pathconf(2) constants */ static int ntfs_read __P((struct vop_read_args *)); static int ntfs_write __P((struct vop_write_args *ap)); static int ntfs_getattr __P((struct vop_getattr_args *ap)); static int ntfs_inactive __P((struct vop_inactive_args *ap)); static int ntfs_print __P((struct vop_print_args *ap)); static int ntfs_reclaim __P((struct vop_reclaim_args *ap)); static int ntfs_strategy __P((struct vop_strategy_args *ap)); static int ntfs_access __P((struct vop_access_args *ap)); static int ntfs_open __P((struct vop_open_args *ap)); static int ntfs_close __P((struct vop_close_args *ap)); static int ntfs_readdir __P((struct vop_readdir_args *ap)); static int ntfs_lookup __P((struct vop_lookup_args *ap)); static int ntfs_bmap __P((struct vop_bmap_args *ap)); #if defined(__FreeBSD__) static int ntfs_getpages __P((struct vop_getpages_args *ap)); static int ntfs_putpages __P((struct vop_putpages_args *)); static int ntfs_fsync __P((struct vop_fsync_args *ap)); #else static int ntfs_bypass __P((struct vop_generic_args *ap)); #endif static int ntfs_pathconf __P((void *)); int ntfs_prtactive = 1; /* 1 => print out reclaim of active vnodes */ #if defined(__FreeBSD__) int ntfs_getpages(ap) struct vop_getpages_args *ap; { return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage); } int ntfs_putpages(ap) struct vop_putpages_args *ap; { return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, ap->a_rtvals); } #endif /* * This is a noop, simply returning what one has been given. */ int ntfs_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct vnode **a_vpp; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { dprintf(("ntfs_bmap: vn: %p, blk: %d\n", ap->a_vp,(u_int32_t)ap->a_bn)); if (ap->a_vpp != NULL) *ap->a_vpp = ap->a_vp; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn; if (ap->a_runp != NULL) *ap->a_runp = 0; #if !defined(__NetBSD__) if (ap->a_runb != NULL) *ap->a_runb = 0; #endif return (0); } static int ntfs_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); struct uio *uio = ap->a_uio; struct ntfsmount *ntmp = ip->i_mp; u_int64_t toread; int error; dprintf(("ntfs_read: ino: %d, off: %d resid: %d, segflg: %d\n",ip->i_number,(u_int32_t)uio->uio_offset,uio->uio_resid,uio->uio_segflg)); dprintf(("ntfs_read: filesize: %d",(u_int32_t)fp->f_size)); /* don't allow reading after end of file */ if (uio->uio_offset > fp->f_size) toread = 0; else toread = min( uio->uio_resid, fp->f_size - uio->uio_offset ); dprintf((", toread: %d\n",(u_int32_t)toread)); if (toread == 0) return (0); error = ntfs_readattr(ntmp, ip, fp->f_attrtype, fp->f_attrname, uio->uio_offset, toread, NULL, uio); if (error) { printf("ntfs_read: ntfs_readattr failed: %d\n",error); return (error); } return (0); } #if !defined(__FreeBSD__) static int ntfs_bypass(ap) struct vop_generic_args /* { struct vnodeop_desc *a_desc; } */ *ap; { int error = ENOTTY; dprintf(("ntfs_bypass: %s\n", ap->a_desc->vdesc_name)); return (error); } #endif static int ntfs_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); register struct vattr *vap = ap->a_vap; dprintf(("ntfs_getattr: %d, flags: %d\n",ip->i_number,ip->i_flag)); #if defined(__FreeBSD__) vap->va_fsid = dev2udev(ip->i_dev); #else /* NetBSD */ vap->va_fsid = ip->i_dev; #endif vap->va_fileid = ip->i_number; vap->va_mode = ip->i_mp->ntm_mode; vap->va_nlink = ip->i_nlink; vap->va_uid = ip->i_mp->ntm_uid; vap->va_gid = ip->i_mp->ntm_gid; vap->va_rdev = 0; /* XXX UNODEV ? */ vap->va_size = fp->f_size; vap->va_bytes = fp->f_allocated; vap->va_atime = ntfs_nttimetounix(fp->f_times.t_access); vap->va_mtime = ntfs_nttimetounix(fp->f_times.t_write); vap->va_ctime = ntfs_nttimetounix(fp->f_times.t_create); vap->va_flags = ip->i_flag; vap->va_gen = 0; vap->va_blocksize = ip->i_mp->ntm_spc * ip->i_mp->ntm_bps; vap->va_type = vp->v_type; vap->va_filerev = 0; return (0); } /* * Last reference to an ntnode. If necessary, write or delete it. */ int ntfs_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; } */ *ap; { register struct vnode *vp = ap->a_vp; #ifdef NTFS_DEBUG register struct ntnode *ip = VTONT(vp); #endif dprintf(("ntfs_inactive: vnode: %p, ntnode: %d\n", vp, ip->i_number)); if (ntfs_prtactive && vp->v_usecount != 0) vprint("ntfs_inactive: pushing active", vp); VOP__UNLOCK(vp, 0, ap->a_p); /* XXX since we don't support any filesystem changes * right now, nothing more needs to be done */ return (0); } /* * Reclaim an fnode/ntnode so that it can be used for other purposes. */ int ntfs_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); int error; dprintf(("ntfs_reclaim: vnode: %p, ntnode: %d\n", vp, ip->i_number)); if (ntfs_prtactive && vp->v_usecount != 0) vprint("ntfs_reclaim: pushing active", vp); if ((error = ntfs_ntget(ip)) != 0) return (error); /* Purge old data structures associated with the inode. */ cache_purge(vp); if (ip->i_devvp) { vrele(ip->i_devvp); ip->i_devvp = NULL; } ntfs_frele(fp); ntfs_ntput(ip); vp->v_data = NULL; return (0); } static int ntfs_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { return (0); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. */ int ntfs_strategy(ap) struct vop_strategy_args /* { struct buf *a_bp; } */ *ap; { register struct buf *bp = ap->a_bp; register struct vnode *vp = bp->b_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); struct ntfsmount *ntmp = ip->i_mp; int error; #ifdef __FreeBSD__ dprintf(("ntfs_strategy: offset: %d, blkno: %d, lblkno: %d\n", (u_int32_t)bp->b_offset,(u_int32_t)bp->b_blkno, (u_int32_t)bp->b_lblkno)); #else dprintf(("ntfs_strategy: blkno: %d, lblkno: %d\n", (u_int32_t)bp->b_blkno, (u_int32_t)bp->b_lblkno)); #endif dprintf(("strategy: bcount: %d flags: 0x%lx\n", (u_int32_t)bp->b_bcount,bp->b_flags)); if (bp->b_iocmd == BIO_READ) { u_int32_t toread; if (ntfs_cntob(bp->b_blkno) >= fp->f_size) { clrbuf(bp); error = 0; } else { toread = min(bp->b_bcount, fp->f_size-ntfs_cntob(bp->b_blkno)); dprintf(("ntfs_strategy: toread: %d, fsize: %d\n", toread,(u_int32_t)fp->f_size)); error = ntfs_readattr(ntmp, ip, fp->f_attrtype, fp->f_attrname, ntfs_cntob(bp->b_blkno), toread, bp->b_data, NULL); if (error) { printf("ntfs_strategy: ntfs_readattr failed\n"); bp->b_error = error; bp->b_ioflags |= BIO_ERROR; } bzero(bp->b_data + toread, bp->b_bcount - toread); } } else { size_t tmp; u_int32_t towrite; if (ntfs_cntob(bp->b_blkno) + bp->b_bcount >= fp->f_size) { printf("ntfs_strategy: CAN'T EXTEND FILE\n"); bp->b_error = error = EFBIG; bp->b_ioflags |= BIO_ERROR; } else { towrite = min(bp->b_bcount, fp->f_size-ntfs_cntob(bp->b_blkno)); dprintf(("ntfs_strategy: towrite: %d, fsize: %d\n", towrite,(u_int32_t)fp->f_size)); error = ntfs_writeattr_plain(ntmp, ip, fp->f_attrtype, fp->f_attrname, ntfs_cntob(bp->b_blkno),towrite, bp->b_data, &tmp, NULL); if (error) { printf("ntfs_strategy: ntfs_writeattr fail\n"); bp->b_error = error; bp->b_ioflags |= BIO_ERROR; } } } bufdone(bp); return (error); } static int ntfs_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); struct uio *uio = ap->a_uio; struct ntfsmount *ntmp = ip->i_mp; u_int64_t towrite; size_t written; int error; dprintf(("ntfs_write: ino: %d, off: %d resid: %d, segflg: %d\n",ip->i_number,(u_int32_t)uio->uio_offset,uio->uio_resid,uio->uio_segflg)); dprintf(("ntfs_write: filesize: %d",(u_int32_t)fp->f_size)); if (uio->uio_resid + uio->uio_offset > fp->f_size) { printf("ntfs_write: CAN'T WRITE BEYOND END OF FILE\n"); return (EFBIG); } towrite = min(uio->uio_resid, fp->f_size - uio->uio_offset); dprintf((", towrite: %d\n",(u_int32_t)towrite)); error = ntfs_writeattr_plain(ntmp, ip, fp->f_attrtype, fp->f_attrname, uio->uio_offset, towrite, NULL, &written, uio); #ifdef NTFS_DEBUG if (error) printf("ntfs_write: ntfs_writeattr failed: %d\n", error); #endif return (error); } int ntfs_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct ntnode *ip = VTONT(vp); mode_t mode = ap->a_mode; #ifdef QUOTA int error; #endif dprintf(("ntfs_access: %d\n",ip->i_number)); /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ if (mode & VWRITE) { switch ((int)vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); #ifdef QUOTA if (error = getinoquota(ip)) return (error); #endif break; } } return (vaccess(vp->v_type, ip->i_mp->ntm_mode, ip->i_mp->ntm_uid, ip->i_mp->ntm_gid, ap->a_mode, ap->a_cred, NULL)); } /* * Open called. * * Nothing to do. */ /* ARGSUSED */ static int ntfs_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { #if NTFS_DEBUG register struct vnode *vp = ap->a_vp; register struct ntnode *ip = VTONT(vp); printf("ntfs_open: %d\n",ip->i_number); #endif /* * Files marked append-only must be opened for appending. */ return (0); } /* * Close called. * * Update the times on the inode. */ /* ARGSUSED */ static int ntfs_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { #if NTFS_DEBUG register struct vnode *vp = ap->a_vp; register struct ntnode *ip = VTONT(vp); printf("ntfs_close: %d\n",ip->i_number); #endif return (0); } int ntfs_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_ncookies; u_int **cookies; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); struct uio *uio = ap->a_uio; struct ntfsmount *ntmp = ip->i_mp; int i, error = 0; u_int32_t faked = 0, num; int ncookies = 0; struct dirent cde; off_t off; dprintf(("ntfs_readdir %d off: %d resid: %d\n",ip->i_number,(u_int32_t)uio->uio_offset,uio->uio_resid)); off = uio->uio_offset; /* Simulate . in every dir except ROOT */ if( ip->i_number != NTFS_ROOTINO ) { struct dirent dot = { NTFS_ROOTINO, sizeof(struct dirent), DT_DIR, 1, "." }; if( uio->uio_offset < sizeof(struct dirent) ) { dot.d_fileno = ip->i_number; error = uiomove((char *)&dot,sizeof(struct dirent),uio); if(error) return (error); ncookies ++; } } /* Simulate .. in every dir including ROOT */ if( uio->uio_offset < 2 * sizeof(struct dirent) ) { struct dirent dotdot = { NTFS_ROOTINO, sizeof(struct dirent), DT_DIR, 2, ".." }; error = uiomove((char *)&dotdot,sizeof(struct dirent),uio); if(error) return (error); ncookies ++; } faked = (ip->i_number == NTFS_ROOTINO) ? 1 : 2; num = uio->uio_offset / sizeof(struct dirent) - faked; while( uio->uio_resid >= sizeof(struct dirent) ) { struct attr_indexentry *iep; error = ntfs_ntreaddir(ntmp, fp, num, &iep); if(error) return (error); if( NULL == iep ) break; for(; !(iep->ie_flag & NTFS_IEFLAG_LAST) && (uio->uio_resid >= sizeof(struct dirent)); iep = NTFS_NEXTREC(iep, struct attr_indexentry *)) { if(!ntfs_isnamepermitted(ntmp,iep)) continue; for(i=0; iie_fnamelen; i++) { cde.d_name[i] = ntfs_u28(iep->ie_fname[i]); } cde.d_name[i] = '\0'; dprintf(("ntfs_readdir: elem: %d, fname:[%s] type: %d, flag: %d, ", num, cde.d_name, iep->ie_fnametype, iep->ie_flag)); cde.d_namlen = iep->ie_fnamelen; cde.d_fileno = iep->ie_number; cde.d_type = (iep->ie_fflag & NTFS_FFLAG_DIR) ? DT_DIR : DT_REG; cde.d_reclen = sizeof(struct dirent); dprintf(("%s\n", (cde.d_type == DT_DIR) ? "dir":"reg")); error = uiomove((char *)&cde, sizeof(struct dirent), uio); if(error) return (error); ncookies++; num++; } } dprintf(("ntfs_readdir: %d entries (%d bytes) read\n", ncookies,(u_int)(uio->uio_offset - off))); dprintf(("ntfs_readdir: off: %d resid: %d\n", (u_int32_t)uio->uio_offset,uio->uio_resid)); if (!error && ap->a_ncookies != NULL) { struct dirent* dpStart; struct dirent* dp; #if defined(__FreeBSD__) u_long *cookies; u_long *cookiep; #else /* defined(__NetBSD__) */ off_t *cookies; off_t *cookiep; #endif ddprintf(("ntfs_readdir: %d cookies\n",ncookies)); if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) panic("ntfs_readdir: unexpected uio from NFS server"); dpStart = (struct dirent *) ((caddr_t)uio->uio_iov->iov_base - (uio->uio_offset - off)); #if defined(__FreeBSD__) MALLOC(cookies, u_long *, ncookies * sizeof(u_long), M_TEMP, M_WAITOK); #else /* defined(__NetBSD__) */ MALLOC(cookies, off_t *, ncookies * sizeof(off_t), M_TEMP, M_WAITOK); #endif for (dp = dpStart, cookiep = cookies, i=0; i < ncookies; dp = (struct dirent *)((caddr_t) dp + dp->d_reclen), i++) { off += dp->d_reclen; *cookiep++ = (u_int) off; } *ap->a_ncookies = ncookies; *ap->a_cookies = cookies; } /* if (ap->a_eofflag) *ap->a_eofflag = VTONT(ap->a_vp)->i_size <= uio->uio_offset; */ return (error); } int ntfs_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { register struct vnode *dvp = ap->a_dvp; register struct ntnode *dip = VTONT(dvp); struct ntfsmount *ntmp = dip->i_mp; struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; int error; int lockparent = cnp->cn_flags & LOCKPARENT; #if NTFS_DEBUG int wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT); #endif dprintf(("ntfs_lookup: \"%.*s\" (%ld bytes) in %d, lp: %d, wp: %d \n", (int)cnp->cn_namelen, cnp->cn_nameptr, cnp->cn_namelen, dip->i_number, lockparent, wantparent)); error = VOP_ACCESS(dvp, VEXEC, cred, cnp->cn_proc); if(error) return (error); if ((cnp->cn_flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); #ifdef __NetBSD__ /* * We now have a segment name to search for, and a directory * to search. * * Before tediously performing a linear scan of the directory, * check the name cache to see if the directory/name pair * we are looking for is known already. */ if ((error = cache_lookup(ap->a_dvp, ap->a_vpp, cnp)) >= 0) return (error); #endif if(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { dprintf(("ntfs_lookup: faking . directory in %d\n", dip->i_number)); VREF(dvp); *ap->a_vpp = dvp; error = 0; } else if (cnp->cn_flags & ISDOTDOT) { struct ntvattr *vap; dprintf(("ntfs_lookup: faking .. directory in %d\n", dip->i_number)); error = ntfs_ntvattrget(ntmp, dip, NTFS_A_NAME, NULL, 0, &vap); if(error) return (error); VOP__UNLOCK(dvp,0,cnp->cn_proc); cnp->cn_flags |= PDIRUNLOCK; dprintf(("ntfs_lookup: parentdir: %d\n", vap->va_a_name->n_pnumber)); error = VFS_VGET(ntmp->ntm_mountp, vap->va_a_name->n_pnumber,ap->a_vpp); ntfs_ntvattrrele(vap); if (error) { if (VN_LOCK(dvp,LK_EXCLUSIVE|LK_RETRY,cnp->cn_proc)==0) cnp->cn_flags &= ~PDIRUNLOCK; return (error); } if (lockparent && (cnp->cn_flags & ISLASTCN)) { error = VN_LOCK(dvp, LK_EXCLUSIVE, cnp->cn_proc); if (error) { vput( *(ap->a_vpp) ); return (error); } cnp->cn_flags &= ~PDIRUNLOCK; } } else { error = ntfs_ntlookupfile(ntmp, dvp, cnp, ap->a_vpp); if (error) { dprintf(("ntfs_ntlookupfile: returned %d\n", error)); return (error); } dprintf(("ntfs_lookup: found ino: %d\n", VTONT(*ap->a_vpp)->i_number)); if(!lockparent || !(cnp->cn_flags & ISLASTCN)) VOP__UNLOCK(dvp, 0, cnp->cn_proc); } if (cnp->cn_flags & MAKEENTRY) cache_enter(dvp, *ap->a_vpp, cnp); return (error); } #if defined(__FreeBSD__) /* * Flush the blocks of a file to disk. * * This function is worthless for vnodes that represent directories. Maybe we * could just do a sync if they try an fsync on a directory file. */ static int ntfs_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct proc *a_p; } */ *ap; { return (0); } #endif /* * Return POSIX pathconf information applicable to NTFS filesystem */ int ntfs_pathconf(v) void *v; { struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap = v; switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = 1; return (0); case _PC_NAME_MAX: *ap->a_retval = NTFS_MAXFILENAME; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_NO_TRUNC: *ap->a_retval = 0; return (0); #if defined(__NetBSD__) case _PC_SYNC_IO: *ap->a_retval = 1; return (0); case _PC_FILESIZEBITS: *ap->a_retval = 64; return (0); #endif default: return (EINVAL); } /* NOTREACHED */ } /* * Global vfs data structures */ vop_t **ntfs_vnodeop_p; #if defined(__FreeBSD__) static struct vnodeopv_entry_desc ntfs_vnodeop_entries[] = { { &vop_default_desc, (vop_t *)vop_defaultop }, { &vop_getattr_desc, (vop_t *)ntfs_getattr }, { &vop_inactive_desc, (vop_t *)ntfs_inactive }, { &vop_reclaim_desc, (vop_t *)ntfs_reclaim }, { &vop_print_desc, (vop_t *)ntfs_print }, { &vop_pathconf_desc, ntfs_pathconf }, { &vop_islocked_desc, (vop_t *)vop_stdislocked }, { &vop_unlock_desc, (vop_t *)vop_stdunlock }, { &vop_lock_desc, (vop_t *)vop_stdlock }, { &vop_cachedlookup_desc, (vop_t *)ntfs_lookup }, { &vop_lookup_desc, (vop_t *)vfs_cache_lookup }, { &vop_access_desc, (vop_t *)ntfs_access }, { &vop_close_desc, (vop_t *)ntfs_close }, { &vop_open_desc, (vop_t *)ntfs_open }, { &vop_readdir_desc, (vop_t *)ntfs_readdir }, { &vop_fsync_desc, (vop_t *)ntfs_fsync }, { &vop_bmap_desc, (vop_t *)ntfs_bmap }, { &vop_getpages_desc, (vop_t *) ntfs_getpages }, { &vop_putpages_desc, (vop_t *) ntfs_putpages }, { &vop_strategy_desc, (vop_t *)ntfs_strategy }, - { &vop_bwrite_desc, (vop_t *)vop_stdbwrite }, { &vop_read_desc, (vop_t *)ntfs_read }, { &vop_write_desc, (vop_t *)ntfs_write }, { NULL, NULL } }; static struct vnodeopv_desc ntfs_vnodeop_opv_desc = { &ntfs_vnodeop_p, ntfs_vnodeop_entries }; VNODEOP_SET(ntfs_vnodeop_opv_desc); #else /* !FreeBSD */ struct vnodeopv_entry_desc ntfs_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) ntfs_bypass }, { &vop_lookup_desc, (vop_t *) ntfs_lookup }, /* lookup */ { &vop_create_desc, genfs_eopnotsupp }, /* create */ { &vop_mknod_desc, genfs_eopnotsupp }, /* mknod */ { &vop_open_desc, (vop_t *) ntfs_open }, /* open */ { &vop_close_desc,(vop_t *) ntfs_close }, /* close */ { &vop_access_desc, (vop_t *) ntfs_access }, /* access */ { &vop_getattr_desc, (vop_t *) ntfs_getattr }, /* getattr */ { &vop_setattr_desc, genfs_eopnotsupp }, /* setattr */ { &vop_read_desc, (vop_t *) ntfs_read }, /* read */ { &vop_write_desc, (vop_t *) ntfs_write }, /* write */ { &vop_lease_desc, genfs_lease_check }, /* lease */ { &vop_fcntl_desc, genfs_fcntl }, /* fcntl */ { &vop_ioctl_desc, genfs_enoioctl }, /* ioctl */ { &vop_poll_desc, genfs_poll }, /* poll */ { &vop_revoke_desc, genfs_revoke }, /* revoke */ { &vop_fsync_desc, genfs_fsync }, /* fsync */ { &vop_seek_desc, genfs_seek }, /* seek */ { &vop_remove_desc, genfs_eopnotsupp }, /* remove */ { &vop_link_desc, genfs_eopnotsupp }, /* link */ { &vop_rename_desc, genfs_eopnotsupp }, /* rename */ { &vop_mkdir_desc, genfs_eopnotsupp }, /* mkdir */ { &vop_rmdir_desc, genfs_eopnotsupp }, /* rmdir */ { &vop_symlink_desc, genfs_eopnotsupp }, /* symlink */ { &vop_readdir_desc, (vop_t *) ntfs_readdir }, /* readdir */ { &vop_readlink_desc, genfs_eopnotsupp }, /* readlink */ { &vop_abortop_desc, genfs_abortop }, /* abortop */ { &vop_inactive_desc, (vop_t *) ntfs_inactive }, /* inactive */ { &vop_reclaim_desc, (vop_t *) ntfs_reclaim }, /* reclaim */ { &vop_lock_desc, genfs_lock }, /* lock */ { &vop_unlock_desc, genfs_unlock }, /* unlock */ { &vop_bmap_desc, (vop_t *) ntfs_bmap }, /* bmap */ { &vop_strategy_desc, (vop_t *) ntfs_strategy }, /* strategy */ { &vop_print_desc, (vop_t *) ntfs_print }, /* print */ { &vop_islocked_desc, genfs_islocked }, /* islocked */ { &vop_pathconf_desc, ntfs_pathconf }, /* pathconf */ { &vop_advlock_desc, genfs_nullop }, /* advlock */ { &vop_blkatoff_desc, genfs_eopnotsupp }, /* blkatoff */ { &vop_valloc_desc, genfs_eopnotsupp }, /* valloc */ { &vop_reallocblks_desc, genfs_eopnotsupp }, /* reallocblks */ { &vop_vfree_desc, genfs_eopnotsupp }, /* vfree */ { &vop_truncate_desc, genfs_eopnotsupp }, /* truncate */ { &vop_update_desc, genfs_eopnotsupp }, /* update */ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ { (struct vnodeop_desc *)NULL, (int (*) __P((void *)))NULL } }; struct vnodeopv_desc ntfs_vnodeop_opv_desc = { &ntfs_vnodeop_p, ntfs_vnodeop_entries }; #endif Index: head/sys/kern/vfs_bio.c =================================================================== --- head/sys/kern/vfs_bio.c (revision 75579) +++ head/sys/kern/vfs_bio.c (revision 75580) @@ -1,3245 +1,3252 @@ /* * Copyright (c) 1994,1997 John S. Dyson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * * $FreeBSD$ */ /* * this file contains a new buffer I/O scheme implementing a coherent * VM object and buffer cache scheme. Pains have been taken to make * sure that the performance degradation associated with schemes such * as this is not realized. * * Author: John S. Dyson * Significant help during the development and debugging phases * had been provided by David Greenman, also of the FreeBSD core team. * * see man buf(9) for more info. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer"); struct bio_ops bioops; /* I/O operation notification */ +struct buf_ops buf_ops_bio = { + "buf_ops_bio", + bwrite +}; + struct buf *buf; /* buffer header pool */ struct swqueue bswlist; struct mtx buftimelock; /* Interlock on setting prio and timo */ static void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to); static void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to); static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m); static void vfs_clean_pages(struct buf * bp); static void vfs_setdirty(struct buf *bp); static void vfs_vmio_release(struct buf *bp); static void vfs_backgroundwritedone(struct buf *bp); static int flushbufqueues(void); static int bd_request; static void buf_daemon __P((void)); /* * bogus page -- for I/O to/from partially complete buffers * this is a temporary solution to the problem, but it is not * really that bad. it would be better to split the buffer * for input in the case of buffers partially already in memory, * but the code is intricate enough already. */ vm_page_t bogus_page; int vmiodirenable = FALSE; int runningbufspace; static vm_offset_t bogus_offset; static int bufspace, maxbufspace, bufmallocspace, maxbufmallocspace, lobufspace, hibufspace; static int bufreusecnt, bufdefragcnt, buffreekvacnt; static int needsbuffer; static int lorunningspace, hirunningspace, runningbufreq; static int numdirtybuffers, lodirtybuffers, hidirtybuffers; static int numfreebuffers, lofreebuffers, hifreebuffers; static int getnewbufcalls; static int getnewbufrestarts; SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0, ""); static int bufhashmask; static LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash; struct bqueues bufqueues[BUFFER_QUEUES] = { { 0 } }; char *buf_wmesg = BUF_WMESG; extern int vm_swap_size; #define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ #define VFS_BIO_NEED_DIRTYFLUSH 0x02 /* waiting for dirty buffer flush */ #define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */ #define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ /* * Buffer hash table code. Note that the logical block scans linearly, which * gives us some L1 cache locality. */ static __inline struct bufhashhdr * bufhash(struct vnode *vnp, daddr_t bn) { return(&bufhashtbl[(((uintptr_t)(vnp) >> 7) + (int)bn) & bufhashmask]); } /* * numdirtywakeup: * * If someone is blocked due to there being too many dirty buffers, * and numdirtybuffers is now reasonable, wake them up. */ static __inline void numdirtywakeup(int level) { if (numdirtybuffers <= level) { if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) { needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH; wakeup(&needsbuffer); } } } /* * bufspacewakeup: * * Called when buffer space is potentially available for recovery. * getnewbuf() will block on this flag when it is unable to free * sufficient buffer space. Buffer space becomes recoverable when * bp's get placed back in the queues. */ static __inline void bufspacewakeup(void) { /* * If someone is waiting for BUF space, wake them up. Even * though we haven't freed the kva space yet, the waiting * process will be able to now. */ if (needsbuffer & VFS_BIO_NEED_BUFSPACE) { needsbuffer &= ~VFS_BIO_NEED_BUFSPACE; wakeup(&needsbuffer); } } /* * runningbufwakeup() - in-progress I/O accounting. * */ static __inline void runningbufwakeup(struct buf *bp) { if (bp->b_runningbufspace) { runningbufspace -= bp->b_runningbufspace; bp->b_runningbufspace = 0; if (runningbufreq && runningbufspace <= lorunningspace) { runningbufreq = 0; wakeup(&runningbufreq); } } } /* * bufcountwakeup: * * Called when a buffer has been added to one of the free queues to * account for the buffer and to wakeup anyone waiting for free buffers. * This typically occurs when large amounts of metadata are being handled * by the buffer cache ( else buffer space runs out first, usually ). */ static __inline void bufcountwakeup(void) { ++numfreebuffers; if (needsbuffer) { needsbuffer &= ~VFS_BIO_NEED_ANY; if (numfreebuffers >= hifreebuffers) needsbuffer &= ~VFS_BIO_NEED_FREE; wakeup(&needsbuffer); } } /* * waitrunningbufspace() * * runningbufspace is a measure of the amount of I/O currently * running. This routine is used in async-write situations to * prevent creating huge backups of pending writes to a device. * Only asynchronous writes are governed by this function. * * Reads will adjust runningbufspace, but will not block based on it. * The read load has a side effect of reducing the allowed write load. * * This does NOT turn an async write into a sync write. It waits * for earlier writes to complete and generally returns before the * caller's write has reached the device. */ static __inline void waitrunningbufspace(void) { while (runningbufspace > hirunningspace) { ++runningbufreq; tsleep(&runningbufreq, PVM, "wdrain", 0); } } /* * vfs_buf_test_cache: * * Called when a buffer is extended. This function clears the B_CACHE * bit if the newly extended portion of the buffer does not contain * valid data. */ static __inline__ void vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, vm_page_t m) { if (bp->b_flags & B_CACHE) { int base = (foff + off) & PAGE_MASK; if (vm_page_is_valid(m, base, size) == 0) bp->b_flags &= ~B_CACHE; } } static __inline__ void bd_wakeup(int dirtybuflevel) { if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) { bd_request = 1; wakeup(&bd_request); } } /* * bd_speedup - speedup the buffer cache flushing code */ static __inline__ void bd_speedup(void) { bd_wakeup(1); } /* * Initialize buffer headers and related structures. */ caddr_t bufhashinit(caddr_t vaddr) { /* first, make a null hash table */ for (bufhashmask = 8; bufhashmask < nbuf / 4; bufhashmask <<= 1) ; bufhashtbl = (void *)vaddr; vaddr = vaddr + sizeof(*bufhashtbl) * bufhashmask; --bufhashmask; return(vaddr); } void bufinit(void) { struct buf *bp; int i; TAILQ_INIT(&bswlist); LIST_INIT(&invalhash); mtx_init(&buftimelock, "buftime lock", MTX_DEF); for (i = 0; i <= bufhashmask; i++) LIST_INIT(&bufhashtbl[i]); /* next, make a null set of free lists */ for (i = 0; i < BUFFER_QUEUES; i++) TAILQ_INIT(&bufqueues[i]); /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { bp = &buf[i]; bzero(bp, sizeof *bp); bp->b_flags = B_INVAL; /* we're just an empty header */ bp->b_dev = NODEV; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_EMPTY; bp->b_xflags = 0; LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); LIST_INSERT_HEAD(&invalhash, bp, b_hash); } /* * maxbufspace is the absolute maximum amount of buffer space we are * allowed to reserve in KVM and in real terms. The absolute maximum * is nominally used by buf_daemon. hibufspace is the nominal maximum * used by most other processes. The differential is required to * ensure that buf_daemon is able to run when other processes might * be blocked waiting for buffer space. * * maxbufspace is based on BKVASIZE. Allocating buffers larger then * this may result in KVM fragmentation which is not handled optimally * by the system. */ maxbufspace = nbuf * BKVASIZE; hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10); lobufspace = hibufspace - MAXBSIZE; lorunningspace = 512 * 1024; hirunningspace = 1024 * 1024; /* * Limit the amount of malloc memory since it is wired permanently into * the kernel space. Even though this is accounted for in the buffer * allocation, we don't want the malloced region to grow uncontrolled. * The malloc scheme improves memory utilization significantly on average * (small) directories. */ maxbufmallocspace = hibufspace / 20; /* * Reduce the chance of a deadlock occuring by limiting the number * of delayed-write dirty buffers we allow to stack up. */ hidirtybuffers = nbuf / 4 + 20; numdirtybuffers = 0; /* * To support extreme low-memory systems, make sure hidirtybuffers cannot * eat up all available buffer space. This occurs when our minimum cannot * be met. We try to size hidirtybuffers to 3/4 our buffer space assuming * BKVASIZE'd (8K) buffers. */ while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { hidirtybuffers >>= 1; } lodirtybuffers = hidirtybuffers / 2; /* * Try to keep the number of free buffers in the specified range, * and give special processes (e.g. like buf_daemon) access to an * emergency reserve. */ lofreebuffers = nbuf / 18 + 5; hifreebuffers = 2 * lofreebuffers; numfreebuffers = nbuf; /* * Maximum number of async ops initiated per buf_daemon loop. This is * somewhat of a hack at the moment, we really need to limit ourselves * based on the number of bytes of I/O in-transit that were initiated * from buf_daemon. */ bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE); bogus_page = vm_page_alloc(kernel_object, ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), VM_ALLOC_NORMAL); cnt.v_wire_count++; } /* * bfreekva() - free the kva allocation for a buffer. * * Must be called at splbio() or higher as this is the only locking for * buffer_map. * * Since this call frees up buffer space, we call bufspacewakeup(). */ static void bfreekva(struct buf * bp) { if (bp->b_kvasize) { ++buffreekvacnt; bufspace -= bp->b_kvasize; vm_map_delete(buffer_map, (vm_offset_t) bp->b_kvabase, (vm_offset_t) bp->b_kvabase + bp->b_kvasize ); bp->b_kvasize = 0; bufspacewakeup(); } } /* * bremfree: * * Remove the buffer from the appropriate free list. */ void bremfree(struct buf * bp) { int s = splbio(); int old_qindex = bp->b_qindex; if (bp->b_qindex != QUEUE_NONE) { KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp)); TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); bp->b_qindex = QUEUE_NONE; } else { if (BUF_REFCNT(bp) <= 1) panic("bremfree: removing a buffer not on a queue"); } /* * Fixup numfreebuffers count. If the buffer is invalid or not * delayed-write, and it was on the EMPTY, LRU, or AGE queues, * the buffer was free and we must decrement numfreebuffers. */ if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) { switch(old_qindex) { case QUEUE_DIRTY: case QUEUE_CLEAN: case QUEUE_EMPTY: case QUEUE_EMPTYKVA: --numfreebuffers; break; default: break; } } splx(s); } /* * Get a buffer with the specified data. Look in the cache first. We * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE * is set, the buffer is valid and we do not have to do anything ( see * getblk() ). */ int bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, struct buf ** bpp) { struct buf *bp; bp = getblk(vp, blkno, size, 0, 0); *bpp = bp; /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { if (curproc != PCPU_GET(idleproc)) curproc->p_stats->p_ru.ru_inblock++; KASSERT(!(bp->b_flags & B_ASYNC), ("bread: illegal async bp %p", bp)); bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; if (bp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); bp->b_rcred = cred; } vfs_busy_pages(bp, 0); VOP_STRATEGY(vp, bp); return (bufwait(bp)); } return (0); } /* * Operates like bread, but also starts asynchronous I/O on * read-ahead blocks. We must clear BIO_ERROR and B_INVAL prior * to initiating I/O . If B_CACHE is set, the buffer is valid * and we do not have to do anything. */ int breadn(struct vnode * vp, daddr_t blkno, int size, daddr_t * rablkno, int *rabsize, int cnt, struct ucred * cred, struct buf ** bpp) { struct buf *bp, *rabp; int i; int rv = 0, readwait = 0; *bpp = bp = getblk(vp, blkno, size, 0, 0); /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { if (curproc != PCPU_GET(idleproc)) curproc->p_stats->p_ru.ru_inblock++; bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; if (bp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); bp->b_rcred = cred; } vfs_busy_pages(bp, 0); VOP_STRATEGY(vp, bp); ++readwait; } for (i = 0; i < cnt; i++, rablkno++, rabsize++) { if (inmem(vp, *rablkno)) continue; rabp = getblk(vp, *rablkno, *rabsize, 0, 0); if ((rabp->b_flags & B_CACHE) == 0) { if (curproc != PCPU_GET(idleproc)) curproc->p_stats->p_ru.ru_inblock++; rabp->b_flags |= B_ASYNC; rabp->b_flags &= ~B_INVAL; rabp->b_ioflags &= ~BIO_ERROR; rabp->b_iocmd = BIO_READ; if (rabp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); rabp->b_rcred = cred; } vfs_busy_pages(rabp, 0); BUF_KERNPROC(rabp); VOP_STRATEGY(vp, rabp); } else { brelse(rabp); } } if (readwait) { rv = bufwait(bp); } return (rv); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ int dobkgrdwrite = 1; SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0, ""); int bwrite(struct buf * bp) { int oldflags, s; struct buf *newbp; if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } oldflags = bp->b_flags; if (BUF_REFCNT(bp) == 0) panic("bwrite: buffer is not busy???"); s = splbio(); /* * If a background write is already in progress, delay * writing this block if it is asynchronous. Otherwise * wait for the background write to complete. */ if (bp->b_xflags & BX_BKGRDINPROG) { if (bp->b_flags & B_ASYNC) { splx(s); bdwrite(bp); return (0); } bp->b_xflags |= BX_BKGRDWAIT; tsleep(&bp->b_xflags, PRIBIO, "biord", 0); if (bp->b_xflags & BX_BKGRDINPROG) panic("bwrite: still writing"); } /* Mark the buffer clean */ bundirty(bp); /* * If this buffer is marked for background writing and we * do not have to wait for it, make a copy and write the * copy so as to leave this buffer ready for further use. * * This optimization eats a lot of memory. If we have a page * or buffer shortfall we can't do it. */ if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) && (bp->b_flags & B_ASYNC) && !vm_page_count_severe() && !buf_dirty_count_severe()) { if (bp->b_iodone != NULL) { printf("bp->b_iodone = %p\n", bp->b_iodone); panic("bwrite: need chained iodone"); } /* get a new block */ newbp = geteblk(bp->b_bufsize); /* set it to be identical to the old block */ memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); bgetvp(bp->b_vp, newbp); newbp->b_lblkno = bp->b_lblkno; newbp->b_blkno = bp->b_blkno; newbp->b_offset = bp->b_offset; newbp->b_iodone = vfs_backgroundwritedone; newbp->b_flags |= B_ASYNC; newbp->b_flags &= ~B_INVAL; /* move over the dependencies */ if (LIST_FIRST(&bp->b_dep) != NULL) buf_movedeps(bp, newbp); /* * Initiate write on the copy, release the original to * the B_LOCKED queue so that it cannot go away until * the background write completes. If not locked it could go * away and then be reconstituted while it was being written. * If the reconstituted buffer were written, we could end up * with two background copies being written at the same time. */ bp->b_xflags |= BX_BKGRDINPROG; bp->b_flags |= B_LOCKED; bqrelse(bp); bp = newbp; } bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_flags |= B_WRITEINPROG | B_CACHE; bp->b_iocmd = BIO_WRITE; bp->b_vp->v_numoutput++; vfs_busy_pages(bp, 1); /* * Normal bwrites pipeline writes */ bp->b_runningbufspace = bp->b_bufsize; runningbufspace += bp->b_runningbufspace; if (curproc != PCPU_GET(idleproc)) curproc->p_stats->p_ru.ru_oublock++; splx(s); if (oldflags & B_ASYNC) BUF_KERNPROC(bp); BUF_STRATEGY(bp); if ((oldflags & B_ASYNC) == 0) { int rtval = bufwait(bp); brelse(bp); return (rtval); } else { /* * don't allow the async write to saturate the I/O * system. There is no chance of deadlock here because * we are blocking on I/O that is already in-progress. */ waitrunningbufspace(); } return (0); } /* * Complete a background write started from bwrite. */ static void vfs_backgroundwritedone(bp) struct buf *bp; { struct buf *origbp; /* * Find the original buffer that we are writing. */ if ((origbp = gbincore(bp->b_vp, bp->b_lblkno)) == NULL) panic("backgroundwritedone: lost buffer"); /* * Process dependencies then return any unfinished ones. */ if (LIST_FIRST(&bp->b_dep) != NULL) buf_complete(bp); if (LIST_FIRST(&bp->b_dep) != NULL) buf_movedeps(bp, origbp); /* * Clear the BX_BKGRDINPROG flag in the original buffer * and awaken it if it is waiting for the write to complete. * If BX_BKGRDINPROG is not set in the original buffer it must * have been released and re-instantiated - which is not legal. */ KASSERT((origbp->b_xflags & BX_BKGRDINPROG), ("backgroundwritedone: lost buffer2")); origbp->b_xflags &= ~BX_BKGRDINPROG; if (origbp->b_xflags & BX_BKGRDWAIT) { origbp->b_xflags &= ~BX_BKGRDWAIT; wakeup(&origbp->b_xflags); } /* * Clear the B_LOCKED flag and remove it from the locked * queue if it currently resides there. */ origbp->b_flags &= ~B_LOCKED; if (BUF_LOCK(origbp, LK_EXCLUSIVE | LK_NOWAIT) == 0) { bremfree(origbp); bqrelse(origbp); } /* * This buffer is marked B_NOCACHE, so when it is released * by biodone, it will be tossed. We mark it with BIO_READ * to avoid biodone doing a second vwakeup. */ bp->b_flags |= B_NOCACHE; bp->b_iocmd = BIO_READ; bp->b_flags &= ~(B_CACHE | B_DONE); bp->b_iodone = 0; bufdone(bp); } /* * Delayed write. (Buffer is marked dirty). Do not bother writing * anything if the buffer is marked invalid. * * Note that since the buffer must be completely valid, we can safely * set B_CACHE. In fact, we have to set B_CACHE here rather then in * biodone() in order to prevent getblk from writing the buffer * out synchronously. */ void bdwrite(struct buf * bp) { if (BUF_REFCNT(bp) == 0) panic("bdwrite: buffer is not busy"); if (bp->b_flags & B_INVAL) { brelse(bp); return; } bdirty(bp); /* * Set B_CACHE, indicating that the buffer is fully valid. This is * true even of NFS now. */ bp->b_flags |= B_CACHE; /* * This bmap keeps the system from needing to do the bmap later, * perhaps when the system is attempting to do a sync. Since it * is likely that the indirect block -- or whatever other datastructure * that the filesystem needs is still in memory now, it is a good * thing to do this. Note also, that if the pageout daemon is * requesting a sync -- there might not be enough memory to do * the bmap then... So, this is important to do. */ if (bp->b_lblkno == bp->b_blkno) { VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } /* * Set the *dirty* buffer range based upon the VM system dirty pages. */ vfs_setdirty(bp); /* * We need to do this here to satisfy the vnode_pager and the * pageout daemon, so that it thinks that the pages have been * "cleaned". Note that since the pages are in a delayed write * buffer -- the VFS layer "will" see that the pages get written * out on the next sync, or perhaps the cluster will be completed. */ vfs_clean_pages(bp); bqrelse(bp); /* * Wakeup the buffer flushing daemon if we have a lot of dirty * buffers (midpoint between our recovery point and our stall * point). */ bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); /* * note: we cannot initiate I/O from a bdwrite even if we wanted to, * due to the softdep code. */ } /* * bdirty: * * Turn buffer into delayed write request. We must clear BIO_READ and * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to * itself to properly update it in the dirty/clean lists. We mark it * B_DONE to ensure that any asynchronization of the buffer properly * clears B_DONE ( else a panic will occur later ). * * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() * should only be called if the buffer is known-good. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * Must be called at splbio(). * The buffer must be on QUEUE_NONE. */ void bdirty(bp) struct buf *bp; { KASSERT(bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); bp->b_flags &= ~(B_RELBUF); bp->b_iocmd = BIO_WRITE; if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= B_DONE | B_DELWRI; reassignbuf(bp, bp->b_vp); ++numdirtybuffers; bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); } } /* * bundirty: * * Clear B_DELWRI for buffer. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * Must be called at splbio(). * The buffer must be on QUEUE_NONE. */ void bundirty(bp) struct buf *bp; { KASSERT(bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); if (bp->b_flags & B_DELWRI) { bp->b_flags &= ~B_DELWRI; reassignbuf(bp, bp->b_vp); --numdirtybuffers; numdirtywakeup(lodirtybuffers); } /* * Since it is now being written, we can clear its deferred write flag. */ bp->b_flags &= ~B_DEFERRED; } /* * bawrite: * * Asynchronous write. Start output on a buffer, but do not wait for * it to complete. The buffer is released when the output completes. * * bwrite() ( or the VOP routine anyway ) is responsible for handling * B_INVAL buffers. Not us. */ void bawrite(struct buf * bp) { bp->b_flags |= B_ASYNC; (void) BUF_WRITE(bp); } /* * bowrite: * * Ordered write. Start output on a buffer, and flag it so that the * device will write it in the order it was queued. The buffer is * released when the output completes. bwrite() ( or the VOP routine * anyway ) is responsible for handling B_INVAL buffers. */ int bowrite(struct buf * bp) { bp->b_ioflags |= BIO_ORDERED; bp->b_flags |= B_ASYNC; return (BUF_WRITE(bp)); } /* * bwillwrite: * * Called prior to the locking of any vnodes when we are expecting to * write. We do not want to starve the buffer cache with too many * dirty buffers so we block here. By blocking prior to the locking * of any vnodes we attempt to avoid the situation where a locked vnode * prevents the various system daemons from flushing related buffers. */ void bwillwrite(void) { if (numdirtybuffers >= hidirtybuffers) { int s; s = splbio(); while (numdirtybuffers >= hidirtybuffers) { bd_wakeup(1); needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH; tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0); } splx(s); } } /* * Return true if we have too many dirty buffers. */ int buf_dirty_count_severe(void) { return(numdirtybuffers >= hidirtybuffers); } /* * brelse: * * Release a busy buffer and, if requested, free its resources. The * buffer will be stashed in the appropriate bufqueue[] allowing it * to be accessed later as a cache entity or reused for other purposes. */ void brelse(struct buf * bp) { int s; KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); s = splbio(); if (bp->b_flags & B_LOCKED) bp->b_ioflags &= ~BIO_ERROR; if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && !(bp->b_flags & B_INVAL)) { /* * Failed write, redirty. Must clear BIO_ERROR to prevent * pages from being scrapped. If B_INVAL is set then * this case is not run and the next case is run to * destroy the buffer. B_INVAL can occur if the buffer * is outside the range supported by the underlying device. */ bp->b_ioflags &= ~BIO_ERROR; bdirty(bp); } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || (bp->b_ioflags & BIO_ERROR) || bp->b_iocmd == BIO_DELETE || (bp->b_bufsize <= 0)) { /* * Either a failed I/O or we were asked to free or not * cache the buffer. */ bp->b_flags |= B_INVAL; if (LIST_FIRST(&bp->b_dep) != NULL) buf_deallocate(bp); if (bp->b_flags & B_DELWRI) { --numdirtybuffers; numdirtywakeup(lodirtybuffers); } bp->b_flags &= ~(B_DELWRI | B_CACHE); if ((bp->b_flags & B_VMIO) == 0) { if (bp->b_bufsize) allocbuf(bp, 0); if (bp->b_vp) brelvp(bp); } } /* * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_release() * is called with B_DELWRI set, the underlying pages may wind up * getting freed causing a previous write (bdwrite()) to get 'lost' * because pages associated with a B_DELWRI bp are marked clean. * * We still allow the B_INVAL case to call vfs_vmio_release(), even * if B_DELWRI is set. * * If B_DELWRI is not set we may have to set B_RELBUF if we are low * on pages to return pages to the VM page queues. */ if (bp->b_flags & B_DELWRI) bp->b_flags &= ~B_RELBUF; else if (vm_page_count_severe() && !(bp->b_xflags & BX_BKGRDINPROG)) bp->b_flags |= B_RELBUF; /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer * constituted, not even NFS buffers now. Two flags effect this. If * B_INVAL, the struct buf is invalidated but the VM object is kept * around ( i.e. so it is trivial to reconstitute the buffer later ). * * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be * invalidated. BIO_ERROR cannot be set for a failed write unless the * buffer is also B_INVAL because it hits the re-dirtying code above. * * Normally we can do this whether a buffer is B_DELWRI or not. If * the buffer is an NFS buffer, it is tracking piecemeal writes or * the commit state and we cannot afford to lose the buffer. If the * buffer has a background write in progress, we need to keep it * around to prevent it from being reconstituted and starting a second * background write. */ if ((bp->b_flags & B_VMIO) && !(bp->b_vp->v_tag == VT_NFS && !vn_isdisk(bp->b_vp, NULL) && (bp->b_flags & B_DELWRI)) ) { int i, j, resid; vm_page_t m; off_t foff; vm_pindex_t poff; vm_object_t obj; struct vnode *vp; vp = bp->b_vp; /* * Get the base offset and length of the buffer. Note that * for block sizes that are less then PAGE_SIZE, the b_data * base of the buffer does not represent exactly b_offset and * neither b_offset nor b_size are necessarily page aligned. * Instead, the starting position of b_offset is: * * b_data + (b_offset & PAGE_MASK) * * block sizes less then DEV_BSIZE (usually 512) are not * supported due to the page granularity bits (m->valid, * m->dirty, etc...). * * See man buf(9) for more information */ resid = bp->b_bufsize; foff = bp->b_offset; for (i = 0; i < bp->b_npages; i++) { int had_bogus = 0; m = bp->b_pages[i]; vm_page_flag_clear(m, PG_ZERO); /* * If we hit a bogus page, fixup *all* the bogus pages * now. */ if (m == bogus_page) { VOP_GETVOBJECT(vp, &obj); poff = OFF_TO_IDX(bp->b_offset); had_bogus = 1; for (j = i; j < bp->b_npages; j++) { vm_page_t mtmp; mtmp = bp->b_pages[j]; if (mtmp == bogus_page) { mtmp = vm_page_lookup(obj, poff + j); if (!mtmp) { panic("brelse: page missing\n"); } bp->b_pages[j] = mtmp; } } if ((bp->b_flags & B_INVAL) == 0) { pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } m = bp->b_pages[i]; } if ((bp->b_flags & B_NOCACHE) || (bp->b_ioflags & BIO_ERROR)) { int poffset = foff & PAGE_MASK; int presid = resid > (PAGE_SIZE - poffset) ? (PAGE_SIZE - poffset) : resid; KASSERT(presid >= 0, ("brelse: extra page")); vm_page_set_invalid(m, poffset, presid); if (had_bogus) printf("avoided corruption bug in bogus_page/brelse code\n"); } resid -= PAGE_SIZE - (foff & PAGE_MASK); foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } if (bp->b_flags & (B_INVAL | B_RELBUF)) vfs_vmio_release(bp); } else if (bp->b_flags & B_VMIO) { if (bp->b_flags & (B_INVAL | B_RELBUF)) vfs_vmio_release(bp); } if (bp->b_qindex != QUEUE_NONE) panic("brelse: free buffer onto another queue???"); if (BUF_REFCNT(bp) > 1) { /* do not release to free list */ BUF_UNLOCK(bp); splx(s); return; } /* enqueue */ /* buffers with no memory */ if (bp->b_bufsize == 0) { bp->b_flags |= B_INVAL; bp->b_xflags &= ~BX_BKGRDWRITE; if (bp->b_xflags & BX_BKGRDINPROG) panic("losing buffer 1"); if (bp->b_kvasize) { bp->b_qindex = QUEUE_EMPTYKVA; } else { bp->b_qindex = QUEUE_EMPTY; } TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); bp->b_dev = NODEV; /* buffers with junk contents */ } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || (bp->b_ioflags & BIO_ERROR)) { bp->b_flags |= B_INVAL; bp->b_xflags &= ~BX_BKGRDWRITE; if (bp->b_xflags & BX_BKGRDINPROG) panic("losing buffer 2"); bp->b_qindex = QUEUE_CLEAN; TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist); LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); bp->b_dev = NODEV; /* buffers that are locked */ } else if (bp->b_flags & B_LOCKED) { bp->b_qindex = QUEUE_LOCKED; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); /* remaining buffers */ } else { switch(bp->b_flags & (B_DELWRI|B_AGE)) { case B_DELWRI | B_AGE: bp->b_qindex = QUEUE_DIRTY; TAILQ_INSERT_HEAD(&bufqueues[QUEUE_DIRTY], bp, b_freelist); break; case B_DELWRI: bp->b_qindex = QUEUE_DIRTY; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist); break; case B_AGE: bp->b_qindex = QUEUE_CLEAN; TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist); break; default: bp->b_qindex = QUEUE_CLEAN; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist); break; } } /* * If B_INVAL, clear B_DELWRI. We've already placed the buffer * on the correct queue. */ if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) { bp->b_flags &= ~B_DELWRI; --numdirtybuffers; numdirtywakeup(lodirtybuffers); } /* * Fixup numfreebuffers count. The bp is on an appropriate queue * unless locked. We then bump numfreebuffers if it is not B_DELWRI. * We've already handled the B_INVAL case ( B_DELWRI will be clear * if B_INVAL is set ). */ if ((bp->b_flags & B_LOCKED) == 0 && !(bp->b_flags & B_DELWRI)) bufcountwakeup(); /* * Something we can maybe free or reuse */ if (bp->b_bufsize || bp->b_kvasize) bufspacewakeup(); /* unlock */ BUF_UNLOCK(bp); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); bp->b_ioflags &= ~BIO_ORDERED; if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("brelse: not dirty"); splx(s); } /* * Release a buffer back to the appropriate queue but do not try to free * it. The buffer is expected to be used again soon. * * bqrelse() is used by bdwrite() to requeue a delayed write, and used by * biodone() to requeue an async I/O on completion. It is also used when * known good buffers need to be requeued but we think we may need the data * again soon. */ void bqrelse(struct buf * bp) { int s; s = splbio(); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); if (bp->b_qindex != QUEUE_NONE) panic("bqrelse: free buffer onto another queue???"); if (BUF_REFCNT(bp) > 1) { /* do not release to free list */ BUF_UNLOCK(bp); splx(s); return; } if (bp->b_flags & B_LOCKED) { bp->b_ioflags &= ~BIO_ERROR; bp->b_qindex = QUEUE_LOCKED; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); /* buffers with stale but valid contents */ } else if (bp->b_flags & B_DELWRI) { bp->b_qindex = QUEUE_DIRTY; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist); } else if (vm_page_count_severe()) { /* * We are too low on memory, we have to try to free the * buffer (most importantly: the wired pages making up its * backing store) *now*. */ splx(s); brelse(bp); return; } else { bp->b_qindex = QUEUE_CLEAN; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist); } if ((bp->b_flags & B_LOCKED) == 0 && ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) { bufcountwakeup(); } /* * Something we can maybe free or reuse. */ if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) bufspacewakeup(); /* unlock */ BUF_UNLOCK(bp); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); bp->b_ioflags &= ~BIO_ORDERED; if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("bqrelse: not dirty"); splx(s); } static void vfs_vmio_release(bp) struct buf *bp; { int i, s; vm_page_t m; s = splvm(); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; bp->b_pages[i] = NULL; /* * In order to keep page LRU ordering consistent, put * everything on the inactive queue. */ vm_page_unwire(m, 0); /* * We don't mess with busy pages, it is * the responsibility of the process that * busied the pages to deal with them. */ if ((m->flags & PG_BUSY) || (m->busy != 0)) continue; if (m->wire_count == 0) { vm_page_flag_clear(m, PG_ZERO); /* * Might as well free the page if we can and it has * no valid data. */ if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) { vm_page_busy(m); vm_page_protect(m, VM_PROT_NONE); vm_page_free(m); } else if (vm_page_count_severe()) { vm_page_try_to_cache(m); } } } splx(s); pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); if (bp->b_bufsize) { bufspacewakeup(); bp->b_bufsize = 0; } bp->b_npages = 0; bp->b_flags &= ~B_VMIO; if (bp->b_vp) brelvp(bp); } /* * Check to see if a block is currently memory resident. */ struct buf * gbincore(struct vnode * vp, daddr_t blkno) { struct buf *bp; struct bufhashhdr *bh; bh = bufhash(vp, blkno); /* Search hash chain */ LIST_FOREACH(bp, bh, b_hash) { /* hit */ if (bp->b_vp == vp && bp->b_lblkno == blkno && (bp->b_flags & B_INVAL) == 0) { break; } } return (bp); } /* * vfs_bio_awrite: * * Implement clustered async writes for clearing out B_DELWRI buffers. * This is much better then the old way of writing only one buffer at * a time. Note that we may not be presented with the buffers in the * correct order, so we search for the cluster in both directions. */ int vfs_bio_awrite(struct buf * bp) { int i; int j; daddr_t lblkno = bp->b_lblkno; struct vnode *vp = bp->b_vp; int s; int ncl; struct buf *bpa; int nwritten; int size; int maxcl; s = splbio(); /* * right now we support clustered writing only to regular files. If * we find a clusterable block we could be in the middle of a cluster * rather then at the beginning. */ if ((vp->v_type == VREG) && (vp->v_mount != 0) && /* Only on nodes that have the size info */ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { size = vp->v_mount->mnt_stat.f_iosize; maxcl = MAXPHYS / size; for (i = 1; i < maxcl; i++) { if ((bpa = gbincore(vp, lblkno + i)) && BUF_REFCNT(bpa) == 0 && ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == (B_DELWRI | B_CLUSTEROK)) && (bpa->b_bufsize == size)) { if ((bpa->b_blkno == bpa->b_lblkno) || (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT))) break; } else { break; } } for (j = 1; i + j <= maxcl && j <= lblkno; j++) { if ((bpa = gbincore(vp, lblkno - j)) && BUF_REFCNT(bpa) == 0 && ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == (B_DELWRI | B_CLUSTEROK)) && (bpa->b_bufsize == size)) { if ((bpa->b_blkno == bpa->b_lblkno) || (bpa->b_blkno != bp->b_blkno - ((j * size) >> DEV_BSHIFT))) break; } else { break; } } --j; ncl = i + j; /* * this is a possible cluster write */ if (ncl != 1) { nwritten = cluster_wbuild(vp, size, lblkno - j, ncl); splx(s); return nwritten; } } BUF_LOCK(bp, LK_EXCLUSIVE); bremfree(bp); bp->b_flags |= B_ASYNC; splx(s); /* * default (old) behavior, writing out only one block * * XXX returns b_bufsize instead of b_bcount for nwritten? */ nwritten = bp->b_bufsize; (void) BUF_WRITE(bp); return nwritten; } /* * getnewbuf: * * Find and initialize a new buffer header, freeing up existing buffers * in the bufqueues as necessary. The new buffer is returned locked. * * Important: B_INVAL is not set. If the caller wishes to throw the * buffer away, the caller must set B_INVAL prior to calling brelse(). * * We block if: * We have insufficient buffer headers * We have insufficient buffer space * buffer_map is too fragmented ( space reservation fails ) * If we have to flush dirty buffers ( but we try to avoid this ) * * To avoid VFS layer recursion we do not flush dirty buffers ourselves. * Instead we ask the buf daemon to do it for us. We attempt to * avoid piecemeal wakeups of the pageout daemon. */ static struct buf * getnewbuf(int slpflag, int slptimeo, int size, int maxsize) { struct buf *bp; struct buf *nbp; int defrag = 0; int nqindex; static int flushingbufs; /* * We can't afford to block since we might be holding a vnode lock, * which may prevent system daemons from running. We deal with * low-memory situations by proactively returning memory and running * async I/O rather then sync I/O. */ ++getnewbufcalls; --getnewbufrestarts; restart: ++getnewbufrestarts; /* * Setup for scan. If we do not have enough free buffers, * we setup a degenerate case that immediately fails. Note * that if we are specially marked process, we are allowed to * dip into our reserves. * * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN * * We start with EMPTYKVA. If the list is empty we backup to EMPTY. * However, there are a number of cases (defragging, reusing, ...) * where we cannot backup. */ nqindex = QUEUE_EMPTYKVA; nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); if (nbp == NULL) { /* * If no EMPTYKVA buffers and we are either * defragging or reusing, locate a CLEAN buffer * to free or reuse. If bufspace useage is low * skip this step so we can allocate a new buffer. */ if (defrag || bufspace >= lobufspace) { nqindex = QUEUE_CLEAN; nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); } /* * If we could not find or were not allowed to reuse a * CLEAN buffer, check to see if it is ok to use an EMPTY * buffer. We can only use an EMPTY buffer if allocating * its KVA would not otherwise run us out of buffer space. */ if (nbp == NULL && defrag == 0 && bufspace + maxsize < hibufspace) { nqindex = QUEUE_EMPTY; nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); } } /* * Run scan, possibly freeing data and/or kva mappings on the fly * depending. */ while ((bp = nbp) != NULL) { int qindex = nqindex; /* * Calculate next bp ( we can only use it if we do not block * or do other fancy things ). */ if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) { switch(qindex) { case QUEUE_EMPTY: nqindex = QUEUE_EMPTYKVA; if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]))) break; /* fall through */ case QUEUE_EMPTYKVA: nqindex = QUEUE_CLEAN; if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]))) break; /* fall through */ case QUEUE_CLEAN: /* * nbp is NULL. */ break; } } /* * Sanity Checks */ KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp)); /* * Note: we no longer distinguish between VMIO and non-VMIO * buffers. */ KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex)); /* * If we are defragging then we need a buffer with * b_kvasize != 0. XXX this situation should no longer * occur, if defrag is non-zero the buffer's b_kvasize * should also be non-zero at this point. XXX */ if (defrag && bp->b_kvasize == 0) { printf("Warning: defrag empty buffer %p\n", bp); continue; } /* * Start freeing the bp. This is somewhat involved. nbp * remains valid only for QUEUE_EMPTY[KVA] bp's. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) panic("getnewbuf: locked buf"); bremfree(bp); if (qindex == QUEUE_CLEAN) { if (bp->b_flags & B_VMIO) { bp->b_flags &= ~B_ASYNC; vfs_vmio_release(bp); } if (bp->b_vp) brelvp(bp); } /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. * * Get the rest of the buffer freed up. b_kva* is still * valid after this operation. */ if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } if (LIST_FIRST(&bp->b_dep) != NULL) buf_deallocate(bp); if (bp->b_xflags & BX_BKGRDINPROG) panic("losing buffer 3"); LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); if (bp->b_bufsize) allocbuf(bp, 0); bp->b_flags = 0; bp->b_ioflags = 0; bp->b_xflags = 0; bp->b_dev = NODEV; bp->b_vp = NULL; bp->b_blkno = bp->b_lblkno = 0; bp->b_offset = NOOFFSET; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; bp->b_dirtyoff = bp->b_dirtyend = 0; + bp->b_magic = B_MAGIC_BIO; + bp->b_op = &buf_ops_bio; LIST_INIT(&bp->b_dep); /* * If we are defragging then free the buffer. */ if (defrag) { bp->b_flags |= B_INVAL; bfreekva(bp); brelse(bp); defrag = 0; goto restart; } /* * If we are overcomitted then recover the buffer and its * KVM space. This occurs in rare situations when multiple * processes are blocked in getnewbuf() or allocbuf(). */ if (bufspace >= hibufspace) flushingbufs = 1; if (flushingbufs && bp->b_kvasize != 0) { bp->b_flags |= B_INVAL; bfreekva(bp); brelse(bp); goto restart; } if (bufspace < lobufspace) flushingbufs = 0; break; } /* * If we exhausted our list, sleep as appropriate. We may have to * wakeup various daemons and write out some dirty buffers. * * Generally we are sleeping due to insufficient buffer space. */ if (bp == NULL) { int flags; char *waitmsg; if (defrag) { flags = VFS_BIO_NEED_BUFSPACE; waitmsg = "nbufkv"; } else if (bufspace >= hibufspace) { waitmsg = "nbufbs"; flags = VFS_BIO_NEED_BUFSPACE; } else { waitmsg = "newbuf"; flags = VFS_BIO_NEED_ANY; } bd_speedup(); /* heeeelp */ needsbuffer |= flags; while (needsbuffer & flags) { if (tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, waitmsg, slptimeo)) return (NULL); } } else { /* * We finally have a valid bp. We aren't quite out of the * woods, we still have to reserve kva space. In order * to keep fragmentation sane we only allocate kva in * BKVASIZE chunks. */ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; if (maxsize != bp->b_kvasize) { vm_offset_t addr = 0; bfreekva(bp); if (vm_map_findspace(buffer_map, vm_map_min(buffer_map), maxsize, &addr)) { /* * Uh oh. Buffer map is to fragmented. We * must defragment the map. */ ++bufdefragcnt; defrag = 1; bp->b_flags |= B_INVAL; brelse(bp); goto restart; } if (addr) { vm_map_insert(buffer_map, NULL, 0, addr, addr + maxsize, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); bp->b_kvabase = (caddr_t) addr; bp->b_kvasize = maxsize; bufspace += bp->b_kvasize; ++bufreusecnt; } } bp->b_data = bp->b_kvabase; } return(bp); } /* * buf_daemon: * * buffer flushing daemon. Buffers are normally flushed by the * update daemon but if it cannot keep up this process starts to * take the load in an attempt to prevent getnewbuf() from blocking. */ static struct proc *bufdaemonproc; static struct kproc_desc buf_kp = { "bufdaemon", buf_daemon, &bufdaemonproc }; SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp) static void buf_daemon() { int s; mtx_lock(&Giant); /* * This process needs to be suspended prior to shutdown sync. */ EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc, SHUTDOWN_PRI_LAST); /* * This process is allowed to take the buffer cache to the limit */ curproc->p_flag |= P_BUFEXHAUST; s = splbio(); for (;;) { kthread_suspend_check(bufdaemonproc); bd_request = 0; /* * Do the flush. Limit the amount of in-transit I/O we * allow to build up, otherwise we would completely saturate * the I/O system. Wakeup any waiting processes before we * normally would so they can run in parallel with our drain. */ while (numdirtybuffers > lodirtybuffers) { if (flushbufqueues() == 0) break; waitrunningbufspace(); numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2); } /* * Only clear bd_request if we have reached our low water * mark. The buf_daemon normally waits 5 seconds and * then incrementally flushes any dirty buffers that have * built up, within reason. * * If we were unable to hit our low water mark and couldn't * find any flushable buffers, we sleep half a second. * Otherwise we loop immediately. */ if (numdirtybuffers <= lodirtybuffers) { /* * We reached our low water mark, reset the * request and sleep until we are needed again. * The sleep is just so the suspend code works. */ bd_request = 0; tsleep(&bd_request, PVM, "psleep", hz); } else { /* * We couldn't find any flushable dirty buffers but * still have too many dirty buffers, we * have to sleep and try again. (rare) */ tsleep(&bd_request, PVM, "qsleep", hz / 2); } } } /* * flushbufqueues: * * Try to flush a buffer in the dirty queue. We must be careful to * free up B_INVAL buffers instead of write them, which NFS is * particularly sensitive to. */ static int flushbufqueues(void) { struct buf *bp; int r = 0; bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]); while (bp) { KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp)); if ((bp->b_flags & B_DELWRI) != 0 && (bp->b_xflags & BX_BKGRDINPROG) == 0) { if (bp->b_flags & B_INVAL) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) panic("flushbufqueues: locked buf"); bremfree(bp); brelse(bp); ++r; break; } if (LIST_FIRST(&bp->b_dep) != NULL && (bp->b_flags & B_DEFERRED) == 0 && buf_countdeps(bp, 0)) { TAILQ_REMOVE(&bufqueues[QUEUE_DIRTY], bp, b_freelist); TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist); bp->b_flags |= B_DEFERRED; bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]); continue; } vfs_bio_awrite(bp); ++r; break; } bp = TAILQ_NEXT(bp, b_freelist); } return (r); } /* * Check to see if a block is currently memory resident. */ struct buf * incore(struct vnode * vp, daddr_t blkno) { struct buf *bp; int s = splbio(); bp = gbincore(vp, blkno); splx(s); return (bp); } /* * Returns true if no I/O is needed to access the * associated VM object. This is like incore except * it also hunts around in the VM system for the data. */ int inmem(struct vnode * vp, daddr_t blkno) { vm_object_t obj; vm_offset_t toff, tinc, size; vm_page_t m; vm_ooffset_t off; if (incore(vp, blkno)) return 1; if (vp->v_mount == NULL) return 0; if (VOP_GETVOBJECT(vp, &obj) != 0 || (vp->v_flag & VOBJBUF) == 0) return 0; size = PAGE_SIZE; if (size > vp->v_mount->mnt_stat.f_iosize) size = vp->v_mount->mnt_stat.f_iosize; off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); if (!m) return 0; tinc = size; if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); if (vm_page_is_valid(m, (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) return 0; } return 1; } /* * vfs_setdirty: * * Sets the dirty range for a buffer based on the status of the dirty * bits in the pages comprising the buffer. * * The range is limited to the size of the buffer. * * This routine is primarily used by NFS, but is generalized for the * B_VMIO case. */ static void vfs_setdirty(struct buf *bp) { int i; vm_object_t object; /* * Degenerate case - empty buffer */ if (bp->b_bufsize == 0) return; /* * We qualify the scan for modified pages on whether the * object has been flushed yet. The OBJ_WRITEABLE flag * is not cleared simply by protecting pages off. */ if ((bp->b_flags & B_VMIO) == 0) return; object = bp->b_pages[0]->object; if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY)) printf("Warning: object %p writeable but not mightbedirty\n", object); if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY)) printf("Warning: object %p mightbedirty but not writeable\n", object); if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) { vm_offset_t boffset; vm_offset_t eoffset; /* * test the pages to see if they have been modified directly * by users through the VM system. */ for (i = 0; i < bp->b_npages; i++) { vm_page_flag_clear(bp->b_pages[i], PG_ZERO); vm_page_test_dirty(bp->b_pages[i]); } /* * Calculate the encompassing dirty range, boffset and eoffset, * (eoffset - boffset) bytes. */ for (i = 0; i < bp->b_npages; i++) { if (bp->b_pages[i]->dirty) break; } boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); for (i = bp->b_npages - 1; i >= 0; --i) { if (bp->b_pages[i]->dirty) { break; } } eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); /* * Fit it to the buffer. */ if (eoffset > bp->b_bcount) eoffset = bp->b_bcount; /* * If we have a good dirty range, merge with the existing * dirty range. */ if (boffset < eoffset) { if (bp->b_dirtyoff > boffset) bp->b_dirtyoff = boffset; if (bp->b_dirtyend < eoffset) bp->b_dirtyend = eoffset; } } } /* * getblk: * * Get a block given a specified block and offset into a file/device. * The buffers B_DONE bit will be cleared on return, making it almost * ready for an I/O initiation. B_INVAL may or may not be set on * return. The caller should clear B_INVAL prior to initiating a * READ. * * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for * an existing buffer. * * For a VMIO buffer, B_CACHE is modified according to the backing VM. * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set * and then cleared based on the backing VM. If the previous buffer is * non-0-sized but invalid, B_CACHE will be cleared. * * If getblk() must create a new buffer, the new buffer is returned with * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which * case it is returned with B_INVAL clear and B_CACHE set based on the * backing VM. * - * getblk() also forces a VOP_BWRITE() for any B_DELWRI buffer whos + * getblk() also forces a BUF_WRITE() for any B_DELWRI buffer whos * B_CACHE bit is clear. * * What this means, basically, is that the caller should use B_CACHE to * determine whether the buffer is fully valid or not and should clear * B_INVAL prior to issuing a read. If the caller intends to validate * the buffer by loading its data area with something, the caller needs * to clear B_INVAL. If the caller does this without issuing an I/O, * the caller should set B_CACHE ( as an optimization ), else the caller * should issue the I/O and biodone() will set B_CACHE if the I/O was * a write attempt or if it was a successfull read. If the caller * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR * prior to issuing the READ. biodone() will *not* clear B_INVAL. */ struct buf * getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) { struct buf *bp; int s; struct bufhashhdr *bh; if (size > MAXBSIZE) panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); s = splbio(); loop: /* * Block if we are low on buffers. Certain processes are allowed * to completely exhaust the buffer cache. * * If this check ever becomes a bottleneck it may be better to * move it into the else, when gbincore() fails. At the moment * it isn't a problem. * * XXX remove if 0 sections (clean this up after its proven) */ if (numfreebuffers == 0) { if (curproc == PCPU_GET(idleproc)) return NULL; needsbuffer |= VFS_BIO_NEED_ANY; } if ((bp = gbincore(vp, blkno))) { /* * Buffer is in-core. If the buffer is not busy, it must * be on a queue. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { if (BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL, "getblk", slpflag, slptimeo) == ENOLCK) goto loop; splx(s); return (struct buf *) NULL; } /* * The buffer is locked. B_CACHE is cleared if the buffer is * invalid. Ohterwise, for a non-VMIO buffer, B_CACHE is set * and for a VMIO buffer B_CACHE is adjusted according to the * backing VM cache. */ if (bp->b_flags & B_INVAL) bp->b_flags &= ~B_CACHE; else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) bp->b_flags |= B_CACHE; bremfree(bp); /* * check for size inconsistancies for non-VMIO case. */ if (bp->b_bcount != size) { if ((bp->b_flags & B_VMIO) == 0 || (size > bp->b_kvasize)) { if (bp->b_flags & B_DELWRI) { bp->b_flags |= B_NOCACHE; BUF_WRITE(bp); } else { if ((bp->b_flags & B_VMIO) && (LIST_FIRST(&bp->b_dep) == NULL)) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bp->b_flags |= B_NOCACHE; BUF_WRITE(bp); } } goto loop; } } /* * If the size is inconsistant in the VMIO case, we can resize * the buffer. This might lead to B_CACHE getting set or * cleared. If the size has not changed, B_CACHE remains * unchanged from its previous state. */ if (bp->b_bcount != size) allocbuf(bp, size); KASSERT(bp->b_offset != NOOFFSET, ("getblk: no buffer offset")); /* * A buffer with B_DELWRI set and B_CACHE clear must * be committed before we can return the buffer in * order to prevent the caller from issuing a read * ( due to B_CACHE not being set ) and overwriting * it. * * Most callers, including NFS and FFS, need this to * operate properly either because they assume they * can issue a read if B_CACHE is not set, or because * ( for example ) an uncached B_DELWRI might loop due * to softupdates re-dirtying the buffer. In the latter * case, B_CACHE is set after the first write completes, * preventing further loops. */ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { BUF_WRITE(bp); goto loop; } splx(s); bp->b_flags &= ~B_DONE; } else { /* * Buffer is not in-core, create new buffer. The buffer * returned by getnewbuf() is locked. Note that the returned * buffer is also considered valid (not marked B_INVAL). */ int bsize, maxsize, vmio; off_t offset; if (vn_isdisk(vp, NULL)) bsize = DEV_BSIZE; else if (vp->v_mountedhere) bsize = vp->v_mountedhere->mnt_stat.f_iosize; else if (vp->v_mount) bsize = vp->v_mount->mnt_stat.f_iosize; else bsize = size; offset = (off_t)blkno * bsize; vmio = (VOP_GETVOBJECT(vp, NULL) == 0) && (vp->v_flag & VOBJBUF); maxsize = vmio ? size + (offset & PAGE_MASK) : size; maxsize = imax(maxsize, bsize); if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL) { if (slpflag || slptimeo) { splx(s); return NULL; } goto loop; } /* * This code is used to make sure that a buffer is not * created while the getnewbuf routine is blocked. * This can be a problem whether the vnode is locked or not. * If the buffer is created out from under us, we have to * throw away the one we just created. There is now window * race because we are safely running at splbio() from the * point of the duplicate buffer creation through to here, * and we've locked the buffer. */ if (gbincore(vp, blkno)) { bp->b_flags |= B_INVAL; brelse(bp); goto loop; } /* * Insert the buffer into the hash, so that it can * be found by incore. */ bp->b_blkno = bp->b_lblkno = blkno; bp->b_offset = offset; bgetvp(vp, bp); LIST_REMOVE(bp, b_hash); bh = bufhash(vp, blkno); LIST_INSERT_HEAD(bh, bp, b_hash); /* * set B_VMIO bit. allocbuf() the buffer bigger. Since the * buffer size starts out as 0, B_CACHE will be set by * allocbuf() for the VMIO case prior to it testing the * backing store for validity. */ if (vmio) { bp->b_flags |= B_VMIO; #if defined(VFS_BIO_DEBUG) if (vp->v_type != VREG) printf("getblk: vmioing file type %d???\n", vp->v_type); #endif } else { bp->b_flags &= ~B_VMIO; } allocbuf(bp, size); splx(s); bp->b_flags &= ~B_DONE; } return (bp); } /* * Get an empty, disassociated buffer of given size. The buffer is initially * set to B_INVAL. */ struct buf * geteblk(int size) { struct buf *bp; int s; int maxsize; maxsize = (size + BKVAMASK) & ~BKVAMASK; s = splbio(); while ((bp = getnewbuf(0, 0, size, maxsize)) == 0); splx(s); allocbuf(bp, size); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ return (bp); } /* * This code constitutes the buffer memory from either anonymous system * memory (in the case of non-VMIO operations) or from an associated * VM object (in the case of VMIO operations). This code is able to * resize a buffer up or down. * * Note that this code is tricky, and has many complications to resolve * deadlock or inconsistant data situations. Tread lightly!!! * There are B_CACHE and B_DELWRI interactions that must be dealt with by * the caller. Calling this code willy nilly can result in the loss of data. * * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with * B_CACHE for the non-VMIO case. */ int allocbuf(struct buf *bp, int size) { int newbsize, mbsize; int i; if (BUF_REFCNT(bp) == 0) panic("allocbuf: buffer not busy"); if (bp->b_kvasize < size) panic("allocbuf: buffer too small"); if ((bp->b_flags & B_VMIO) == 0) { caddr_t origbuf; int origbufsize; /* * Just get anonymous memory from the kernel. Don't * mess with B_CACHE. */ mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); #if !defined(NO_B_MALLOC) if (bp->b_flags & B_MALLOC) newbsize = mbsize; else #endif newbsize = round_page(size); if (newbsize < bp->b_bufsize) { #if !defined(NO_B_MALLOC) /* * malloced buffers are not shrunk */ if (bp->b_flags & B_MALLOC) { if (newbsize) { bp->b_bcount = size; } else { free(bp->b_data, M_BIOBUF); if (bp->b_bufsize) { bufmallocspace -= bp->b_bufsize; bufspacewakeup(); bp->b_bufsize = 0; } bp->b_data = bp->b_kvabase; bp->b_bcount = 0; bp->b_flags &= ~B_MALLOC; } return 1; } #endif vm_hold_free_pages( bp, (vm_offset_t) bp->b_data + newbsize, (vm_offset_t) bp->b_data + bp->b_bufsize); } else if (newbsize > bp->b_bufsize) { #if !defined(NO_B_MALLOC) /* * We only use malloced memory on the first allocation. * and revert to page-allocated memory when the buffer * grows. */ if ( (bufmallocspace < maxbufmallocspace) && (bp->b_bufsize == 0) && (mbsize <= PAGE_SIZE/2)) { bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK); bp->b_bufsize = mbsize; bp->b_bcount = size; bp->b_flags |= B_MALLOC; bufmallocspace += mbsize; return 1; } #endif origbuf = NULL; origbufsize = 0; #if !defined(NO_B_MALLOC) /* * If the buffer is growing on its other-than-first allocation, * then we revert to the page-allocation scheme. */ if (bp->b_flags & B_MALLOC) { origbuf = bp->b_data; origbufsize = bp->b_bufsize; bp->b_data = bp->b_kvabase; if (bp->b_bufsize) { bufmallocspace -= bp->b_bufsize; bufspacewakeup(); bp->b_bufsize = 0; } bp->b_flags &= ~B_MALLOC; newbsize = round_page(newbsize); } #endif vm_hold_load_pages( bp, (vm_offset_t) bp->b_data + bp->b_bufsize, (vm_offset_t) bp->b_data + newbsize); #if !defined(NO_B_MALLOC) if (origbuf) { bcopy(origbuf, bp->b_data, origbufsize); free(origbuf, M_BIOBUF); } #endif } } else { vm_page_t m; int desiredpages; newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); desiredpages = (size == 0) ? 0 : num_pages((bp->b_offset & PAGE_MASK) + newbsize); #if !defined(NO_B_MALLOC) if (bp->b_flags & B_MALLOC) panic("allocbuf: VMIO buffer can't be malloced"); #endif /* * Set B_CACHE initially if buffer is 0 length or will become * 0-length. */ if (size == 0 || bp->b_bufsize == 0) bp->b_flags |= B_CACHE; if (newbsize < bp->b_bufsize) { /* * DEV_BSIZE aligned new buffer size is less then the * DEV_BSIZE aligned existing buffer size. Figure out * if we have to remove any pages. */ if (desiredpages < bp->b_npages) { for (i = desiredpages; i < bp->b_npages; i++) { /* * the page is not freed here -- it * is the responsibility of * vnode_pager_setsize */ m = bp->b_pages[i]; KASSERT(m != bogus_page, ("allocbuf: bogus page found")); while (vm_page_sleep_busy(m, TRUE, "biodep")) ; bp->b_pages[i] = NULL; vm_page_unwire(m, 0); } pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) + (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); bp->b_npages = desiredpages; } } else if (size > bp->b_bcount) { /* * We are growing the buffer, possibly in a * byte-granular fashion. */ struct vnode *vp; vm_object_t obj; vm_offset_t toff; vm_offset_t tinc; /* * Step 1, bring in the VM pages from the object, * allocating them if necessary. We must clear * B_CACHE if these pages are not valid for the * range covered by the buffer. */ vp = bp->b_vp; VOP_GETVOBJECT(vp, &obj); while (bp->b_npages < desiredpages) { vm_page_t m; vm_pindex_t pi; pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages; if ((m = vm_page_lookup(obj, pi)) == NULL) { /* * note: must allocate system pages * since blocking here could intefere * with paging I/O, no matter which * process we are. */ m = vm_page_alloc(obj, pi, VM_ALLOC_SYSTEM); if (m == NULL) { VM_WAIT; vm_pageout_deficit += desiredpages - bp->b_npages; } else { vm_page_wire(m); vm_page_wakeup(m); bp->b_flags &= ~B_CACHE; bp->b_pages[bp->b_npages] = m; ++bp->b_npages; } continue; } /* * We found a page. If we have to sleep on it, * retry because it might have gotten freed out * from under us. * * We can only test PG_BUSY here. Blocking on * m->busy might lead to a deadlock: * * vm_fault->getpages->cluster_read->allocbuf * */ if (vm_page_sleep_busy(m, FALSE, "pgtblk")) continue; /* * We have a good page. Should we wakeup the * page daemon? */ if ((curproc != pageproc) && ((m->queue - m->pc) == PQ_CACHE) && ((cnt.v_free_count + cnt.v_cache_count) < (cnt.v_free_min + cnt.v_cache_min))) { pagedaemon_wakeup(); } vm_page_flag_clear(m, PG_ZERO); vm_page_wire(m); bp->b_pages[bp->b_npages] = m; ++bp->b_npages; } /* * Step 2. We've loaded the pages into the buffer, * we have to figure out if we can still have B_CACHE * set. Note that B_CACHE is set according to the * byte-granular range ( bcount and size ), new the * aligned range ( newbsize ). * * The VM test is against m->valid, which is DEV_BSIZE * aligned. Needless to say, the validity of the data * needs to also be DEV_BSIZE aligned. Note that this * fails with NFS if the server or some other client * extends the file's EOF. If our buffer is resized, * B_CACHE may remain set! XXX */ toff = bp->b_bcount; tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); while ((bp->b_flags & B_CACHE) && toff < size) { vm_pindex_t pi; if (tinc > (size - toff)) tinc = size - toff; pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT; vfs_buf_test_cache( bp, bp->b_offset, toff, tinc, bp->b_pages[pi] ); toff += tinc; tinc = PAGE_SIZE; } /* * Step 3, fixup the KVM pmap. Remember that * bp->b_data is relative to bp->b_offset, but * bp->b_offset may be offset into the first page. */ bp->b_data = (caddr_t) trunc_page((vm_offset_t)bp->b_data); pmap_qenter( (vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages ); bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | (vm_offset_t)(bp->b_offset & PAGE_MASK)); } } if (newbsize < bp->b_bufsize) bufspacewakeup(); bp->b_bufsize = newbsize; /* actual buffer allocation */ bp->b_bcount = size; /* requested buffer size */ return 1; } /* * bufwait: * * Wait for buffer I/O completion, returning error status. The buffer * is left locked and B_DONE on return. B_EINTR is converted into a EINTR * error and cleared. */ int bufwait(register struct buf * bp) { int s; s = splbio(); while ((bp->b_flags & B_DONE) == 0) { if (bp->b_iocmd == BIO_READ) tsleep(bp, PRIBIO, "biord", 0); else tsleep(bp, PRIBIO, "biowr", 0); } splx(s); if (bp->b_flags & B_EINTR) { bp->b_flags &= ~B_EINTR; return (EINTR); } if (bp->b_ioflags & BIO_ERROR) { return (bp->b_error ? bp->b_error : EIO); } else { return (0); } } /* * Call back function from struct bio back up to struct buf. * The corresponding initialization lives in sys/conf.h:DEV_STRATEGY(). */ void bufdonebio(struct bio *bp) { bufdone(bp->bio_caller2); } /* * bufdone: * * Finish I/O on a buffer, optionally calling a completion function. * This is usually called from an interrupt so process blocking is * not allowed. * * biodone is also responsible for setting B_CACHE in a B_VMIO bp. * In a non-VMIO bp, B_CACHE will be set on the next getblk() * assuming B_INVAL is clear. * * For the VMIO case, we set B_CACHE if the op was a read and no * read error occured, or if the op was a write. B_CACHE is never * set if the buffer is invalid or otherwise uncacheable. * * biodone does not mess with B_INVAL, allowing the I/O routine or the * initiator to leave B_INVAL set to brelse the buffer out of existance * in the biodone routine. */ void bufdone(struct buf *bp) { int s, error; void (*biodone) __P((struct buf *)); s = splbio(); KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp))); KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); bp->b_flags |= B_DONE; runningbufwakeup(bp); if (bp->b_iocmd == BIO_DELETE) { brelse(bp); splx(s); return; } if (bp->b_iocmd == BIO_WRITE) { vwakeup(bp); } /* call optional completion function if requested */ if (bp->b_iodone != NULL) { biodone = bp->b_iodone; bp->b_iodone = NULL; (*biodone) (bp); splx(s); return; } if (LIST_FIRST(&bp->b_dep) != NULL) buf_complete(bp); if (bp->b_flags & B_VMIO) { int i; vm_ooffset_t foff; vm_page_t m; vm_object_t obj; int iosize; struct vnode *vp = bp->b_vp; error = VOP_GETVOBJECT(vp, &obj); #if defined(VFS_BIO_DEBUG) if (vp->v_usecount == 0) { panic("biodone: zero vnode ref count"); } if (error) { panic("biodone: missing VM object"); } if ((vp->v_flag & VOBJBUF) == 0) { panic("biodone: vnode is not setup for merged cache"); } #endif foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("biodone: no buffer offset")); if (error) { panic("biodone: no object"); } #if defined(VFS_BIO_DEBUG) if (obj->paging_in_progress < bp->b_npages) { printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", obj->paging_in_progress, bp->b_npages); } #endif /* * Set B_CACHE if the op was a normal read and no error * occured. B_CACHE is set for writes in the b*write() * routines. */ iosize = bp->b_bcount - bp->b_resid; if (bp->b_iocmd == BIO_READ && !(bp->b_flags & (B_INVAL|B_NOCACHE)) && !(bp->b_ioflags & BIO_ERROR)) { bp->b_flags |= B_CACHE; } for (i = 0; i < bp->b_npages; i++) { int bogusflag = 0; int resid; resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; if (resid > iosize) resid = iosize; /* * cleanup bogus pages, restoring the originals */ m = bp->b_pages[i]; if (m == bogus_page) { bogusflag = 1; m = vm_page_lookup(obj, OFF_TO_IDX(foff)); if (m == NULL) panic("biodone: page disappeared!"); bp->b_pages[i] = m; pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } #if defined(VFS_BIO_DEBUG) if (OFF_TO_IDX(foff) != m->pindex) { printf( "biodone: foff(%lu)/m->pindex(%d) mismatch\n", (unsigned long)foff, m->pindex); } #endif /* * In the write case, the valid and clean bits are * already changed correctly ( see bdwrite() ), so we * only need to do this here in the read case. */ if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) { vfs_page_set_valid(bp, foff, i, m); } vm_page_flag_clear(m, PG_ZERO); /* * when debugging new filesystems or buffer I/O methods, this * is the most common error that pops up. if you see this, you * have not set the page busy flag correctly!!! */ if (m->busy == 0) { printf("biodone: page busy < 0, " "pindex: %d, foff: 0x(%x,%x), " "resid: %d, index: %d\n", (int) m->pindex, (int)(foff >> 32), (int) foff & 0xffffffff, resid, i); if (!vn_isdisk(vp, NULL)) printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n", bp->b_vp->v_mount->mnt_stat.f_iosize, (int) bp->b_lblkno, bp->b_flags, bp->b_npages); else printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n", (int) bp->b_lblkno, bp->b_flags, bp->b_npages); printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n", m->valid, m->dirty, m->wire_count); panic("biodone: page busy < 0\n"); } vm_page_io_finish(m); vm_object_pip_subtract(obj, 1); foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; iosize -= resid; } if (obj) vm_object_pip_wakeupn(obj, 0); } /* * For asynchronous completions, release the buffer now. The brelse * will do a wakeup there if necessary - so no need to do a wakeup * here in the async case. The sync case always needs to do a wakeup. */ if (bp->b_flags & B_ASYNC) { if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR)) brelse(bp); else bqrelse(bp); } else { wakeup(bp); } splx(s); } /* * This routine is called in lieu of iodone in the case of * incomplete I/O. This keeps the busy status for pages * consistant. */ void vfs_unbusy_pages(struct buf * bp) { int i; runningbufwakeup(bp); if (bp->b_flags & B_VMIO) { struct vnode *vp = bp->b_vp; vm_object_t obj; VOP_GETVOBJECT(vp, &obj); for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); if (!m) { panic("vfs_unbusy_pages: page missing\n"); } bp->b_pages[i] = m; pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } vm_object_pip_subtract(obj, 1); vm_page_flag_clear(m, PG_ZERO); vm_page_io_finish(m); } vm_object_pip_wakeupn(obj, 0); } } /* * vfs_page_set_valid: * * Set the valid bits in a page based on the supplied offset. The * range is restricted to the buffer's size. * * This routine is typically called after a read completes. */ static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m) { vm_ooffset_t soff, eoff; /* * Start and end offsets in buffer. eoff - soff may not cross a * page boundry or cross the end of the buffer. The end of the * buffer, in this case, is our file EOF, not the allocation size * of the buffer. */ soff = off; eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > soff) { vm_page_set_validclean( m, (vm_offset_t) (soff & PAGE_MASK), (vm_offset_t) (eoff - soff) ); } } /* * This routine is called before a device strategy routine. * It is used to tell the VM system that paging I/O is in * progress, and treat the pages associated with the buffer * almost as being PG_BUSY. Also the object paging_in_progress * flag is handled to make sure that the object doesn't become * inconsistant. * * Since I/O has not been initiated yet, certain buffer flags * such as BIO_ERROR or B_INVAL may be in an inconsistant state * and should be ignored. */ void vfs_busy_pages(struct buf * bp, int clear_modify) { int i, bogus; if (bp->b_flags & B_VMIO) { struct vnode *vp = bp->b_vp; vm_object_t obj; vm_ooffset_t foff; VOP_GETVOBJECT(vp, &obj); foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_busy_pages: no buffer offset")); vfs_setdirty(bp); retry: for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; if (vm_page_sleep_busy(m, FALSE, "vbpage")) goto retry; } bogus = 0; for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; vm_page_flag_clear(m, PG_ZERO); if ((bp->b_flags & B_CLUSTER) == 0) { vm_object_pip_add(obj, 1); vm_page_io_start(m); } /* * When readying a buffer for a read ( i.e * clear_modify == 0 ), it is important to do * bogus_page replacement for valid pages in * partially instantiated buffers. Partially * instantiated buffers can, in turn, occur when * reconstituting a buffer from its VM backing store * base. We only have to do this if B_CACHE is * clear ( which causes the I/O to occur in the * first place ). The replacement prevents the read * I/O from overwriting potentially dirty VM-backed * pages. XXX bogus page replacement is, uh, bogus. * It may not work properly with small-block devices. * We need to find a better way. */ vm_page_protect(m, VM_PROT_NONE); if (clear_modify) vfs_page_set_valid(bp, foff, i, m); else if (m->valid == VM_PAGE_BITS_ALL && (bp->b_flags & B_CACHE) == 0) { bp->b_pages[i] = bogus_page; bogus++; } foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } if (bogus) pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * Tell the VM system that the pages associated with this buffer * are clean. This is used for delayed writes where the data is * going to go to disk eventually without additional VM intevention. * * Note that while we only really need to clean through to b_bcount, we * just go ahead and clean through to b_bufsize. */ static void vfs_clean_pages(struct buf * bp) { int i; if (bp->b_flags & B_VMIO) { vm_ooffset_t foff; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_clean_pages: no buffer offset")); for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; vm_ooffset_t noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; vm_ooffset_t eoff = noff; if (eoff > bp->b_offset + bp->b_bufsize) eoff = bp->b_offset + bp->b_bufsize; vfs_page_set_valid(bp, foff, i, m); /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ foff = noff; } } } /* * vfs_bio_set_validclean: * * Set the range within the buffer to valid and clean. The range is * relative to the beginning of the buffer, b_offset. Note that b_offset * itself may be offset from the beginning of the first page. */ void vfs_bio_set_validclean(struct buf *bp, int base, int size) { if (bp->b_flags & B_VMIO) { int i; int n; /* * Fixup base to be relative to beginning of first page. * Set initial n to be the maximum number of bytes in the * first page that can be validated. */ base += (bp->b_offset & PAGE_MASK); n = PAGE_SIZE - (base & PAGE_MASK); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { vm_page_t m = bp->b_pages[i]; if (n > size) n = size; vm_page_set_validclean(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } } } /* * vfs_bio_clrbuf: * * clear a buffer. This routine essentially fakes an I/O, so we need * to clear BIO_ERROR and B_INVAL. * * Note that while we only theoretically need to clear through b_bcount, * we go ahead and clear through b_bufsize. */ void vfs_bio_clrbuf(struct buf *bp) { int i, mask = 0; caddr_t sa, ea; if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) { bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && (bp->b_offset & PAGE_MASK) == 0) { mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; if (((bp->b_pages[0]->flags & PG_ZERO) == 0) && ((bp->b_pages[0]->valid & mask) != mask)) { bzero(bp->b_data, bp->b_bufsize); } bp->b_pages[0]->valid |= mask; bp->b_resid = 0; return; } ea = sa = bp->b_data; for(i=0;ib_npages;i++,sa=ea) { int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE; ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE); ea = (caddr_t)(vm_offset_t)ulmin( (u_long)(vm_offset_t)ea, (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize); mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; if ((bp->b_pages[i]->valid & mask) == mask) continue; if ((bp->b_pages[i]->valid & mask) == 0) { if ((bp->b_pages[i]->flags & PG_ZERO) == 0) { bzero(sa, ea - sa); } } else { for (; sa < ea; sa += DEV_BSIZE, j++) { if (((bp->b_pages[i]->flags & PG_ZERO) == 0) && (bp->b_pages[i]->valid & (1<b_pages[i]->valid |= mask; vm_page_flag_clear(bp->b_pages[i], PG_ZERO); } bp->b_resid = 0; } else { clrbuf(bp); } } /* * vm_hold_load_pages and vm_hold_unload pages get pages into * a buffers address space. The pages are anonymous and are * not associated with a file object. */ void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index; to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; for (pg = from; pg < to; pg += PAGE_SIZE, index++) { tryagain: /* * note: must allocate system pages since blocking here * could intefere with paging I/O, no matter which * process we are. */ p = vm_page_alloc(kernel_object, ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), VM_ALLOC_SYSTEM); if (!p) { vm_pageout_deficit += (to - from) >> PAGE_SHIFT; VM_WAIT; goto tryagain; } vm_page_wire(p); p->valid = VM_PAGE_BITS_ALL; vm_page_flag_clear(p, PG_ZERO); pmap_kenter(pg, VM_PAGE_TO_PHYS(p)); bp->b_pages[index] = p; vm_page_wakeup(p); } bp->b_npages = index; } void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index, newnpages; from = round_page(from); to = round_page(to); newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; for (pg = from; pg < to; pg += PAGE_SIZE, index++) { p = bp->b_pages[index]; if (p && (index < bp->b_npages)) { if (p->busy) { printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n", bp->b_blkno, bp->b_lblkno); } bp->b_pages[index] = NULL; pmap_kremove(pg); vm_page_busy(p); vm_page_unwire(p, 0); vm_page_free(p); } } bp->b_npages = newnpages; } #include "opt_ddb.h" #ifdef DDB #include DB_SHOW_COMMAND(buffer, db_show_buffer) { /* get args */ struct buf *bp = (struct buf *)addr; if (!have_addr) { db_printf("usage: show buffer \n"); return; } db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS); db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, " "b_resid = %ld\nb_dev = (%d,%d), b_data = %p, " "b_blkno = %d, b_pblkno = %d\n", bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, major(bp->b_dev), minor(bp->b_dev), bp->b_data, bp->b_blkno, bp->b_pblkno); if (bp->b_npages) { int i; db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); for (i = 0; i < bp->b_npages; i++) { vm_page_t m; m = bp->b_pages[i]; db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object, (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); if ((i + 1) < bp->b_npages) db_printf(","); } db_printf("\n"); } } #endif /* DDB */ Index: head/sys/kern/vfs_cluster.c =================================================================== --- head/sys/kern/vfs_cluster.c (revision 75579) +++ head/sys/kern/vfs_cluster.c (revision 75580) @@ -1,929 +1,931 @@ /*- * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * Modifications/enhancements: * Copyright (c) 1995 John S. Dyson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 * $FreeBSD$ */ #include "opt_debug_cluster.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(CLUSTERDEBUG) #include static int rcluster= 0; SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, ""); #endif static MALLOC_DEFINE(M_SEGMENT, "cluster_save buffer", "cluster_save buffer"); static struct cluster_save * cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp)); static struct buf * cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn, daddr_t blkno, long size, int run, struct buf *fbp)); static int write_behind = 1; SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, ""); extern vm_page_t bogus_page; extern int cluster_pbuf_freecnt; /* * Maximum number of blocks for read-ahead. */ #define MAXRA 32 /* * This replaces bread. */ int cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) struct vnode *vp; u_quad_t filesize; daddr_t lblkno; long size; struct ucred *cred; long totread; int seqcount; struct buf **bpp; { struct buf *bp, *rbp, *reqbp; daddr_t blkno, origblkno; int error, num_ra; int i; int maxra, racluster; long origtotread; error = 0; /* * Try to limit the amount of read-ahead by a few * ad-hoc parameters. This needs work!!! */ racluster = vp->v_mount->mnt_iosize_max / size; maxra = 2 * racluster + (totread / size); if (maxra > MAXRA) maxra = MAXRA; if (maxra > nbuf/8) maxra = nbuf/8; /* * get the requested block */ *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0); origblkno = lblkno; origtotread = totread; /* * if it is in the cache, then check to see if the reads have been * sequential. If they have, then try some read-ahead, otherwise * back-off on prospective read-aheads. */ if (bp->b_flags & B_CACHE) { if (!seqcount) { return 0; } else if ((bp->b_flags & B_RAM) == 0) { return 0; } else { int s; struct buf *tbp; bp->b_flags &= ~B_RAM; /* * We do the spl here so that there is no window * between the incore and the b_usecount increment * below. We opt to keep the spl out of the loop * for efficiency. */ s = splbio(); for (i = 1; i < maxra; i++) { if (!(tbp = incore(vp, lblkno+i))) { break; } /* * Set another read-ahead mark so we know * to check again. */ if (((i % racluster) == (racluster - 1)) || (i == (maxra - 1))) tbp->b_flags |= B_RAM; } splx(s); if (i >= maxra) { return 0; } lblkno += i; } reqbp = bp = NULL; } else { off_t firstread = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("cluster_read: no buffer offset")); if (firstread + totread > filesize) totread = filesize - firstread; if (totread > size) { int nblks = 0; int ncontigafter; while (totread > 0) { nblks++; totread -= size; } if (nblks == 1) goto single_block_read; if (nblks > racluster) nblks = racluster; error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontigafter, NULL); if (error) goto single_block_read; if (blkno == -1) goto single_block_read; if (ncontigafter == 0) goto single_block_read; if (ncontigafter + 1 < nblks) nblks = ncontigafter + 1; bp = cluster_rbuild(vp, filesize, lblkno, blkno, size, nblks, bp); lblkno += (bp->b_bufsize / size); } else { single_block_read: /* * if it isn't in the cache, then get a chunk from * disk if sequential, otherwise just get the block. */ bp->b_flags |= B_RAM; bp->b_iocmd = BIO_READ; lblkno += 1; } } /* * if we have been doing sequential I/O, then do some read-ahead */ rbp = NULL; if (seqcount && (lblkno < (origblkno + seqcount))) { /* * we now build the read-ahead buffer if it is desirable. */ if (((u_quad_t)(lblkno + 1) * size) <= filesize && !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) && blkno != -1) { int nblksread; int ntoread = num_ra + 1; nblksread = (origtotread + size - 1) / size; if (seqcount < nblksread) seqcount = nblksread; if (seqcount < ntoread) ntoread = seqcount; if (num_ra) { rbp = cluster_rbuild(vp, filesize, lblkno, blkno, size, ntoread, NULL); } else { rbp = getblk(vp, lblkno, size, 0, 0); rbp->b_flags |= B_ASYNC | B_RAM; rbp->b_iocmd = BIO_READ; rbp->b_blkno = blkno; } } } /* * handle the synchronous read */ if (bp) { #if defined(CLUSTERDEBUG) if (rcluster) printf("S(%ld,%ld,%d) ", (long)bp->b_lblkno, bp->b_bcount, seqcount); #endif if ((bp->b_flags & B_CLUSTER) == 0) { vfs_busy_pages(bp, 0); } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL) BUF_KERNPROC(bp); error = VOP_STRATEGY(vp, bp); curproc->p_stats->p_ru.ru_inblock++; } /* * and if we have read-aheads, do them too */ if (rbp) { if (error) { rbp->b_flags &= ~B_ASYNC; brelse(rbp); } else if (rbp->b_flags & B_CACHE) { rbp->b_flags &= ~B_ASYNC; bqrelse(rbp); } else { #if defined(CLUSTERDEBUG) if (rcluster) { if (bp) printf("A+(%ld,%ld,%ld,%d) ", (long)rbp->b_lblkno, rbp->b_bcount, (long)(rbp->b_lblkno - origblkno), seqcount); else printf("A(%ld,%ld,%ld,%d) ", (long)rbp->b_lblkno, rbp->b_bcount, (long)(rbp->b_lblkno - origblkno), seqcount); } #endif if ((rbp->b_flags & B_CLUSTER) == 0) { vfs_busy_pages(rbp, 0); } rbp->b_flags &= ~B_INVAL; rbp->b_ioflags &= ~BIO_ERROR; if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL) BUF_KERNPROC(rbp); (void) VOP_STRATEGY(vp, rbp); curproc->p_stats->p_ru.ru_inblock++; } } if (reqbp) return (bufwait(reqbp)); else return (error); } /* * If blocks are contiguous on disk, use this to provide clustered * read ahead. We will read as many blocks as possible sequentially * and then parcel them up into logical blocks in the buffer hash table. */ static struct buf * cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) struct vnode *vp; u_quad_t filesize; daddr_t lbn; daddr_t blkno; long size; int run; struct buf *fbp; { struct buf *bp, *tbp; daddr_t bn; int i, inc, j; KASSERT(size == vp->v_mount->mnt_stat.f_iosize, ("cluster_rbuild: size %ld != filesize %ld\n", size, vp->v_mount->mnt_stat.f_iosize)); /* * avoid a division */ while ((u_quad_t) size * (lbn + run) > filesize) { --run; } if (fbp) { tbp = fbp; tbp->b_iocmd = BIO_READ; } else { tbp = getblk(vp, lbn, size, 0, 0); if (tbp->b_flags & B_CACHE) return tbp; tbp->b_flags |= B_ASYNC | B_RAM; tbp->b_iocmd = BIO_READ; } tbp->b_blkno = blkno; if( (tbp->b_flags & B_MALLOC) || ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) return tbp; bp = trypbuf(&cluster_pbuf_freecnt); if (bp == 0) return tbp; bp->b_data = (char *)((vm_offset_t)bp->b_data | ((vm_offset_t)tbp->b_data & PAGE_MASK)); bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO; bp->b_iocmd = BIO_READ; bp->b_iodone = cluster_callback; bp->b_blkno = blkno; bp->b_lblkno = lbn; bp->b_offset = tbp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset")); pbgetvp(vp, bp); TAILQ_INIT(&bp->b_cluster.cluster_head); bp->b_bcount = 0; bp->b_bufsize = 0; bp->b_npages = 0; inc = btodb(size); for (bn = blkno, i = 0; i < run; ++i, bn += inc) { if (i != 0) { if ((bp->b_npages * PAGE_SIZE) + round_page(size) > vp->v_mount->mnt_iosize_max) break; if ((tbp = incore(vp, lbn + i)) != NULL) { if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) break; BUF_UNLOCK(tbp); for (j = 0; j < tbp->b_npages; j++) if (tbp->b_pages[j]->valid) break; if (j != tbp->b_npages) break; if (tbp->b_bcount != size) break; } tbp = getblk(vp, lbn + i, size, 0, 0); /* * If the buffer is already fully valid or locked * (which could also mean that a background write is * in progress), or the buffer is not backed by VMIO, * stop. */ if ((tbp->b_flags & (B_CACHE|B_LOCKED)) || (tbp->b_flags & B_VMIO) == 0) { bqrelse(tbp); break; } for (j = 0;j < tbp->b_npages; j++) { if (tbp->b_pages[j]->valid) break; } if (j != tbp->b_npages) { bqrelse(tbp); break; } if ((fbp && (i == 1)) || (i == (run - 1))) tbp->b_flags |= B_RAM; tbp->b_flags |= B_ASYNC; tbp->b_iocmd = BIO_READ; if (tbp->b_blkno == tbp->b_lblkno) { tbp->b_blkno = bn; } else if (tbp->b_blkno != bn) { brelse(tbp); break; } } /* * XXX fbp from caller may not be B_ASYNC, but we are going * to biodone() it in cluster_callback() anyway */ BUF_KERNPROC(tbp); TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, tbp, b_cluster.cluster_entry); for (j = 0; j < tbp->b_npages; j += 1) { vm_page_t m; m = tbp->b_pages[j]; vm_page_io_start(m); vm_object_pip_add(m->object, 1); if ((bp->b_npages == 0) || (bp->b_pages[bp->b_npages-1] != m)) { bp->b_pages[bp->b_npages] = m; bp->b_npages++; } if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) tbp->b_pages[j] = bogus_page; } bp->b_bcount += tbp->b_bcount; bp->b_bufsize += tbp->b_bufsize; } for(j=0;jb_npages;j++) { if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) bp->b_pages[j] = bogus_page; } if (bp->b_bufsize > bp->b_kvasize) panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n", bp->b_bufsize, bp->b_kvasize); bp->b_kvasize = bp->b_bufsize; pmap_qenter(trunc_page((vm_offset_t) bp->b_data), (vm_page_t *)bp->b_pages, bp->b_npages); return (bp); } /* * Cleanup after a clustered read or write. * This is complicated by the fact that any of the buffers might have * extra memory (if there were no empty buffer headers at allocbuf time) * that we will need to shift around. */ void cluster_callback(bp) struct buf *bp; { struct buf *nbp, *tbp; int error = 0; /* * Must propogate errors to all the components. */ if (bp->b_ioflags & BIO_ERROR) error = bp->b_error; pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); /* * Move memory from the large cluster buffer into the component * buffers and mark IO as done on these. */ for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head); tbp; tbp = nbp) { nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry); if (error) { tbp->b_ioflags |= BIO_ERROR; tbp->b_error = error; } else { tbp->b_dirtyoff = tbp->b_dirtyend = 0; tbp->b_flags &= ~B_INVAL; tbp->b_ioflags &= ~BIO_ERROR; } bufdone(tbp); } relpbuf(bp, &cluster_pbuf_freecnt); } /* * cluster_wbuild_wb: * * Implement modified write build for cluster. * * write_behind = 0 write behind disabled * write_behind = 1 write behind normal (default) * write_behind = 2 write behind backed-off */ static __inline int cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len) { int r = 0; switch(write_behind) { case 2: if (start_lbn < len) break; start_lbn -= len; /* fall through */ case 1: r = cluster_wbuild(vp, size, start_lbn, len); /* fall through */ default: /* fall through */ break; } return(r); } /* * Do clustered write for FFS. * * Three cases: * 1. Write is not sequential (write asynchronously) * Write is sequential: * 2. beginning of cluster - begin cluster * 3. middle of a cluster - add to cluster * 4. end of a cluster - asynchronously write cluster */ void cluster_write(bp, filesize, seqcount) struct buf *bp; u_quad_t filesize; int seqcount; { struct vnode *vp; daddr_t lbn; int maxclen, cursize; int lblocksize; int async; vp = bp->b_vp; if (vp->v_type == VREG) { async = vp->v_mount->mnt_flag & MNT_ASYNC; lblocksize = vp->v_mount->mnt_stat.f_iosize; } else { async = 0; lblocksize = bp->b_bufsize; } lbn = bp->b_lblkno; KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset")); /* Initialize vnode to beginning of file. */ if (lbn == 0) vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1; if (vp->v_clen != 0) { /* * Next block is not sequential. * * If we are not writing at end of file, the process * seeked to another point in the file since its last * write, or we have reached our maximum cluster size, * then push the previous cluster. Otherwise try * reallocating to make it sequential. * * Change to algorithm: only push previous cluster if * it was sequential from the point of view of the * seqcount heuristic, otherwise leave the buffer * intact so we can potentially optimize the I/O * later on in the buf_daemon or update daemon * flush. */ cursize = vp->v_lastw - vp->v_cstart + 1; if (((u_quad_t) bp->b_offset + lblocksize) != filesize || lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { if (!async && seqcount > 0) { cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, cursize); } } else { struct buf **bpp, **endbp; struct cluster_save *buflist; buflist = cluster_collectbufs(vp, bp); endbp = &buflist->bs_children [buflist->bs_nchildren - 1]; if (VOP_REALLOCBLKS(vp, buflist)) { /* * Failed, push the previous cluster * if *really* writing sequentially * in the logical file (seqcount > 1), * otherwise delay it in the hopes that * the low level disk driver can * optimize the write ordering. */ for (bpp = buflist->bs_children; bpp < endbp; bpp++) brelse(*bpp); free(buflist, M_SEGMENT); if (seqcount > 1) { cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, cursize); } } else { /* * Succeeded, keep building cluster. */ for (bpp = buflist->bs_children; bpp <= endbp; bpp++) bdwrite(*bpp); free(buflist, M_SEGMENT); vp->v_lastw = lbn; vp->v_lasta = bp->b_blkno; return; } } } /* * Consider beginning a cluster. If at end of file, make * cluster as large as possible, otherwise find size of * existing cluster. */ if ((vp->v_type == VREG) && ((u_quad_t) bp->b_offset + lblocksize) != filesize && (bp->b_blkno == bp->b_lblkno) && (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || bp->b_blkno == -1)) { bawrite(bp); vp->v_clen = 0; vp->v_lasta = bp->b_blkno; vp->v_cstart = lbn + 1; vp->v_lastw = lbn; return; } vp->v_clen = maxclen; if (!async && maxclen == 0) { /* I/O not contiguous */ vp->v_cstart = lbn + 1; bawrite(bp); } else { /* Wait for rest of cluster */ vp->v_cstart = lbn; bdwrite(bp); } } else if (lbn == vp->v_cstart + vp->v_clen) { /* * At end of cluster, write it out if seqcount tells us we * are operating sequentially, otherwise let the buf or * update daemon handle it. */ bdwrite(bp); if (seqcount > 1) cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); vp->v_clen = 0; vp->v_cstart = lbn + 1; } else if (vm_page_count_severe()) { /* * We are low on memory, get it going NOW */ bawrite(bp); } else { /* * In the middle of a cluster, so just delay the I/O for now. */ bdwrite(bp); } vp->v_lastw = lbn; vp->v_lasta = bp->b_blkno; } /* * This is an awful lot like cluster_rbuild...wish they could be combined. * The last lbn argument is the current block on which I/O is being * performed. Check to see that it doesn't fall in the middle of * the current block (if last_bp == NULL). */ int cluster_wbuild(vp, size, start_lbn, len) struct vnode *vp; long size; daddr_t start_lbn; int len; { struct buf *bp, *tbp; int i, j, s; int totalwritten = 0; int dbsize = btodb(size); while (len > 0) { s = splbio(); /* * If the buffer is not delayed-write (i.e. dirty), or it * is delayed-write but either locked or inval, it cannot * partake in the clustered write. */ if (((tbp = gbincore(vp, start_lbn)) == NULL) || ((tbp->b_flags & (B_LOCKED | B_INVAL | B_DELWRI)) != B_DELWRI) || BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) { ++start_lbn; --len; splx(s); continue; } bremfree(tbp); tbp->b_flags &= ~B_DONE; splx(s); /* * Extra memory in the buffer, punt on this buffer. * XXX we could handle this in most cases, but we would * have to push the extra memory down to after our max * possible cluster size and then potentially pull it back * up if the cluster was terminated prematurely--too much * hassle. */ if (((tbp->b_flags & (B_CLUSTEROK | B_MALLOC | B_VMIO)) != (B_CLUSTEROK | B_VMIO)) || (tbp->b_bcount != tbp->b_bufsize) || (tbp->b_bcount != size) || (len == 1) || ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) { totalwritten += tbp->b_bufsize; bawrite(tbp); ++start_lbn; --len; continue; } /* * We got a pbuf to make the cluster in. * so initialise it. */ TAILQ_INIT(&bp->b_cluster.cluster_head); bp->b_bcount = 0; + bp->b_magic = tbp->b_magic; + bp->b_op = tbp->b_op; bp->b_bufsize = 0; bp->b_npages = 0; if (tbp->b_wcred != NOCRED) { bp->b_wcred = tbp->b_wcred; crhold(bp->b_wcred); } bp->b_blkno = tbp->b_blkno; bp->b_lblkno = tbp->b_lblkno; bp->b_offset = tbp->b_offset; bp->b_data = (char *)((vm_offset_t)bp->b_data | ((vm_offset_t)tbp->b_data & PAGE_MASK)); bp->b_flags |= B_CLUSTER | (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); bp->b_iodone = cluster_callback; pbgetvp(vp, bp); /* * From this location in the file, scan forward to see * if there are buffers with adjacent data that need to * be written as well. */ for (i = 0; i < len; ++i, ++start_lbn) { if (i != 0) { /* If not the first buffer */ s = splbio(); /* * If the adjacent data is not even in core it * can't need to be written. */ if ((tbp = gbincore(vp, start_lbn)) == NULL) { splx(s); break; } /* * If it IS in core, but has different * characteristics, or is locked (which * means it could be undergoing a background * I/O or be in a weird state), then don't * cluster with it. */ if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK | B_INVAL | B_DELWRI | B_NEEDCOMMIT)) != (B_DELWRI | B_CLUSTEROK | (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) || (tbp->b_flags & B_LOCKED) || tbp->b_wcred != bp->b_wcred || BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) { splx(s); break; } /* * Check that the combined cluster * would make sense with regard to pages * and would not be too large */ if ((tbp->b_bcount != size) || ((bp->b_blkno + (dbsize * i)) != tbp->b_blkno) || ((tbp->b_npages + bp->b_npages) > (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) { BUF_UNLOCK(tbp); splx(s); break; } /* * Ok, it's passed all the tests, * so remove it from the free list * and mark it busy. We will use it. */ bremfree(tbp); tbp->b_flags &= ~B_DONE; splx(s); } /* end of code for non-first buffers only */ /* check for latent dependencies to be handled */ if ((LIST_FIRST(&tbp->b_dep)) != NULL) buf_start(tbp); /* * If the IO is via the VM then we do some * special VM hackery. (yuck) */ if (tbp->b_flags & B_VMIO) { vm_page_t m; if (i != 0) { /* if not first buffer */ for (j = 0; j < tbp->b_npages; j += 1) { m = tbp->b_pages[j]; if (m->flags & PG_BUSY) { bqrelse(tbp); goto finishcluster; } } } for (j = 0; j < tbp->b_npages; j += 1) { m = tbp->b_pages[j]; vm_page_io_start(m); vm_object_pip_add(m->object, 1); if ((bp->b_npages == 0) || (bp->b_pages[bp->b_npages - 1] != m)) { bp->b_pages[bp->b_npages] = m; bp->b_npages++; } } } bp->b_bcount += size; bp->b_bufsize += size; s = splbio(); bundirty(tbp); tbp->b_flags &= ~B_DONE; tbp->b_ioflags &= ~BIO_ERROR; tbp->b_flags |= B_ASYNC; tbp->b_iocmd = BIO_WRITE; reassignbuf(tbp, tbp->b_vp); /* put on clean list */ ++tbp->b_vp->v_numoutput; splx(s); BUF_KERNPROC(tbp); TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, tbp, b_cluster.cluster_entry); } finishcluster: pmap_qenter(trunc_page((vm_offset_t) bp->b_data), (vm_page_t *) bp->b_pages, bp->b_npages); if (bp->b_bufsize > bp->b_kvasize) panic( "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", bp->b_bufsize, bp->b_kvasize); bp->b_kvasize = bp->b_bufsize; totalwritten += bp->b_bufsize; bp->b_dirtyoff = 0; bp->b_dirtyend = bp->b_bufsize; bawrite(bp); len -= i; } return totalwritten; } /* * Collect together all the buffers in a cluster. * Plus add one additional buffer. */ static struct cluster_save * cluster_collectbufs(vp, last_bp) struct vnode *vp; struct buf *last_bp; { struct cluster_save *buflist; struct buf *bp; daddr_t lbn; int i, len; len = vp->v_lastw - vp->v_cstart + 1; buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), M_SEGMENT, M_WAITOK); buflist->bs_nchildren = 0; buflist->bs_children = (struct buf **) (buflist + 1); for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) { (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp); buflist->bs_children[i] = bp; if (bp->b_blkno == bp->b_lblkno) VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } buflist->bs_children[i] = bp = last_bp; if (bp->b_blkno == bp->b_lblkno) VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); buflist->bs_nchildren = i + 1; return (buflist); } Index: head/sys/kern/vfs_default.c =================================================================== --- head/sys/kern/vfs_default.c (revision 75579) +++ head/sys/kern/vfs_default.c (revision 75580) @@ -1,753 +1,745 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed * to Berkeley by John Heidemann of the UCLA Ficus project. * * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int vop_nolookup __P((struct vop_lookup_args *)); static int vop_nostrategy __P((struct vop_strategy_args *)); /* * This vnode table stores what we want to do if the filesystem doesn't * implement a particular VOP. * * If there is no specific entry here, we will return EOPNOTSUPP. * */ vop_t **default_vnodeop_p; static struct vnodeopv_entry_desc default_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_eopnotsupp }, { &vop_advlock_desc, (vop_t *) vop_einval }, - { &vop_bwrite_desc, (vop_t *) vop_stdbwrite }, { &vop_close_desc, (vop_t *) vop_null }, { &vop_createvobject_desc, (vop_t *) vop_stdcreatevobject }, { &vop_destroyvobject_desc, (vop_t *) vop_stddestroyvobject }, { &vop_fsync_desc, (vop_t *) vop_null }, { &vop_getvobject_desc, (vop_t *) vop_stdgetvobject }, { &vop_inactive_desc, (vop_t *) vop_stdinactive }, { &vop_ioctl_desc, (vop_t *) vop_enotty }, { &vop_islocked_desc, (vop_t *) vop_noislocked }, { &vop_lease_desc, (vop_t *) vop_null }, { &vop_lock_desc, (vop_t *) vop_nolock }, { &vop_lookup_desc, (vop_t *) vop_nolookup }, { &vop_open_desc, (vop_t *) vop_null }, { &vop_pathconf_desc, (vop_t *) vop_einval }, { &vop_poll_desc, (vop_t *) vop_nopoll }, { &vop_readlink_desc, (vop_t *) vop_einval }, { &vop_revoke_desc, (vop_t *) vop_revoke }, { &vop_strategy_desc, (vop_t *) vop_nostrategy }, { &vop_unlock_desc, (vop_t *) vop_nounlock }, { NULL, NULL } }; static struct vnodeopv_desc default_vnodeop_opv_desc = { &default_vnodeop_p, default_vnodeop_entries }; VNODEOP_SET(default_vnodeop_opv_desc); int vop_eopnotsupp(struct vop_generic_args *ap) { /* printf("vop_notsupp[%s]\n", ap->a_desc->vdesc_name); */ return (EOPNOTSUPP); } int vop_ebadf(struct vop_generic_args *ap) { return (EBADF); } int vop_enotty(struct vop_generic_args *ap) { return (ENOTTY); } int vop_einval(struct vop_generic_args *ap) { return (EINVAL); } int vop_null(struct vop_generic_args *ap) { return (0); } int vop_defaultop(struct vop_generic_args *ap) { return (VOCALL(default_vnodeop_p, ap->a_desc->vdesc_offset, ap)); } int vop_panic(struct vop_generic_args *ap) { panic("filesystem goof: vop_panic[%s]", ap->a_desc->vdesc_name); } static int vop_nolookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { *ap->a_vpp = NULL; return (ENOTDIR); } /* * vop_nostrategy: * * Strategy routine for VFS devices that have none. * * BIO_ERROR and B_INVAL must be cleared prior to calling any strategy * routine. Typically this is done for a BIO_READ strategy call. * Typically B_INVAL is assumed to already be clear prior to a write * and should not be cleared manually unless you just made the buffer * invalid. BIO_ERROR should be cleared either way. */ static int vop_nostrategy (struct vop_strategy_args *ap) { printf("No strategy for buffer at %p\n", ap->a_bp); vprint("", ap->a_vp); vprint("", ap->a_bp->b_vp); ap->a_bp->b_ioflags |= BIO_ERROR; ap->a_bp->b_error = EOPNOTSUPP; bufdone(ap->a_bp); return (EOPNOTSUPP); } int vop_stdpathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; } */ *ap; { switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = LINK_MAX; return (0); case _PC_MAX_CANON: *ap->a_retval = MAX_CANON; return (0); case _PC_MAX_INPUT: *ap->a_retval = MAX_INPUT; return (0); case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_VDISABLE: *ap->a_retval = _POSIX_VDISABLE; return (0); default: return (EINVAL); } /* NOTREACHED */ } /* * Standard lock, unlock and islocked functions. * * These depend on the lock structure being the first element in the * inode, ie: vp->v_data points to the the lock! */ int vop_stdlock(ap) struct vop_lock_args /* { struct vnode *a_vp; int a_flags; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; #ifndef DEBUG_LOCKS return (lockmgr(&vp->v_lock, ap->a_flags, &vp->v_interlock, ap->a_p)); #else return (debuglockmgr(&vp->v_lock, ap->a_flags, &vp->v_interlock, ap->a_p, "vop_stdlock", vp->filename, vp->line)); #endif } int vop_stdunlock(ap) struct vop_unlock_args /* { struct vnode *a_vp; int a_flags; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; return (lockmgr(&vp->v_lock, ap->a_flags | LK_RELEASE, &vp->v_interlock, ap->a_p)); } int vop_stdislocked(ap) struct vop_islocked_args /* { struct vnode *a_vp; struct proc *a_p; } */ *ap; { return (lockstatus(&ap->a_vp->v_lock, ap->a_p)); } int vop_stdinactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct proc *a_p; } */ *ap; { VOP_UNLOCK(ap->a_vp, 0, ap->a_p); return (0); } /* * Return true for select/poll. */ int vop_nopoll(ap) struct vop_poll_args /* { struct vnode *a_vp; int a_events; struct ucred *a_cred; struct proc *a_p; } */ *ap; { /* * Return true for read/write. If the user asked for something * special, return POLLNVAL, so that clients have a way of * determining reliably whether or not the extended * functionality is present without hard-coding knowledge * of specific filesystem implementations. */ if (ap->a_events & ~POLLSTANDARD) return (POLLNVAL); return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); } /* * Implement poll for local filesystems that support it. */ int vop_stdpoll(ap) struct vop_poll_args /* { struct vnode *a_vp; int a_events; struct ucred *a_cred; struct proc *a_p; } */ *ap; { if ((ap->a_events & ~POLLSTANDARD) == 0) return (ap->a_events & (POLLRDNORM|POLLWRNORM)); return (vn_pollrecord(ap->a_vp, ap->a_p, ap->a_events)); -} - -int -vop_stdbwrite(ap) - struct vop_bwrite_args *ap; -{ - return (bwrite(ap->a_bp)); } /* * Stubs to use when there is no locking to be done on the underlying object. * A minimal shared lock is necessary to ensure that the underlying object * is not revoked while an operation is in progress. So, an active shared * count is maintained in an auxillary vnode lock structure. */ int vop_sharedlock(ap) struct vop_lock_args /* { struct vnode *a_vp; int a_flags; struct proc *a_p; } */ *ap; { /* * This code cannot be used until all the non-locking filesystems * (notably NFS) are converted to properly lock and release nodes. * Also, certain vnode operations change the locking state within * the operation (create, mknod, remove, link, rename, mkdir, rmdir, * and symlink). Ideally these operations should not change the * lock state, but should be changed to let the caller of the * function unlock them. Otherwise all intermediate vnode layers * (such as union, umapfs, etc) must catch these functions to do * the necessary locking at their layer. Note that the inactive * and lookup operations also change their lock state, but this * cannot be avoided, so these two operations will always need * to be handled in intermediate layers. */ struct vnode *vp = ap->a_vp; int vnflags, flags = ap->a_flags; switch (flags & LK_TYPE_MASK) { case LK_DRAIN: vnflags = LK_DRAIN; break; case LK_EXCLUSIVE: #ifdef DEBUG_VFS_LOCKS /* * Normally, we use shared locks here, but that confuses * the locking assertions. */ vnflags = LK_EXCLUSIVE; break; #endif case LK_SHARED: vnflags = LK_SHARED; break; case LK_UPGRADE: case LK_EXCLUPGRADE: case LK_DOWNGRADE: return (0); case LK_RELEASE: default: panic("vop_sharedlock: bad operation %d", flags & LK_TYPE_MASK); } if (flags & LK_INTERLOCK) vnflags |= LK_INTERLOCK; #ifndef DEBUG_LOCKS return (lockmgr(&vp->v_lock, vnflags, &vp->v_interlock, ap->a_p)); #else return (debuglockmgr(&vp->v_lock, vnflags, &vp->v_interlock, ap->a_p, "vop_sharedlock", vp->filename, vp->line)); #endif } /* * Stubs to use when there is no locking to be done on the underlying object. * A minimal shared lock is necessary to ensure that the underlying object * is not revoked while an operation is in progress. So, an active shared * count is maintained in an auxillary vnode lock structure. */ int vop_nolock(ap) struct vop_lock_args /* { struct vnode *a_vp; int a_flags; struct proc *a_p; } */ *ap; { #ifdef notyet /* * This code cannot be used until all the non-locking filesystems * (notably NFS) are converted to properly lock and release nodes. * Also, certain vnode operations change the locking state within * the operation (create, mknod, remove, link, rename, mkdir, rmdir, * and symlink). Ideally these operations should not change the * lock state, but should be changed to let the caller of the * function unlock them. Otherwise all intermediate vnode layers * (such as union, umapfs, etc) must catch these functions to do * the necessary locking at their layer. Note that the inactive * and lookup operations also change their lock state, but this * cannot be avoided, so these two operations will always need * to be handled in intermediate layers. */ struct vnode *vp = ap->a_vp; int vnflags, flags = ap->a_flags; switch (flags & LK_TYPE_MASK) { case LK_DRAIN: vnflags = LK_DRAIN; break; case LK_EXCLUSIVE: case LK_SHARED: vnflags = LK_SHARED; break; case LK_UPGRADE: case LK_EXCLUPGRADE: case LK_DOWNGRADE: return (0); case LK_RELEASE: default: panic("vop_nolock: bad operation %d", flags & LK_TYPE_MASK); } if (flags & LK_INTERLOCK) vnflags |= LK_INTERLOCK; return(lockmgr(&vp->v_lock, vnflags, &vp->v_interlock, ap->a_p)); #else /* for now */ /* * Since we are not using the lock manager, we must clear * the interlock here. */ if (ap->a_flags & LK_INTERLOCK) mtx_unlock(&ap->a_vp->v_interlock); return (0); #endif } /* * Do the inverse of vop_nolock, handling the interlock in a compatible way. */ int vop_nounlock(ap) struct vop_unlock_args /* { struct vnode *a_vp; int a_flags; struct proc *a_p; } */ *ap; { /* * Since we are not using the lock manager, we must clear * the interlock here. */ if (ap->a_flags & LK_INTERLOCK) mtx_unlock(&ap->a_vp->v_interlock); return (0); } /* * Return whether or not the node is in use. */ int vop_noislocked(ap) struct vop_islocked_args /* { struct vnode *a_vp; struct proc *a_p; } */ *ap; { return (0); } /* * Return our mount point, as we will take charge of the writes. */ int vop_stdgetwritemount(ap) struct vop_getwritemount_args /* { struct vnode *a_vp; struct mount **a_mpp; } */ *ap; { *(ap->a_mpp) = ap->a_vp->v_mount; return (0); } int vop_stdcreatevobject(ap) struct vop_createvobject_args /* { struct vnode *vp; struct ucred *cred; struct proc *p; } */ *ap; { struct vnode *vp = ap->a_vp; struct ucred *cred = ap->a_cred; struct proc *p = ap->a_p; struct vattr vat; vm_object_t object; int error = 0; if (!vn_isdisk(vp, NULL) && vn_canvmio(vp) == FALSE) return (0); retry: if ((object = vp->v_object) == NULL) { if (vp->v_type == VREG || vp->v_type == VDIR) { if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0) goto retn; object = vnode_pager_alloc(vp, vat.va_size, 0, 0); } else if (devsw(vp->v_rdev) != NULL) { /* * This simply allocates the biggest object possible * for a disk vnode. This should be fixed, but doesn't * cause any problems (yet). */ object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0); } else { goto retn; } /* * Dereference the reference we just created. This assumes * that the object is associated with the vp. */ object->ref_count--; vp->v_usecount--; } else { if (object->flags & OBJ_DEAD) { VOP_UNLOCK(vp, 0, p); tsleep(object, PVM, "vodead", 0); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); goto retry; } } KASSERT(vp->v_object != NULL, ("vfs_object_create: NULL object")); vp->v_flag |= VOBJBUF; retn: return (error); } int vop_stddestroyvobject(ap) struct vop_destroyvobject_args /* { struct vnode *vp; } */ *ap; { struct vnode *vp = ap->a_vp; vm_object_t obj = vp->v_object; if (vp->v_object == NULL) return (0); if (obj->ref_count == 0) { /* * vclean() may be called twice. The first time * removes the primary reference to the object, * the second time goes one further and is a * special-case to terminate the object. */ vm_object_terminate(obj); } else { /* * Woe to the process that tries to page now :-). */ vm_pager_deallocate(obj); } return (0); } int vop_stdgetvobject(ap) struct vop_getvobject_args /* { struct vnode *vp; struct vm_object **objpp; } */ *ap; { struct vnode *vp = ap->a_vp; struct vm_object **objpp = ap->a_objpp; if (objpp) *objpp = vp->v_object; return (vp->v_object ? 0 : EINVAL); } /* * vfs default ops * used to fill the vfs fucntion table to get reasonable default return values. */ int vfs_stdmount (mp, path, data, ndp, p) struct mount *mp; char *path; caddr_t data; struct nameidata *ndp; struct proc *p; { return (0); } int vfs_stdunmount (mp, mntflags, p) struct mount *mp; int mntflags; struct proc *p; { return (0); } int vfs_stdroot (mp, vpp) struct mount *mp; struct vnode **vpp; { return (EOPNOTSUPP); } int vfs_stdstatfs (mp, sbp, p) struct mount *mp; struct statfs *sbp; struct proc *p; { return (EOPNOTSUPP); } int vfs_stdvptofh (vp, fhp) struct vnode *vp; struct fid *fhp; { return (EOPNOTSUPP); } int vfs_stdstart (mp, flags, p) struct mount *mp; int flags; struct proc *p; { return (0); } int vfs_stdquotactl (mp, cmds, uid, arg, p) struct mount *mp; int cmds; uid_t uid; caddr_t arg; struct proc *p; { return (EOPNOTSUPP); } int vfs_stdsync (mp, waitfor, cred, p) struct mount *mp; int waitfor; struct ucred *cred; struct proc *p; { return (0); } int vfs_stdvget (mp, ino, vpp) struct mount *mp; ino_t ino; struct vnode **vpp; { return (EOPNOTSUPP); } int vfs_stdfhtovp (mp, fhp, vpp) struct mount *mp; struct fid *fhp; struct vnode **vpp; { return (EOPNOTSUPP); } int vfs_stdcheckexp (mp, nam, extflagsp, credanonp) struct mount *mp; struct sockaddr *nam; int *extflagsp; struct ucred **credanonp; { return (EOPNOTSUPP); } int vfs_stdinit (vfsp) struct vfsconf *vfsp; { return (0); } int vfs_stduninit (vfsp) struct vfsconf *vfsp; { return(0); } int vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname, p) struct mount *mp; int cmd; struct vnode *filename_vp; int attrnamespace; const char *attrname; struct proc *p; { return(EOPNOTSUPP); } /* end of vfs default ops */ Index: head/sys/kern/vfs_export.c =================================================================== --- head/sys/kern/vfs_export.c (revision 75579) +++ head/sys/kern/vfs_export.c (revision 75580) @@ -1,3150 +1,3150 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 * $FreeBSD$ */ /* * External virtual filesystem routines */ #include "opt_ddb.h" #include "opt_ffs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); static void addalias __P((struct vnode *vp, dev_t nvp_rdev)); static void insmntque __P((struct vnode *vp, struct mount *mp)); static void vclean __P((struct vnode *vp, int flags, struct proc *p)); /* * Number of vnodes in existence. Increased whenever getnewvnode() * allocates a new vnode, never decreased. */ static unsigned long numvnodes; SYSCTL_LONG(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); /* * Conversion tables for conversion from vnode types to inode formats * and back. */ enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, }; int vttoif_tab[9] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, }; /* * List of vnodes that are ready for recycling. */ static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* * Minimum number of free vnodes. If there are fewer than this free vnodes, * getnewvnode() will return a newly allocated vnode. */ static u_long wantfreevnodes = 25; SYSCTL_LONG(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); /* Number of vnodes in the free list. */ static u_long freevnodes = 0; SYSCTL_LONG(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); /* * Various variables used for debugging the new implementation of * reassignbuf(). * XXX these are probably of (very) limited utility now. */ static int reassignbufcalls; SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); static int reassignbufloops; SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); static int reassignbufsortgood; SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); static int reassignbufsortbad; SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); /* Set to 0 for old insertion-sort based reassignbuf, 1 for modern method. */ static int reassignbufmethod = 1; SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); #ifdef ENABLE_VFS_IOOPT /* See NOTES for a description of this setting. */ int vfs_ioopt = 0; SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); #endif /* List of mounted filesystems. */ struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* For any iteration/modification of mountlist */ struct mtx mountlist_mtx; /* For any iteration/modification of mnt_vnodelist */ struct mtx mntvnode_mtx; /* * Cache for the mount type id assigned to NFS. This is used for * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c. */ int nfs_mount_type = -1; /* To keep more than one thread at a time from running vfs_getnewfsid */ static struct mtx mntid_mtx; /* For any iteration/modification of vnode_free_list */ static struct mtx vnode_free_list_mtx; /* * For any iteration/modification of dev->si_hlist (linked through * v_specnext) */ static struct mtx spechash_mtx; /* Publicly exported FS */ struct nfs_public nfs_pub; /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ static vm_zone_t vnode_zone; /* Set to 1 to print out reclaim of active vnodes */ int prtactive = 0; /* * The workitem queue. * * It is useful to delay writes of file data and filesystem metadata * for tens of seconds so that quickly created and deleted files need * not waste disk bandwidth being created and removed. To realize this, * we append vnodes to a "workitem" queue. When running with a soft * updates implementation, most pending metadata dependencies should * not wait for more than a few seconds. Thus, mounted on block devices * are delayed only about a half the time that file data is delayed. * Similarly, directory updates are more critical, so are only delayed * about a third the time that file data is delayed. Thus, there are * SYNCER_MAXDELAY queues that are processed round-robin at a rate of * one each second (driven off the filesystem syncer process). The * syncer_delayno variable indicates the next queue that is to be processed. * Items that need to be processed soon are placed in this queue: * * syncer_workitem_pending[syncer_delayno] * * A delay of fifteen seconds is done by placing the request fifteen * entries later in the queue: * * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * */ static int syncer_delayno = 0; static long syncer_mask; LIST_HEAD(synclist, vnode); static struct synclist *syncer_workitem_pending; #define SYNCER_MAXDELAY 32 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ time_t syncdelay = 30; /* max time to delay syncing data */ time_t filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); time_t dirdelay = 29; /* time to delay syncing directories */ SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); time_t metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); static int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); /* * Number of vnodes we want to exist at any one time. This is mostly used * to size hash tables in vnode-related code. It is normally not used in * getnewvnode(), as wantfreevnodes is normally nonzero.) * * XXX desiredvnodes is historical cruft and should not exist. */ int desiredvnodes; SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "Maximum number of vnodes"); static void vfs_free_addrlist __P((struct netexport *nep)); static int vfs_free_netcred __P((struct radix_node *rn, void *w)); static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep, struct export_args *argp)); /* * Initialize the vnode management data structures. */ static void vntblinit(void *dummy __unused) { desiredvnodes = maxproc + cnt.v_page_count / 4; mtx_init(&mountlist_mtx, "mountlist", MTX_DEF); mtx_init(&mntvnode_mtx, "mntvnode", MTX_DEF); mtx_init(&mntid_mtx, "mntid", MTX_DEF); mtx_init(&spechash_mtx, "spechash", MTX_DEF); TAILQ_INIT(&vnode_free_list); mtx_init(&vnode_free_list_mtx, "vnode_free_list", MTX_DEF); vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); /* * Initialize the filesystem syncer. */ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_maxdelay = syncer_mask + 1; } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL) /* * Mark a mount point as busy. Used to synchronize access and to delay * unmounting. Interlock is not released on failure. */ int vfs_busy(mp, flags, interlkp, p) struct mount *mp; int flags; struct mtx *interlkp; struct proc *p; { int lkflags; if (mp->mnt_kern_flag & MNTK_UNMOUNT) { if (flags & LK_NOWAIT) return (ENOENT); mp->mnt_kern_flag |= MNTK_MWAIT; /* * Since all busy locks are shared except the exclusive * lock granted when unmounting, the only place that a * wakeup needs to be done is at the release of the * exclusive lock at the end of dounmount. */ msleep((caddr_t)mp, interlkp, PVFS, "vfs_busy", 0); return (ENOENT); } lkflags = LK_SHARED | LK_NOPAUSE; if (interlkp) lkflags |= LK_INTERLOCK; if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) panic("vfs_busy: unexpected lock failure"); return (0); } /* * Free a busy filesystem. */ void vfs_unbusy(mp, p) struct mount *mp; struct proc *p; { lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); } /* * Lookup a filesystem type, and if found allocate and initialize * a mount structure for it. * * Devname is usually updated by mount(8) after booting. */ int vfs_rootmountalloc(fstypename, devname, mpp) char *fstypename; char *devname; struct mount **mpp; { struct proc *p = curproc; /* XXX */ struct vfsconf *vfsp; struct mount *mp; if (fstypename == NULL) return (ENODEV); for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (!strcmp(vfsp->vfc_name, fstypename)) break; if (vfsp == NULL) return (ENODEV); mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO); lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); (void)vfs_busy(mp, LK_NOWAIT, 0, p); LIST_INIT(&mp->mnt_vnodelist); mp->mnt_vfc = vfsp; mp->mnt_op = vfsp->vfc_vfsops; mp->mnt_flag = MNT_RDONLY; mp->mnt_vnodecovered = NULLVP; vfsp->vfc_refcount++; mp->mnt_iosize_max = DFLTPHYS; mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); mp->mnt_stat.f_mntonname[0] = '/'; mp->mnt_stat.f_mntonname[1] = 0; (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); *mpp = mp; return (0); } /* * Find an appropriate filesystem to use for the root. If a filesystem * has not been preselected, walk through the list of known filesystems * trying those that have mountroot routines, and try them until one * works or we have tried them all. */ #ifdef notdef /* XXX JH */ int lite2_vfs_mountroot() { struct vfsconf *vfsp; extern int (*lite2_mountroot) __P((void)); int error; if (lite2_mountroot != NULL) return ((*lite2_mountroot)()); for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { if (vfsp->vfc_mountroot == NULL) continue; if ((error = (*vfsp->vfc_mountroot)()) == 0) return (0); printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); } return (ENODEV); } #endif /* * Lookup a mount point by filesystem identifier. */ struct mount * vfs_getvfs(fsid) fsid_t *fsid; { register struct mount *mp; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { mtx_unlock(&mountlist_mtx); return (mp); } } mtx_unlock(&mountlist_mtx); return ((struct mount *) 0); } /* * Get a new unique fsid. Try to make its val[0] unique, since this value * will be used to create fake device numbers for stat(). Also try (but * not so hard) make its val[0] unique mod 2^16, since some emulators only * support 16-bit device numbers. We end up with unique val[0]'s for the * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. * * Keep in mind that several mounts may be running in parallel. Starting * the search one past where the previous search terminated is both a * micro-optimization and a defense against returning the same fsid to * different mounts. */ void vfs_getnewfsid(mp) struct mount *mp; { static u_int16_t mntid_base; fsid_t tfsid; int mtype; mtx_lock(&mntid_mtx); mtype = mp->mnt_vfc->vfc_typenum; tfsid.val[1] = mtype; mtype = (mtype & 0xFF) << 24; for (;;) { tfsid.val[0] = makeudev(255, mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); mntid_base++; if (vfs_getvfs(&tfsid) == NULL) break; } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; mtx_unlock(&mntid_mtx); } /* * Knob to control the precision of file timestamps: * * 0 = seconds only; nanoseconds zeroed. * 1 = seconds and nanoseconds, accurate within 1/HZ. * 2 = seconds and nanoseconds, truncated to microseconds. * >=3 = seconds and nanoseconds, maximum precision. */ enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; static int timestamp_precision = TSP_SEC; SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, ×tamp_precision, 0, ""); /* * Get a current timestamp. */ void vfs_timestamp(tsp) struct timespec *tsp; { struct timeval tv; switch (timestamp_precision) { case TSP_SEC: tsp->tv_sec = time_second; tsp->tv_nsec = 0; break; case TSP_HZ: getnanotime(tsp); break; case TSP_USEC: microtime(&tv); TIMEVAL_TO_TIMESPEC(&tv, tsp); break; case TSP_NSEC: default: nanotime(tsp); break; } } /* * Set vnode attributes to VNOVAL */ void vattr_null(vap) register struct vattr *vap; { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = VNOVAL; vap->va_nlink = VNOVAL; vap->va_uid = VNOVAL; vap->va_gid = VNOVAL; vap->va_fsid = VNOVAL; vap->va_fileid = VNOVAL; vap->va_blocksize = VNOVAL; vap->va_rdev = VNOVAL; vap->va_atime.tv_sec = VNOVAL; vap->va_atime.tv_nsec = VNOVAL; vap->va_mtime.tv_sec = VNOVAL; vap->va_mtime.tv_nsec = VNOVAL; vap->va_ctime.tv_sec = VNOVAL; vap->va_ctime.tv_nsec = VNOVAL; vap->va_flags = VNOVAL; vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * Routines having to do with the management of the vnode table. */ /* * Return the next vnode from the free list. */ int getnewvnode(tag, mp, vops, vpp) enum vtagtype tag; struct mount *mp; vop_t **vops; struct vnode **vpp; { int s, count; struct proc *p = curproc; /* XXX */ struct vnode *vp = NULL; struct mount *vnmp; vm_object_t object; /* * We take the least recently used vnode from the freelist * if we can get it and it has no cached pages, and no * namecache entries are relative to it. * Otherwise we allocate a new vnode */ s = splbio(); mtx_lock(&vnode_free_list_mtx); if (wantfreevnodes && freevnodes < wantfreevnodes) { vp = NULL; } else if (!wantfreevnodes && freevnodes <= desiredvnodes) { /* * XXX: this is only here to be backwards compatible */ vp = NULL; } else for (count = 0; count < freevnodes; count++) { vp = TAILQ_FIRST(&vnode_free_list); if (vp == NULL || vp->v_usecount) panic("getnewvnode: free vnode isn't"); TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); /* * Don't recycle if active in the namecache or * if it still has cached pages or we cannot get * its interlock. */ if (LIST_FIRST(&vp->v_cache_src) != NULL || (VOP_GETVOBJECT(vp, &object) == 0 && (object->resident_page_count || object->ref_count)) || !mtx_trylock(&vp->v_interlock)) { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); vp = NULL; continue; } /* * Skip over it if its filesystem is being suspended. */ if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0) break; mtx_unlock(&vp->v_interlock); TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); vp = NULL; } if (vp) { vp->v_flag |= VDOOMED; vp->v_flag &= ~VFREE; freevnodes--; mtx_unlock(&vnode_free_list_mtx); cache_purge(vp); vp->v_lease = NULL; if (vp->v_type != VBAD) { vgonel(vp, p); } else { mtx_unlock(&vp->v_interlock); } vn_finished_write(vnmp); #ifdef INVARIANTS { int s; if (vp->v_data) panic("cleaned vnode isn't"); s = splbio(); if (vp->v_numoutput) panic("Clean vnode has pending I/O's"); splx(s); if (vp->v_writecount != 0) panic("Non-zero write count"); } #endif vp->v_flag = 0; vp->v_lastw = 0; vp->v_lasta = 0; vp->v_cstart = 0; vp->v_clen = 0; vp->v_socket = 0; } else { mtx_unlock(&vnode_free_list_mtx); vp = (struct vnode *) zalloc(vnode_zone); bzero((char *) vp, sizeof *vp); mtx_init(&vp->v_interlock, "vnode interlock", MTX_DEF); vp->v_dd = vp; mtx_init(&vp->v_pollinfo.vpi_lock, "vnode pollinfo", MTX_DEF); cache_purge(vp); LIST_INIT(&vp->v_cache_src); TAILQ_INIT(&vp->v_cache_dst); numvnodes++; } TAILQ_INIT(&vp->v_cleanblkhd); TAILQ_INIT(&vp->v_dirtyblkhd); vp->v_type = VNON; vp->v_tag = tag; vp->v_op = vops; lockinit(&vp->v_lock, PVFS, "vnlock", 0, LK_NOPAUSE); insmntque(vp, mp); *vpp = vp; vp->v_usecount = 1; vp->v_data = 0; splx(s); vfs_object_create(vp, p, p->p_ucred); return (0); } /* * Move a vnode from one mount queue to another. */ static void insmntque(vp, mp) register struct vnode *vp; register struct mount *mp; { mtx_lock(&mntvnode_mtx); /* * Delete from old mount point vnode list, if on one. */ if (vp->v_mount != NULL) LIST_REMOVE(vp, v_mntvnodes); /* * Insert into list of vnodes for the new mount point, if available. */ if ((vp->v_mount = mp) == NULL) { mtx_unlock(&mntvnode_mtx); return; } LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); mtx_unlock(&mntvnode_mtx); } /* * Update outstanding I/O count and do wakeup if requested. */ void vwakeup(bp) register struct buf *bp; { register struct vnode *vp; bp->b_flags &= ~B_WRITEINPROG; if ((vp = bp->b_vp)) { vp->v_numoutput--; if (vp->v_numoutput < 0) panic("vwakeup: neg numoutput"); if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { vp->v_flag &= ~VBWAIT; wakeup((caddr_t) &vp->v_numoutput); } } } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) register struct vnode *vp; int flags; struct ucred *cred; struct proc *p; int slpflag, slptimeo; { register struct buf *bp; struct buf *nbp, *blist; int s, error; vm_object_t object; if (flags & V_SAVE) { s = splbio(); while (vp->v_numoutput) { vp->v_flag |= VBWAIT; error = tsleep((caddr_t)&vp->v_numoutput, slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo); if (error) { splx(s); return (error); } } if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { splx(s); if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0) return (error); s = splbio(); if (vp->v_numoutput > 0 || !TAILQ_EMPTY(&vp->v_dirtyblkhd)) panic("vinvalbuf: dirty bufs"); } splx(s); } s = splbio(); for (;;) { blist = TAILQ_FIRST(&vp->v_cleanblkhd); if (!blist) blist = TAILQ_FIRST(&vp->v_dirtyblkhd); if (!blist) break; for (bp = blist; bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL, "vinvalbuf", slpflag, slptimeo); if (error == ENOLCK) break; splx(s); return (error); } /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. Note that vfs_bio_awrite expects - * buffers to reside on a queue, while VOP_BWRITE and + * buffers to reside on a queue, while BUF_WRITE and * brelse do not. */ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && (flags & V_SAVE)) { if (bp->b_vp == vp) { if (bp->b_flags & B_CLUSTEROK) { BUF_UNLOCK(bp); vfs_bio_awrite(bp); } else { bremfree(bp); bp->b_flags |= B_ASYNC; BUF_WRITE(bp); } } else { bremfree(bp); (void) BUF_WRITE(bp); } break; } bremfree(bp); bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); } } while (vp->v_numoutput > 0) { vp->v_flag |= VBWAIT; tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); } splx(s); /* * Destroy the copy in the VM cache, too. */ mtx_lock(&vp->v_interlock); if (VOP_GETVOBJECT(vp, &object) == 0) { vm_object_page_remove(object, 0, 0, (flags & V_SAVE) ? TRUE : FALSE); } mtx_unlock(&vp->v_interlock); if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) panic("vinvalbuf: flush failed"); return (0); } /* * Truncate a file's buffer and pages to a specified length. This * is in lieu of the old vinvalbuf mechanism, which performed unneeded * sync activity. */ int vtruncbuf(vp, cred, p, length, blksize) register struct vnode *vp; struct ucred *cred; struct proc *p; off_t length; int blksize; { register struct buf *bp; struct buf *nbp; int s, anyfreed; int trunclbn; /* * Round up to the *next* lbn. */ trunclbn = (length + blksize - 1) / blksize; s = splbio(); restart: anyfreed = 1; for (;anyfreed;) { anyfreed = 0; for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (bp->b_lblkno >= trunclbn) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); goto restart; } else { bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; } if (nbp && (((nbp->b_xflags & BX_VNCLEAN) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI))) { goto restart; } } } for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (bp->b_lblkno >= trunclbn) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); goto restart; } else { bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; } if (nbp && (((nbp->b_xflags & BX_VNDIRTY) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI) == 0)) { goto restart; } } } } if (length > 0) { restartsync: for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); goto restart; } else { bremfree(bp); if (bp->b_vp == vp) { bp->b_flags |= B_ASYNC; } else { bp->b_flags &= ~B_ASYNC; } BUF_WRITE(bp); } goto restartsync; } } } while (vp->v_numoutput > 0) { vp->v_flag |= VBWAIT; tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); } splx(s); vnode_pager_setsize(vp, length); return (0); } /* * Associate a buffer with a vnode. */ void bgetvp(vp, bp) register struct vnode *vp; register struct buf *bp; { int s; KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); vhold(vp); bp->b_vp = vp; bp->b_dev = vn_todev(vp); /* * Insert onto list for new vnode. */ s = splbio(); bp->b_xflags |= BX_VNCLEAN; bp->b_xflags &= ~BX_VNDIRTY; TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); splx(s); } /* * Disassociate a buffer from a vnode. */ void brelvp(bp) register struct buf *bp; { struct vnode *vp; struct buflists *listheadp; int s; KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); /* * Delete from old vnode list, if on one. */ vp = bp->b_vp; s = splbio(); if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { if (bp->b_xflags & BX_VNDIRTY) listheadp = &vp->v_dirtyblkhd; else listheadp = &vp->v_cleanblkhd; TAILQ_REMOVE(listheadp, bp, b_vnbufs); bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); } if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { vp->v_flag &= ~VONWORKLST; LIST_REMOVE(vp, v_synclist); } splx(s); bp->b_vp = (struct vnode *) 0; vdrop(vp); } /* * Add an item to the syncer work queue. */ static void vn_syncer_add_to_worklist(struct vnode *vp, int delay) { int s, slot; s = splbio(); if (vp->v_flag & VONWORKLST) { LIST_REMOVE(vp, v_synclist); } if (delay > syncer_maxdelay - 2) delay = syncer_maxdelay - 2; slot = (syncer_delayno + delay) & syncer_mask; LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); vp->v_flag |= VONWORKLST; splx(s); } struct proc *updateproc; static void sched_sync __P((void)); static struct kproc_desc up_kp = { "syncer", sched_sync, &updateproc }; SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) /* * System filesystem synchronizer daemon. */ void sched_sync(void) { struct synclist *slp; struct vnode *vp; struct mount *mp; long starttime; int s; struct proc *p = updateproc; mtx_lock(&Giant); EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, SHUTDOWN_PRI_LAST); for (;;) { kthread_suspend_check(p); starttime = time_second; /* * Push files whose dirty time has expired. Be careful * of interrupt race on slp queue. */ s = splbio(); slp = &syncer_workitem_pending[syncer_delayno]; syncer_delayno += 1; if (syncer_delayno == syncer_maxdelay) syncer_delayno = 0; splx(s); while ((vp = LIST_FIRST(slp)) != NULL) { if (VOP_ISLOCKED(vp, NULL) == 0 && vn_start_write(vp, &mp, V_NOWAIT) == 0) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); VOP_UNLOCK(vp, 0, p); vn_finished_write(mp); } s = splbio(); if (LIST_FIRST(slp) == vp) { /* * Note: v_tag VT_VFS vps can remain on the * worklist too with no dirty blocks, but * since sync_fsync() moves it to a different * slot we are safe. */ if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && !vn_isdisk(vp, NULL)) panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ vn_syncer_add_to_worklist(vp, syncdelay); } splx(s); } /* * Do soft update processing. */ #ifdef SOFTUPDATES softdep_process_worklist(NULL); #endif /* * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob * value of N tells the filesystem syncer to process the next * N seconds worth of work on its queue ASAP. Currently rushjob * is used by the soft update code to speed up the filesystem * syncer process when the incore state is getting so far * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. */ if (rushjob > 0) { rushjob -= 1; continue; } /* * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. */ if (time_second == starttime) tsleep(&lbolt, PPAUSE, "syncer", 0); } } /* * Request the syncer daemon to speed up its work. * We never push it to speed up more than half of its * normal turn time, otherwise it could take over the cpu. */ int speedup_syncer() { mtx_lock_spin(&sched_lock); if (updateproc->p_wchan == &lbolt) setrunnable(updateproc); mtx_unlock_spin(&sched_lock); if (rushjob < syncdelay / 2) { rushjob += 1; stat_rush_requests += 1; return (1); } return(0); } /* * Associate a p-buffer with a vnode. * * Also sets B_PAGING flag to indicate that vnode is not fully associated * with the buffer. i.e. the bp has not been linked into the vnode or * ref-counted. */ void pbgetvp(vp, bp) register struct vnode *vp; register struct buf *bp; { KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); bp->b_vp = vp; bp->b_flags |= B_PAGING; bp->b_dev = vn_todev(vp); } /* * Disassociate a p-buffer from a vnode. */ void pbrelvp(bp) register struct buf *bp; { KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); /* XXX REMOVE ME */ if (TAILQ_NEXT(bp, b_vnbufs) != NULL) { panic( "relpbuf(): b_vp was probably reassignbuf()d %p %x", bp, (int)bp->b_flags ); } bp->b_vp = (struct vnode *) 0; bp->b_flags &= ~B_PAGING; } /* * Change the vnode a pager buffer is associated with. */ void pbreassignbuf(bp, newvp) struct buf *bp; struct vnode *newvp; { KASSERT(bp->b_flags & B_PAGING, ("pbreassignbuf() on non phys bp %p", bp)); bp->b_vp = newvp; } /* * Reassign a buffer from one vnode to another. * Used to assign file specific control information * (indirect blocks) to the vnode to which they belong. */ void reassignbuf(bp, newvp) register struct buf *bp; register struct vnode *newvp; { struct buflists *listheadp; int delay; int s; if (newvp == NULL) { printf("reassignbuf: NULL"); return; } ++reassignbufcalls; /* * B_PAGING flagged buffers cannot be reassigned because their vp * is not fully linked in. */ if (bp->b_flags & B_PAGING) panic("cannot reassign paging buffer"); s = splbio(); /* * Delete from old vnode list, if on one. */ if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { if (bp->b_xflags & BX_VNDIRTY) listheadp = &bp->b_vp->v_dirtyblkhd; else listheadp = &bp->b_vp->v_cleanblkhd; TAILQ_REMOVE(listheadp, bp, b_vnbufs); bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); if (bp->b_vp != newvp) { vdrop(bp->b_vp); bp->b_vp = NULL; /* for clarification */ } } /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { struct buf *tbp; listheadp = &newvp->v_dirtyblkhd; if ((newvp->v_flag & VONWORKLST) == 0) { switch (newvp->v_type) { case VDIR: delay = dirdelay; break; case VCHR: if (newvp->v_rdev->si_mountpoint != NULL) { delay = metadelay; break; } /* fall through */ default: delay = filedelay; } vn_syncer_add_to_worklist(newvp, delay); } bp->b_xflags |= BX_VNDIRTY; tbp = TAILQ_FIRST(listheadp); if (tbp == NULL || bp->b_lblkno == 0 || (bp->b_lblkno > 0 && tbp->b_lblkno < 0) || (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); ++reassignbufsortgood; } else if (bp->b_lblkno < 0) { TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); ++reassignbufsortgood; } else if (reassignbufmethod == 1) { /* * New sorting algorithm, only handle sequential case, * otherwise append to end (but before metadata) */ if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && (tbp->b_xflags & BX_VNDIRTY)) { /* * Found the best place to insert the buffer */ TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); ++reassignbufsortgood; } else { /* * Missed, append to end, but before meta-data. * We know that the head buffer in the list is * not meta-data due to prior conditionals. * * Indirect effects: NFS second stage write * tends to wind up here, giving maximum * distance between the unstable write and the * commit rpc. */ tbp = TAILQ_LAST(listheadp, buflists); while (tbp && tbp->b_lblkno < 0) tbp = TAILQ_PREV(tbp, buflists, b_vnbufs); TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); ++reassignbufsortbad; } } else { /* * Old sorting algorithm, scan queue and insert */ struct buf *ttbp; while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && (ttbp->b_lblkno < bp->b_lblkno)) { ++reassignbufloops; tbp = ttbp; } TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); } } else { bp->b_xflags |= BX_VNCLEAN; TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); if ((newvp->v_flag & VONWORKLST) && TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { newvp->v_flag &= ~VONWORKLST; LIST_REMOVE(newvp, v_synclist); } } if (bp->b_vp != newvp) { bp->b_vp = newvp; vhold(bp->b_vp); } splx(s); } /* * Create a vnode for a device. * Used for mounting the root file system. */ int bdevvp(dev, vpp) dev_t dev; struct vnode **vpp; { register struct vnode *vp; struct vnode *nvp; int error; if (dev == NODEV) { *vpp = NULLVP; return (ENXIO); } if (vfinddev(dev, VCHR, vpp)) return (0); error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); if (error) { *vpp = NULLVP; return (error); } vp = nvp; vp->v_type = VCHR; addalias(vp, dev); *vpp = vp; return (0); } /* * Add vnode to the alias list hung off the dev_t. * * The reason for this gunk is that multiple vnodes can reference * the same physical device, so checking vp->v_usecount to see * how many users there are is inadequate; the v_usecount for * the vnodes need to be accumulated. vcount() does that. */ struct vnode * addaliasu(nvp, nvp_rdev) struct vnode *nvp; udev_t nvp_rdev; { struct vnode *ovp; vop_t **ops; dev_t dev; if (nvp->v_type == VBLK) return (nvp); if (nvp->v_type != VCHR) panic("addaliasu on non-special vnode"); dev = udev2dev(nvp_rdev, 0); /* * Check to see if we have a bdevvp vnode with no associated * filesystem. If so, we want to associate the filesystem of * the new newly instigated vnode with the bdevvp vnode and * discard the newly created vnode rather than leaving the * bdevvp vnode lying around with no associated filesystem. */ if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) { addalias(nvp, dev); return (nvp); } /* * Discard unneeded vnode, but save its node specific data. * Note that if there is a lock, it is carried over in the * node specific data to the replacement vnode. */ vref(ovp); ovp->v_data = nvp->v_data; ovp->v_tag = nvp->v_tag; nvp->v_data = NULL; lockinit(&ovp->v_lock, PVFS, nvp->v_lock.lk_wmesg, nvp->v_lock.lk_timo, nvp->v_lock.lk_flags & LK_EXTFLG_MASK); if (nvp->v_vnlock) ovp->v_vnlock = &ovp->v_lock; ops = ovp->v_op; ovp->v_op = nvp->v_op; if (VOP_ISLOCKED(nvp, curproc)) { VOP_UNLOCK(nvp, 0, curproc); vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curproc); } nvp->v_op = ops; insmntque(ovp, nvp->v_mount); vrele(nvp); vgone(nvp); return (ovp); } /* This is a local helper function that do the same as addaliasu, but for a * dev_t instead of an udev_t. */ static void addalias(nvp, dev) struct vnode *nvp; dev_t dev; { KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode")); nvp->v_rdev = dev; mtx_lock(&spechash_mtx); SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext); mtx_unlock(&spechash_mtx); } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. The vnode lock bit is set if the * vnode is being eliminated in vgone. The process is awakened * when the transition is completed, and an error returned to * indicate that the vnode is no longer usable (possibly having * been changed to a new file system type). */ int vget(vp, flags, p) register struct vnode *vp; int flags; struct proc *p; { int error; /* * If the vnode is in the process of being cleaned out for * another use, we wait for the cleaning to finish and then * return failure. Cleaning is determined by checking that * the VXLOCK flag is set. */ if ((flags & LK_INTERLOCK) == 0) mtx_lock(&vp->v_interlock); if (vp->v_flag & VXLOCK) { if (vp->v_vxproc == curproc) { printf("VXLOCK interlock avoided\n"); } else { vp->v_flag |= VXWANT; msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, "vget", 0); return (ENOENT); } } vp->v_usecount++; if (VSHOULDBUSY(vp)) vbusy(vp); if (flags & LK_TYPE_MASK) { if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) { /* * must expand vrele here because we do not want * to call VOP_INACTIVE if the reference count * drops back to zero since it was never really * active. We must remove it from the free list * before sleeping so that multiple processes do * not try to recycle it. */ mtx_lock(&vp->v_interlock); vp->v_usecount--; if (VSHOULDFREE(vp)) vfree(vp); mtx_unlock(&vp->v_interlock); } return (error); } mtx_unlock(&vp->v_interlock); return (0); } /* * Increase the reference count of a vnode. */ void vref(struct vnode *vp) { mtx_lock(&vp->v_interlock); vp->v_usecount++; mtx_unlock(&vp->v_interlock); } /* * Vnode put/release. * If count drops to zero, call inactive routine and return to freelist. */ void vrele(vp) struct vnode *vp; { struct proc *p = curproc; /* XXX */ KASSERT(vp != NULL, ("vrele: null vp")); mtx_lock(&vp->v_interlock); KASSERT(vp->v_writecount < vp->v_usecount, ("vrele: missed vn_close")); if (vp->v_usecount > 1) { vp->v_usecount--; mtx_unlock(&vp->v_interlock); return; } if (vp->v_usecount == 1) { vp->v_usecount--; if (VSHOULDFREE(vp)) vfree(vp); /* * If we are doing a vput, the node is already locked, and we must * call VOP_INACTIVE with the node locked. So, in the case of * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. */ if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) { VOP_INACTIVE(vp, p); } } else { #ifdef DIAGNOSTIC vprint("vrele: negative ref count", vp); mtx_unlock(&vp->v_interlock); #endif panic("vrele: negative ref cnt"); } } /* * Release an already locked vnode. This give the same effects as * unlock+vrele(), but takes less time and avoids releasing and * re-aquiring the lock (as vrele() aquires the lock internally.) */ void vput(vp) struct vnode *vp; { struct proc *p = curproc; /* XXX */ KASSERT(vp != NULL, ("vput: null vp")); mtx_lock(&vp->v_interlock); KASSERT(vp->v_writecount < vp->v_usecount, ("vput: missed vn_close")); if (vp->v_usecount > 1) { vp->v_usecount--; VOP_UNLOCK(vp, LK_INTERLOCK, p); return; } if (vp->v_usecount == 1) { vp->v_usecount--; if (VSHOULDFREE(vp)) vfree(vp); /* * If we are doing a vput, the node is already locked, and we must * call VOP_INACTIVE with the node locked. So, in the case of * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. */ mtx_unlock(&vp->v_interlock); VOP_INACTIVE(vp, p); } else { #ifdef DIAGNOSTIC vprint("vput: negative ref count", vp); #endif panic("vput: negative ref cnt"); } } /* * Somebody doesn't want the vnode recycled. */ void vhold(vp) register struct vnode *vp; { int s; s = splbio(); vp->v_holdcnt++; if (VSHOULDBUSY(vp)) vbusy(vp); splx(s); } /* * Note that there is one less who cares about this vnode. vdrop() is the * opposite of vhold(). */ void vdrop(vp) register struct vnode *vp; { int s; s = splbio(); if (vp->v_holdcnt <= 0) panic("vdrop: holdcnt"); vp->v_holdcnt--; if (VSHOULDFREE(vp)) vfree(vp); splx(s); } /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If MNT_NOFORCE is specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If MNT_FORCE is specified, detach any active vnodes * that are found. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); #endif int vflush(mp, skipvp, flags) struct mount *mp; struct vnode *skipvp; int flags; { struct proc *p = curproc; /* XXX */ struct vnode *vp, *nvp; int busy = 0; mtx_lock(&mntvnode_mtx); loop: for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { /* * Make sure this vnode wasn't reclaimed in getnewvnode(). * Start over if it has (it won't be on the list anymore). */ if (vp->v_mount != mp) goto loop; nvp = LIST_NEXT(vp, v_mntvnodes); /* * Skip over a selected vnode. */ if (vp == skipvp) continue; mtx_lock(&vp->v_interlock); /* * Skip over a vnodes marked VSYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { mtx_unlock(&vp->v_interlock); continue; } /* * If WRITECLOSE is set, only flush out regular file vnodes * open for writing. */ if ((flags & WRITECLOSE) && (vp->v_writecount == 0 || vp->v_type != VREG)) { mtx_unlock(&vp->v_interlock); continue; } /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. */ if (vp->v_usecount == 0) { mtx_unlock(&mntvnode_mtx); vgonel(vp, p); mtx_lock(&mntvnode_mtx); continue; } /* * If FORCECLOSE is set, forcibly close the vnode. For block * or character devices, revert to an anonymous device. For * all other files, just kill them. */ if (flags & FORCECLOSE) { mtx_unlock(&mntvnode_mtx); if (vp->v_type != VCHR) { vgonel(vp, p); } else { vclean(vp, 0, p); vp->v_op = spec_vnodeop_p; insmntque(vp, (struct mount *) 0); } mtx_lock(&mntvnode_mtx); continue; } #ifdef DIAGNOSTIC if (busyprt) vprint("vflush: busy vnode", vp); #endif mtx_unlock(&vp->v_interlock); busy++; } mtx_unlock(&mntvnode_mtx); if (busy) return (EBUSY); return (0); } /* * Disassociate the underlying file system from a vnode. */ static void vclean(vp, flags, p) struct vnode *vp; int flags; struct proc *p; { int active; /* * Check to see if the vnode is in use. If so we have to reference it * before we clean it out so that its count cannot fall to zero and * generate a race against ourselves to recycle it. */ if ((active = vp->v_usecount)) vp->v_usecount++; /* * Prevent the vnode from being recycled or brought into use while we * clean it out. */ if (vp->v_flag & VXLOCK) panic("vclean: deadlock"); vp->v_flag |= VXLOCK; vp->v_vxproc = curproc; /* * Even if the count is zero, the VOP_INACTIVE routine may still * have the object locked while it cleans it out. The VOP_LOCK * ensures that the VOP_INACTIVE routine is done with its work. * For active vnodes, it ensures that no other activity can * occur while the underlying object is being cleaned out. */ VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); /* * Clean out any buffers associated with the vnode. * If the flush fails, just toss the buffers. */ if (flags & DOCLOSE) { if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL) (void) vn_write_suspend_wait(vp, NULL, V_WAIT); if (vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0) != 0) vinvalbuf(vp, 0, NOCRED, p, 0, 0); } VOP_DESTROYVOBJECT(vp); /* * If purging an active vnode, it must be closed and * deactivated before being reclaimed. Note that the * VOP_INACTIVE will unlock the vnode. */ if (active) { if (flags & DOCLOSE) VOP_CLOSE(vp, FNONBLOCK, NOCRED, p); VOP_INACTIVE(vp, p); } else { /* * Any other processes trying to obtain this lock must first * wait for VXLOCK to clear, then call the new lock operation. */ VOP_UNLOCK(vp, 0, p); } /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp, p)) panic("vclean: cannot reclaim"); if (active) { /* * Inline copy of vrele() since VOP_INACTIVE * has already been called. */ mtx_lock(&vp->v_interlock); if (--vp->v_usecount <= 0) { #ifdef DIAGNOSTIC if (vp->v_usecount < 0 || vp->v_writecount != 0) { vprint("vclean: bad ref count", vp); panic("vclean: ref cnt"); } #endif vfree(vp); } mtx_unlock(&vp->v_interlock); } cache_purge(vp); vp->v_vnlock = NULL; lockdestroy(&vp->v_lock); if (VSHOULDFREE(vp)) vfree(vp); /* * Done with purge, notify sleepers of the grim news. */ vp->v_op = dead_vnodeop_p; vn_pollgone(vp); vp->v_tag = VT_NON; vp->v_flag &= ~VXLOCK; vp->v_vxproc = NULL; if (vp->v_flag & VXWANT) { vp->v_flag &= ~VXWANT; wakeup((caddr_t) vp); } } /* * Eliminate all activity associated with the requested vnode * and with all vnodes aliased to the requested vnode. */ int vop_revoke(ap) struct vop_revoke_args /* { struct vnode *a_vp; int a_flags; } */ *ap; { struct vnode *vp, *vq; dev_t dev; KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); vp = ap->a_vp; /* * If a vgone (or vclean) is already in progress, * wait until it is done and return. */ if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, "vop_revokeall", 0); return (0); } dev = vp->v_rdev; for (;;) { mtx_lock(&spechash_mtx); vq = SLIST_FIRST(&dev->si_hlist); mtx_unlock(&spechash_mtx); if (!vq) break; vgone(vq); } return (0); } /* * Recycle an unused vnode to the front of the free list. * Release the passed interlock if the vnode will be recycled. */ int vrecycle(vp, inter_lkp, p) struct vnode *vp; struct mtx *inter_lkp; struct proc *p; { mtx_lock(&vp->v_interlock); if (vp->v_usecount == 0) { if (inter_lkp) { mtx_unlock(inter_lkp); } vgonel(vp, p); return (1); } mtx_unlock(&vp->v_interlock); return (0); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(vp) register struct vnode *vp; { struct proc *p = curproc; /* XXX */ mtx_lock(&vp->v_interlock); vgonel(vp, p); } /* * vgone, with the vp interlock held. */ void vgonel(vp, p) struct vnode *vp; struct proc *p; { int s; /* * If a vgone (or vclean) is already in progress, * wait until it is done and return. */ if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, "vgone", 0); return; } /* * Clean out the filesystem specific data. */ vclean(vp, DOCLOSE, p); mtx_lock(&vp->v_interlock); /* * Delete from old mount point vnode list, if on one. */ if (vp->v_mount != NULL) insmntque(vp, (struct mount *)0); /* * If special device, remove it from special device alias list * if it is on one. */ if (vp->v_type == VCHR && vp->v_rdev != NULL && vp->v_rdev != NODEV) { mtx_lock(&spechash_mtx); SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext); freedev(vp->v_rdev); mtx_unlock(&spechash_mtx); vp->v_rdev = NULL; } /* * If it is on the freelist and not already at the head, * move it to the head of the list. The test of the * VDOOMED flag and the reference count of zero is because * it will be removed from the free list by getnewvnode, * but will not have its reference count incremented until * after calling vgone. If the reference count were * incremented first, vgone would (incorrectly) try to * close the previous instance of the underlying object. */ if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { s = splbio(); mtx_lock(&vnode_free_list_mtx); if (vp->v_flag & VFREE) TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); else freevnodes++; vp->v_flag |= VFREE; TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); mtx_unlock(&vnode_free_list_mtx); splx(s); } vp->v_type = VBAD; mtx_unlock(&vp->v_interlock); } /* * Lookup a vnode by device number. */ int vfinddev(dev, type, vpp) dev_t dev; enum vtype type; struct vnode **vpp; { struct vnode *vp; mtx_lock(&spechash_mtx); SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { if (type == vp->v_type) { *vpp = vp; mtx_unlock(&spechash_mtx); return (1); } } mtx_unlock(&spechash_mtx); return (0); } /* * Calculate the total number of references to a special device. */ int vcount(vp) struct vnode *vp; { struct vnode *vq; int count; count = 0; mtx_lock(&spechash_mtx); SLIST_FOREACH(vq, &vp->v_rdev->si_hlist, v_specnext) count += vq->v_usecount; mtx_unlock(&spechash_mtx); return (count); } /* * Same as above, but using the dev_t as argument */ int count_dev(dev) dev_t dev; { struct vnode *vp; vp = SLIST_FIRST(&dev->si_hlist); if (vp == NULL) return (0); return(vcount(vp)); } /* * Print out a description of a vnode. */ static char *typename[] = {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; void vprint(label, vp) char *label; struct vnode *vp; { char buf[96]; if (label != NULL) printf("%s: %p: ", label, (void *)vp); else printf("%p: ", (void *)vp); printf("type %s, usecount %d, writecount %d, refcount %d,", typename[vp->v_type], vp->v_usecount, vp->v_writecount, vp->v_holdcnt); buf[0] = '\0'; if (vp->v_flag & VROOT) strcat(buf, "|VROOT"); if (vp->v_flag & VTEXT) strcat(buf, "|VTEXT"); if (vp->v_flag & VSYSTEM) strcat(buf, "|VSYSTEM"); if (vp->v_flag & VXLOCK) strcat(buf, "|VXLOCK"); if (vp->v_flag & VXWANT) strcat(buf, "|VXWANT"); if (vp->v_flag & VBWAIT) strcat(buf, "|VBWAIT"); if (vp->v_flag & VDOOMED) strcat(buf, "|VDOOMED"); if (vp->v_flag & VFREE) strcat(buf, "|VFREE"); if (vp->v_flag & VOBJBUF) strcat(buf, "|VOBJBUF"); if (buf[0] != '\0') printf(" flags (%s)", &buf[1]); if (vp->v_data == NULL) { printf("\n"); } else { printf("\n\t"); VOP_PRINT(vp); } } #ifdef DDB #include /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) { struct proc *p = curproc; /* XXX */ struct mount *mp, *nmp; struct vnode *vp; printf("Locked vnodes\n"); mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, p)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { if (VOP_ISLOCKED(vp, NULL)) vprint((char *)0, vp); } mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, p); } mtx_unlock(&mountlist_mtx); } #endif /* * Top level filesystem related information gathering. */ static int sysctl_ovfs_conf __P((SYSCTL_HANDLER_ARGS)); static int vfs_sysctl(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1 - 1; /* XXX */ u_int namelen = arg2 + 1; /* XXX */ struct vfsconf *vfsp; #if 1 || defined(COMPAT_PRELITE2) /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ if (namelen == 1) return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); #endif /* XXX the below code does not compile; vfs_sysctl does not exist. */ #ifdef notyet /* all sysctl names at this level are at least name and field */ if (namelen < 2) return (ENOTDIR); /* overloaded */ if (name[0] != VFS_GENERIC) { for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (vfsp->vfc_typenum == name[0]) break; if (vfsp == NULL) return (EOPNOTSUPP); return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, oldp, oldlenp, newp, newlen, p)); } #endif switch (name[1]) { case VFS_MAXTYPENUM: if (namelen != 2) return (ENOTDIR); return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); case VFS_CONF: if (namelen != 3) return (ENOTDIR); /* overloaded */ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (vfsp->vfc_typenum == name[2]) break; if (vfsp == NULL) return (EOPNOTSUPP); return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); } return (EOPNOTSUPP); } SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, "Generic filesystem"); #if 1 || defined(COMPAT_PRELITE2) static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) { int error; struct vfsconf *vfsp; struct ovfsconf ovfs; for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ strcpy(ovfs.vfc_name, vfsp->vfc_name); ovfs.vfc_index = vfsp->vfc_typenum; ovfs.vfc_refcount = vfsp->vfc_refcount; ovfs.vfc_flags = vfsp->vfc_flags; error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); if (error) return error; } return 0; } #endif /* 1 || COMPAT_PRELITE2 */ #if COMPILING_LINT #define KINFO_VNODESLOP 10 /* * Dump vnode list (via sysctl). * Copyout address of vnode followed by vnode. */ /* ARGSUSED */ static int sysctl_vnode(SYSCTL_HANDLER_ARGS) { struct proc *p = curproc; /* XXX */ struct mount *mp, *nmp; struct vnode *nvp, *vp; int error; #define VPTRSZ sizeof (struct vnode *) #define VNODESZ sizeof (struct vnode) req->lock = 0; if (!req->oldptr) /* Make an estimate */ return (SYSCTL_OUT(req, 0, (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, p)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } again: mtx_lock(&mntvnode_mtx); for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) { /* * Check that the vp is still associated with * this filesystem. RACE: could have been * recycled onto the same filesystem. */ if (vp->v_mount != mp) { mtx_unlock(&mntvnode_mtx); goto again; } nvp = LIST_NEXT(vp, v_mntvnodes); mtx_unlock(&mntvnode_mtx); if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || (error = SYSCTL_OUT(req, vp, VNODESZ))) return (error); mtx_lock(&mntvnode_mtx); } mtx_unlock(&mntvnode_mtx); mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, p); } mtx_unlock(&mountlist_mtx); return (0); } /* * XXX * Exporting the vnode list on large systems causes them to crash. * Exporting the vnode list on medium systems causes sysctl to coredump. */ SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_vnode, "S,vnode", ""); #endif /* * Check to see if a filesystem is mounted on a block device. */ int vfs_mountedon(vp) struct vnode *vp; { if (vp->v_rdev->si_mountpoint != NULL) return (EBUSY); return (0); } /* * Unmount all filesystems. The list is traversed in reverse order * of mounting to avoid dependencies. */ void vfs_unmountall() { struct mount *mp; struct proc *p; int error; if (curproc != NULL) p = curproc; else p = initproc; /* XXX XXX should this be proc0? */ /* * Since this only runs when rebooting, it is not interlocked. */ while(!TAILQ_EMPTY(&mountlist)) { mp = TAILQ_LAST(&mountlist, mntlist); error = dounmount(mp, MNT_FORCE, p); if (error) { TAILQ_REMOVE(&mountlist, mp, mnt_list); printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } else { /* The unmount has removed mp from the mountlist */ } } } /* * Build hash lists of net addresses and hang them off the mount point. * Called by ufs_mount() to set up the lists of export addresses. */ static int vfs_hang_addrlist(mp, nep, argp) struct mount *mp; struct netexport *nep; struct export_args *argp; { register struct netcred *np; register struct radix_node_head *rnh; register int i; struct radix_node *rn; struct sockaddr *saddr, *smask = 0; struct domain *dom; int error; if (argp->ex_addrlen == 0) { if (mp->mnt_flag & MNT_DEFEXPORTED) return (EPERM); np = &nep->ne_defexported; np->netc_exflags = argp->ex_flags; bzero(&np->netc_anon, sizeof(np->netc_anon)); np->netc_anon.cr_uid = argp->ex_anon.cr_uid; np->netc_anon.cr_ngroups = argp->ex_anon.cr_ngroups; bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups, sizeof(np->netc_anon.cr_groups)); np->netc_anon.cr_ref = 1; mp->mnt_flag |= MNT_DEFEXPORTED; return (0); } i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK | M_ZERO); saddr = (struct sockaddr *) (np + 1); if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) goto out; if (saddr->sa_len > argp->ex_addrlen) saddr->sa_len = argp->ex_addrlen; if (argp->ex_masklen) { smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen); if (error) goto out; if (smask->sa_len > argp->ex_masklen) smask->sa_len = argp->ex_masklen; } i = saddr->sa_family; if ((rnh = nep->ne_rtable[i]) == 0) { /* * Seems silly to initialize every AF when most are not used, * do so on demand here */ for (dom = domains; dom; dom = dom->dom_next) if (dom->dom_family == i && dom->dom_rtattach) { dom->dom_rtattach((void **) &nep->ne_rtable[i], dom->dom_rtoffset); break; } if ((rnh = nep->ne_rtable[i]) == 0) { error = ENOBUFS; goto out; } } rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, np->netc_rnodes); if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ error = EPERM; goto out; } np->netc_exflags = argp->ex_flags; bzero(&np->netc_anon, sizeof(np->netc_anon)); np->netc_anon.cr_uid = argp->ex_anon.cr_uid; np->netc_anon.cr_ngroups = argp->ex_anon.cr_ngroups; bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups, sizeof(np->netc_anon.cr_groups)); np->netc_anon.cr_ref = 1; return (0); out: free(np, M_NETADDR); return (error); } /* Helper for vfs_free_addrlist. */ /* ARGSUSED */ static int vfs_free_netcred(rn, w) struct radix_node *rn; void *w; { register struct radix_node_head *rnh = (struct radix_node_head *) w; (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); free((caddr_t) rn, M_NETADDR); return (0); } /* * Free the net address hash lists that are hanging off the mount points. */ static void vfs_free_addrlist(nep) struct netexport *nep; { register int i; register struct radix_node_head *rnh; for (i = 0; i <= AF_MAX; i++) if ((rnh = nep->ne_rtable[i])) { (*rnh->rnh_walktree) (rnh, vfs_free_netcred, (caddr_t) rnh); free((caddr_t) rnh, M_RTABLE); nep->ne_rtable[i] = 0; } } /* * High level function to manipulate export options on a mount point * and the passed in netexport. * Struct export_args *argp is the variable used to twiddle options, * the structure is described in sys/mount.h */ int vfs_export(mp, nep, argp) struct mount *mp; struct netexport *nep; struct export_args *argp; { int error; if (argp->ex_flags & MNT_DELEXPORT) { if (mp->mnt_flag & MNT_EXPUBLIC) { vfs_setpublicfs(NULL, NULL, NULL); mp->mnt_flag &= ~MNT_EXPUBLIC; } vfs_free_addrlist(nep); mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); } if (argp->ex_flags & MNT_EXPORTED) { if (argp->ex_flags & MNT_EXPUBLIC) { if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) return (error); mp->mnt_flag |= MNT_EXPUBLIC; } if ((error = vfs_hang_addrlist(mp, nep, argp))) return (error); mp->mnt_flag |= MNT_EXPORTED; } return (0); } /* * Set the publicly exported filesystem (WebNFS). Currently, only * one public filesystem is possible in the spec (RFC 2054 and 2055) */ int vfs_setpublicfs(mp, nep, argp) struct mount *mp; struct netexport *nep; struct export_args *argp; { int error; struct vnode *rvp; char *cp; /* * mp == NULL -> invalidate the current info, the FS is * no longer exported. May be called from either vfs_export * or unmount, so check if it hasn't already been done. */ if (mp == NULL) { if (nfs_pub.np_valid) { nfs_pub.np_valid = 0; if (nfs_pub.np_index != NULL) { FREE(nfs_pub.np_index, M_TEMP); nfs_pub.np_index = NULL; } } return (0); } /* * Only one allowed at a time. */ if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) return (EBUSY); /* * Get real filehandle for root of exported FS. */ bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; if ((error = VFS_ROOT(mp, &rvp))) return (error); if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) return (error); vput(rvp); /* * If an indexfile was specified, pull it in. */ if (argp->ex_indexfile != NULL) { MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, M_WAITOK); error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, MAXNAMLEN, (size_t *)0); if (!error) { /* * Check for illegal filenames. */ for (cp = nfs_pub.np_index; *cp; cp++) { if (*cp == '/') { error = EINVAL; break; } } } if (error) { FREE(nfs_pub.np_index, M_TEMP); return (error); } } nfs_pub.np_mount = mp; nfs_pub.np_valid = 1; return (0); } /* * Used by the filesystems to determine if a given network address * (passed in 'nam') is present in thier exports list, returns a pointer * to struct netcred so that the filesystem can examine it for * access rights (read/write/etc). */ struct netcred * vfs_export_lookup(mp, nep, nam) register struct mount *mp; struct netexport *nep; struct sockaddr *nam; { register struct netcred *np; register struct radix_node_head *rnh; struct sockaddr *saddr; np = NULL; if (mp->mnt_flag & MNT_EXPORTED) { /* * Lookup in the export list first. */ if (nam != NULL) { saddr = nam; rnh = nep->ne_rtable[saddr->sa_family]; if (rnh != NULL) { np = (struct netcred *) (*rnh->rnh_matchaddr)((caddr_t)saddr, rnh); if (np && np->netc_rnodes->rn_flags & RNF_ROOT) np = NULL; } } /* * If no address match, use the default if it exists. */ if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) np = &nep->ne_defexported; } return (np); } /* * perform msync on all vnodes under a mount point * the mount point must be locked. */ void vfs_msync(struct mount *mp, int flags) { struct vnode *vp, *nvp; struct vm_object *obj; int anyio, tries; tries = 5; loop: anyio = 0; for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) { nvp = LIST_NEXT(vp, v_mntvnodes); if (vp->v_mount != mp) { goto loop; } if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ continue; if (flags != MNT_WAIT) { if (VOP_GETVOBJECT(vp, &obj) != 0 || (obj->flags & OBJ_MIGHTBEDIRTY) == 0) continue; if (VOP_ISLOCKED(vp, NULL)) continue; } mtx_lock(&vp->v_interlock); if (VOP_GETVOBJECT(vp, &obj) == 0 && (obj->flags & OBJ_MIGHTBEDIRTY)) { if (!vget(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) { if (VOP_GETVOBJECT(vp, &obj) == 0) { vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); anyio = 1; } vput(vp); } } else { mtx_unlock(&vp->v_interlock); } } if (anyio && (--tries > 0)) goto loop; } /* * Create the VM object needed for VMIO and mmap support. This * is done for all VREG files in the system. Some filesystems might * afford the additional metadata buffering capability of the * VMIO code by making the device node be VMIO mode also. * * vp must be locked when vfs_object_create is called. */ int vfs_object_create(vp, p, cred) struct vnode *vp; struct proc *p; struct ucred *cred; { return (VOP_CREATEVOBJECT(vp, cred, p)); } /* * Mark a vnode as free, putting it up for recycling. */ void vfree(vp) struct vnode *vp; { int s; s = splbio(); mtx_lock(&vnode_free_list_mtx); KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free")); if (vp->v_flag & VAGE) { TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); } else { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); } freevnodes++; mtx_unlock(&vnode_free_list_mtx); vp->v_flag &= ~VAGE; vp->v_flag |= VFREE; splx(s); } /* * Opposite of vfree() - mark a vnode as in use. */ void vbusy(vp) struct vnode *vp; { int s; s = splbio(); mtx_lock(&vnode_free_list_mtx); KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free")); TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); freevnodes--; mtx_unlock(&vnode_free_list_mtx); vp->v_flag &= ~(VFREE|VAGE); splx(s); } /* * Record a process's interest in events which might happen to * a vnode. Because poll uses the historic select-style interface * internally, this routine serves as both the ``check for any * pending events'' and the ``record my interest in future events'' * functions. (These are done together, while the lock is held, * to avoid race conditions.) */ int vn_pollrecord(vp, p, events) struct vnode *vp; struct proc *p; short events; { mtx_lock(&vp->v_pollinfo.vpi_lock); if (vp->v_pollinfo.vpi_revents & events) { /* * This leaves events we are not interested * in available for the other process which * which presumably had requested them * (otherwise they would never have been * recorded). */ events &= vp->v_pollinfo.vpi_revents; vp->v_pollinfo.vpi_revents &= ~events; mtx_unlock(&vp->v_pollinfo.vpi_lock); return events; } vp->v_pollinfo.vpi_events |= events; selrecord(p, &vp->v_pollinfo.vpi_selinfo); mtx_unlock(&vp->v_pollinfo.vpi_lock); return 0; } /* * Note the occurrence of an event. If the VN_POLLEVENT macro is used, * it is possible for us to miss an event due to race conditions, but * that condition is expected to be rare, so for the moment it is the * preferred interface. */ void vn_pollevent(vp, events) struct vnode *vp; short events; { mtx_lock(&vp->v_pollinfo.vpi_lock); if (vp->v_pollinfo.vpi_events & events) { /* * We clear vpi_events so that we don't * call selwakeup() twice if two events are * posted before the polling process(es) is * awakened. This also ensures that we take at * most one selwakeup() if the polling process * is no longer interested. However, it does * mean that only one event can be noticed at * a time. (Perhaps we should only clear those * event bits which we note?) XXX */ vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ vp->v_pollinfo.vpi_revents |= events; selwakeup(&vp->v_pollinfo.vpi_selinfo); } mtx_unlock(&vp->v_pollinfo.vpi_lock); } #define VN_KNOTE(vp, b) \ KNOTE((struct klist *)&vp->v_pollinfo.vpi_selinfo.si_note, (b)) /* * Wake up anyone polling on vp because it is being revoked. * This depends on dead_poll() returning POLLHUP for correct * behavior. */ void vn_pollgone(vp) struct vnode *vp; { mtx_lock(&vp->v_pollinfo.vpi_lock); VN_KNOTE(vp, NOTE_REVOKE); if (vp->v_pollinfo.vpi_events) { vp->v_pollinfo.vpi_events = 0; selwakeup(&vp->v_pollinfo.vpi_selinfo); } mtx_unlock(&vp->v_pollinfo.vpi_lock); } /* * Routine to create and manage a filesystem syncer vnode. */ #define sync_close ((int (*) __P((struct vop_close_args *)))nullop) static int sync_fsync __P((struct vop_fsync_args *)); static int sync_inactive __P((struct vop_inactive_args *)); static int sync_reclaim __P((struct vop_reclaim_args *)); #define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock) #define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock) static int sync_print __P((struct vop_print_args *)); #define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked) static vop_t **sync_vnodeop_p; static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_eopnotsupp }, { &vop_close_desc, (vop_t *) sync_close }, /* close */ { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ { &vop_print_desc, (vop_t *) sync_print }, /* print */ { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ { NULL, NULL } }; static struct vnodeopv_desc sync_vnodeop_opv_desc = { &sync_vnodeop_p, sync_vnodeop_entries }; VNODEOP_SET(sync_vnodeop_opv_desc); /* * Create a new filesystem syncer vnode for the specified mount point. */ int vfs_allocate_syncvnode(mp) struct mount *mp; { struct vnode *vp; static long start, incr, next; int error; /* Allocate a new vnode */ if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { mp->mnt_syncer = NULL; return (error); } vp->v_type = VNON; /* * Place the vnode onto the syncer worklist. We attempt to * scatter them about on the list so that they will go off * at evenly distributed times even if all the filesystems * are mounted at once. */ next += incr; if (next == 0 || next > syncer_maxdelay) { start /= 2; incr /= 2; if (start == 0) { start = syncer_maxdelay / 2; incr = syncer_maxdelay; } next = start; } vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); mp->mnt_syncer = vp; return (0); } /* * Do a lazy sync of the filesystem. */ static int sync_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct proc *a_p; } */ *ap; { struct vnode *syncvp = ap->a_vp; struct mount *mp = syncvp->v_mount; struct proc *p = ap->a_p; int asyncflag; /* * We only need to do something if this is a lazy evaluation. */ if (ap->a_waitfor != MNT_LAZY) return (0); /* * Move ourselves to the back of the sync list. */ vn_syncer_add_to_worklist(syncvp, syncdelay); /* * Walk the list of vnodes pushing all that are dirty and * not already on the sync list. */ mtx_lock(&mountlist_mtx); if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, p) != 0) { mtx_unlock(&mountlist_mtx); return (0); } if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { vfs_unbusy(mp, p); return (0); } asyncflag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; vfs_msync(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p); if (asyncflag) mp->mnt_flag |= MNT_ASYNC; vn_finished_write(mp); vfs_unbusy(mp, p); return (0); } /* * The syncer vnode is no referenced. */ static int sync_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct proc *a_p; } */ *ap; { vgone(ap->a_vp); return (0); } /* * The syncer vnode is no longer needed and is being decommissioned. * * Modifications to the worklist must be protected at splbio(). */ static int sync_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; int s; s = splbio(); vp->v_mount->mnt_syncer = NULL; if (vp->v_flag & VONWORKLST) { LIST_REMOVE(vp, v_synclist); vp->v_flag &= ~VONWORKLST; } splx(s); return (0); } /* * Print out a syncer vnode. */ static int sync_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; printf("syncer vnode"); if (vp->v_vnlock != NULL) lockmgr_printinfo(vp->v_vnlock); printf("\n"); return (0); } /* * extract the dev_t from a VCHR */ dev_t vn_todev(vp) struct vnode *vp; { if (vp->v_type != VCHR) return (NODEV); return (vp->v_rdev); } /* * Check if vnode represents a disk device */ int vn_isdisk(vp, errp) struct vnode *vp; int *errp; { struct cdevsw *cdevsw; if (vp->v_type != VCHR) { if (errp != NULL) *errp = ENOTBLK; return (0); } if (vp->v_rdev == NULL) { if (errp != NULL) *errp = ENXIO; return (0); } cdevsw = devsw(vp->v_rdev); if (cdevsw == NULL) { if (errp != NULL) *errp = ENXIO; return (0); } if (!(cdevsw->d_flags & D_DISK)) { if (errp != NULL) *errp = ENOTBLK; return (0); } if (errp != NULL) *errp = 0; return (1); } /* * Free data allocated by namei(); see namei(9) for details. */ void NDFREE(ndp, flags) struct nameidata *ndp; const uint flags; { if (!(flags & NDF_NO_FREE_PNBUF) && (ndp->ni_cnd.cn_flags & HASBUF)) { zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); ndp->ni_cnd.cn_flags &= ~HASBUF; } if (!(flags & NDF_NO_DVP_UNLOCK) && (ndp->ni_cnd.cn_flags & LOCKPARENT) && ndp->ni_dvp != ndp->ni_vp) VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_proc); if (!(flags & NDF_NO_DVP_RELE) && (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) { vrele(ndp->ni_dvp); ndp->ni_dvp = NULL; } if (!(flags & NDF_NO_VP_UNLOCK) && (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_proc); if (!(flags & NDF_NO_VP_RELE) && ndp->ni_vp) { vrele(ndp->ni_vp); ndp->ni_vp = NULL; } if (!(flags & NDF_NO_STARTDIR_RELE) && (ndp->ni_cnd.cn_flags & SAVESTART)) { vrele(ndp->ni_startdir); ndp->ni_startdir = NULL; } } /* * Common file system object access control check routine. Accepts a * vnode's type, "mode", uid and gid, requested access mode, credentials, * and optional call-by-reference privused argument allowing vaccess() * to indicate to the caller whether privilege was used to satisfy the * request. Returns 0 on success, or an errno on failure. */ int vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused) enum vtype type; mode_t file_mode; uid_t file_uid; gid_t file_gid; mode_t acc_mode; struct ucred *cred; int *privused; { mode_t dac_granted; #ifdef CAPABILITIES mode_t cap_granted; #endif /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. */ if (privused != NULL) *privused = 0; dac_granted = 0; /* Check the owner. */ if (cred->cr_uid == file_uid) { dac_granted |= VADMIN; if (file_mode & S_IXUSR) dac_granted |= VEXEC; if (file_mode & S_IRUSR) dac_granted |= VREAD; if (file_mode & S_IWUSR) dac_granted |= VWRITE; if ((acc_mode & dac_granted) == acc_mode) return (0); goto privcheck; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) dac_granted |= VEXEC; if (file_mode & S_IRGRP) dac_granted |= VREAD; if (file_mode & S_IWGRP) dac_granted |= VWRITE; if ((acc_mode & dac_granted) == acc_mode) return (0); goto privcheck; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) dac_granted |= VEXEC; if (file_mode & S_IROTH) dac_granted |= VREAD; if (file_mode & S_IWOTH) dac_granted |= VWRITE; if ((acc_mode & dac_granted) == acc_mode) return (0); privcheck: if (!suser_xxx(cred, NULL, PRISON_ROOT)) { /* XXX audit: privilege used */ if (privused != NULL) *privused = 1; return (0); } #ifdef CAPABILITIES /* * Build a capability mask to determine if the set of capabilities * satisfies the requirements when combined with the granted mask * from above. * For each capability, if the capability is required, bitwise * or the request type onto the cap_granted mask. */ cap_granted = 0; if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && !cap_check_xxx(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT)) cap_granted |= VEXEC; if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) && !cap_check_xxx(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) cap_granted |= VREAD; if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) && !cap_check_xxx(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT)) cap_granted |= VWRITE; if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) && !cap_check_xxx(cred, NULL, CAP_FOWNER, PRISON_ROOT)) cap_granted |= VADMIN; if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) { /* XXX audit: privilege used */ if (privused != NULL) *privused = 1; return (0); } #endif return ((acc_mode & VADMIN) ? EPERM : EACCES); } Index: head/sys/kern/vfs_subr.c =================================================================== --- head/sys/kern/vfs_subr.c (revision 75579) +++ head/sys/kern/vfs_subr.c (revision 75580) @@ -1,3150 +1,3150 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 * $FreeBSD$ */ /* * External virtual filesystem routines */ #include "opt_ddb.h" #include "opt_ffs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); static void addalias __P((struct vnode *vp, dev_t nvp_rdev)); static void insmntque __P((struct vnode *vp, struct mount *mp)); static void vclean __P((struct vnode *vp, int flags, struct proc *p)); /* * Number of vnodes in existence. Increased whenever getnewvnode() * allocates a new vnode, never decreased. */ static unsigned long numvnodes; SYSCTL_LONG(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); /* * Conversion tables for conversion from vnode types to inode formats * and back. */ enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, }; int vttoif_tab[9] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, }; /* * List of vnodes that are ready for recycling. */ static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* * Minimum number of free vnodes. If there are fewer than this free vnodes, * getnewvnode() will return a newly allocated vnode. */ static u_long wantfreevnodes = 25; SYSCTL_LONG(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); /* Number of vnodes in the free list. */ static u_long freevnodes = 0; SYSCTL_LONG(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); /* * Various variables used for debugging the new implementation of * reassignbuf(). * XXX these are probably of (very) limited utility now. */ static int reassignbufcalls; SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); static int reassignbufloops; SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); static int reassignbufsortgood; SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); static int reassignbufsortbad; SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); /* Set to 0 for old insertion-sort based reassignbuf, 1 for modern method. */ static int reassignbufmethod = 1; SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); #ifdef ENABLE_VFS_IOOPT /* See NOTES for a description of this setting. */ int vfs_ioopt = 0; SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); #endif /* List of mounted filesystems. */ struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* For any iteration/modification of mountlist */ struct mtx mountlist_mtx; /* For any iteration/modification of mnt_vnodelist */ struct mtx mntvnode_mtx; /* * Cache for the mount type id assigned to NFS. This is used for * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c. */ int nfs_mount_type = -1; /* To keep more than one thread at a time from running vfs_getnewfsid */ static struct mtx mntid_mtx; /* For any iteration/modification of vnode_free_list */ static struct mtx vnode_free_list_mtx; /* * For any iteration/modification of dev->si_hlist (linked through * v_specnext) */ static struct mtx spechash_mtx; /* Publicly exported FS */ struct nfs_public nfs_pub; /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ static vm_zone_t vnode_zone; /* Set to 1 to print out reclaim of active vnodes */ int prtactive = 0; /* * The workitem queue. * * It is useful to delay writes of file data and filesystem metadata * for tens of seconds so that quickly created and deleted files need * not waste disk bandwidth being created and removed. To realize this, * we append vnodes to a "workitem" queue. When running with a soft * updates implementation, most pending metadata dependencies should * not wait for more than a few seconds. Thus, mounted on block devices * are delayed only about a half the time that file data is delayed. * Similarly, directory updates are more critical, so are only delayed * about a third the time that file data is delayed. Thus, there are * SYNCER_MAXDELAY queues that are processed round-robin at a rate of * one each second (driven off the filesystem syncer process). The * syncer_delayno variable indicates the next queue that is to be processed. * Items that need to be processed soon are placed in this queue: * * syncer_workitem_pending[syncer_delayno] * * A delay of fifteen seconds is done by placing the request fifteen * entries later in the queue: * * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * */ static int syncer_delayno = 0; static long syncer_mask; LIST_HEAD(synclist, vnode); static struct synclist *syncer_workitem_pending; #define SYNCER_MAXDELAY 32 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ time_t syncdelay = 30; /* max time to delay syncing data */ time_t filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); time_t dirdelay = 29; /* time to delay syncing directories */ SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); time_t metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); static int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); /* * Number of vnodes we want to exist at any one time. This is mostly used * to size hash tables in vnode-related code. It is normally not used in * getnewvnode(), as wantfreevnodes is normally nonzero.) * * XXX desiredvnodes is historical cruft and should not exist. */ int desiredvnodes; SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "Maximum number of vnodes"); static void vfs_free_addrlist __P((struct netexport *nep)); static int vfs_free_netcred __P((struct radix_node *rn, void *w)); static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep, struct export_args *argp)); /* * Initialize the vnode management data structures. */ static void vntblinit(void *dummy __unused) { desiredvnodes = maxproc + cnt.v_page_count / 4; mtx_init(&mountlist_mtx, "mountlist", MTX_DEF); mtx_init(&mntvnode_mtx, "mntvnode", MTX_DEF); mtx_init(&mntid_mtx, "mntid", MTX_DEF); mtx_init(&spechash_mtx, "spechash", MTX_DEF); TAILQ_INIT(&vnode_free_list); mtx_init(&vnode_free_list_mtx, "vnode_free_list", MTX_DEF); vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); /* * Initialize the filesystem syncer. */ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_maxdelay = syncer_mask + 1; } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL) /* * Mark a mount point as busy. Used to synchronize access and to delay * unmounting. Interlock is not released on failure. */ int vfs_busy(mp, flags, interlkp, p) struct mount *mp; int flags; struct mtx *interlkp; struct proc *p; { int lkflags; if (mp->mnt_kern_flag & MNTK_UNMOUNT) { if (flags & LK_NOWAIT) return (ENOENT); mp->mnt_kern_flag |= MNTK_MWAIT; /* * Since all busy locks are shared except the exclusive * lock granted when unmounting, the only place that a * wakeup needs to be done is at the release of the * exclusive lock at the end of dounmount. */ msleep((caddr_t)mp, interlkp, PVFS, "vfs_busy", 0); return (ENOENT); } lkflags = LK_SHARED | LK_NOPAUSE; if (interlkp) lkflags |= LK_INTERLOCK; if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) panic("vfs_busy: unexpected lock failure"); return (0); } /* * Free a busy filesystem. */ void vfs_unbusy(mp, p) struct mount *mp; struct proc *p; { lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); } /* * Lookup a filesystem type, and if found allocate and initialize * a mount structure for it. * * Devname is usually updated by mount(8) after booting. */ int vfs_rootmountalloc(fstypename, devname, mpp) char *fstypename; char *devname; struct mount **mpp; { struct proc *p = curproc; /* XXX */ struct vfsconf *vfsp; struct mount *mp; if (fstypename == NULL) return (ENODEV); for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (!strcmp(vfsp->vfc_name, fstypename)) break; if (vfsp == NULL) return (ENODEV); mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO); lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); (void)vfs_busy(mp, LK_NOWAIT, 0, p); LIST_INIT(&mp->mnt_vnodelist); mp->mnt_vfc = vfsp; mp->mnt_op = vfsp->vfc_vfsops; mp->mnt_flag = MNT_RDONLY; mp->mnt_vnodecovered = NULLVP; vfsp->vfc_refcount++; mp->mnt_iosize_max = DFLTPHYS; mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); mp->mnt_stat.f_mntonname[0] = '/'; mp->mnt_stat.f_mntonname[1] = 0; (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); *mpp = mp; return (0); } /* * Find an appropriate filesystem to use for the root. If a filesystem * has not been preselected, walk through the list of known filesystems * trying those that have mountroot routines, and try them until one * works or we have tried them all. */ #ifdef notdef /* XXX JH */ int lite2_vfs_mountroot() { struct vfsconf *vfsp; extern int (*lite2_mountroot) __P((void)); int error; if (lite2_mountroot != NULL) return ((*lite2_mountroot)()); for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { if (vfsp->vfc_mountroot == NULL) continue; if ((error = (*vfsp->vfc_mountroot)()) == 0) return (0); printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); } return (ENODEV); } #endif /* * Lookup a mount point by filesystem identifier. */ struct mount * vfs_getvfs(fsid) fsid_t *fsid; { register struct mount *mp; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { mtx_unlock(&mountlist_mtx); return (mp); } } mtx_unlock(&mountlist_mtx); return ((struct mount *) 0); } /* * Get a new unique fsid. Try to make its val[0] unique, since this value * will be used to create fake device numbers for stat(). Also try (but * not so hard) make its val[0] unique mod 2^16, since some emulators only * support 16-bit device numbers. We end up with unique val[0]'s for the * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. * * Keep in mind that several mounts may be running in parallel. Starting * the search one past where the previous search terminated is both a * micro-optimization and a defense against returning the same fsid to * different mounts. */ void vfs_getnewfsid(mp) struct mount *mp; { static u_int16_t mntid_base; fsid_t tfsid; int mtype; mtx_lock(&mntid_mtx); mtype = mp->mnt_vfc->vfc_typenum; tfsid.val[1] = mtype; mtype = (mtype & 0xFF) << 24; for (;;) { tfsid.val[0] = makeudev(255, mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); mntid_base++; if (vfs_getvfs(&tfsid) == NULL) break; } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; mtx_unlock(&mntid_mtx); } /* * Knob to control the precision of file timestamps: * * 0 = seconds only; nanoseconds zeroed. * 1 = seconds and nanoseconds, accurate within 1/HZ. * 2 = seconds and nanoseconds, truncated to microseconds. * >=3 = seconds and nanoseconds, maximum precision. */ enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; static int timestamp_precision = TSP_SEC; SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, ×tamp_precision, 0, ""); /* * Get a current timestamp. */ void vfs_timestamp(tsp) struct timespec *tsp; { struct timeval tv; switch (timestamp_precision) { case TSP_SEC: tsp->tv_sec = time_second; tsp->tv_nsec = 0; break; case TSP_HZ: getnanotime(tsp); break; case TSP_USEC: microtime(&tv); TIMEVAL_TO_TIMESPEC(&tv, tsp); break; case TSP_NSEC: default: nanotime(tsp); break; } } /* * Set vnode attributes to VNOVAL */ void vattr_null(vap) register struct vattr *vap; { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = VNOVAL; vap->va_nlink = VNOVAL; vap->va_uid = VNOVAL; vap->va_gid = VNOVAL; vap->va_fsid = VNOVAL; vap->va_fileid = VNOVAL; vap->va_blocksize = VNOVAL; vap->va_rdev = VNOVAL; vap->va_atime.tv_sec = VNOVAL; vap->va_atime.tv_nsec = VNOVAL; vap->va_mtime.tv_sec = VNOVAL; vap->va_mtime.tv_nsec = VNOVAL; vap->va_ctime.tv_sec = VNOVAL; vap->va_ctime.tv_nsec = VNOVAL; vap->va_flags = VNOVAL; vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * Routines having to do with the management of the vnode table. */ /* * Return the next vnode from the free list. */ int getnewvnode(tag, mp, vops, vpp) enum vtagtype tag; struct mount *mp; vop_t **vops; struct vnode **vpp; { int s, count; struct proc *p = curproc; /* XXX */ struct vnode *vp = NULL; struct mount *vnmp; vm_object_t object; /* * We take the least recently used vnode from the freelist * if we can get it and it has no cached pages, and no * namecache entries are relative to it. * Otherwise we allocate a new vnode */ s = splbio(); mtx_lock(&vnode_free_list_mtx); if (wantfreevnodes && freevnodes < wantfreevnodes) { vp = NULL; } else if (!wantfreevnodes && freevnodes <= desiredvnodes) { /* * XXX: this is only here to be backwards compatible */ vp = NULL; } else for (count = 0; count < freevnodes; count++) { vp = TAILQ_FIRST(&vnode_free_list); if (vp == NULL || vp->v_usecount) panic("getnewvnode: free vnode isn't"); TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); /* * Don't recycle if active in the namecache or * if it still has cached pages or we cannot get * its interlock. */ if (LIST_FIRST(&vp->v_cache_src) != NULL || (VOP_GETVOBJECT(vp, &object) == 0 && (object->resident_page_count || object->ref_count)) || !mtx_trylock(&vp->v_interlock)) { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); vp = NULL; continue; } /* * Skip over it if its filesystem is being suspended. */ if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0) break; mtx_unlock(&vp->v_interlock); TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); vp = NULL; } if (vp) { vp->v_flag |= VDOOMED; vp->v_flag &= ~VFREE; freevnodes--; mtx_unlock(&vnode_free_list_mtx); cache_purge(vp); vp->v_lease = NULL; if (vp->v_type != VBAD) { vgonel(vp, p); } else { mtx_unlock(&vp->v_interlock); } vn_finished_write(vnmp); #ifdef INVARIANTS { int s; if (vp->v_data) panic("cleaned vnode isn't"); s = splbio(); if (vp->v_numoutput) panic("Clean vnode has pending I/O's"); splx(s); if (vp->v_writecount != 0) panic("Non-zero write count"); } #endif vp->v_flag = 0; vp->v_lastw = 0; vp->v_lasta = 0; vp->v_cstart = 0; vp->v_clen = 0; vp->v_socket = 0; } else { mtx_unlock(&vnode_free_list_mtx); vp = (struct vnode *) zalloc(vnode_zone); bzero((char *) vp, sizeof *vp); mtx_init(&vp->v_interlock, "vnode interlock", MTX_DEF); vp->v_dd = vp; mtx_init(&vp->v_pollinfo.vpi_lock, "vnode pollinfo", MTX_DEF); cache_purge(vp); LIST_INIT(&vp->v_cache_src); TAILQ_INIT(&vp->v_cache_dst); numvnodes++; } TAILQ_INIT(&vp->v_cleanblkhd); TAILQ_INIT(&vp->v_dirtyblkhd); vp->v_type = VNON; vp->v_tag = tag; vp->v_op = vops; lockinit(&vp->v_lock, PVFS, "vnlock", 0, LK_NOPAUSE); insmntque(vp, mp); *vpp = vp; vp->v_usecount = 1; vp->v_data = 0; splx(s); vfs_object_create(vp, p, p->p_ucred); return (0); } /* * Move a vnode from one mount queue to another. */ static void insmntque(vp, mp) register struct vnode *vp; register struct mount *mp; { mtx_lock(&mntvnode_mtx); /* * Delete from old mount point vnode list, if on one. */ if (vp->v_mount != NULL) LIST_REMOVE(vp, v_mntvnodes); /* * Insert into list of vnodes for the new mount point, if available. */ if ((vp->v_mount = mp) == NULL) { mtx_unlock(&mntvnode_mtx); return; } LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); mtx_unlock(&mntvnode_mtx); } /* * Update outstanding I/O count and do wakeup if requested. */ void vwakeup(bp) register struct buf *bp; { register struct vnode *vp; bp->b_flags &= ~B_WRITEINPROG; if ((vp = bp->b_vp)) { vp->v_numoutput--; if (vp->v_numoutput < 0) panic("vwakeup: neg numoutput"); if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { vp->v_flag &= ~VBWAIT; wakeup((caddr_t) &vp->v_numoutput); } } } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) register struct vnode *vp; int flags; struct ucred *cred; struct proc *p; int slpflag, slptimeo; { register struct buf *bp; struct buf *nbp, *blist; int s, error; vm_object_t object; if (flags & V_SAVE) { s = splbio(); while (vp->v_numoutput) { vp->v_flag |= VBWAIT; error = tsleep((caddr_t)&vp->v_numoutput, slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo); if (error) { splx(s); return (error); } } if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { splx(s); if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0) return (error); s = splbio(); if (vp->v_numoutput > 0 || !TAILQ_EMPTY(&vp->v_dirtyblkhd)) panic("vinvalbuf: dirty bufs"); } splx(s); } s = splbio(); for (;;) { blist = TAILQ_FIRST(&vp->v_cleanblkhd); if (!blist) blist = TAILQ_FIRST(&vp->v_dirtyblkhd); if (!blist) break; for (bp = blist; bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL, "vinvalbuf", slpflag, slptimeo); if (error == ENOLCK) break; splx(s); return (error); } /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. Note that vfs_bio_awrite expects - * buffers to reside on a queue, while VOP_BWRITE and + * buffers to reside on a queue, while BUF_WRITE and * brelse do not. */ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && (flags & V_SAVE)) { if (bp->b_vp == vp) { if (bp->b_flags & B_CLUSTEROK) { BUF_UNLOCK(bp); vfs_bio_awrite(bp); } else { bremfree(bp); bp->b_flags |= B_ASYNC; BUF_WRITE(bp); } } else { bremfree(bp); (void) BUF_WRITE(bp); } break; } bremfree(bp); bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); } } while (vp->v_numoutput > 0) { vp->v_flag |= VBWAIT; tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); } splx(s); /* * Destroy the copy in the VM cache, too. */ mtx_lock(&vp->v_interlock); if (VOP_GETVOBJECT(vp, &object) == 0) { vm_object_page_remove(object, 0, 0, (flags & V_SAVE) ? TRUE : FALSE); } mtx_unlock(&vp->v_interlock); if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) panic("vinvalbuf: flush failed"); return (0); } /* * Truncate a file's buffer and pages to a specified length. This * is in lieu of the old vinvalbuf mechanism, which performed unneeded * sync activity. */ int vtruncbuf(vp, cred, p, length, blksize) register struct vnode *vp; struct ucred *cred; struct proc *p; off_t length; int blksize; { register struct buf *bp; struct buf *nbp; int s, anyfreed; int trunclbn; /* * Round up to the *next* lbn. */ trunclbn = (length + blksize - 1) / blksize; s = splbio(); restart: anyfreed = 1; for (;anyfreed;) { anyfreed = 0; for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (bp->b_lblkno >= trunclbn) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); goto restart; } else { bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; } if (nbp && (((nbp->b_xflags & BX_VNCLEAN) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI))) { goto restart; } } } for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (bp->b_lblkno >= trunclbn) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); goto restart; } else { bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; } if (nbp && (((nbp->b_xflags & BX_VNDIRTY) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI) == 0)) { goto restart; } } } } if (length > 0) { restartsync: for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); goto restart; } else { bremfree(bp); if (bp->b_vp == vp) { bp->b_flags |= B_ASYNC; } else { bp->b_flags &= ~B_ASYNC; } BUF_WRITE(bp); } goto restartsync; } } } while (vp->v_numoutput > 0) { vp->v_flag |= VBWAIT; tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); } splx(s); vnode_pager_setsize(vp, length); return (0); } /* * Associate a buffer with a vnode. */ void bgetvp(vp, bp) register struct vnode *vp; register struct buf *bp; { int s; KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); vhold(vp); bp->b_vp = vp; bp->b_dev = vn_todev(vp); /* * Insert onto list for new vnode. */ s = splbio(); bp->b_xflags |= BX_VNCLEAN; bp->b_xflags &= ~BX_VNDIRTY; TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); splx(s); } /* * Disassociate a buffer from a vnode. */ void brelvp(bp) register struct buf *bp; { struct vnode *vp; struct buflists *listheadp; int s; KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); /* * Delete from old vnode list, if on one. */ vp = bp->b_vp; s = splbio(); if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { if (bp->b_xflags & BX_VNDIRTY) listheadp = &vp->v_dirtyblkhd; else listheadp = &vp->v_cleanblkhd; TAILQ_REMOVE(listheadp, bp, b_vnbufs); bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); } if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { vp->v_flag &= ~VONWORKLST; LIST_REMOVE(vp, v_synclist); } splx(s); bp->b_vp = (struct vnode *) 0; vdrop(vp); } /* * Add an item to the syncer work queue. */ static void vn_syncer_add_to_worklist(struct vnode *vp, int delay) { int s, slot; s = splbio(); if (vp->v_flag & VONWORKLST) { LIST_REMOVE(vp, v_synclist); } if (delay > syncer_maxdelay - 2) delay = syncer_maxdelay - 2; slot = (syncer_delayno + delay) & syncer_mask; LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); vp->v_flag |= VONWORKLST; splx(s); } struct proc *updateproc; static void sched_sync __P((void)); static struct kproc_desc up_kp = { "syncer", sched_sync, &updateproc }; SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) /* * System filesystem synchronizer daemon. */ void sched_sync(void) { struct synclist *slp; struct vnode *vp; struct mount *mp; long starttime; int s; struct proc *p = updateproc; mtx_lock(&Giant); EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, SHUTDOWN_PRI_LAST); for (;;) { kthread_suspend_check(p); starttime = time_second; /* * Push files whose dirty time has expired. Be careful * of interrupt race on slp queue. */ s = splbio(); slp = &syncer_workitem_pending[syncer_delayno]; syncer_delayno += 1; if (syncer_delayno == syncer_maxdelay) syncer_delayno = 0; splx(s); while ((vp = LIST_FIRST(slp)) != NULL) { if (VOP_ISLOCKED(vp, NULL) == 0 && vn_start_write(vp, &mp, V_NOWAIT) == 0) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); VOP_UNLOCK(vp, 0, p); vn_finished_write(mp); } s = splbio(); if (LIST_FIRST(slp) == vp) { /* * Note: v_tag VT_VFS vps can remain on the * worklist too with no dirty blocks, but * since sync_fsync() moves it to a different * slot we are safe. */ if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && !vn_isdisk(vp, NULL)) panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ vn_syncer_add_to_worklist(vp, syncdelay); } splx(s); } /* * Do soft update processing. */ #ifdef SOFTUPDATES softdep_process_worklist(NULL); #endif /* * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob * value of N tells the filesystem syncer to process the next * N seconds worth of work on its queue ASAP. Currently rushjob * is used by the soft update code to speed up the filesystem * syncer process when the incore state is getting so far * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. */ if (rushjob > 0) { rushjob -= 1; continue; } /* * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. */ if (time_second == starttime) tsleep(&lbolt, PPAUSE, "syncer", 0); } } /* * Request the syncer daemon to speed up its work. * We never push it to speed up more than half of its * normal turn time, otherwise it could take over the cpu. */ int speedup_syncer() { mtx_lock_spin(&sched_lock); if (updateproc->p_wchan == &lbolt) setrunnable(updateproc); mtx_unlock_spin(&sched_lock); if (rushjob < syncdelay / 2) { rushjob += 1; stat_rush_requests += 1; return (1); } return(0); } /* * Associate a p-buffer with a vnode. * * Also sets B_PAGING flag to indicate that vnode is not fully associated * with the buffer. i.e. the bp has not been linked into the vnode or * ref-counted. */ void pbgetvp(vp, bp) register struct vnode *vp; register struct buf *bp; { KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); bp->b_vp = vp; bp->b_flags |= B_PAGING; bp->b_dev = vn_todev(vp); } /* * Disassociate a p-buffer from a vnode. */ void pbrelvp(bp) register struct buf *bp; { KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); /* XXX REMOVE ME */ if (TAILQ_NEXT(bp, b_vnbufs) != NULL) { panic( "relpbuf(): b_vp was probably reassignbuf()d %p %x", bp, (int)bp->b_flags ); } bp->b_vp = (struct vnode *) 0; bp->b_flags &= ~B_PAGING; } /* * Change the vnode a pager buffer is associated with. */ void pbreassignbuf(bp, newvp) struct buf *bp; struct vnode *newvp; { KASSERT(bp->b_flags & B_PAGING, ("pbreassignbuf() on non phys bp %p", bp)); bp->b_vp = newvp; } /* * Reassign a buffer from one vnode to another. * Used to assign file specific control information * (indirect blocks) to the vnode to which they belong. */ void reassignbuf(bp, newvp) register struct buf *bp; register struct vnode *newvp; { struct buflists *listheadp; int delay; int s; if (newvp == NULL) { printf("reassignbuf: NULL"); return; } ++reassignbufcalls; /* * B_PAGING flagged buffers cannot be reassigned because their vp * is not fully linked in. */ if (bp->b_flags & B_PAGING) panic("cannot reassign paging buffer"); s = splbio(); /* * Delete from old vnode list, if on one. */ if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { if (bp->b_xflags & BX_VNDIRTY) listheadp = &bp->b_vp->v_dirtyblkhd; else listheadp = &bp->b_vp->v_cleanblkhd; TAILQ_REMOVE(listheadp, bp, b_vnbufs); bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); if (bp->b_vp != newvp) { vdrop(bp->b_vp); bp->b_vp = NULL; /* for clarification */ } } /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { struct buf *tbp; listheadp = &newvp->v_dirtyblkhd; if ((newvp->v_flag & VONWORKLST) == 0) { switch (newvp->v_type) { case VDIR: delay = dirdelay; break; case VCHR: if (newvp->v_rdev->si_mountpoint != NULL) { delay = metadelay; break; } /* fall through */ default: delay = filedelay; } vn_syncer_add_to_worklist(newvp, delay); } bp->b_xflags |= BX_VNDIRTY; tbp = TAILQ_FIRST(listheadp); if (tbp == NULL || bp->b_lblkno == 0 || (bp->b_lblkno > 0 && tbp->b_lblkno < 0) || (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); ++reassignbufsortgood; } else if (bp->b_lblkno < 0) { TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); ++reassignbufsortgood; } else if (reassignbufmethod == 1) { /* * New sorting algorithm, only handle sequential case, * otherwise append to end (but before metadata) */ if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && (tbp->b_xflags & BX_VNDIRTY)) { /* * Found the best place to insert the buffer */ TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); ++reassignbufsortgood; } else { /* * Missed, append to end, but before meta-data. * We know that the head buffer in the list is * not meta-data due to prior conditionals. * * Indirect effects: NFS second stage write * tends to wind up here, giving maximum * distance between the unstable write and the * commit rpc. */ tbp = TAILQ_LAST(listheadp, buflists); while (tbp && tbp->b_lblkno < 0) tbp = TAILQ_PREV(tbp, buflists, b_vnbufs); TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); ++reassignbufsortbad; } } else { /* * Old sorting algorithm, scan queue and insert */ struct buf *ttbp; while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && (ttbp->b_lblkno < bp->b_lblkno)) { ++reassignbufloops; tbp = ttbp; } TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); } } else { bp->b_xflags |= BX_VNCLEAN; TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); if ((newvp->v_flag & VONWORKLST) && TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { newvp->v_flag &= ~VONWORKLST; LIST_REMOVE(newvp, v_synclist); } } if (bp->b_vp != newvp) { bp->b_vp = newvp; vhold(bp->b_vp); } splx(s); } /* * Create a vnode for a device. * Used for mounting the root file system. */ int bdevvp(dev, vpp) dev_t dev; struct vnode **vpp; { register struct vnode *vp; struct vnode *nvp; int error; if (dev == NODEV) { *vpp = NULLVP; return (ENXIO); } if (vfinddev(dev, VCHR, vpp)) return (0); error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); if (error) { *vpp = NULLVP; return (error); } vp = nvp; vp->v_type = VCHR; addalias(vp, dev); *vpp = vp; return (0); } /* * Add vnode to the alias list hung off the dev_t. * * The reason for this gunk is that multiple vnodes can reference * the same physical device, so checking vp->v_usecount to see * how many users there are is inadequate; the v_usecount for * the vnodes need to be accumulated. vcount() does that. */ struct vnode * addaliasu(nvp, nvp_rdev) struct vnode *nvp; udev_t nvp_rdev; { struct vnode *ovp; vop_t **ops; dev_t dev; if (nvp->v_type == VBLK) return (nvp); if (nvp->v_type != VCHR) panic("addaliasu on non-special vnode"); dev = udev2dev(nvp_rdev, 0); /* * Check to see if we have a bdevvp vnode with no associated * filesystem. If so, we want to associate the filesystem of * the new newly instigated vnode with the bdevvp vnode and * discard the newly created vnode rather than leaving the * bdevvp vnode lying around with no associated filesystem. */ if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) { addalias(nvp, dev); return (nvp); } /* * Discard unneeded vnode, but save its node specific data. * Note that if there is a lock, it is carried over in the * node specific data to the replacement vnode. */ vref(ovp); ovp->v_data = nvp->v_data; ovp->v_tag = nvp->v_tag; nvp->v_data = NULL; lockinit(&ovp->v_lock, PVFS, nvp->v_lock.lk_wmesg, nvp->v_lock.lk_timo, nvp->v_lock.lk_flags & LK_EXTFLG_MASK); if (nvp->v_vnlock) ovp->v_vnlock = &ovp->v_lock; ops = ovp->v_op; ovp->v_op = nvp->v_op; if (VOP_ISLOCKED(nvp, curproc)) { VOP_UNLOCK(nvp, 0, curproc); vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curproc); } nvp->v_op = ops; insmntque(ovp, nvp->v_mount); vrele(nvp); vgone(nvp); return (ovp); } /* This is a local helper function that do the same as addaliasu, but for a * dev_t instead of an udev_t. */ static void addalias(nvp, dev) struct vnode *nvp; dev_t dev; { KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode")); nvp->v_rdev = dev; mtx_lock(&spechash_mtx); SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext); mtx_unlock(&spechash_mtx); } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. The vnode lock bit is set if the * vnode is being eliminated in vgone. The process is awakened * when the transition is completed, and an error returned to * indicate that the vnode is no longer usable (possibly having * been changed to a new file system type). */ int vget(vp, flags, p) register struct vnode *vp; int flags; struct proc *p; { int error; /* * If the vnode is in the process of being cleaned out for * another use, we wait for the cleaning to finish and then * return failure. Cleaning is determined by checking that * the VXLOCK flag is set. */ if ((flags & LK_INTERLOCK) == 0) mtx_lock(&vp->v_interlock); if (vp->v_flag & VXLOCK) { if (vp->v_vxproc == curproc) { printf("VXLOCK interlock avoided\n"); } else { vp->v_flag |= VXWANT; msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, "vget", 0); return (ENOENT); } } vp->v_usecount++; if (VSHOULDBUSY(vp)) vbusy(vp); if (flags & LK_TYPE_MASK) { if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) { /* * must expand vrele here because we do not want * to call VOP_INACTIVE if the reference count * drops back to zero since it was never really * active. We must remove it from the free list * before sleeping so that multiple processes do * not try to recycle it. */ mtx_lock(&vp->v_interlock); vp->v_usecount--; if (VSHOULDFREE(vp)) vfree(vp); mtx_unlock(&vp->v_interlock); } return (error); } mtx_unlock(&vp->v_interlock); return (0); } /* * Increase the reference count of a vnode. */ void vref(struct vnode *vp) { mtx_lock(&vp->v_interlock); vp->v_usecount++; mtx_unlock(&vp->v_interlock); } /* * Vnode put/release. * If count drops to zero, call inactive routine and return to freelist. */ void vrele(vp) struct vnode *vp; { struct proc *p = curproc; /* XXX */ KASSERT(vp != NULL, ("vrele: null vp")); mtx_lock(&vp->v_interlock); KASSERT(vp->v_writecount < vp->v_usecount, ("vrele: missed vn_close")); if (vp->v_usecount > 1) { vp->v_usecount--; mtx_unlock(&vp->v_interlock); return; } if (vp->v_usecount == 1) { vp->v_usecount--; if (VSHOULDFREE(vp)) vfree(vp); /* * If we are doing a vput, the node is already locked, and we must * call VOP_INACTIVE with the node locked. So, in the case of * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. */ if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) { VOP_INACTIVE(vp, p); } } else { #ifdef DIAGNOSTIC vprint("vrele: negative ref count", vp); mtx_unlock(&vp->v_interlock); #endif panic("vrele: negative ref cnt"); } } /* * Release an already locked vnode. This give the same effects as * unlock+vrele(), but takes less time and avoids releasing and * re-aquiring the lock (as vrele() aquires the lock internally.) */ void vput(vp) struct vnode *vp; { struct proc *p = curproc; /* XXX */ KASSERT(vp != NULL, ("vput: null vp")); mtx_lock(&vp->v_interlock); KASSERT(vp->v_writecount < vp->v_usecount, ("vput: missed vn_close")); if (vp->v_usecount > 1) { vp->v_usecount--; VOP_UNLOCK(vp, LK_INTERLOCK, p); return; } if (vp->v_usecount == 1) { vp->v_usecount--; if (VSHOULDFREE(vp)) vfree(vp); /* * If we are doing a vput, the node is already locked, and we must * call VOP_INACTIVE with the node locked. So, in the case of * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. */ mtx_unlock(&vp->v_interlock); VOP_INACTIVE(vp, p); } else { #ifdef DIAGNOSTIC vprint("vput: negative ref count", vp); #endif panic("vput: negative ref cnt"); } } /* * Somebody doesn't want the vnode recycled. */ void vhold(vp) register struct vnode *vp; { int s; s = splbio(); vp->v_holdcnt++; if (VSHOULDBUSY(vp)) vbusy(vp); splx(s); } /* * Note that there is one less who cares about this vnode. vdrop() is the * opposite of vhold(). */ void vdrop(vp) register struct vnode *vp; { int s; s = splbio(); if (vp->v_holdcnt <= 0) panic("vdrop: holdcnt"); vp->v_holdcnt--; if (VSHOULDFREE(vp)) vfree(vp); splx(s); } /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If MNT_NOFORCE is specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If MNT_FORCE is specified, detach any active vnodes * that are found. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); #endif int vflush(mp, skipvp, flags) struct mount *mp; struct vnode *skipvp; int flags; { struct proc *p = curproc; /* XXX */ struct vnode *vp, *nvp; int busy = 0; mtx_lock(&mntvnode_mtx); loop: for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { /* * Make sure this vnode wasn't reclaimed in getnewvnode(). * Start over if it has (it won't be on the list anymore). */ if (vp->v_mount != mp) goto loop; nvp = LIST_NEXT(vp, v_mntvnodes); /* * Skip over a selected vnode. */ if (vp == skipvp) continue; mtx_lock(&vp->v_interlock); /* * Skip over a vnodes marked VSYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { mtx_unlock(&vp->v_interlock); continue; } /* * If WRITECLOSE is set, only flush out regular file vnodes * open for writing. */ if ((flags & WRITECLOSE) && (vp->v_writecount == 0 || vp->v_type != VREG)) { mtx_unlock(&vp->v_interlock); continue; } /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. */ if (vp->v_usecount == 0) { mtx_unlock(&mntvnode_mtx); vgonel(vp, p); mtx_lock(&mntvnode_mtx); continue; } /* * If FORCECLOSE is set, forcibly close the vnode. For block * or character devices, revert to an anonymous device. For * all other files, just kill them. */ if (flags & FORCECLOSE) { mtx_unlock(&mntvnode_mtx); if (vp->v_type != VCHR) { vgonel(vp, p); } else { vclean(vp, 0, p); vp->v_op = spec_vnodeop_p; insmntque(vp, (struct mount *) 0); } mtx_lock(&mntvnode_mtx); continue; } #ifdef DIAGNOSTIC if (busyprt) vprint("vflush: busy vnode", vp); #endif mtx_unlock(&vp->v_interlock); busy++; } mtx_unlock(&mntvnode_mtx); if (busy) return (EBUSY); return (0); } /* * Disassociate the underlying file system from a vnode. */ static void vclean(vp, flags, p) struct vnode *vp; int flags; struct proc *p; { int active; /* * Check to see if the vnode is in use. If so we have to reference it * before we clean it out so that its count cannot fall to zero and * generate a race against ourselves to recycle it. */ if ((active = vp->v_usecount)) vp->v_usecount++; /* * Prevent the vnode from being recycled or brought into use while we * clean it out. */ if (vp->v_flag & VXLOCK) panic("vclean: deadlock"); vp->v_flag |= VXLOCK; vp->v_vxproc = curproc; /* * Even if the count is zero, the VOP_INACTIVE routine may still * have the object locked while it cleans it out. The VOP_LOCK * ensures that the VOP_INACTIVE routine is done with its work. * For active vnodes, it ensures that no other activity can * occur while the underlying object is being cleaned out. */ VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); /* * Clean out any buffers associated with the vnode. * If the flush fails, just toss the buffers. */ if (flags & DOCLOSE) { if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL) (void) vn_write_suspend_wait(vp, NULL, V_WAIT); if (vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0) != 0) vinvalbuf(vp, 0, NOCRED, p, 0, 0); } VOP_DESTROYVOBJECT(vp); /* * If purging an active vnode, it must be closed and * deactivated before being reclaimed. Note that the * VOP_INACTIVE will unlock the vnode. */ if (active) { if (flags & DOCLOSE) VOP_CLOSE(vp, FNONBLOCK, NOCRED, p); VOP_INACTIVE(vp, p); } else { /* * Any other processes trying to obtain this lock must first * wait for VXLOCK to clear, then call the new lock operation. */ VOP_UNLOCK(vp, 0, p); } /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp, p)) panic("vclean: cannot reclaim"); if (active) { /* * Inline copy of vrele() since VOP_INACTIVE * has already been called. */ mtx_lock(&vp->v_interlock); if (--vp->v_usecount <= 0) { #ifdef DIAGNOSTIC if (vp->v_usecount < 0 || vp->v_writecount != 0) { vprint("vclean: bad ref count", vp); panic("vclean: ref cnt"); } #endif vfree(vp); } mtx_unlock(&vp->v_interlock); } cache_purge(vp); vp->v_vnlock = NULL; lockdestroy(&vp->v_lock); if (VSHOULDFREE(vp)) vfree(vp); /* * Done with purge, notify sleepers of the grim news. */ vp->v_op = dead_vnodeop_p; vn_pollgone(vp); vp->v_tag = VT_NON; vp->v_flag &= ~VXLOCK; vp->v_vxproc = NULL; if (vp->v_flag & VXWANT) { vp->v_flag &= ~VXWANT; wakeup((caddr_t) vp); } } /* * Eliminate all activity associated with the requested vnode * and with all vnodes aliased to the requested vnode. */ int vop_revoke(ap) struct vop_revoke_args /* { struct vnode *a_vp; int a_flags; } */ *ap; { struct vnode *vp, *vq; dev_t dev; KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); vp = ap->a_vp; /* * If a vgone (or vclean) is already in progress, * wait until it is done and return. */ if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, "vop_revokeall", 0); return (0); } dev = vp->v_rdev; for (;;) { mtx_lock(&spechash_mtx); vq = SLIST_FIRST(&dev->si_hlist); mtx_unlock(&spechash_mtx); if (!vq) break; vgone(vq); } return (0); } /* * Recycle an unused vnode to the front of the free list. * Release the passed interlock if the vnode will be recycled. */ int vrecycle(vp, inter_lkp, p) struct vnode *vp; struct mtx *inter_lkp; struct proc *p; { mtx_lock(&vp->v_interlock); if (vp->v_usecount == 0) { if (inter_lkp) { mtx_unlock(inter_lkp); } vgonel(vp, p); return (1); } mtx_unlock(&vp->v_interlock); return (0); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(vp) register struct vnode *vp; { struct proc *p = curproc; /* XXX */ mtx_lock(&vp->v_interlock); vgonel(vp, p); } /* * vgone, with the vp interlock held. */ void vgonel(vp, p) struct vnode *vp; struct proc *p; { int s; /* * If a vgone (or vclean) is already in progress, * wait until it is done and return. */ if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, "vgone", 0); return; } /* * Clean out the filesystem specific data. */ vclean(vp, DOCLOSE, p); mtx_lock(&vp->v_interlock); /* * Delete from old mount point vnode list, if on one. */ if (vp->v_mount != NULL) insmntque(vp, (struct mount *)0); /* * If special device, remove it from special device alias list * if it is on one. */ if (vp->v_type == VCHR && vp->v_rdev != NULL && vp->v_rdev != NODEV) { mtx_lock(&spechash_mtx); SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext); freedev(vp->v_rdev); mtx_unlock(&spechash_mtx); vp->v_rdev = NULL; } /* * If it is on the freelist and not already at the head, * move it to the head of the list. The test of the * VDOOMED flag and the reference count of zero is because * it will be removed from the free list by getnewvnode, * but will not have its reference count incremented until * after calling vgone. If the reference count were * incremented first, vgone would (incorrectly) try to * close the previous instance of the underlying object. */ if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { s = splbio(); mtx_lock(&vnode_free_list_mtx); if (vp->v_flag & VFREE) TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); else freevnodes++; vp->v_flag |= VFREE; TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); mtx_unlock(&vnode_free_list_mtx); splx(s); } vp->v_type = VBAD; mtx_unlock(&vp->v_interlock); } /* * Lookup a vnode by device number. */ int vfinddev(dev, type, vpp) dev_t dev; enum vtype type; struct vnode **vpp; { struct vnode *vp; mtx_lock(&spechash_mtx); SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { if (type == vp->v_type) { *vpp = vp; mtx_unlock(&spechash_mtx); return (1); } } mtx_unlock(&spechash_mtx); return (0); } /* * Calculate the total number of references to a special device. */ int vcount(vp) struct vnode *vp; { struct vnode *vq; int count; count = 0; mtx_lock(&spechash_mtx); SLIST_FOREACH(vq, &vp->v_rdev->si_hlist, v_specnext) count += vq->v_usecount; mtx_unlock(&spechash_mtx); return (count); } /* * Same as above, but using the dev_t as argument */ int count_dev(dev) dev_t dev; { struct vnode *vp; vp = SLIST_FIRST(&dev->si_hlist); if (vp == NULL) return (0); return(vcount(vp)); } /* * Print out a description of a vnode. */ static char *typename[] = {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; void vprint(label, vp) char *label; struct vnode *vp; { char buf[96]; if (label != NULL) printf("%s: %p: ", label, (void *)vp); else printf("%p: ", (void *)vp); printf("type %s, usecount %d, writecount %d, refcount %d,", typename[vp->v_type], vp->v_usecount, vp->v_writecount, vp->v_holdcnt); buf[0] = '\0'; if (vp->v_flag & VROOT) strcat(buf, "|VROOT"); if (vp->v_flag & VTEXT) strcat(buf, "|VTEXT"); if (vp->v_flag & VSYSTEM) strcat(buf, "|VSYSTEM"); if (vp->v_flag & VXLOCK) strcat(buf, "|VXLOCK"); if (vp->v_flag & VXWANT) strcat(buf, "|VXWANT"); if (vp->v_flag & VBWAIT) strcat(buf, "|VBWAIT"); if (vp->v_flag & VDOOMED) strcat(buf, "|VDOOMED"); if (vp->v_flag & VFREE) strcat(buf, "|VFREE"); if (vp->v_flag & VOBJBUF) strcat(buf, "|VOBJBUF"); if (buf[0] != '\0') printf(" flags (%s)", &buf[1]); if (vp->v_data == NULL) { printf("\n"); } else { printf("\n\t"); VOP_PRINT(vp); } } #ifdef DDB #include /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) { struct proc *p = curproc; /* XXX */ struct mount *mp, *nmp; struct vnode *vp; printf("Locked vnodes\n"); mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, p)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { if (VOP_ISLOCKED(vp, NULL)) vprint((char *)0, vp); } mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, p); } mtx_unlock(&mountlist_mtx); } #endif /* * Top level filesystem related information gathering. */ static int sysctl_ovfs_conf __P((SYSCTL_HANDLER_ARGS)); static int vfs_sysctl(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1 - 1; /* XXX */ u_int namelen = arg2 + 1; /* XXX */ struct vfsconf *vfsp; #if 1 || defined(COMPAT_PRELITE2) /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ if (namelen == 1) return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); #endif /* XXX the below code does not compile; vfs_sysctl does not exist. */ #ifdef notyet /* all sysctl names at this level are at least name and field */ if (namelen < 2) return (ENOTDIR); /* overloaded */ if (name[0] != VFS_GENERIC) { for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (vfsp->vfc_typenum == name[0]) break; if (vfsp == NULL) return (EOPNOTSUPP); return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, oldp, oldlenp, newp, newlen, p)); } #endif switch (name[1]) { case VFS_MAXTYPENUM: if (namelen != 2) return (ENOTDIR); return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); case VFS_CONF: if (namelen != 3) return (ENOTDIR); /* overloaded */ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (vfsp->vfc_typenum == name[2]) break; if (vfsp == NULL) return (EOPNOTSUPP); return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); } return (EOPNOTSUPP); } SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, "Generic filesystem"); #if 1 || defined(COMPAT_PRELITE2) static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) { int error; struct vfsconf *vfsp; struct ovfsconf ovfs; for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ strcpy(ovfs.vfc_name, vfsp->vfc_name); ovfs.vfc_index = vfsp->vfc_typenum; ovfs.vfc_refcount = vfsp->vfc_refcount; ovfs.vfc_flags = vfsp->vfc_flags; error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); if (error) return error; } return 0; } #endif /* 1 || COMPAT_PRELITE2 */ #if COMPILING_LINT #define KINFO_VNODESLOP 10 /* * Dump vnode list (via sysctl). * Copyout address of vnode followed by vnode. */ /* ARGSUSED */ static int sysctl_vnode(SYSCTL_HANDLER_ARGS) { struct proc *p = curproc; /* XXX */ struct mount *mp, *nmp; struct vnode *nvp, *vp; int error; #define VPTRSZ sizeof (struct vnode *) #define VNODESZ sizeof (struct vnode) req->lock = 0; if (!req->oldptr) /* Make an estimate */ return (SYSCTL_OUT(req, 0, (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, p)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } again: mtx_lock(&mntvnode_mtx); for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) { /* * Check that the vp is still associated with * this filesystem. RACE: could have been * recycled onto the same filesystem. */ if (vp->v_mount != mp) { mtx_unlock(&mntvnode_mtx); goto again; } nvp = LIST_NEXT(vp, v_mntvnodes); mtx_unlock(&mntvnode_mtx); if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || (error = SYSCTL_OUT(req, vp, VNODESZ))) return (error); mtx_lock(&mntvnode_mtx); } mtx_unlock(&mntvnode_mtx); mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, p); } mtx_unlock(&mountlist_mtx); return (0); } /* * XXX * Exporting the vnode list on large systems causes them to crash. * Exporting the vnode list on medium systems causes sysctl to coredump. */ SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_vnode, "S,vnode", ""); #endif /* * Check to see if a filesystem is mounted on a block device. */ int vfs_mountedon(vp) struct vnode *vp; { if (vp->v_rdev->si_mountpoint != NULL) return (EBUSY); return (0); } /* * Unmount all filesystems. The list is traversed in reverse order * of mounting to avoid dependencies. */ void vfs_unmountall() { struct mount *mp; struct proc *p; int error; if (curproc != NULL) p = curproc; else p = initproc; /* XXX XXX should this be proc0? */ /* * Since this only runs when rebooting, it is not interlocked. */ while(!TAILQ_EMPTY(&mountlist)) { mp = TAILQ_LAST(&mountlist, mntlist); error = dounmount(mp, MNT_FORCE, p); if (error) { TAILQ_REMOVE(&mountlist, mp, mnt_list); printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } else { /* The unmount has removed mp from the mountlist */ } } } /* * Build hash lists of net addresses and hang them off the mount point. * Called by ufs_mount() to set up the lists of export addresses. */ static int vfs_hang_addrlist(mp, nep, argp) struct mount *mp; struct netexport *nep; struct export_args *argp; { register struct netcred *np; register struct radix_node_head *rnh; register int i; struct radix_node *rn; struct sockaddr *saddr, *smask = 0; struct domain *dom; int error; if (argp->ex_addrlen == 0) { if (mp->mnt_flag & MNT_DEFEXPORTED) return (EPERM); np = &nep->ne_defexported; np->netc_exflags = argp->ex_flags; bzero(&np->netc_anon, sizeof(np->netc_anon)); np->netc_anon.cr_uid = argp->ex_anon.cr_uid; np->netc_anon.cr_ngroups = argp->ex_anon.cr_ngroups; bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups, sizeof(np->netc_anon.cr_groups)); np->netc_anon.cr_ref = 1; mp->mnt_flag |= MNT_DEFEXPORTED; return (0); } i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK | M_ZERO); saddr = (struct sockaddr *) (np + 1); if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) goto out; if (saddr->sa_len > argp->ex_addrlen) saddr->sa_len = argp->ex_addrlen; if (argp->ex_masklen) { smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen); if (error) goto out; if (smask->sa_len > argp->ex_masklen) smask->sa_len = argp->ex_masklen; } i = saddr->sa_family; if ((rnh = nep->ne_rtable[i]) == 0) { /* * Seems silly to initialize every AF when most are not used, * do so on demand here */ for (dom = domains; dom; dom = dom->dom_next) if (dom->dom_family == i && dom->dom_rtattach) { dom->dom_rtattach((void **) &nep->ne_rtable[i], dom->dom_rtoffset); break; } if ((rnh = nep->ne_rtable[i]) == 0) { error = ENOBUFS; goto out; } } rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, np->netc_rnodes); if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ error = EPERM; goto out; } np->netc_exflags = argp->ex_flags; bzero(&np->netc_anon, sizeof(np->netc_anon)); np->netc_anon.cr_uid = argp->ex_anon.cr_uid; np->netc_anon.cr_ngroups = argp->ex_anon.cr_ngroups; bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups, sizeof(np->netc_anon.cr_groups)); np->netc_anon.cr_ref = 1; return (0); out: free(np, M_NETADDR); return (error); } /* Helper for vfs_free_addrlist. */ /* ARGSUSED */ static int vfs_free_netcred(rn, w) struct radix_node *rn; void *w; { register struct radix_node_head *rnh = (struct radix_node_head *) w; (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); free((caddr_t) rn, M_NETADDR); return (0); } /* * Free the net address hash lists that are hanging off the mount points. */ static void vfs_free_addrlist(nep) struct netexport *nep; { register int i; register struct radix_node_head *rnh; for (i = 0; i <= AF_MAX; i++) if ((rnh = nep->ne_rtable[i])) { (*rnh->rnh_walktree) (rnh, vfs_free_netcred, (caddr_t) rnh); free((caddr_t) rnh, M_RTABLE); nep->ne_rtable[i] = 0; } } /* * High level function to manipulate export options on a mount point * and the passed in netexport. * Struct export_args *argp is the variable used to twiddle options, * the structure is described in sys/mount.h */ int vfs_export(mp, nep, argp) struct mount *mp; struct netexport *nep; struct export_args *argp; { int error; if (argp->ex_flags & MNT_DELEXPORT) { if (mp->mnt_flag & MNT_EXPUBLIC) { vfs_setpublicfs(NULL, NULL, NULL); mp->mnt_flag &= ~MNT_EXPUBLIC; } vfs_free_addrlist(nep); mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); } if (argp->ex_flags & MNT_EXPORTED) { if (argp->ex_flags & MNT_EXPUBLIC) { if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) return (error); mp->mnt_flag |= MNT_EXPUBLIC; } if ((error = vfs_hang_addrlist(mp, nep, argp))) return (error); mp->mnt_flag |= MNT_EXPORTED; } return (0); } /* * Set the publicly exported filesystem (WebNFS). Currently, only * one public filesystem is possible in the spec (RFC 2054 and 2055) */ int vfs_setpublicfs(mp, nep, argp) struct mount *mp; struct netexport *nep; struct export_args *argp; { int error; struct vnode *rvp; char *cp; /* * mp == NULL -> invalidate the current info, the FS is * no longer exported. May be called from either vfs_export * or unmount, so check if it hasn't already been done. */ if (mp == NULL) { if (nfs_pub.np_valid) { nfs_pub.np_valid = 0; if (nfs_pub.np_index != NULL) { FREE(nfs_pub.np_index, M_TEMP); nfs_pub.np_index = NULL; } } return (0); } /* * Only one allowed at a time. */ if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) return (EBUSY); /* * Get real filehandle for root of exported FS. */ bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; if ((error = VFS_ROOT(mp, &rvp))) return (error); if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) return (error); vput(rvp); /* * If an indexfile was specified, pull it in. */ if (argp->ex_indexfile != NULL) { MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, M_WAITOK); error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, MAXNAMLEN, (size_t *)0); if (!error) { /* * Check for illegal filenames. */ for (cp = nfs_pub.np_index; *cp; cp++) { if (*cp == '/') { error = EINVAL; break; } } } if (error) { FREE(nfs_pub.np_index, M_TEMP); return (error); } } nfs_pub.np_mount = mp; nfs_pub.np_valid = 1; return (0); } /* * Used by the filesystems to determine if a given network address * (passed in 'nam') is present in thier exports list, returns a pointer * to struct netcred so that the filesystem can examine it for * access rights (read/write/etc). */ struct netcred * vfs_export_lookup(mp, nep, nam) register struct mount *mp; struct netexport *nep; struct sockaddr *nam; { register struct netcred *np; register struct radix_node_head *rnh; struct sockaddr *saddr; np = NULL; if (mp->mnt_flag & MNT_EXPORTED) { /* * Lookup in the export list first. */ if (nam != NULL) { saddr = nam; rnh = nep->ne_rtable[saddr->sa_family]; if (rnh != NULL) { np = (struct netcred *) (*rnh->rnh_matchaddr)((caddr_t)saddr, rnh); if (np && np->netc_rnodes->rn_flags & RNF_ROOT) np = NULL; } } /* * If no address match, use the default if it exists. */ if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) np = &nep->ne_defexported; } return (np); } /* * perform msync on all vnodes under a mount point * the mount point must be locked. */ void vfs_msync(struct mount *mp, int flags) { struct vnode *vp, *nvp; struct vm_object *obj; int anyio, tries; tries = 5; loop: anyio = 0; for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) { nvp = LIST_NEXT(vp, v_mntvnodes); if (vp->v_mount != mp) { goto loop; } if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ continue; if (flags != MNT_WAIT) { if (VOP_GETVOBJECT(vp, &obj) != 0 || (obj->flags & OBJ_MIGHTBEDIRTY) == 0) continue; if (VOP_ISLOCKED(vp, NULL)) continue; } mtx_lock(&vp->v_interlock); if (VOP_GETVOBJECT(vp, &obj) == 0 && (obj->flags & OBJ_MIGHTBEDIRTY)) { if (!vget(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) { if (VOP_GETVOBJECT(vp, &obj) == 0) { vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); anyio = 1; } vput(vp); } } else { mtx_unlock(&vp->v_interlock); } } if (anyio && (--tries > 0)) goto loop; } /* * Create the VM object needed for VMIO and mmap support. This * is done for all VREG files in the system. Some filesystems might * afford the additional metadata buffering capability of the * VMIO code by making the device node be VMIO mode also. * * vp must be locked when vfs_object_create is called. */ int vfs_object_create(vp, p, cred) struct vnode *vp; struct proc *p; struct ucred *cred; { return (VOP_CREATEVOBJECT(vp, cred, p)); } /* * Mark a vnode as free, putting it up for recycling. */ void vfree(vp) struct vnode *vp; { int s; s = splbio(); mtx_lock(&vnode_free_list_mtx); KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free")); if (vp->v_flag & VAGE) { TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); } else { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); } freevnodes++; mtx_unlock(&vnode_free_list_mtx); vp->v_flag &= ~VAGE; vp->v_flag |= VFREE; splx(s); } /* * Opposite of vfree() - mark a vnode as in use. */ void vbusy(vp) struct vnode *vp; { int s; s = splbio(); mtx_lock(&vnode_free_list_mtx); KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free")); TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); freevnodes--; mtx_unlock(&vnode_free_list_mtx); vp->v_flag &= ~(VFREE|VAGE); splx(s); } /* * Record a process's interest in events which might happen to * a vnode. Because poll uses the historic select-style interface * internally, this routine serves as both the ``check for any * pending events'' and the ``record my interest in future events'' * functions. (These are done together, while the lock is held, * to avoid race conditions.) */ int vn_pollrecord(vp, p, events) struct vnode *vp; struct proc *p; short events; { mtx_lock(&vp->v_pollinfo.vpi_lock); if (vp->v_pollinfo.vpi_revents & events) { /* * This leaves events we are not interested * in available for the other process which * which presumably had requested them * (otherwise they would never have been * recorded). */ events &= vp->v_pollinfo.vpi_revents; vp->v_pollinfo.vpi_revents &= ~events; mtx_unlock(&vp->v_pollinfo.vpi_lock); return events; } vp->v_pollinfo.vpi_events |= events; selrecord(p, &vp->v_pollinfo.vpi_selinfo); mtx_unlock(&vp->v_pollinfo.vpi_lock); return 0; } /* * Note the occurrence of an event. If the VN_POLLEVENT macro is used, * it is possible for us to miss an event due to race conditions, but * that condition is expected to be rare, so for the moment it is the * preferred interface. */ void vn_pollevent(vp, events) struct vnode *vp; short events; { mtx_lock(&vp->v_pollinfo.vpi_lock); if (vp->v_pollinfo.vpi_events & events) { /* * We clear vpi_events so that we don't * call selwakeup() twice if two events are * posted before the polling process(es) is * awakened. This also ensures that we take at * most one selwakeup() if the polling process * is no longer interested. However, it does * mean that only one event can be noticed at * a time. (Perhaps we should only clear those * event bits which we note?) XXX */ vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ vp->v_pollinfo.vpi_revents |= events; selwakeup(&vp->v_pollinfo.vpi_selinfo); } mtx_unlock(&vp->v_pollinfo.vpi_lock); } #define VN_KNOTE(vp, b) \ KNOTE((struct klist *)&vp->v_pollinfo.vpi_selinfo.si_note, (b)) /* * Wake up anyone polling on vp because it is being revoked. * This depends on dead_poll() returning POLLHUP for correct * behavior. */ void vn_pollgone(vp) struct vnode *vp; { mtx_lock(&vp->v_pollinfo.vpi_lock); VN_KNOTE(vp, NOTE_REVOKE); if (vp->v_pollinfo.vpi_events) { vp->v_pollinfo.vpi_events = 0; selwakeup(&vp->v_pollinfo.vpi_selinfo); } mtx_unlock(&vp->v_pollinfo.vpi_lock); } /* * Routine to create and manage a filesystem syncer vnode. */ #define sync_close ((int (*) __P((struct vop_close_args *)))nullop) static int sync_fsync __P((struct vop_fsync_args *)); static int sync_inactive __P((struct vop_inactive_args *)); static int sync_reclaim __P((struct vop_reclaim_args *)); #define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock) #define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock) static int sync_print __P((struct vop_print_args *)); #define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked) static vop_t **sync_vnodeop_p; static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_eopnotsupp }, { &vop_close_desc, (vop_t *) sync_close }, /* close */ { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ { &vop_print_desc, (vop_t *) sync_print }, /* print */ { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ { NULL, NULL } }; static struct vnodeopv_desc sync_vnodeop_opv_desc = { &sync_vnodeop_p, sync_vnodeop_entries }; VNODEOP_SET(sync_vnodeop_opv_desc); /* * Create a new filesystem syncer vnode for the specified mount point. */ int vfs_allocate_syncvnode(mp) struct mount *mp; { struct vnode *vp; static long start, incr, next; int error; /* Allocate a new vnode */ if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { mp->mnt_syncer = NULL; return (error); } vp->v_type = VNON; /* * Place the vnode onto the syncer worklist. We attempt to * scatter them about on the list so that they will go off * at evenly distributed times even if all the filesystems * are mounted at once. */ next += incr; if (next == 0 || next > syncer_maxdelay) { start /= 2; incr /= 2; if (start == 0) { start = syncer_maxdelay / 2; incr = syncer_maxdelay; } next = start; } vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); mp->mnt_syncer = vp; return (0); } /* * Do a lazy sync of the filesystem. */ static int sync_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct proc *a_p; } */ *ap; { struct vnode *syncvp = ap->a_vp; struct mount *mp = syncvp->v_mount; struct proc *p = ap->a_p; int asyncflag; /* * We only need to do something if this is a lazy evaluation. */ if (ap->a_waitfor != MNT_LAZY) return (0); /* * Move ourselves to the back of the sync list. */ vn_syncer_add_to_worklist(syncvp, syncdelay); /* * Walk the list of vnodes pushing all that are dirty and * not already on the sync list. */ mtx_lock(&mountlist_mtx); if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, p) != 0) { mtx_unlock(&mountlist_mtx); return (0); } if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { vfs_unbusy(mp, p); return (0); } asyncflag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; vfs_msync(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p); if (asyncflag) mp->mnt_flag |= MNT_ASYNC; vn_finished_write(mp); vfs_unbusy(mp, p); return (0); } /* * The syncer vnode is no referenced. */ static int sync_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct proc *a_p; } */ *ap; { vgone(ap->a_vp); return (0); } /* * The syncer vnode is no longer needed and is being decommissioned. * * Modifications to the worklist must be protected at splbio(). */ static int sync_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; int s; s = splbio(); vp->v_mount->mnt_syncer = NULL; if (vp->v_flag & VONWORKLST) { LIST_REMOVE(vp, v_synclist); vp->v_flag &= ~VONWORKLST; } splx(s); return (0); } /* * Print out a syncer vnode. */ static int sync_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; printf("syncer vnode"); if (vp->v_vnlock != NULL) lockmgr_printinfo(vp->v_vnlock); printf("\n"); return (0); } /* * extract the dev_t from a VCHR */ dev_t vn_todev(vp) struct vnode *vp; { if (vp->v_type != VCHR) return (NODEV); return (vp->v_rdev); } /* * Check if vnode represents a disk device */ int vn_isdisk(vp, errp) struct vnode *vp; int *errp; { struct cdevsw *cdevsw; if (vp->v_type != VCHR) { if (errp != NULL) *errp = ENOTBLK; return (0); } if (vp->v_rdev == NULL) { if (errp != NULL) *errp = ENXIO; return (0); } cdevsw = devsw(vp->v_rdev); if (cdevsw == NULL) { if (errp != NULL) *errp = ENXIO; return (0); } if (!(cdevsw->d_flags & D_DISK)) { if (errp != NULL) *errp = ENOTBLK; return (0); } if (errp != NULL) *errp = 0; return (1); } /* * Free data allocated by namei(); see namei(9) for details. */ void NDFREE(ndp, flags) struct nameidata *ndp; const uint flags; { if (!(flags & NDF_NO_FREE_PNBUF) && (ndp->ni_cnd.cn_flags & HASBUF)) { zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); ndp->ni_cnd.cn_flags &= ~HASBUF; } if (!(flags & NDF_NO_DVP_UNLOCK) && (ndp->ni_cnd.cn_flags & LOCKPARENT) && ndp->ni_dvp != ndp->ni_vp) VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_proc); if (!(flags & NDF_NO_DVP_RELE) && (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) { vrele(ndp->ni_dvp); ndp->ni_dvp = NULL; } if (!(flags & NDF_NO_VP_UNLOCK) && (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_proc); if (!(flags & NDF_NO_VP_RELE) && ndp->ni_vp) { vrele(ndp->ni_vp); ndp->ni_vp = NULL; } if (!(flags & NDF_NO_STARTDIR_RELE) && (ndp->ni_cnd.cn_flags & SAVESTART)) { vrele(ndp->ni_startdir); ndp->ni_startdir = NULL; } } /* * Common file system object access control check routine. Accepts a * vnode's type, "mode", uid and gid, requested access mode, credentials, * and optional call-by-reference privused argument allowing vaccess() * to indicate to the caller whether privilege was used to satisfy the * request. Returns 0 on success, or an errno on failure. */ int vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused) enum vtype type; mode_t file_mode; uid_t file_uid; gid_t file_gid; mode_t acc_mode; struct ucred *cred; int *privused; { mode_t dac_granted; #ifdef CAPABILITIES mode_t cap_granted; #endif /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. */ if (privused != NULL) *privused = 0; dac_granted = 0; /* Check the owner. */ if (cred->cr_uid == file_uid) { dac_granted |= VADMIN; if (file_mode & S_IXUSR) dac_granted |= VEXEC; if (file_mode & S_IRUSR) dac_granted |= VREAD; if (file_mode & S_IWUSR) dac_granted |= VWRITE; if ((acc_mode & dac_granted) == acc_mode) return (0); goto privcheck; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) dac_granted |= VEXEC; if (file_mode & S_IRGRP) dac_granted |= VREAD; if (file_mode & S_IWGRP) dac_granted |= VWRITE; if ((acc_mode & dac_granted) == acc_mode) return (0); goto privcheck; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) dac_granted |= VEXEC; if (file_mode & S_IROTH) dac_granted |= VREAD; if (file_mode & S_IWOTH) dac_granted |= VWRITE; if ((acc_mode & dac_granted) == acc_mode) return (0); privcheck: if (!suser_xxx(cred, NULL, PRISON_ROOT)) { /* XXX audit: privilege used */ if (privused != NULL) *privused = 1; return (0); } #ifdef CAPABILITIES /* * Build a capability mask to determine if the set of capabilities * satisfies the requirements when combined with the granted mask * from above. * For each capability, if the capability is required, bitwise * or the request type onto the cap_granted mask. */ cap_granted = 0; if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && !cap_check_xxx(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT)) cap_granted |= VEXEC; if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) && !cap_check_xxx(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) cap_granted |= VREAD; if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) && !cap_check_xxx(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT)) cap_granted |= VWRITE; if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) && !cap_check_xxx(cred, NULL, CAP_FOWNER, PRISON_ROOT)) cap_granted |= VADMIN; if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) { /* XXX audit: privilege used */ if (privused != NULL) *privused = 1; return (0); } #endif return ((acc_mode & VADMIN) ? EPERM : EACCES); } Index: head/sys/kern/vnode_if.src =================================================================== --- head/sys/kern/vnode_if.src (revision 75579) +++ head/sys/kern/vnode_if.src (revision 75580) @@ -1,573 +1,565 @@ # # Copyright (c) 1992, 1993 # The Regents of the University of California. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. All advertising materials mentioning features or use of this software # must display the following acknowledgement: # This product includes software developed by the University of # California, Berkeley and its contributors. # 4. Neither the name of the University nor the names of its contributors # may be used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # # @(#)vnode_if.src 8.12 (Berkeley) 5/14/95 # $FreeBSD$ # # # Above each of the vop descriptors is a specification of the locking # protocol used by each vop call. The first column is the name of # the variable, the remaining three columns are in, out and error # respectively. The "in" column defines the lock state on input, # the "out" column defines the state on succesful return, and the # "error" column defines the locking state on error exit. # # The locking value can take the following values: # L: locked; not converted to type of lock. # A: any lock type. # S: locked with shared lock. # E: locked with exclusive lock for this process. # O: locked with exclusive lock for other process. # U: unlocked. # -: not applicable. vnode does not yet (or no longer) exists. # =: the same on input and output, may be either L or U. # X: locked if not nil. # # #% islocked vp = = = # vop_islocked { IN struct vnode *vp; IN struct proc *p; }; # #% lookup dvp L ? ? #% lookup vpp - L - # # XXX - the lookup locking protocol defies simple description and depends # on the flags and operation fields in the (cnp) structure. Note # especially that *vpp may equal dvp and both may be locked. # vop_lookup { IN struct vnode *dvp; INOUT struct vnode **vpp; IN struct componentname *cnp; }; # #% cachedlookup dvp L ? ? #% cachedlookup vpp - L - # # This must be an exact copy of lookup. See kern/vfs_cache.c for details. # vop_cachedlookup { IN struct vnode *dvp; INOUT struct vnode **vpp; IN struct componentname *cnp; }; # #% create dvp L L L #% create vpp - L - # vop_create { IN struct vnode *dvp; OUT struct vnode **vpp; IN struct componentname *cnp; IN struct vattr *vap; }; # #% whiteout dvp L L L # vop_whiteout { IN struct vnode *dvp; IN struct componentname *cnp; IN int flags; }; # #% mknod dvp L L L #% mknod vpp - X - # vop_mknod { IN struct vnode *dvp; OUT struct vnode **vpp; IN struct componentname *cnp; IN struct vattr *vap; }; # #% open vp L L L # vop_open { IN struct vnode *vp; IN int mode; IN struct ucred *cred; IN struct proc *p; }; # #% close vp U U U # vop_close { IN struct vnode *vp; IN int fflag; IN struct ucred *cred; IN struct proc *p; }; # #% access vp L L L # vop_access { IN struct vnode *vp; IN int mode; IN struct ucred *cred; IN struct proc *p; }; # #% getattr vp = = = # vop_getattr { IN struct vnode *vp; OUT struct vattr *vap; IN struct ucred *cred; IN struct proc *p; }; # #% setattr vp L L L # vop_setattr { IN struct vnode *vp; IN struct vattr *vap; IN struct ucred *cred; IN struct proc *p; }; # #% read vp L L L # vop_read { IN struct vnode *vp; INOUT struct uio *uio; IN int ioflag; IN struct ucred *cred; }; # #% write vp L L L # vop_write { IN struct vnode *vp; INOUT struct uio *uio; IN int ioflag; IN struct ucred *cred; }; # #% lease vp = = = # vop_lease { IN struct vnode *vp; IN struct proc *p; IN struct ucred *cred; IN int flag; }; # #% ioctl vp U U U # vop_ioctl { IN struct vnode *vp; IN u_long command; IN caddr_t data; IN int fflag; IN struct ucred *cred; IN struct proc *p; }; # #% poll vp U U U # vop_poll { IN struct vnode *vp; IN int events; IN struct ucred *cred; IN struct proc *p; }; # #% kqfilter vp U U U # vop_kqfilter { IN struct vnode *vp; IN struct knote *kn; }; # #% revoke vp U U U # vop_revoke { IN struct vnode *vp; IN int flags; }; # #% fsync vp L L L # vop_fsync { IN struct vnode *vp; IN struct ucred *cred; IN int waitfor; IN struct proc *p; }; # #% remove dvp L L L #% remove vp L L L # vop_remove { IN struct vnode *dvp; IN struct vnode *vp; IN struct componentname *cnp; }; # #% link tdvp L L L #% link vp U U U # vop_link { IN struct vnode *tdvp; IN struct vnode *vp; IN struct componentname *cnp; }; # #% rename fdvp U U U #% rename fvp U U U #% rename tdvp L U U #% rename tvp X U U # vop_rename { IN WILLRELE struct vnode *fdvp; IN WILLRELE struct vnode *fvp; IN struct componentname *fcnp; IN WILLRELE struct vnode *tdvp; IN WILLRELE struct vnode *tvp; IN struct componentname *tcnp; }; # #% mkdir dvp L L L #% mkdir vpp - L - # vop_mkdir { IN struct vnode *dvp; OUT struct vnode **vpp; IN struct componentname *cnp; IN struct vattr *vap; }; # #% rmdir dvp L L L #% rmdir vp L L L # vop_rmdir { IN struct vnode *dvp; IN struct vnode *vp; IN struct componentname *cnp; }; # #% symlink dvp L L L #% symlink vpp - U - # vop_symlink { IN struct vnode *dvp; OUT struct vnode **vpp; IN struct componentname *cnp; IN struct vattr *vap; IN char *target; }; # #% readdir vp L L L # vop_readdir { IN struct vnode *vp; INOUT struct uio *uio; IN struct ucred *cred; INOUT int *eofflag; OUT int *ncookies; INOUT u_long **cookies; }; # #% readlink vp L L L # vop_readlink { IN struct vnode *vp; INOUT struct uio *uio; IN struct ucred *cred; }; # #% inactive vp L U U # vop_inactive { IN struct vnode *vp; IN struct proc *p; }; # #% reclaim vp U U U # vop_reclaim { IN struct vnode *vp; IN struct proc *p; }; # #% lock vp ? ? ? # vop_lock { IN struct vnode *vp; IN int flags; IN struct proc *p; }; # #% unlock vp L U L # vop_unlock { IN struct vnode *vp; IN int flags; IN struct proc *p; }; # #% bmap vp L L L #% bmap vpp - U - # vop_bmap { IN struct vnode *vp; IN daddr_t bn; OUT struct vnode **vpp; IN daddr_t *bnp; OUT int *runp; OUT int *runb; }; # #% strategy vp L L L # vop_strategy { IN struct vnode *vp; IN struct buf *bp; }; # #% getwritemount vp = = = # vop_getwritemount { IN struct vnode *vp; OUT struct mount **mpp; }; # #% print vp = = = # vop_print { IN struct vnode *vp; }; # #% pathconf vp L L L # vop_pathconf { IN struct vnode *vp; IN int name; OUT register_t *retval; }; # #% advlock vp U U U # vop_advlock { IN struct vnode *vp; IN caddr_t id; IN int op; IN struct flock *fl; IN int flags; }; # #% balloc vp L L L # vop_balloc { IN struct vnode *vp; IN off_t startoffset; IN int size; IN struct ucred *cred; IN int flags; OUT struct buf **bpp; }; # #% reallocblks vp L L L # vop_reallocblks { IN struct vnode *vp; IN struct cluster_save *buflist; }; # #% getpages vp L L L # vop_getpages { IN struct vnode *vp; IN vm_page_t *m; IN int count; IN int reqpage; IN vm_ooffset_t offset; }; # #% putpages vp L L L # vop_putpages { IN struct vnode *vp; IN vm_page_t *m; IN int count; IN int sync; IN int *rtvals; IN vm_ooffset_t offset; }; # #% freeblks vp - - - # # This call is used by the filesystem to release blocks back to # device-driver. This is useful if the driver has a lengthy # erase handling or similar. # vop_freeblks { IN struct vnode *vp; IN daddr_t addr; IN daddr_t length; }; # -#% bwrite vp L L L -# -vop_bwrite { - IN struct vnode *vp; - IN struct buf *bp; -}; - -# #% getacl vp L L L # vop_getacl { IN struct vnode *vp; IN acl_type_t type; OUT struct acl *aclp; IN struct ucred *cred; IN struct proc *p; }; # #% setacl vp L L L # vop_setacl { IN struct vnode *vp; IN acl_type_t type; IN struct acl *aclp; IN struct ucred *cred; IN struct proc *p; }; # #% aclcheck vp = = = # vop_aclcheck { IN struct vnode *vp; IN acl_type_t type; IN struct acl *aclp; IN struct ucred *cred; IN struct proc *p; }; # #% getextattr vp L L L # vop_getextattr { IN struct vnode *vp; IN int attrnamespace; IN const char *name; INOUT struct uio *uio; IN struct ucred *cred; IN struct proc *p; }; # #% setextattr vp L L L # vop_setextattr { IN struct vnode *vp; IN int attrnamespace; IN const char *name; INOUT struct uio *uio; IN struct ucred *cred; IN struct proc *p; }; # #% createvobject vp L L L # vop_createvobject { IN struct vnode *vp; IN struct ucred *cred; IN struct proc *p; }; # #% destroyvobject vp L L L # vop_destroyvobject { IN struct vnode *vp; }; # #% getvobject vp L L L # vop_getvobject { IN struct vnode *vp; OUT struct vm_object **objpp; }; Index: head/sys/nfs/nfs_bio.c =================================================================== --- head/sys/nfs/nfs_bio.c (revision 75579) +++ head/sys/nfs/nfs_bio.c (revision 75580) @@ -1,1604 +1,1623 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +/* + * Just call nfs_writebp() with the force argument set to 1. + * + * NOTE: B_DONE may or may not be set in a_bp on call. + */ +static int +nfs_bwrite(struct buf *bp) +{ + return (nfs_writebp(bp, 1, curproc)); +} + +struct buf_ops buf_ops_nfs = { + "buf_ops_nfs", + nfs_bwrite +}; + + static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size, struct proc *p)); extern int nfs_numasync; extern int nfs_pbuf_freecnt; extern struct nfsstats nfsstats; /* * Vnode op for VM getpages. */ int nfs_getpages(ap) struct vop_getpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_reqpage; vm_ooffset_t a_offset; } */ *ap; { int i, error, nextoff, size, toff, count, npages; struct uio uio; struct iovec iov; vm_offset_t kva; struct buf *bp; struct vnode *vp; struct proc *p; struct ucred *cred; struct nfsmount *nmp; vm_page_t *pages; vp = ap->a_vp; p = curproc; /* XXX */ cred = curproc->p_ucred; /* XXX */ nmp = VFSTONFS(vp->v_mount); pages = ap->a_m; count = ap->a_count; if (vp->v_object == NULL) { printf("nfs_getpages: called with non-merged cache vnode??\n"); return VM_PAGER_ERROR; } if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) (void)nfs_fsinfo(nmp, vp, cred, p); npages = btoc(count); /* * If the requested page is partially valid, just return it and * allow the pager to zero-out the blanks. Partially valid pages * can only occur at the file EOF. */ { vm_page_t m = pages[ap->a_reqpage]; if (m->valid != 0) { /* handled by vm_fault now */ /* vm_page_zero_invalid(m, TRUE); */ for (i = 0; i < npages; ++i) { if (i != ap->a_reqpage) vnode_pager_freepage(pages[i]); } return(0); } } /* * We use only the kva address for the buffer, but this is extremely * convienient and fast. */ bp = getpbuf(&nfs_pbuf_freecnt); kva = (vm_offset_t) bp->b_data; pmap_qenter(kva, pages, npages); iov.iov_base = (caddr_t) kva; iov.iov_len = count; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); uio.uio_resid = count; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_procp = p; error = nfs_readrpc(vp, &uio, cred); pmap_qremove(kva, npages); relpbuf(bp, &nfs_pbuf_freecnt); if (error && (uio.uio_resid == count)) { printf("nfs_getpages: error %d\n", error); for (i = 0; i < npages; ++i) { if (i != ap->a_reqpage) vnode_pager_freepage(pages[i]); } return VM_PAGER_ERROR; } /* * Calculate the number of bytes read and validate only that number * of bytes. Note that due to pending writes, size may be 0. This * does not mean that the remaining data is invalid! */ size = count - uio.uio_resid; for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { vm_page_t m; nextoff = toff + PAGE_SIZE; m = pages[i]; m->flags &= ~PG_ZERO; if (nextoff <= size) { /* * Read operation filled an entire page */ m->valid = VM_PAGE_BITS_ALL; vm_page_undirty(m); } else if (size > toff) { /* * Read operation filled a partial page. */ m->valid = 0; vm_page_set_validclean(m, 0, size - toff); /* handled by vm_fault now */ /* vm_page_zero_invalid(m, TRUE); */ } if (i != ap->a_reqpage) { /* * Whether or not to leave the page activated is up in * the air, but we should put the page on a page queue * somewhere (it already is in the object). Result: * It appears that emperical results show that * deactivating pages is best. */ /* * Just in case someone was asking for this page we * now tell them that it is ok to use. */ if (!error) { if (m->flags & PG_WANTED) vm_page_activate(m); else vm_page_deactivate(m); vm_page_wakeup(m); } else { vnode_pager_freepage(m); } } } return 0; } /* * Vnode op for VM putpages. */ int nfs_putpages(ap) struct vop_putpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_sync; int *a_rtvals; vm_ooffset_t a_offset; } */ *ap; { struct uio uio; struct iovec iov; vm_offset_t kva; struct buf *bp; int iomode, must_commit, i, error, npages, count; off_t offset; int *rtvals; struct vnode *vp; struct proc *p; struct ucred *cred; struct nfsmount *nmp; struct nfsnode *np; vm_page_t *pages; vp = ap->a_vp; np = VTONFS(vp); p = curproc; /* XXX */ cred = curproc->p_ucred; /* XXX */ nmp = VFSTONFS(vp->v_mount); pages = ap->a_m; count = ap->a_count; rtvals = ap->a_rtvals; npages = btoc(count); offset = IDX_TO_OFF(pages[0]->pindex); if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) (void)nfs_fsinfo(nmp, vp, cred, p); for (i = 0; i < npages; i++) { rtvals[i] = VM_PAGER_AGAIN; } /* * When putting pages, do not extend file past EOF. */ if (offset + count > np->n_size) { count = np->n_size - offset; if (count < 0) count = 0; } /* * We use only the kva address for the buffer, but this is extremely * convienient and fast. */ bp = getpbuf(&nfs_pbuf_freecnt); kva = (vm_offset_t) bp->b_data; pmap_qenter(kva, pages, npages); iov.iov_base = (caddr_t) kva; iov.iov_len = count; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = offset; uio.uio_resid = count; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_WRITE; uio.uio_procp = p; if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0) iomode = NFSV3WRITE_UNSTABLE; else iomode = NFSV3WRITE_FILESYNC; error = nfs_writerpc(vp, &uio, cred, &iomode, &must_commit); pmap_qremove(kva, npages); relpbuf(bp, &nfs_pbuf_freecnt); if (!error) { int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; for (i = 0; i < nwritten; i++) { rtvals[i] = VM_PAGER_OK; vm_page_undirty(pages[i]); } if (must_commit) nfs_clearcommit(vp->v_mount); } return rtvals[0]; } /* * Vnode op for read using bio */ int nfs_bioread(vp, uio, ioflag, cred) register struct vnode *vp; register struct uio *uio; int ioflag; struct ucred *cred; { register struct nfsnode *np = VTONFS(vp); register int biosize, i; struct buf *bp = 0, *rabp; struct vattr vattr; struct proc *p; struct nfsmount *nmp = VFSTONFS(vp->v_mount); daddr_t lbn, rabn; int bcount; int seqcount; int nra, error = 0, n = 0, on = 0; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_READ) panic("nfs_read mode"); #endif if (uio->uio_resid == 0) return (0); if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */ return (EINVAL); p = uio->uio_procp; if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) (void)nfs_fsinfo(nmp, vp, cred, p); if (vp->v_type != VDIR && (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) return (EFBIG); biosize = vp->v_mount->mnt_stat.f_iosize; seqcount = (int)((off_t)(ioflag >> 16) * biosize / BKVASIZE); /* * For nfs, cache consistency can only be maintained approximately. * Although RFC1094 does not specify the criteria, the following is * believed to be compatible with the reference port. * For nqnfs, full cache consistency is maintained within the loop. * For nfs: * If the file's modify time on the server has changed since the * last read rpc or you have written to the file, * you may have lost data cache consistency with the * server, so flush all of the file's data out of the cache. * Then force a getattr rpc to ensure that you have up to date * attributes. * NB: This implies that cache data can be read when up to * NFS_ATTRTIMEO seconds out of date. If you find that you need current * attributes this could be forced by setting n_attrstamp to 0 before * the VOP_GETATTR() call. */ if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) { if (np->n_flag & NMODIFIED) { if (vp->v_type != VREG) { if (vp->v_type != VDIR) panic("nfs: bioread, not dir"); nfs_invaldir(vp); error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) return (error); } np->n_attrstamp = 0; error = VOP_GETATTR(vp, &vattr, cred, p); if (error) return (error); np->n_mtime = vattr.va_mtime.tv_sec; } else { error = VOP_GETATTR(vp, &vattr, cred, p); if (error) return (error); if (np->n_mtime != vattr.va_mtime.tv_sec) { if (vp->v_type == VDIR) nfs_invaldir(vp); error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) return (error); np->n_mtime = vattr.va_mtime.tv_sec; } } } do { /* * Get a valid lease. If cached data is stale, flush it. */ if (nmp->nm_flag & NFSMNT_NQNFS) { if (NQNFS_CKINVALID(vp, np, ND_READ)) { do { error = nqnfs_getlease(vp, ND_READ, cred, p); } while (error == NQNFS_EXPIRED); if (error) return (error); if (np->n_lrev != np->n_brev || (np->n_flag & NQNFSNONCACHE) || ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { if (vp->v_type == VDIR) nfs_invaldir(vp); error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) return (error); np->n_brev = np->n_lrev; } } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { nfs_invaldir(vp); error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) return (error); } } if (np->n_flag & NQNFSNONCACHE) { switch (vp->v_type) { case VREG: return (nfs_readrpc(vp, uio, cred)); case VLNK: return (nfs_readlinkrpc(vp, uio, cred)); case VDIR: break; default: printf(" NQNFSNONCACHE: type %x unexpected\n", vp->v_type); }; } switch (vp->v_type) { case VREG: nfsstats.biocache_reads++; lbn = uio->uio_offset / biosize; on = uio->uio_offset & (biosize - 1); /* * Start the read ahead(s), as required. */ if (nfs_numasync > 0 && nmp->nm_readahead > 0) { for (nra = 0; nra < nmp->nm_readahead && nra < seqcount && (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) { rabn = lbn + 1 + nra; if (!incore(vp, rabn)) { rabp = nfs_getcacheblk(vp, rabn, biosize, p); if (!rabp) return (EINTR); if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { rabp->b_flags |= B_ASYNC; rabp->b_iocmd = BIO_READ; vfs_busy_pages(rabp, 0); if (nfs_asyncio(rabp, cred, p)) { rabp->b_flags |= B_INVAL; rabp->b_ioflags |= BIO_ERROR; vfs_unbusy_pages(rabp); brelse(rabp); break; } } else { brelse(rabp); } } } } /* * Obtain the buffer cache block. Figure out the buffer size * when we are at EOF. If we are modifying the size of the * buffer based on an EOF condition we need to hold * nfs_rslock() through obtaining the buffer to prevent * a potential writer-appender from messing with n_size. * Otherwise we may accidently truncate the buffer and * lose dirty data. * * Note that bcount is *not* DEV_BSIZE aligned. */ again: bcount = biosize; if ((off_t)lbn * biosize >= np->n_size) { bcount = 0; } else if ((off_t)(lbn + 1) * biosize > np->n_size) { bcount = np->n_size - (off_t)lbn * biosize; } if (bcount != biosize) { switch(nfs_rslock(np, p)) { case ENOLCK: goto again; /* not reached */ case EINTR: case ERESTART: return(EINTR); /* not reached */ default: break; } } bp = nfs_getcacheblk(vp, lbn, bcount, p); if (bcount != biosize) nfs_rsunlock(np, p); if (!bp) return (EINTR); /* * If B_CACHE is not set, we must issue the read. If this * fails, we return an error. */ if ((bp->b_flags & B_CACHE) == 0) { bp->b_iocmd = BIO_READ; vfs_busy_pages(bp, 0); error = nfs_doio(bp, cred, p); if (error) { brelse(bp); return (error); } } /* * on is the offset into the current bp. Figure out how many * bytes we can copy out of the bp. Note that bcount is * NOT DEV_BSIZE aligned. * * Then figure out how many bytes we can copy into the uio. */ n = 0; if (on < bcount) n = min((unsigned)(bcount - on), uio->uio_resid); break; case VLNK: nfsstats.biocache_readlinks++; bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p); if (!bp) return (EINTR); if ((bp->b_flags & B_CACHE) == 0) { bp->b_iocmd = BIO_READ; vfs_busy_pages(bp, 0); error = nfs_doio(bp, cred, p); if (error) { bp->b_ioflags |= BIO_ERROR; brelse(bp); return (error); } } n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); on = 0; break; case VDIR: nfsstats.biocache_readdirs++; if (np->n_direofoffset && uio->uio_offset >= np->n_direofoffset) { return (0); } lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ; on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p); if (!bp) return (EINTR); if ((bp->b_flags & B_CACHE) == 0) { bp->b_iocmd = BIO_READ; vfs_busy_pages(bp, 0); error = nfs_doio(bp, cred, p); if (error) { brelse(bp); } while (error == NFSERR_BAD_COOKIE) { printf("got bad cookie vp %p bp %p\n", vp, bp); nfs_invaldir(vp); error = nfs_vinvalbuf(vp, 0, cred, p, 1); /* * Yuck! The directory has been modified on the * server. The only way to get the block is by * reading from the beginning to get all the * offset cookies. * * Leave the last bp intact unless there is an error. * Loop back up to the while if the error is another * NFSERR_BAD_COOKIE (double yuch!). */ for (i = 0; i <= lbn && !error; i++) { if (np->n_direofoffset && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) return (0); bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p); if (!bp) return (EINTR); if ((bp->b_flags & B_CACHE) == 0) { bp->b_iocmd = BIO_READ; vfs_busy_pages(bp, 0); error = nfs_doio(bp, cred, p); /* * no error + B_INVAL == directory EOF, * use the block. */ if (error == 0 && (bp->b_flags & B_INVAL)) break; } /* * An error will throw away the block and the * for loop will break out. If no error and this * is not the block we want, we throw away the * block and go for the next one via the for loop. */ if (error || i < lbn) brelse(bp); } } /* * The above while is repeated if we hit another cookie * error. If we hit an error and it wasn't a cookie error, * we give up. */ if (error) return (error); } /* * If not eof and read aheads are enabled, start one. * (You need the current block first, so that you have the * directory offset cookie of the next block.) */ if (nfs_numasync > 0 && nmp->nm_readahead > 0 && (bp->b_flags & B_INVAL) == 0 && (np->n_direofoffset == 0 || (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && !(np->n_flag & NQNFSNONCACHE) && !incore(vp, lbn + 1)) { rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p); if (rabp) { if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { rabp->b_flags |= B_ASYNC; rabp->b_iocmd = BIO_READ; vfs_busy_pages(rabp, 0); if (nfs_asyncio(rabp, cred, p)) { rabp->b_flags |= B_INVAL; rabp->b_ioflags |= BIO_ERROR; vfs_unbusy_pages(rabp); brelse(rabp); } } else { brelse(rabp); } } } /* * Unlike VREG files, whos buffer size ( bp->b_bcount ) is * chopped for the EOF condition, we cannot tell how large * NFS directories are going to be until we hit EOF. So * an NFS directory buffer is *not* chopped to its EOF. Now, * it just so happens that b_resid will effectively chop it * to EOF. *BUT* this information is lost if the buffer goes * away and is reconstituted into a B_CACHE state ( due to * being VMIO ) later. So we keep track of the directory eof * in np->n_direofoffset and chop it off as an extra step * right here. */ n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset) n = np->n_direofoffset - uio->uio_offset; break; default: printf(" nfs_bioread: type %x unexpected\n",vp->v_type); break; }; if (n > 0) { error = uiomove(bp->b_data + on, (int)n, uio); } switch (vp->v_type) { case VREG: break; case VLNK: n = 0; break; case VDIR: /* * Invalidate buffer if caching is disabled, forcing a * re-read from the remote later. */ if (np->n_flag & NQNFSNONCACHE) bp->b_flags |= B_INVAL; break; default: printf(" nfs_bioread: type %x unexpected\n",vp->v_type); } brelse(bp); } while (error == 0 && uio->uio_resid > 0 && n > 0); return (error); } /* * Vnode op for write using bio */ int nfs_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { int biosize; struct uio *uio = ap->a_uio; struct proc *p = uio->uio_procp; struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); struct ucred *cred = ap->a_cred; int ioflag = ap->a_ioflag; struct buf *bp; struct vattr vattr; struct nfsmount *nmp = VFSTONFS(vp->v_mount); daddr_t lbn; int bcount; int n, on, error = 0, iomode, must_commit; int haverslock = 0; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_WRITE) panic("nfs_write mode"); if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) panic("nfs_write proc"); #endif if (vp->v_type != VREG) return (EIO); if (np->n_flag & NWRITEERR) { np->n_flag &= ~NWRITEERR; return (np->n_error); } if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) (void)nfs_fsinfo(nmp, vp, cred, p); /* * Synchronously flush pending buffers if we are in synchronous * mode or if we are appending. */ if (ioflag & (IO_APPEND | IO_SYNC)) { if (np->n_flag & NMODIFIED) { np->n_attrstamp = 0; error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) return (error); } } /* * If IO_APPEND then load uio_offset. We restart here if we cannot * get the append lock. */ restart: if (ioflag & IO_APPEND) { np->n_attrstamp = 0; error = VOP_GETATTR(vp, &vattr, cred, p); if (error) return (error); uio->uio_offset = np->n_size; } if (uio->uio_offset < 0) return (EINVAL); if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) return (EFBIG); if (uio->uio_resid == 0) return (0); /* * We need to obtain the rslock if we intend to modify np->n_size * in order to guarentee the append point with multiple contending * writers, to guarentee that no other appenders modify n_size * while we are trying to obtain a truncated buffer (i.e. to avoid * accidently truncating data written by another appender due to * the race), and to ensure that the buffer is populated prior to * our extending of the file. We hold rslock through the entire * operation. * * Note that we do not synchronize the case where someone truncates * the file while we are appending to it because attempting to lock * this case may deadlock other parts of the system unexpectedly. */ if ((ioflag & IO_APPEND) || uio->uio_offset + uio->uio_resid > np->n_size) { switch(nfs_rslock(np, p)) { case ENOLCK: goto restart; /* not reached */ case EINTR: case ERESTART: return(EINTR); /* not reached */ default: break; } haverslock = 1; } /* * Maybe this should be above the vnode op call, but so long as * file servers have no limits, i don't think it matters */ if (p && uio->uio_offset + uio->uio_resid > p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { PROC_LOCK(p); psignal(p, SIGXFSZ); PROC_UNLOCK(p); if (haverslock) nfs_rsunlock(np, p); return (EFBIG); } biosize = vp->v_mount->mnt_stat.f_iosize; do { /* * Check for a valid write lease. */ if ((nmp->nm_flag & NFSMNT_NQNFS) && NQNFS_CKINVALID(vp, np, ND_WRITE)) { do { error = nqnfs_getlease(vp, ND_WRITE, cred, p); } while (error == NQNFS_EXPIRED); if (error) break; if (np->n_lrev != np->n_brev || (np->n_flag & NQNFSNONCACHE)) { error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) break; np->n_brev = np->n_lrev; } } if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { iomode = NFSV3WRITE_FILESYNC; error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); if (must_commit) nfs_clearcommit(vp->v_mount); break; } nfsstats.biocache_writes++; lbn = uio->uio_offset / biosize; on = uio->uio_offset & (biosize-1); n = min((unsigned)(biosize - on), uio->uio_resid); again: /* * Handle direct append and file extension cases, calculate * unaligned buffer size. */ if (uio->uio_offset == np->n_size && n) { /* * Get the buffer (in its pre-append state to maintain * B_CACHE if it was previously set). Resize the * nfsnode after we have locked the buffer to prevent * readers from reading garbage. */ bcount = on; bp = nfs_getcacheblk(vp, lbn, bcount, p); if (bp != NULL) { long save; np->n_size = uio->uio_offset + n; np->n_flag |= NMODIFIED; vnode_pager_setsize(vp, np->n_size); save = bp->b_flags & B_CACHE; bcount += n; allocbuf(bp, bcount); bp->b_flags |= save; + bp->b_magic = B_MAGIC_NFS; + bp->b_op = &buf_ops_nfs; } } else { /* * Obtain the locked cache block first, and then * adjust the file's size as appropriate. */ bcount = on + n; if ((off_t)lbn * biosize + bcount < np->n_size) { if ((off_t)(lbn + 1) * biosize < np->n_size) bcount = biosize; else bcount = np->n_size - (off_t)lbn * biosize; } bp = nfs_getcacheblk(vp, lbn, bcount, p); if (uio->uio_offset + n > np->n_size) { np->n_size = uio->uio_offset + n; np->n_flag |= NMODIFIED; vnode_pager_setsize(vp, np->n_size); } } if (!bp) { error = EINTR; break; } /* * Issue a READ if B_CACHE is not set. In special-append * mode, B_CACHE is based on the buffer prior to the write * op and is typically set, avoiding the read. If a read * is required in special append mode, the server will * probably send us a short-read since we extended the file * on our end, resulting in b_resid == 0 and, thusly, * B_CACHE getting set. * * We can also avoid issuing the read if the write covers * the entire buffer. We have to make sure the buffer state * is reasonable in this case since we will not be initiating * I/O. See the comments in kern/vfs_bio.c's getblk() for * more information. * * B_CACHE may also be set due to the buffer being cached * normally. */ if (on == 0 && n == bcount) { bp->b_flags |= B_CACHE; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; } if ((bp->b_flags & B_CACHE) == 0) { bp->b_iocmd = BIO_READ; vfs_busy_pages(bp, 0); error = nfs_doio(bp, cred, p); if (error) { brelse(bp); break; } } if (!bp) { error = EINTR; break; } if (bp->b_wcred == NOCRED) { crhold(cred); bp->b_wcred = cred; } np->n_flag |= NMODIFIED; /* * If dirtyend exceeds file size, chop it down. This should * not normally occur but there is an append race where it * might occur XXX, so we log it. * * If the chopping creates a reverse-indexed or degenerate * situation with dirtyoff/end, we 0 both of them. */ if (bp->b_dirtyend > bcount) { printf("NFS append race @%lx:%d\n", (long)bp->b_blkno * DEV_BSIZE, bp->b_dirtyend - bcount); bp->b_dirtyend = bcount; } if (bp->b_dirtyoff >= bp->b_dirtyend) bp->b_dirtyoff = bp->b_dirtyend = 0; /* * If the new write will leave a contiguous dirty * area, just update the b_dirtyoff and b_dirtyend, * otherwise force a write rpc of the old dirty area. * * While it is possible to merge discontiguous writes due to * our having a B_CACHE buffer ( and thus valid read data * for the hole), we don't because it could lead to * significant cache coherency problems with multiple clients, * especially if locking is implemented later on. * * as an optimization we could theoretically maintain * a linked list of discontinuous areas, but we would still * have to commit them separately so there isn't much * advantage to it except perhaps a bit of asynchronization. */ if (bp->b_dirtyend > 0 && (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { if (BUF_WRITE(bp) == EINTR) return (EINTR); goto again; } /* * Check for valid write lease and get one as required. * In case getblk() and/or bwrite() delayed us. */ if ((nmp->nm_flag & NFSMNT_NQNFS) && NQNFS_CKINVALID(vp, np, ND_WRITE)) { do { error = nqnfs_getlease(vp, ND_WRITE, cred, p); } while (error == NQNFS_EXPIRED); if (error) { brelse(bp); break; } if (np->n_lrev != np->n_brev || (np->n_flag & NQNFSNONCACHE)) { brelse(bp); error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) break; np->n_brev = np->n_lrev; goto again; } } error = uiomove((char *)bp->b_data + on, n, uio); /* * Since this block is being modified, it must be written * again and not just committed. Since write clustering does * not work for the stage 1 data write, only the stage 2 * commit rpc, we have to clear B_CLUSTEROK as well. */ bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); if (error) { bp->b_ioflags |= BIO_ERROR; brelse(bp); break; } /* * Only update dirtyoff/dirtyend if not a degenerate * condition. */ if (n) { if (bp->b_dirtyend > 0) { bp->b_dirtyoff = min(on, bp->b_dirtyoff); bp->b_dirtyend = max((on + n), bp->b_dirtyend); } else { bp->b_dirtyoff = on; bp->b_dirtyend = on + n; } vfs_bio_set_validclean(bp, on, n); } /* * If the lease is non-cachable or IO_SYNC do bwrite(). * * IO_INVAL appears to be unused. The idea appears to be * to turn off caching in this case. Very odd. XXX */ if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { if (ioflag & IO_INVAL) bp->b_flags |= B_NOCACHE; error = BUF_WRITE(bp); if (error) break; if (np->n_flag & NQNFSNONCACHE) { error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) break; } } else if ((n + on) == biosize && (nmp->nm_flag & NFSMNT_NQNFS) == 0) { bp->b_flags |= B_ASYNC; (void)nfs_writebp(bp, 0, 0); } else { bdwrite(bp); } } while (uio->uio_resid > 0 && n > 0); if (haverslock) nfs_rsunlock(np, p); return (error); } /* * Get an nfs cache block. * * Allocate a new one if the block isn't currently in the cache * and return the block marked busy. If the calling process is * interrupted by a signal for an interruptible mount point, return * NULL. * * The caller must carefully deal with the possible B_INVAL state of * the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it * indirectly), so synchronous reads can be issued without worrying about * the B_INVAL state. We have to be a little more careful when dealing * with writes (see comments in nfs_write()) when extending a file past * its EOF. */ static struct buf * nfs_getcacheblk(vp, bn, size, p) struct vnode *vp; daddr_t bn; int size; struct proc *p; { register struct buf *bp; struct mount *mp; struct nfsmount *nmp; mp = vp->v_mount; nmp = VFSTONFS(mp); if (nmp->nm_flag & NFSMNT_INT) { bp = getblk(vp, bn, size, PCATCH, 0); while (bp == (struct buf *)0) { if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) return ((struct buf *)0); bp = getblk(vp, bn, size, 0, 2 * hz); } } else { bp = getblk(vp, bn, size, 0, 0); } if (vp->v_type == VREG) { int biosize; biosize = mp->mnt_stat.f_iosize; bp->b_blkno = bn * (biosize / DEV_BSIZE); } return (bp); } /* * Flush and invalidate all dirty buffers. If another process is already * doing the flush, just wait for completion. */ int nfs_vinvalbuf(vp, flags, cred, p, intrflg) struct vnode *vp; int flags; struct ucred *cred; struct proc *p; int intrflg; { register struct nfsnode *np = VTONFS(vp); struct nfsmount *nmp = VFSTONFS(vp->v_mount); int error = 0, slpflag, slptimeo; if (vp->v_flag & VXLOCK) { return (0); } if ((nmp->nm_flag & NFSMNT_INT) == 0) intrflg = 0; if (intrflg) { slpflag = PCATCH; slptimeo = 2 * hz; } else { slpflag = 0; slptimeo = 0; } /* * First wait for any other process doing a flush to complete. */ while (np->n_flag & NFLUSHINPROG) { np->n_flag |= NFLUSHWANT; error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", slptimeo); if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) return (EINTR); } /* * Now, flush as required. */ np->n_flag |= NFLUSHINPROG; error = vinvalbuf(vp, flags, cred, p, slpflag, 0); while (error) { if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) { np->n_flag &= ~NFLUSHINPROG; if (np->n_flag & NFLUSHWANT) { np->n_flag &= ~NFLUSHWANT; wakeup((caddr_t)&np->n_flag); } return (EINTR); } error = vinvalbuf(vp, flags, cred, p, 0, slptimeo); } np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); if (np->n_flag & NFLUSHWANT) { np->n_flag &= ~NFLUSHWANT; wakeup((caddr_t)&np->n_flag); } return (0); } /* * Initiate asynchronous I/O. Return an error if no nfsiods are available. * This is mainly to avoid queueing async I/O requests when the nfsiods * are all hung on a dead server. * * Note: nfs_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp * is eventually dequeued by the async daemon, nfs_doio() *will*. */ int nfs_asyncio(bp, cred, procp) register struct buf *bp; struct ucred *cred; struct proc *procp; { struct nfsmount *nmp; int i; int gotiod; int slpflag = 0; int slptimeo = 0; int error; /* * If no async daemons then return EIO to force caller to run the rpc * synchronously. */ if (nfs_numasync == 0) return (EIO); nmp = VFSTONFS(bp->b_vp->v_mount); /* * Commits are usually short and sweet so lets save some cpu and * leave the async daemons for more important rpc's (such as reads * and writes). */ if (bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) && (nmp->nm_bufqiods > nfs_numasync / 2)) { return(EIO); } again: if (nmp->nm_flag & NFSMNT_INT) slpflag = PCATCH; gotiod = FALSE; /* * Find a free iod to process this request. */ for (i = 0; i < NFS_MAXASYNCDAEMON; i++) if (nfs_iodwant[i]) { /* * Found one, so wake it up and tell it which * mount to process. */ NFS_DPF(ASYNCIO, ("nfs_asyncio: waking iod %d for mount %p\n", i, nmp)); nfs_iodwant[i] = (struct proc *)0; nfs_iodmount[i] = nmp; nmp->nm_bufqiods++; wakeup((caddr_t)&nfs_iodwant[i]); gotiod = TRUE; break; } /* * If none are free, we may already have an iod working on this mount * point. If so, it will process our request. */ if (!gotiod) { if (nmp->nm_bufqiods > 0) { NFS_DPF(ASYNCIO, ("nfs_asyncio: %d iods are already processing mount %p\n", nmp->nm_bufqiods, nmp)); gotiod = TRUE; } } /* * If we have an iod which can process the request, then queue * the buffer. */ if (gotiod) { /* * Ensure that the queue never grows too large. We still want * to asynchronize so we block rather then return EIO. */ while (nmp->nm_bufqlen >= 2*nfs_numasync) { NFS_DPF(ASYNCIO, ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); nmp->nm_bufqwant = TRUE; error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO, "nfsaio", slptimeo); if (error) { if (nfs_sigintr(nmp, NULL, procp)) return (EINTR); if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; } } /* * We might have lost our iod while sleeping, * so check and loop if nescessary. */ if (nmp->nm_bufqiods == 0) { NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); goto again; } } if (bp->b_iocmd == BIO_READ) { if (bp->b_rcred == NOCRED && cred != NOCRED) { crhold(cred); bp->b_rcred = cred; } } else { bp->b_flags |= B_WRITEINPROG; if (bp->b_wcred == NOCRED && cred != NOCRED) { crhold(cred); bp->b_wcred = cred; } } BUF_KERNPROC(bp); TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); nmp->nm_bufqlen++; return (0); } /* * All the iods are busy on other mounts, so return EIO to * force the caller to process the i/o synchronously. */ NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); return (EIO); } /* * Do an I/O operation to/from a cache block. This may be called * synchronously or from an nfsiod. */ int nfs_doio(bp, cr, p) struct buf *bp; struct ucred *cr; struct proc *p; { struct uio *uiop; struct vnode *vp; struct nfsnode *np; struct nfsmount *nmp; int error = 0, iomode, must_commit = 0; struct uio uio; struct iovec io; vp = bp->b_vp; np = VTONFS(vp); nmp = VFSTONFS(vp->v_mount); uiop = &uio; uiop->uio_iov = &io; uiop->uio_iovcnt = 1; uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_procp = p; /* * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We * do this here so we do not have to do it in all the code that * calls us. */ bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp)); /* * Historically, paging was done with physio, but no more. */ if (bp->b_flags & B_PHYS) { /* * ...though reading /dev/drum still gets us here. */ io.iov_len = uiop->uio_resid = bp->b_bcount; /* mapping was done by vmapbuf() */ io.iov_base = bp->b_data; uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; if (bp->b_iocmd == BIO_READ) { uiop->uio_rw = UIO_READ; nfsstats.read_physios++; error = nfs_readrpc(vp, uiop, cr); } else { int com; iomode = NFSV3WRITE_DATASYNC; uiop->uio_rw = UIO_WRITE; nfsstats.write_physios++; error = nfs_writerpc(vp, uiop, cr, &iomode, &com); } if (error) { bp->b_ioflags |= BIO_ERROR; bp->b_error = error; } } else if (bp->b_iocmd == BIO_READ) { io.iov_len = uiop->uio_resid = bp->b_bcount; io.iov_base = bp->b_data; uiop->uio_rw = UIO_READ; switch (vp->v_type) { case VREG: uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; nfsstats.read_bios++; error = nfs_readrpc(vp, uiop, cr); if (!error) { if (uiop->uio_resid) { /* * If we had a short read with no error, we must have * hit a file hole. We should zero-fill the remainder. * This can also occur if the server hits the file EOF. * * Holes used to be able to occur due to pending * writes, but that is not possible any longer. */ int nread = bp->b_bcount - uiop->uio_resid; int left = bp->b_bcount - nread; if (left > 0) bzero((char *)bp->b_data + nread, left); uiop->uio_resid = 0; } } if (p && (vp->v_flag & VTEXT) && (((nmp->nm_flag & NFSMNT_NQNFS) && NQNFS_CKINVALID(vp, np, ND_READ) && np->n_lrev != np->n_brev) || (!(nmp->nm_flag & NFSMNT_NQNFS) && np->n_mtime != np->n_vattr.va_mtime.tv_sec))) { uprintf("Process killed due to text file modification\n"); PROC_LOCK(p); psignal(p, SIGKILL); _PHOLD(p); PROC_UNLOCK(p); } break; case VLNK: uiop->uio_offset = (off_t)0; nfsstats.readlink_bios++; error = nfs_readlinkrpc(vp, uiop, cr); break; case VDIR: nfsstats.readdir_bios++; uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; if (nmp->nm_flag & NFSMNT_RDIRPLUS) { error = nfs_readdirplusrpc(vp, uiop, cr); if (error == NFSERR_NOTSUPP) nmp->nm_flag &= ~NFSMNT_RDIRPLUS; } if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) error = nfs_readdirrpc(vp, uiop, cr); /* * end-of-directory sets B_INVAL but does not generate an * error. */ if (error == 0 && uiop->uio_resid == bp->b_bcount) bp->b_flags |= B_INVAL; break; default: printf("nfs_doio: type %x unexpected\n",vp->v_type); break; }; if (error) { bp->b_ioflags |= BIO_ERROR; bp->b_error = error; } } else { /* * If we only need to commit, try to commit */ if (bp->b_flags & B_NEEDCOMMIT) { int retv; off_t off; off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; bp->b_flags |= B_WRITEINPROG; retv = nfs_commit( bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff, bp->b_wcred, p); bp->b_flags &= ~B_WRITEINPROG; if (retv == 0) { bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); bp->b_resid = 0; bufdone(bp); return (0); } if (retv == NFSERR_STALEWRITEVERF) { nfs_clearcommit(bp->b_vp->v_mount); } } /* * Setup for actual write */ if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size) bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE; if (bp->b_dirtyend > bp->b_dirtyoff) { io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff; uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyoff; io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; uiop->uio_rw = UIO_WRITE; nfsstats.write_bios++; if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC) iomode = NFSV3WRITE_UNSTABLE; else iomode = NFSV3WRITE_FILESYNC; bp->b_flags |= B_WRITEINPROG; error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); /* * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try * to cluster the buffers needing commit. This will allow * the system to submit a single commit rpc for the whole * cluster. We can do this even if the buffer is not 100% * dirty (relative to the NFS blocksize), so we optimize the * append-to-file-case. * * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be * cleared because write clustering only works for commit * rpc's, not for the data portion of the write). */ if (!error && iomode == NFSV3WRITE_UNSTABLE) { bp->b_flags |= B_NEEDCOMMIT; if (bp->b_dirtyoff == 0 && bp->b_dirtyend == bp->b_bcount) bp->b_flags |= B_CLUSTEROK; } else { bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); } bp->b_flags &= ~B_WRITEINPROG; /* * For an interrupted write, the buffer is still valid * and the write hasn't been pushed to the server yet, * so we can't set BIO_ERROR and report the interruption * by setting B_EINTR. For the B_ASYNC case, B_EINTR * is not relevant, so the rpc attempt is essentially * a noop. For the case of a V3 write rpc not being * committed to stable storage, the block is still * dirty and requires either a commit rpc or another * write rpc with iomode == NFSV3WRITE_FILESYNC before * the block is reused. This is indicated by setting * the B_DELWRI and B_NEEDCOMMIT flags. * * If the buffer is marked B_PAGING, it does not reside on * the vp's paging queues so we cannot call bdirty(). The * bp in this case is not an NFS cache block so we should * be safe. XXX */ if (error == EINTR || (!error && (bp->b_flags & B_NEEDCOMMIT))) { int s; s = splbio(); bp->b_flags &= ~(B_INVAL|B_NOCACHE); if ((bp->b_flags & B_PAGING) == 0) { bdirty(bp); bp->b_flags &= ~B_DONE; } if (error && (bp->b_flags & B_ASYNC) == 0) bp->b_flags |= B_EINTR; splx(s); } else { if (error) { bp->b_ioflags |= BIO_ERROR; bp->b_error = np->n_error = error; np->n_flag |= NWRITEERR; } bp->b_dirtyoff = bp->b_dirtyend = 0; } } else { bp->b_resid = 0; bufdone(bp); return (0); } } bp->b_resid = uiop->uio_resid; if (must_commit) nfs_clearcommit(vp->v_mount); bufdone(bp); return (error); } Index: head/sys/nfs/nfs_vnops.c =================================================================== --- head/sys/nfs/nfs_vnops.c (revision 75579) +++ head/sys/nfs/nfs_vnops.c (revision 75580) @@ -1,3400 +1,3385 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)nfs_vnops.c 8.16 (Berkeley) 5/27/95 * $FreeBSD$ */ /* * vnode op calls for Sun NFS version 2 and 3 */ #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Defs */ #define TRUE 1 #define FALSE 0 /* * Ifdef for FreeBSD-current merged buffer cache. It is unfortunate that these * calls are not in getblk() and brelse() so that they would not be necessary * here. */ #ifndef B_VMIO #define vfs_busy_pages(bp, f) #endif static int nfsspec_read __P((struct vop_read_args *)); static int nfsspec_write __P((struct vop_write_args *)); static int nfsfifo_read __P((struct vop_read_args *)); static int nfsfifo_write __P((struct vop_write_args *)); static int nfsspec_close __P((struct vop_close_args *)); static int nfsfifo_close __P((struct vop_close_args *)); #define nfs_poll vop_nopoll static int nfs_flush __P((struct vnode *,struct ucred *,int,struct proc *,int)); static int nfs_setattrrpc __P((struct vnode *,struct vattr *,struct ucred *,struct proc *)); static int nfs_lookup __P((struct vop_lookup_args *)); static int nfs_create __P((struct vop_create_args *)); static int nfs_mknod __P((struct vop_mknod_args *)); static int nfs_open __P((struct vop_open_args *)); static int nfs_close __P((struct vop_close_args *)); static int nfs_access __P((struct vop_access_args *)); static int nfs_getattr __P((struct vop_getattr_args *)); static int nfs_setattr __P((struct vop_setattr_args *)); static int nfs_read __P((struct vop_read_args *)); static int nfs_fsync __P((struct vop_fsync_args *)); static int nfs_remove __P((struct vop_remove_args *)); static int nfs_link __P((struct vop_link_args *)); static int nfs_rename __P((struct vop_rename_args *)); static int nfs_mkdir __P((struct vop_mkdir_args *)); static int nfs_rmdir __P((struct vop_rmdir_args *)); static int nfs_symlink __P((struct vop_symlink_args *)); static int nfs_readdir __P((struct vop_readdir_args *)); static int nfs_bmap __P((struct vop_bmap_args *)); static int nfs_strategy __P((struct vop_strategy_args *)); static int nfs_lookitup __P((struct vnode *, const char *, int, struct ucred *, struct proc *, struct nfsnode **)); static int nfs_sillyrename __P((struct vnode *,struct vnode *,struct componentname *)); static int nfsspec_access __P((struct vop_access_args *)); static int nfs_readlink __P((struct vop_readlink_args *)); static int nfs_print __P((struct vop_print_args *)); static int nfs_advlock __P((struct vop_advlock_args *)); -static int nfs_bwrite __P((struct vop_bwrite_args *)); /* * Global vfs data structures for nfs */ vop_t **nfsv2_vnodeop_p; static struct vnodeopv_entry_desc nfsv2_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_defaultop }, { &vop_access_desc, (vop_t *) nfs_access }, { &vop_advlock_desc, (vop_t *) nfs_advlock }, { &vop_bmap_desc, (vop_t *) nfs_bmap }, - { &vop_bwrite_desc, (vop_t *) nfs_bwrite }, { &vop_close_desc, (vop_t *) nfs_close }, { &vop_create_desc, (vop_t *) nfs_create }, { &vop_fsync_desc, (vop_t *) nfs_fsync }, { &vop_getattr_desc, (vop_t *) nfs_getattr }, { &vop_getpages_desc, (vop_t *) nfs_getpages }, { &vop_putpages_desc, (vop_t *) nfs_putpages }, { &vop_inactive_desc, (vop_t *) nfs_inactive }, { &vop_islocked_desc, (vop_t *) vop_stdislocked }, { &vop_lease_desc, (vop_t *) vop_null }, { &vop_link_desc, (vop_t *) nfs_link }, { &vop_lock_desc, (vop_t *) vop_sharedlock }, { &vop_lookup_desc, (vop_t *) nfs_lookup }, { &vop_mkdir_desc, (vop_t *) nfs_mkdir }, { &vop_mknod_desc, (vop_t *) nfs_mknod }, { &vop_open_desc, (vop_t *) nfs_open }, { &vop_poll_desc, (vop_t *) nfs_poll }, { &vop_print_desc, (vop_t *) nfs_print }, { &vop_read_desc, (vop_t *) nfs_read }, { &vop_readdir_desc, (vop_t *) nfs_readdir }, { &vop_readlink_desc, (vop_t *) nfs_readlink }, { &vop_reclaim_desc, (vop_t *) nfs_reclaim }, { &vop_remove_desc, (vop_t *) nfs_remove }, { &vop_rename_desc, (vop_t *) nfs_rename }, { &vop_rmdir_desc, (vop_t *) nfs_rmdir }, { &vop_setattr_desc, (vop_t *) nfs_setattr }, { &vop_strategy_desc, (vop_t *) nfs_strategy }, { &vop_symlink_desc, (vop_t *) nfs_symlink }, { &vop_unlock_desc, (vop_t *) vop_stdunlock }, { &vop_write_desc, (vop_t *) nfs_write }, { NULL, NULL } }; static struct vnodeopv_desc nfsv2_vnodeop_opv_desc = { &nfsv2_vnodeop_p, nfsv2_vnodeop_entries }; VNODEOP_SET(nfsv2_vnodeop_opv_desc); /* * Special device vnode ops */ vop_t **spec_nfsv2nodeop_p; static struct vnodeopv_entry_desc nfsv2_specop_entries[] = { { &vop_default_desc, (vop_t *) spec_vnoperate }, { &vop_access_desc, (vop_t *) nfsspec_access }, { &vop_close_desc, (vop_t *) nfsspec_close }, { &vop_fsync_desc, (vop_t *) nfs_fsync }, { &vop_getattr_desc, (vop_t *) nfs_getattr }, { &vop_islocked_desc, (vop_t *) vop_stdislocked }, { &vop_inactive_desc, (vop_t *) nfs_inactive }, { &vop_lock_desc, (vop_t *) vop_sharedlock }, { &vop_print_desc, (vop_t *) nfs_print }, { &vop_read_desc, (vop_t *) nfsspec_read }, { &vop_reclaim_desc, (vop_t *) nfs_reclaim }, { &vop_setattr_desc, (vop_t *) nfs_setattr }, { &vop_unlock_desc, (vop_t *) vop_stdunlock }, { &vop_write_desc, (vop_t *) nfsspec_write }, { NULL, NULL } }; static struct vnodeopv_desc spec_nfsv2nodeop_opv_desc = { &spec_nfsv2nodeop_p, nfsv2_specop_entries }; VNODEOP_SET(spec_nfsv2nodeop_opv_desc); vop_t **fifo_nfsv2nodeop_p; static struct vnodeopv_entry_desc nfsv2_fifoop_entries[] = { { &vop_default_desc, (vop_t *) fifo_vnoperate }, { &vop_access_desc, (vop_t *) nfsspec_access }, { &vop_close_desc, (vop_t *) nfsfifo_close }, { &vop_fsync_desc, (vop_t *) nfs_fsync }, { &vop_getattr_desc, (vop_t *) nfs_getattr }, { &vop_inactive_desc, (vop_t *) nfs_inactive }, { &vop_islocked_desc, (vop_t *) vop_stdislocked }, { &vop_lock_desc, (vop_t *) vop_sharedlock }, { &vop_print_desc, (vop_t *) nfs_print }, { &vop_read_desc, (vop_t *) nfsfifo_read }, { &vop_reclaim_desc, (vop_t *) nfs_reclaim }, { &vop_setattr_desc, (vop_t *) nfs_setattr }, { &vop_unlock_desc, (vop_t *) vop_stdunlock }, { &vop_write_desc, (vop_t *) nfsfifo_write }, { NULL, NULL } }; static struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc = { &fifo_nfsv2nodeop_p, nfsv2_fifoop_entries }; VNODEOP_SET(fifo_nfsv2nodeop_opv_desc); static int nfs_mknodrpc __P((struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct vattr *vap)); static int nfs_removerpc __P((struct vnode *dvp, const char *name, int namelen, struct ucred *cred, struct proc *proc)); static int nfs_renamerpc __P((struct vnode *fdvp, const char *fnameptr, int fnamelen, struct vnode *tdvp, const char *tnameptr, int tnamelen, struct ucred *cred, struct proc *proc)); static int nfs_renameit __P((struct vnode *sdvp, struct componentname *scnp, struct sillyrename *sp)); /* * Global variables */ extern u_int32_t nfs_true, nfs_false; extern u_int32_t nfs_xdrneg1; extern struct nfsstats nfsstats; extern nfstype nfsv3_type[9]; struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON]; int nfs_numasync = 0; #define DIRHDSIZ (sizeof (struct dirent) - (MAXNAMLEN + 1)) SYSCTL_DECL(_vfs_nfs); static int nfsaccess_cache_timeout = NFS_MAXATTRTIMO; SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_timeout, CTLFLAG_RW, &nfsaccess_cache_timeout, 0, "NFS ACCESS cache timeout"); static int nfsv3_commit_on_close = 0; SYSCTL_INT(_vfs_nfs, OID_AUTO, nfsv3_commit_on_close, CTLFLAG_RW, &nfsv3_commit_on_close, 0, "write+commit on close, else only write"); #if 0 SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_hits, CTLFLAG_RD, &nfsstats.accesscache_hits, 0, "NFS ACCESS cache hit count"); SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_misses, CTLFLAG_RD, &nfsstats.accesscache_misses, 0, "NFS ACCESS cache miss count"); #endif #define NFSV3ACCESS_ALL (NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY \ | NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE \ | NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP) static int nfs3_access_otw(struct vnode *vp, int wmode, struct proc *p, struct ucred *cred) { const int v3 = 1; u_int32_t *tl; int error = 0, attrflag; struct mbuf *mreq, *mrep, *md, *mb, *mb2; caddr_t bpos, dpos, cp2; register int32_t t1, t2; register caddr_t cp; u_int32_t rmode; struct nfsnode *np = VTONFS(vp); nfsstats.rpccnt[NFSPROC_ACCESS]++; nfsm_reqhead(vp, NFSPROC_ACCESS, NFSX_FH(v3) + NFSX_UNSIGNED); nfsm_fhtom(vp, v3); nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(wmode); nfsm_request(vp, NFSPROC_ACCESS, p, cred); nfsm_postop_attr(vp, attrflag); if (!error) { nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); rmode = fxdr_unsigned(u_int32_t, *tl); np->n_mode = rmode; np->n_modeuid = cred->cr_uid; np->n_modestamp = time_second; } nfsm_reqdone; return error; } /* * nfs access vnode op. * For nfs version 2, just return ok. File accesses may fail later. * For nfs version 3, use the access rpc to check accessibility. If file modes * are changed on the server, accesses might still fail later. */ static int nfs_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; int error = 0; u_int32_t mode, wmode; int v3 = NFS_ISV3(vp); struct nfsnode *np = VTONFS(vp); /* * Disallow write attempts on filesystems mounted read-only; * unless the file is a socket, fifo, or a block or character * device resident on the filesystem. */ if ((ap->a_mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) { switch (vp->v_type) { case VREG: case VDIR: case VLNK: return (EROFS); default: break; } } /* * For nfs v3, check to see if we have done this recently, and if * so return our cached result instead of making an ACCESS call. * If not, do an access rpc, otherwise you are stuck emulating * ufs_access() locally using the vattr. This may not be correct, * since the server may apply other access criteria such as * client uid-->server uid mapping that we do not know about. */ if (v3) { if (ap->a_mode & VREAD) mode = NFSV3ACCESS_READ; else mode = 0; if (vp->v_type != VDIR) { if (ap->a_mode & VWRITE) mode |= (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND); if (ap->a_mode & VEXEC) mode |= NFSV3ACCESS_EXECUTE; } else { if (ap->a_mode & VWRITE) mode |= (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND | NFSV3ACCESS_DELETE); if (ap->a_mode & VEXEC) mode |= NFSV3ACCESS_LOOKUP; } /* XXX safety belt, only make blanket request if caching */ if (nfsaccess_cache_timeout > 0) { wmode = NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE | NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP; } else { wmode = mode; } /* * Does our cached result allow us to give a definite yes to * this request? */ if ((time_second < (np->n_modestamp + nfsaccess_cache_timeout)) && (ap->a_cred->cr_uid == np->n_modeuid) && ((np->n_mode & mode) == mode)) { nfsstats.accesscache_hits++; } else { /* * Either a no, or a don't know. Go to the wire. */ nfsstats.accesscache_misses++; error = nfs3_access_otw(vp, wmode, ap->a_p,ap->a_cred); if (!error) { if ((np->n_mode & mode) != mode) { error = EACCES; } } } return (error); } else { if ((error = nfsspec_access(ap)) != 0) return (error); /* * Attempt to prevent a mapped root from accessing a file * which it shouldn't. We try to read a byte from the file * if the user is root and the file is not zero length. * After calling nfsspec_access, we should have the correct * file size cached. */ if (ap->a_cred->cr_uid == 0 && (ap->a_mode & VREAD) && VTONFS(vp)->n_size > 0) { struct iovec aiov; struct uio auio; char buf[1]; aiov.iov_base = buf; aiov.iov_len = 1; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_resid = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_procp = ap->a_p; if (vp->v_type == VREG) error = nfs_readrpc(vp, &auio, ap->a_cred); else if (vp->v_type == VDIR) { char* bp; bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK); aiov.iov_base = bp; aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ; error = nfs_readdirrpc(vp, &auio, ap->a_cred); free(bp, M_TEMP); } else if (vp->v_type == VLNK) error = nfs_readlinkrpc(vp, &auio, ap->a_cred); else error = EACCES; } return (error); } } /* * nfs open vnode op * Check to see if the type is ok * and that deletion is not in progress. * For paged in text files, you will need to flush the page cache * if consistency is lost. */ /* ARGSUSED */ static int nfs_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct vattr vattr; int error; if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) { #ifdef DIAGNOSTIC printf("open eacces vtyp=%d\n",vp->v_type); #endif return (EACCES); } /* * Get a valid lease. If cached data is stale, flush it. */ if (nmp->nm_flag & NFSMNT_NQNFS) { if (NQNFS_CKINVALID(vp, np, ND_READ)) { do { error = nqnfs_getlease(vp, ND_READ, ap->a_cred, ap->a_p); } while (error == NQNFS_EXPIRED); if (error) return (error); if (np->n_lrev != np->n_brev || (np->n_flag & NQNFSNONCACHE)) { if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1)) == EINTR) return (error); np->n_brev = np->n_lrev; } } } else { if (np->n_flag & NMODIFIED) { if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1)) == EINTR) return (error); np->n_attrstamp = 0; if (vp->v_type == VDIR) np->n_direofoffset = 0; error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_p); if (error) return (error); np->n_mtime = vattr.va_mtime.tv_sec; } else { error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_p); if (error) return (error); if (np->n_mtime != vattr.va_mtime.tv_sec) { if (vp->v_type == VDIR) np->n_direofoffset = 0; if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1)) == EINTR) return (error); np->n_mtime = vattr.va_mtime.tv_sec; } } } if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) np->n_attrstamp = 0; /* For Open/Close consistency */ return (0); } /* * nfs close vnode op * What an NFS client should do upon close after writing is a debatable issue. * Most NFS clients push delayed writes to the server upon close, basically for * two reasons: * 1 - So that any write errors may be reported back to the client process * doing the close system call. By far the two most likely errors are * NFSERR_NOSPC and NFSERR_DQUOT to indicate space allocation failure. * 2 - To put a worst case upper bound on cache inconsistency between * multiple clients for the file. * There is also a consistency problem for Version 2 of the protocol w.r.t. * not being able to tell if other clients are writing a file concurrently, * since there is no way of knowing if the changed modify time in the reply * is only due to the write for this client. * (NFS Version 3 provides weak cache consistency data in the reply that * should be sufficient to detect and handle this case.) * * The current code does the following: * for NFS Version 2 - play it safe and flush/invalidate all dirty buffers * for NFS Version 3 - flush dirty buffers to the server but don't invalidate * or commit them (this satisfies 1 and 2 except for the * case where the server crashes after this close but * before the commit RPC, which is felt to be "good * enough". Changing the last argument to nfs_flush() to * a 1 would force a commit operation, if it is felt a * commit is necessary now. * for NQNFS - do nothing now, since 2 is dealt with via leases and * 1 should be dealt with via an fsync() system call for * cases where write errors are important. */ /* ARGSUSED */ static int nfs_close(ap) struct vop_close_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); int error = 0; if (vp->v_type == VREG) { if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS) == 0 && (np->n_flag & NMODIFIED)) { if (NFS_ISV3(vp)) { /* * Under NFSv3 we have dirty buffers to dispose of. We * must flush them to the NFS server. We have the option * of waiting all the way through the commit rpc or just * waiting for the initial write. The default is to only * wait through the initial write so the data is in the * server's cache, which is roughly similar to the state * a standard disk subsystem leaves the file in on close(). * * We cannot clear the NMODIFIED bit in np->n_flag due to * potential races with other processes, and certainly * cannot clear it if we don't commit. */ int cm = nfsv3_commit_on_close ? 1 : 0; error = nfs_flush(vp, ap->a_cred, MNT_WAIT, ap->a_p, cm); /* np->n_flag &= ~NMODIFIED; */ } else { error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1); } np->n_attrstamp = 0; } if (np->n_flag & NWRITEERR) { np->n_flag &= ~NWRITEERR; error = np->n_error; } } return (error); } /* * nfs getattr call from vfs. */ static int nfs_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); register caddr_t cp; register u_int32_t *tl; register int32_t t1, t2; caddr_t bpos, dpos; int error = 0; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3 = NFS_ISV3(vp); /* * Update local times for special files. */ if (np->n_flag & (NACC | NUPD)) np->n_flag |= NCHG; /* * First look in the cache. */ if (nfs_getattrcache(vp, ap->a_vap) == 0) return (0); if (v3 && nfsaccess_cache_timeout > 0) { nfsstats.accesscache_misses++; nfs3_access_otw(vp, NFSV3ACCESS_ALL, ap->a_p, ap->a_cred); if (nfs_getattrcache(vp, ap->a_vap) == 0) return (0); } nfsstats.rpccnt[NFSPROC_GETATTR]++; nfsm_reqhead(vp, NFSPROC_GETATTR, NFSX_FH(v3)); nfsm_fhtom(vp, v3); nfsm_request(vp, NFSPROC_GETATTR, ap->a_p, ap->a_cred); if (!error) { nfsm_loadattr(vp, ap->a_vap); } nfsm_reqdone; return (error); } /* * nfs setattr call. */ static int nfs_setattr(ap) struct vop_setattr_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); register struct vattr *vap = ap->a_vap; int error = 0; u_quad_t tsize; #ifndef nolint tsize = (u_quad_t)0; #endif /* * Setting of flags is not supported. */ if (vap->va_flags != VNOVAL) return (EOPNOTSUPP); /* * Disallow write attempts if the filesystem is mounted read-only. */ if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) && (vp->v_mount->mnt_flag & MNT_RDONLY)) return (EROFS); if (vap->va_size != VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VCHR: case VBLK: case VSOCK: case VFIFO: if (vap->va_mtime.tv_sec == VNOVAL && vap->va_atime.tv_sec == VNOVAL && vap->va_mode == (mode_t)VNOVAL && vap->va_uid == (uid_t)VNOVAL && vap->va_gid == (gid_t)VNOVAL) return (0); vap->va_size = VNOVAL; break; default: /* * Disallow write attempts if the filesystem is * mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); vnode_pager_setsize(vp, vap->va_size); if (np->n_flag & NMODIFIED) { if (vap->va_size == 0) error = nfs_vinvalbuf(vp, 0, ap->a_cred, ap->a_p, 1); else error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1); if (error) { vnode_pager_setsize(vp, np->n_size); return (error); } } tsize = np->n_size; np->n_size = np->n_vattr.va_size = vap->va_size; }; } else if ((vap->va_mtime.tv_sec != VNOVAL || vap->va_atime.tv_sec != VNOVAL) && (np->n_flag & NMODIFIED) && vp->v_type == VREG && (error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1)) == EINTR) return (error); error = nfs_setattrrpc(vp, vap, ap->a_cred, ap->a_p); if (error && vap->va_size != VNOVAL) { np->n_size = np->n_vattr.va_size = tsize; vnode_pager_setsize(vp, np->n_size); } return (error); } /* * Do an nfs setattr rpc. */ static int nfs_setattrrpc(vp, vap, cred, procp) register struct vnode *vp; register struct vattr *vap; struct ucred *cred; struct proc *procp; { register struct nfsv2_sattr *sp; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; u_int32_t *tl; int error = 0, wccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3 = NFS_ISV3(vp); nfsstats.rpccnt[NFSPROC_SETATTR]++; nfsm_reqhead(vp, NFSPROC_SETATTR, NFSX_FH(v3) + NFSX_SATTR(v3)); nfsm_fhtom(vp, v3); if (v3) { nfsm_v3attrbuild(vap, TRUE); nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); *tl = nfs_false; } else { nfsm_build(sp, struct nfsv2_sattr *, NFSX_V2SATTR); if (vap->va_mode == (mode_t)VNOVAL) sp->sa_mode = nfs_xdrneg1; else sp->sa_mode = vtonfsv2_mode(vp->v_type, vap->va_mode); if (vap->va_uid == (uid_t)VNOVAL) sp->sa_uid = nfs_xdrneg1; else sp->sa_uid = txdr_unsigned(vap->va_uid); if (vap->va_gid == (gid_t)VNOVAL) sp->sa_gid = nfs_xdrneg1; else sp->sa_gid = txdr_unsigned(vap->va_gid); sp->sa_size = txdr_unsigned(vap->va_size); txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } nfsm_request(vp, NFSPROC_SETATTR, procp, cred); if (v3) { nfsm_wcc_data(vp, wccflag); } else nfsm_loadattr(vp, (struct vattr *)0); nfsm_reqdone; return (error); } /* * nfs lookup call, one step at a time... * First look in cache * If not found, unlock the directory nfsnode and do the rpc */ static int nfs_lookup(ap) struct vop_lookup_args /* { struct vnodeop_desc *a_desc; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; int flags = cnp->cn_flags; struct vnode *newvp; u_int32_t *tl; caddr_t cp; int32_t t1, t2; struct nfsmount *nmp; caddr_t bpos, dpos, cp2; struct mbuf *mreq, *mrep, *md, *mb, *mb2; long len; nfsfh_t *fhp; struct nfsnode *np; int lockparent, wantparent, error = 0, attrflag, fhsize; int v3 = NFS_ISV3(dvp); struct proc *p = cnp->cn_proc; *vpp = NULLVP; cnp->cn_flags &= ~PDIRUNLOCK; if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); if (dvp->v_type != VDIR) return (ENOTDIR); lockparent = flags & LOCKPARENT; wantparent = flags & (LOCKPARENT|WANTPARENT); nmp = VFSTONFS(dvp->v_mount); np = VTONFS(dvp); if ((error = cache_lookup(dvp, vpp, cnp)) && error != ENOENT) { struct vattr vattr; int vpid; if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, p)) != 0) { *vpp = NULLVP; return (error); } newvp = *vpp; vpid = newvp->v_id; /* * See the comment starting `Step through' in ufs/ufs_lookup.c * for an explanation of the locking protocol */ if (dvp == newvp) { VREF(newvp); error = 0; } else if (flags & ISDOTDOT) { VOP_UNLOCK(dvp, 0, p); cnp->cn_flags |= PDIRUNLOCK; error = vget(newvp, LK_EXCLUSIVE, p); if (!error && lockparent && (flags & ISLASTCN)) { error = vn_lock(dvp, LK_EXCLUSIVE, p); if (error == 0) cnp->cn_flags &= ~PDIRUNLOCK; } } else { error = vget(newvp, LK_EXCLUSIVE, p); if (!lockparent || error || !(flags & ISLASTCN)) { VOP_UNLOCK(dvp, 0, p); cnp->cn_flags |= PDIRUNLOCK; } } if (!error) { if (vpid == newvp->v_id) { if (!VOP_GETATTR(newvp, &vattr, cnp->cn_cred, p) && vattr.va_ctime.tv_sec == VTONFS(newvp)->n_ctime) { nfsstats.lookupcache_hits++; if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) cnp->cn_flags |= SAVENAME; return (0); } cache_purge(newvp); } vput(newvp); if (lockparent && dvp != newvp && (flags & ISLASTCN)) VOP_UNLOCK(dvp, 0, p); } error = vn_lock(dvp, LK_EXCLUSIVE, p); *vpp = NULLVP; if (error) { cnp->cn_flags |= PDIRUNLOCK; return (error); } cnp->cn_flags &= ~PDIRUNLOCK; } error = 0; newvp = NULLVP; nfsstats.lookupcache_misses++; nfsstats.rpccnt[NFSPROC_LOOKUP]++; len = cnp->cn_namelen; nfsm_reqhead(dvp, NFSPROC_LOOKUP, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len)); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN); nfsm_request(dvp, NFSPROC_LOOKUP, cnp->cn_proc, cnp->cn_cred); if (error) { nfsm_postop_attr(dvp, attrflag); m_freem(mrep); goto nfsmout; } nfsm_getfh(fhp, fhsize, v3); /* * Handle RENAME case... */ if (cnp->cn_nameiop == RENAME && wantparent && (flags & ISLASTCN)) { if (NFS_CMPFH(np, fhp, fhsize)) { m_freem(mrep); return (EISDIR); } error = nfs_nget(dvp->v_mount, fhp, fhsize, &np); if (error) { m_freem(mrep); return (error); } newvp = NFSTOV(np); if (v3) { nfsm_postop_attr(newvp, attrflag); nfsm_postop_attr(dvp, attrflag); } else nfsm_loadattr(newvp, (struct vattr *)0); *vpp = newvp; m_freem(mrep); cnp->cn_flags |= SAVENAME; if (!lockparent) { VOP_UNLOCK(dvp, 0, p); cnp->cn_flags |= PDIRUNLOCK; } return (0); } if (flags & ISDOTDOT) { VOP_UNLOCK(dvp, 0, p); error = nfs_nget(dvp->v_mount, fhp, fhsize, &np); if (error) { vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, p); return (error); } newvp = NFSTOV(np); if (lockparent && (flags & ISLASTCN)) { error = vn_lock(dvp, LK_EXCLUSIVE, p); if (error) { cnp->cn_flags |= PDIRUNLOCK; vput(newvp); return (error); } } else cnp->cn_flags |= PDIRUNLOCK; } else if (NFS_CMPFH(np, fhp, fhsize)) { VREF(dvp); newvp = dvp; } else { error = nfs_nget(dvp->v_mount, fhp, fhsize, &np); if (error) { m_freem(mrep); return (error); } if (!lockparent || !(flags & ISLASTCN)) { cnp->cn_flags |= PDIRUNLOCK; VOP_UNLOCK(dvp, 0, p); } newvp = NFSTOV(np); } if (v3) { nfsm_postop_attr(newvp, attrflag); nfsm_postop_attr(dvp, attrflag); } else nfsm_loadattr(newvp, (struct vattr *)0); if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) cnp->cn_flags |= SAVENAME; if ((cnp->cn_flags & MAKEENTRY) && (cnp->cn_nameiop != DELETE || !(flags & ISLASTCN))) { np->n_ctime = np->n_vattr.va_ctime.tv_sec; cache_enter(dvp, newvp, cnp); } *vpp = newvp; nfsm_reqdone; if (error) { if (newvp != NULLVP) { vrele(newvp); *vpp = NULLVP; } if ((cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) && (flags & ISLASTCN) && error == ENOENT) { if (!lockparent) { VOP_UNLOCK(dvp, 0, p); cnp->cn_flags |= PDIRUNLOCK; } if (dvp->v_mount->mnt_flag & MNT_RDONLY) error = EROFS; else error = EJUSTRETURN; } if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) cnp->cn_flags |= SAVENAME; } return (error); } /* * nfs read call. * Just call nfs_bioread() to do the work. */ static int nfs_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; if (vp->v_type != VREG) return (EPERM); return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred)); } /* * nfs readlink call */ static int nfs_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; if (vp->v_type != VLNK) return (EINVAL); return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred)); } /* * Do a readlink rpc. * Called by nfs_doio() from below the buffer cache. */ int nfs_readlinkrpc(vp, uiop, cred) register struct vnode *vp; struct uio *uiop; struct ucred *cred; { register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; int error = 0, len, attrflag; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3 = NFS_ISV3(vp); nfsstats.rpccnt[NFSPROC_READLINK]++; nfsm_reqhead(vp, NFSPROC_READLINK, NFSX_FH(v3)); nfsm_fhtom(vp, v3); nfsm_request(vp, NFSPROC_READLINK, uiop->uio_procp, cred); if (v3) nfsm_postop_attr(vp, attrflag); if (!error) { nfsm_strsiz(len, NFS_MAXPATHLEN); if (len == NFS_MAXPATHLEN) { struct nfsnode *np = VTONFS(vp); if (np->n_size && np->n_size < NFS_MAXPATHLEN) len = np->n_size; } nfsm_mtouio(uiop, len); } nfsm_reqdone; return (error); } /* * nfs read rpc call * Ditto above */ int nfs_readrpc(vp, uiop, cred) register struct vnode *vp; struct uio *uiop; struct ucred *cred; { register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; struct mbuf *mreq, *mrep, *md, *mb, *mb2; struct nfsmount *nmp; int error = 0, len, retlen, tsiz, eof, attrflag; int v3 = NFS_ISV3(vp); #ifndef nolint eof = 0; #endif nmp = VFSTONFS(vp->v_mount); tsiz = uiop->uio_resid; if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize) return (EFBIG); while (tsiz > 0) { nfsstats.rpccnt[NFSPROC_READ]++; len = (tsiz > nmp->nm_rsize) ? nmp->nm_rsize : tsiz; nfsm_reqhead(vp, NFSPROC_READ, NFSX_FH(v3) + NFSX_UNSIGNED * 3); nfsm_fhtom(vp, v3); nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED * 3); if (v3) { txdr_hyper(uiop->uio_offset, tl); *(tl + 2) = txdr_unsigned(len); } else { *tl++ = txdr_unsigned(uiop->uio_offset); *tl++ = txdr_unsigned(len); *tl = 0; } nfsm_request(vp, NFSPROC_READ, uiop->uio_procp, cred); if (v3) { nfsm_postop_attr(vp, attrflag); if (error) { m_freem(mrep); goto nfsmout; } nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED); eof = fxdr_unsigned(int, *(tl + 1)); } else nfsm_loadattr(vp, (struct vattr *)0); nfsm_strsiz(retlen, nmp->nm_rsize); nfsm_mtouio(uiop, retlen); m_freem(mrep); tsiz -= retlen; if (v3) { if (eof || retlen == 0) tsiz = 0; } else if (retlen < len) tsiz = 0; } nfsmout: return (error); } /* * nfs write call */ int nfs_writerpc(vp, uiop, cred, iomode, must_commit) register struct vnode *vp; register struct uio *uiop; struct ucred *cred; int *iomode, *must_commit; { register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2, backup; caddr_t bpos, dpos, cp2; struct mbuf *mreq, *mrep, *md, *mb, *mb2; struct nfsmount *nmp = VFSTONFS(vp->v_mount); int error = 0, len, tsiz, wccflag = NFSV3_WCCRATTR, rlen, commit; int v3 = NFS_ISV3(vp), committed = NFSV3WRITE_FILESYNC; #ifndef DIAGNOSTIC if (uiop->uio_iovcnt != 1) panic("nfs: writerpc iovcnt > 1"); #endif *must_commit = 0; tsiz = uiop->uio_resid; if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize) return (EFBIG); while (tsiz > 0) { nfsstats.rpccnt[NFSPROC_WRITE]++; len = (tsiz > nmp->nm_wsize) ? nmp->nm_wsize : tsiz; nfsm_reqhead(vp, NFSPROC_WRITE, NFSX_FH(v3) + 5 * NFSX_UNSIGNED + nfsm_rndup(len)); nfsm_fhtom(vp, v3); if (v3) { nfsm_build(tl, u_int32_t *, 5 * NFSX_UNSIGNED); txdr_hyper(uiop->uio_offset, tl); tl += 2; *tl++ = txdr_unsigned(len); *tl++ = txdr_unsigned(*iomode); *tl = txdr_unsigned(len); } else { register u_int32_t x; nfsm_build(tl, u_int32_t *, 4 * NFSX_UNSIGNED); /* Set both "begin" and "current" to non-garbage. */ x = txdr_unsigned((u_int32_t)uiop->uio_offset); *tl++ = x; /* "begin offset" */ *tl++ = x; /* "current offset" */ x = txdr_unsigned(len); *tl++ = x; /* total to this offset */ *tl = x; /* size of this write */ } nfsm_uiotom(uiop, len); nfsm_request(vp, NFSPROC_WRITE, uiop->uio_procp, cred); if (v3) { wccflag = NFSV3_WCCCHK; nfsm_wcc_data(vp, wccflag); if (!error) { nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED + NFSX_V3WRITEVERF); rlen = fxdr_unsigned(int, *tl++); if (rlen == 0) { error = NFSERR_IO; m_freem(mrep); break; } else if (rlen < len) { backup = len - rlen; uiop->uio_iov->iov_base -= backup; uiop->uio_iov->iov_len += backup; uiop->uio_offset -= backup; uiop->uio_resid += backup; len = rlen; } commit = fxdr_unsigned(int, *tl++); /* * Return the lowest committment level * obtained by any of the RPCs. */ if (committed == NFSV3WRITE_FILESYNC) committed = commit; else if (committed == NFSV3WRITE_DATASYNC && commit == NFSV3WRITE_UNSTABLE) committed = commit; if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0){ bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF); nmp->nm_state |= NFSSTA_HASWRITEVERF; } else if (bcmp((caddr_t)tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF)) { *must_commit = 1; bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF); } } } else nfsm_loadattr(vp, (struct vattr *)0); if (wccflag) VTONFS(vp)->n_mtime = VTONFS(vp)->n_vattr.va_mtime.tv_sec; m_freem(mrep); if (error) break; tsiz -= len; } nfsmout: if (vp->v_mount->mnt_flag & MNT_ASYNC) committed = NFSV3WRITE_FILESYNC; *iomode = committed; if (error) uiop->uio_resid = tsiz; return (error); } /* * nfs mknod rpc * For NFS v2 this is a kludge. Use a create rpc but with the IFMT bits of the * mode set to specify the file type and the size field for rdev. */ static int nfs_mknodrpc(dvp, vpp, cnp, vap) register struct vnode *dvp; register struct vnode **vpp; register struct componentname *cnp; register struct vattr *vap; { register struct nfsv2_sattr *sp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; struct vnode *newvp = (struct vnode *)0; struct nfsnode *np = (struct nfsnode *)0; struct vattr vattr; char *cp2; caddr_t bpos, dpos; int error = 0, wccflag = NFSV3_WCCRATTR, gotvp = 0; struct mbuf *mreq, *mrep, *md, *mb, *mb2; u_int32_t rdev; int v3 = NFS_ISV3(dvp); if (vap->va_type == VCHR || vap->va_type == VBLK) rdev = txdr_unsigned(vap->va_rdev); else if (vap->va_type == VFIFO || vap->va_type == VSOCK) rdev = nfs_xdrneg1; else { return (EOPNOTSUPP); } if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_proc)) != 0) { return (error); } nfsstats.rpccnt[NFSPROC_MKNOD]++; nfsm_reqhead(dvp, NFSPROC_MKNOD, NFSX_FH(v3) + 4 * NFSX_UNSIGNED + + nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(v3)); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); if (v3) { nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); *tl++ = vtonfsv3_type(vap->va_type); nfsm_v3attrbuild(vap, FALSE); if (vap->va_type == VCHR || vap->va_type == VBLK) { nfsm_build(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(umajor(vap->va_rdev)); *tl = txdr_unsigned(uminor(vap->va_rdev)); } } else { nfsm_build(sp, struct nfsv2_sattr *, NFSX_V2SATTR); sp->sa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode); sp->sa_uid = nfs_xdrneg1; sp->sa_gid = nfs_xdrneg1; sp->sa_size = rdev; txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } nfsm_request(dvp, NFSPROC_MKNOD, cnp->cn_proc, cnp->cn_cred); if (!error) { nfsm_mtofh(dvp, newvp, v3, gotvp); if (!gotvp) { if (newvp) { vput(newvp); newvp = (struct vnode *)0; } error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, cnp->cn_proc, &np); if (!error) newvp = NFSTOV(np); } } if (v3) nfsm_wcc_data(dvp, wccflag); nfsm_reqdone; if (error) { if (newvp) vput(newvp); } else { if (cnp->cn_flags & MAKEENTRY) cache_enter(dvp, newvp, cnp); *vpp = newvp; } VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; return (error); } /* * nfs mknod vop * just call nfs_mknodrpc() to do the work. */ /* ARGSUSED */ static int nfs_mknod(ap) struct vop_mknod_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { return nfs_mknodrpc(ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap); } static u_long create_verf; /* * nfs file create call */ static int nfs_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { register struct vnode *dvp = ap->a_dvp; register struct vattr *vap = ap->a_vap; register struct componentname *cnp = ap->a_cnp; register struct nfsv2_sattr *sp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; struct nfsnode *np = (struct nfsnode *)0; struct vnode *newvp = (struct vnode *)0; caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR, gotvp = 0, fmode = 0; struct mbuf *mreq, *mrep, *md, *mb, *mb2; struct vattr vattr; int v3 = NFS_ISV3(dvp); /* * Oops, not for me.. */ if (vap->va_type == VSOCK) return (nfs_mknodrpc(dvp, ap->a_vpp, cnp, vap)); if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_proc)) != 0) { return (error); } if (vap->va_vaflags & VA_EXCLUSIVE) fmode |= O_EXCL; again: nfsstats.rpccnt[NFSPROC_CREATE]++; nfsm_reqhead(dvp, NFSPROC_CREATE, NFSX_FH(v3) + 2 * NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(v3)); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); if (v3) { nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); if (fmode & O_EXCL) { *tl = txdr_unsigned(NFSV3CREATE_EXCLUSIVE); nfsm_build(tl, u_int32_t *, NFSX_V3CREATEVERF); #ifdef INET if (!TAILQ_EMPTY(&in_ifaddrhead)) *tl++ = IA_SIN(in_ifaddrhead.tqh_first)->sin_addr.s_addr; else #endif *tl++ = create_verf; *tl = ++create_verf; } else { *tl = txdr_unsigned(NFSV3CREATE_UNCHECKED); nfsm_v3attrbuild(vap, FALSE); } } else { nfsm_build(sp, struct nfsv2_sattr *, NFSX_V2SATTR); sp->sa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode); sp->sa_uid = nfs_xdrneg1; sp->sa_gid = nfs_xdrneg1; sp->sa_size = 0; txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } nfsm_request(dvp, NFSPROC_CREATE, cnp->cn_proc, cnp->cn_cred); if (!error) { nfsm_mtofh(dvp, newvp, v3, gotvp); if (!gotvp) { if (newvp) { vput(newvp); newvp = (struct vnode *)0; } error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, cnp->cn_proc, &np); if (!error) newvp = NFSTOV(np); } } if (v3) nfsm_wcc_data(dvp, wccflag); nfsm_reqdone; if (error) { if (v3 && (fmode & O_EXCL) && error == NFSERR_NOTSUPP) { fmode &= ~O_EXCL; goto again; } if (newvp) vput(newvp); } else if (v3 && (fmode & O_EXCL)) { /* * We are normally called with only a partially initialized * VAP. Since the NFSv3 spec says that server may use the * file attributes to store the verifier, the spec requires * us to do a SETATTR RPC. FreeBSD servers store the verifier * in atime, but we can't really assume that all servers will * so we ensure that our SETATTR sets both atime and mtime. */ if (vap->va_mtime.tv_sec == VNOVAL) vfs_timestamp(&vap->va_mtime); if (vap->va_atime.tv_sec == VNOVAL) vap->va_atime = vap->va_mtime; error = nfs_setattrrpc(newvp, vap, cnp->cn_cred, cnp->cn_proc); } if (!error) { if (cnp->cn_flags & MAKEENTRY) cache_enter(dvp, newvp, cnp); *ap->a_vpp = newvp; } VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; return (error); } /* * nfs file remove call * To try and make nfs semantics closer to ufs semantics, a file that has * other processes using the vnode is renamed instead of removed and then * removed later on the last close. * - If v_usecount > 1 * If a rename is not already in the works * call nfs_sillyrename() to set it up * else * do the remove rpc */ static int nfs_remove(ap) struct vop_remove_args /* { struct vnodeop_desc *a_desc; struct vnode * a_dvp; struct vnode * a_vp; struct componentname * a_cnp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct vnode *dvp = ap->a_dvp; register struct componentname *cnp = ap->a_cnp; register struct nfsnode *np = VTONFS(vp); int error = 0; struct vattr vattr; #ifndef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("nfs_remove: no name"); if (vp->v_usecount < 1) panic("nfs_remove: bad v_usecount"); #endif if (vp->v_type == VDIR) error = EPERM; else if (vp->v_usecount == 1 || (np->n_sillyrename && VOP_GETATTR(vp, &vattr, cnp->cn_cred, cnp->cn_proc) == 0 && vattr.va_nlink > 1)) { /* * Purge the name cache so that the chance of a lookup for * the name succeeding while the remove is in progress is * minimized. Without node locking it can still happen, such * that an I/O op returns ESTALE, but since you get this if * another host removes the file.. */ cache_purge(vp); /* * throw away biocache buffers, mainly to avoid * unnecessary delayed writes later. */ error = nfs_vinvalbuf(vp, 0, cnp->cn_cred, cnp->cn_proc, 1); /* Do the rpc */ if (error != EINTR) error = nfs_removerpc(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, cnp->cn_proc); /* * Kludge City: If the first reply to the remove rpc is lost.. * the reply to the retransmitted request will be ENOENT * since the file was in fact removed * Therefore, we cheat and return success. */ if (error == ENOENT) error = 0; } else if (!np->n_sillyrename) error = nfs_sillyrename(dvp, vp, cnp); np->n_attrstamp = 0; return (error); } /* * nfs file remove rpc called from nfs_inactive */ int nfs_removeit(sp) register struct sillyrename *sp; { return (nfs_removerpc(sp->s_dvp, sp->s_name, sp->s_namlen, sp->s_cred, (struct proc *)0)); } /* * Nfs remove rpc, called from nfs_remove() and nfs_removeit(). */ static int nfs_removerpc(dvp, name, namelen, cred, proc) register struct vnode *dvp; const char *name; int namelen; struct ucred *cred; struct proc *proc; { register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3 = NFS_ISV3(dvp); nfsstats.rpccnt[NFSPROC_REMOVE]++; nfsm_reqhead(dvp, NFSPROC_REMOVE, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(namelen)); nfsm_fhtom(dvp, v3); nfsm_strtom(name, namelen, NFS_MAXNAMLEN); nfsm_request(dvp, NFSPROC_REMOVE, proc, cred); if (v3) nfsm_wcc_data(dvp, wccflag); nfsm_reqdone; VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; return (error); } /* * nfs file rename call */ static int nfs_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { register struct vnode *fvp = ap->a_fvp; register struct vnode *tvp = ap->a_tvp; register struct vnode *fdvp = ap->a_fdvp; register struct vnode *tdvp = ap->a_tdvp; register struct componentname *tcnp = ap->a_tcnp; register struct componentname *fcnp = ap->a_fcnp; int error; #ifndef DIAGNOSTIC if ((tcnp->cn_flags & HASBUF) == 0 || (fcnp->cn_flags & HASBUF) == 0) panic("nfs_rename: no name"); #endif /* Check for cross-device rename */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; goto out; } /* * We have to flush B_DELWRI data prior to renaming * the file. If we don't, the delayed-write buffers * can be flushed out later after the file has gone stale * under NFSV3. NFSV2 does not have this problem because * ( as far as I can tell ) it flushes dirty buffers more * often. */ VOP_FSYNC(fvp, fcnp->cn_cred, MNT_WAIT, fcnp->cn_proc); if (tvp) VOP_FSYNC(tvp, tcnp->cn_cred, MNT_WAIT, tcnp->cn_proc); /* * If the tvp exists and is in use, sillyrename it before doing the * rename of the new file over it. * XXX Can't sillyrename a directory. */ if (tvp && tvp->v_usecount > 1 && !VTONFS(tvp)->n_sillyrename && tvp->v_type != VDIR && !nfs_sillyrename(tdvp, tvp, tcnp)) { vput(tvp); tvp = NULL; } error = nfs_renamerpc(fdvp, fcnp->cn_nameptr, fcnp->cn_namelen, tdvp, tcnp->cn_nameptr, tcnp->cn_namelen, tcnp->cn_cred, tcnp->cn_proc); if (fvp->v_type == VDIR) { if (tvp != NULL && tvp->v_type == VDIR) cache_purge(tdvp); cache_purge(fdvp); } out: if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fdvp); vrele(fvp); /* * Kludge: Map ENOENT => 0 assuming that it is a reply to a retry. */ if (error == ENOENT) error = 0; return (error); } /* * nfs file rename rpc called from nfs_remove() above */ static int nfs_renameit(sdvp, scnp, sp) struct vnode *sdvp; struct componentname *scnp; register struct sillyrename *sp; { return (nfs_renamerpc(sdvp, scnp->cn_nameptr, scnp->cn_namelen, sdvp, sp->s_name, sp->s_namlen, scnp->cn_cred, scnp->cn_proc)); } /* * Do an nfs rename rpc. Called from nfs_rename() and nfs_renameit(). */ static int nfs_renamerpc(fdvp, fnameptr, fnamelen, tdvp, tnameptr, tnamelen, cred, proc) register struct vnode *fdvp; const char *fnameptr; int fnamelen; register struct vnode *tdvp; const char *tnameptr; int tnamelen; struct ucred *cred; struct proc *proc; { register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; int error = 0, fwccflag = NFSV3_WCCRATTR, twccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3 = NFS_ISV3(fdvp); nfsstats.rpccnt[NFSPROC_RENAME]++; nfsm_reqhead(fdvp, NFSPROC_RENAME, (NFSX_FH(v3) + NFSX_UNSIGNED)*2 + nfsm_rndup(fnamelen) + nfsm_rndup(tnamelen)); nfsm_fhtom(fdvp, v3); nfsm_strtom(fnameptr, fnamelen, NFS_MAXNAMLEN); nfsm_fhtom(tdvp, v3); nfsm_strtom(tnameptr, tnamelen, NFS_MAXNAMLEN); nfsm_request(fdvp, NFSPROC_RENAME, proc, cred); if (v3) { nfsm_wcc_data(fdvp, fwccflag); nfsm_wcc_data(tdvp, twccflag); } nfsm_reqdone; VTONFS(fdvp)->n_flag |= NMODIFIED; VTONFS(tdvp)->n_flag |= NMODIFIED; if (!fwccflag) VTONFS(fdvp)->n_attrstamp = 0; if (!twccflag) VTONFS(tdvp)->n_attrstamp = 0; return (error); } /* * nfs hard link create call */ static int nfs_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct vnode *tdvp = ap->a_tdvp; register struct componentname *cnp = ap->a_cnp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR, attrflag = 0; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3; if (vp->v_mount != tdvp->v_mount) { return (EXDEV); } /* * Push all writes to the server, so that the attribute cache * doesn't get "out of sync" with the server. * XXX There should be a better way! */ VOP_FSYNC(vp, cnp->cn_cred, MNT_WAIT, cnp->cn_proc); v3 = NFS_ISV3(vp); nfsstats.rpccnt[NFSPROC_LINK]++; nfsm_reqhead(vp, NFSPROC_LINK, NFSX_FH(v3)*2 + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen)); nfsm_fhtom(vp, v3); nfsm_fhtom(tdvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); nfsm_request(vp, NFSPROC_LINK, cnp->cn_proc, cnp->cn_cred); if (v3) { nfsm_postop_attr(vp, attrflag); nfsm_wcc_data(tdvp, wccflag); } nfsm_reqdone; VTONFS(tdvp)->n_flag |= NMODIFIED; if (!attrflag) VTONFS(vp)->n_attrstamp = 0; if (!wccflag) VTONFS(tdvp)->n_attrstamp = 0; /* * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry. */ if (error == EEXIST) error = 0; return (error); } /* * nfs symbolic link create call */ static int nfs_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap; { register struct vnode *dvp = ap->a_dvp; register struct vattr *vap = ap->a_vap; register struct componentname *cnp = ap->a_cnp; register struct nfsv2_sattr *sp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; int slen, error = 0, wccflag = NFSV3_WCCRATTR, gotvp; struct mbuf *mreq, *mrep, *md, *mb, *mb2; struct vnode *newvp = (struct vnode *)0; int v3 = NFS_ISV3(dvp); nfsstats.rpccnt[NFSPROC_SYMLINK]++; slen = strlen(ap->a_target); nfsm_reqhead(dvp, NFSPROC_SYMLINK, NFSX_FH(v3) + 2*NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen) + nfsm_rndup(slen) + NFSX_SATTR(v3)); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); if (v3) { nfsm_v3attrbuild(vap, FALSE); } nfsm_strtom(ap->a_target, slen, NFS_MAXPATHLEN); if (!v3) { nfsm_build(sp, struct nfsv2_sattr *, NFSX_V2SATTR); sp->sa_mode = vtonfsv2_mode(VLNK, vap->va_mode); sp->sa_uid = nfs_xdrneg1; sp->sa_gid = nfs_xdrneg1; sp->sa_size = nfs_xdrneg1; txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } /* * Issue the NFS request and get the rpc response. * * Only NFSv3 responses returning an error of 0 actually return * a file handle that can be converted into newvp without having * to do an extra lookup rpc. */ nfsm_request(dvp, NFSPROC_SYMLINK, cnp->cn_proc, cnp->cn_cred); if (v3) { if (error == 0) nfsm_mtofh(dvp, newvp, v3, gotvp); nfsm_wcc_data(dvp, wccflag); } /* * out code jumps -> here, mrep is also freed. */ nfsm_reqdone; /* * If we get an EEXIST error, silently convert it to no-error * in case of an NFS retry. */ if (error == EEXIST) error = 0; /* * If we do not have (or no longer have) an error, and we could * not extract the newvp from the response due to the request being * NFSv2 or the error being EEXIST. We have to do a lookup in order * to obtain a newvp to return. */ if (error == 0 && newvp == NULL) { struct nfsnode *np = NULL; error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, cnp->cn_proc, &np); if (!error) newvp = NFSTOV(np); } if (error) { if (newvp) vput(newvp); } else { *ap->a_vpp = newvp; } VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; return (error); } /* * nfs make dir call */ static int nfs_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { register struct vnode *dvp = ap->a_dvp; register struct vattr *vap = ap->a_vap; register struct componentname *cnp = ap->a_cnp; register struct nfsv2_sattr *sp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; register int len; struct nfsnode *np = (struct nfsnode *)0; struct vnode *newvp = (struct vnode *)0; caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR; int gotvp = 0; struct mbuf *mreq, *mrep, *md, *mb, *mb2; struct vattr vattr; int v3 = NFS_ISV3(dvp); if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_proc)) != 0) { return (error); } len = cnp->cn_namelen; nfsstats.rpccnt[NFSPROC_MKDIR]++; nfsm_reqhead(dvp, NFSPROC_MKDIR, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len) + NFSX_SATTR(v3)); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN); if (v3) { nfsm_v3attrbuild(vap, FALSE); } else { nfsm_build(sp, struct nfsv2_sattr *, NFSX_V2SATTR); sp->sa_mode = vtonfsv2_mode(VDIR, vap->va_mode); sp->sa_uid = nfs_xdrneg1; sp->sa_gid = nfs_xdrneg1; sp->sa_size = nfs_xdrneg1; txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } nfsm_request(dvp, NFSPROC_MKDIR, cnp->cn_proc, cnp->cn_cred); if (!error) nfsm_mtofh(dvp, newvp, v3, gotvp); if (v3) nfsm_wcc_data(dvp, wccflag); nfsm_reqdone; VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; /* * Kludge: Map EEXIST => 0 assuming that you have a reply to a retry * if we can succeed in looking up the directory. */ if (error == EEXIST || (!error && !gotvp)) { if (newvp) { vrele(newvp); newvp = (struct vnode *)0; } error = nfs_lookitup(dvp, cnp->cn_nameptr, len, cnp->cn_cred, cnp->cn_proc, &np); if (!error) { newvp = NFSTOV(np); if (newvp->v_type != VDIR) error = EEXIST; } } if (error) { if (newvp) vrele(newvp); } else *ap->a_vpp = newvp; return (error); } /* * nfs remove directory call */ static int nfs_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct vnode *dvp = ap->a_dvp; register struct componentname *cnp = ap->a_cnp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3 = NFS_ISV3(dvp); if (dvp == vp) return (EINVAL); nfsstats.rpccnt[NFSPROC_RMDIR]++; nfsm_reqhead(dvp, NFSPROC_RMDIR, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen)); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); nfsm_request(dvp, NFSPROC_RMDIR, cnp->cn_proc, cnp->cn_cred); if (v3) nfsm_wcc_data(dvp, wccflag); nfsm_reqdone; VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; cache_purge(dvp); cache_purge(vp); /* * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry. */ if (error == ENOENT) error = 0; return (error); } /* * nfs readdir call */ static int nfs_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); register struct uio *uio = ap->a_uio; int tresid, error; struct vattr vattr; if (vp->v_type != VDIR) return (EPERM); /* * First, check for hit on the EOF offset cache */ if (np->n_direofoffset > 0 && uio->uio_offset >= np->n_direofoffset && (np->n_flag & NMODIFIED) == 0) { if (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS) { if (NQNFS_CKCACHABLE(vp, ND_READ)) { nfsstats.direofcache_hits++; return (0); } } else if (VOP_GETATTR(vp, &vattr, ap->a_cred, uio->uio_procp) == 0 && np->n_mtime == vattr.va_mtime.tv_sec) { nfsstats.direofcache_hits++; return (0); } } /* * Call nfs_bioread() to do the real work. */ tresid = uio->uio_resid; error = nfs_bioread(vp, uio, 0, ap->a_cred); if (!error && uio->uio_resid == tresid) nfsstats.direofcache_misses++; return (error); } /* * Readdir rpc call. * Called from below the buffer cache by nfs_doio(). */ int nfs_readdirrpc(vp, uiop, cred) struct vnode *vp; register struct uio *uiop; struct ucred *cred; { register int len, left; register struct dirent *dp = NULL; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; register nfsuint64 *cookiep; caddr_t bpos, dpos, cp2; struct mbuf *mreq, *mrep, *md, *mb, *mb2; nfsuint64 cookie; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsnode *dnp = VTONFS(vp); u_quad_t fileno; int error = 0, tlen, more_dirs = 1, blksiz = 0, bigenough = 1; int attrflag; int v3 = NFS_ISV3(vp); #ifndef DIAGNOSTIC if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) || (uiop->uio_resid & (DIRBLKSIZ - 1))) panic("nfs readdirrpc bad uio"); #endif /* * If there is no cookie, assume directory was stale. */ cookiep = nfs_getcookie(dnp, uiop->uio_offset, 0); if (cookiep) cookie = *cookiep; else return (NFSERR_BAD_COOKIE); /* * Loop around doing readdir rpc's of size nm_readdirsize * truncated to a multiple of DIRBLKSIZ. * The stopping criteria is EOF or buffer full. */ while (more_dirs && bigenough) { nfsstats.rpccnt[NFSPROC_READDIR]++; nfsm_reqhead(vp, NFSPROC_READDIR, NFSX_FH(v3) + NFSX_READDIR(v3)); nfsm_fhtom(vp, v3); if (v3) { nfsm_build(tl, u_int32_t *, 5 * NFSX_UNSIGNED); *tl++ = cookie.nfsuquad[0]; *tl++ = cookie.nfsuquad[1]; *tl++ = dnp->n_cookieverf.nfsuquad[0]; *tl++ = dnp->n_cookieverf.nfsuquad[1]; } else { nfsm_build(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = cookie.nfsuquad[0]; } *tl = txdr_unsigned(nmp->nm_readdirsize); nfsm_request(vp, NFSPROC_READDIR, uiop->uio_procp, cred); if (v3) { nfsm_postop_attr(vp, attrflag); if (!error) { nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED); dnp->n_cookieverf.nfsuquad[0] = *tl++; dnp->n_cookieverf.nfsuquad[1] = *tl; } else { m_freem(mrep); goto nfsmout; } } nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); more_dirs = fxdr_unsigned(int, *tl); /* loop thru the dir entries, doctoring them to 4bsd form */ while (more_dirs && bigenough) { if (v3) { nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); fileno = fxdr_hyper(tl); len = fxdr_unsigned(int, *(tl + 2)); } else { nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED); fileno = fxdr_unsigned(u_quad_t, *tl++); len = fxdr_unsigned(int, *tl); } if (len <= 0 || len > NFS_MAXNAMLEN) { error = EBADRPC; m_freem(mrep); goto nfsmout; } tlen = nfsm_rndup(len); if (tlen == len) tlen += 4; /* To ensure null termination */ left = DIRBLKSIZ - blksiz; if ((tlen + DIRHDSIZ) > left) { dp->d_reclen += left; uiop->uio_iov->iov_base += left; uiop->uio_iov->iov_len -= left; uiop->uio_offset += left; uiop->uio_resid -= left; blksiz = 0; } if ((tlen + DIRHDSIZ) > uiop->uio_resid) bigenough = 0; if (bigenough) { dp = (struct dirent *)uiop->uio_iov->iov_base; dp->d_fileno = (int)fileno; dp->d_namlen = len; dp->d_reclen = tlen + DIRHDSIZ; dp->d_type = DT_UNKNOWN; blksiz += dp->d_reclen; if (blksiz == DIRBLKSIZ) blksiz = 0; uiop->uio_offset += DIRHDSIZ; uiop->uio_resid -= DIRHDSIZ; uiop->uio_iov->iov_base += DIRHDSIZ; uiop->uio_iov->iov_len -= DIRHDSIZ; nfsm_mtouio(uiop, len); cp = uiop->uio_iov->iov_base; tlen -= len; *cp = '\0'; /* null terminate */ uiop->uio_iov->iov_base += tlen; uiop->uio_iov->iov_len -= tlen; uiop->uio_offset += tlen; uiop->uio_resid -= tlen; } else nfsm_adv(nfsm_rndup(len)); if (v3) { nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); } else { nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED); } if (bigenough) { cookie.nfsuquad[0] = *tl++; if (v3) cookie.nfsuquad[1] = *tl++; } else if (v3) tl += 2; else tl++; more_dirs = fxdr_unsigned(int, *tl); } /* * If at end of rpc data, get the eof boolean */ if (!more_dirs) { nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); more_dirs = (fxdr_unsigned(int, *tl) == 0); } m_freem(mrep); } /* * Fill last record, iff any, out to a multiple of DIRBLKSIZ * by increasing d_reclen for the last record. */ if (blksiz > 0) { left = DIRBLKSIZ - blksiz; dp->d_reclen += left; uiop->uio_iov->iov_base += left; uiop->uio_iov->iov_len -= left; uiop->uio_offset += left; uiop->uio_resid -= left; } /* * We are now either at the end of the directory or have filled the * block. */ if (bigenough) dnp->n_direofoffset = uiop->uio_offset; else { if (uiop->uio_resid > 0) printf("EEK! readdirrpc resid > 0\n"); cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1); *cookiep = cookie; } nfsmout: return (error); } /* * NFS V3 readdir plus RPC. Used in place of nfs_readdirrpc(). */ int nfs_readdirplusrpc(vp, uiop, cred) struct vnode *vp; register struct uio *uiop; struct ucred *cred; { register int len, left; register struct dirent *dp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; register struct vnode *newvp; register nfsuint64 *cookiep; caddr_t bpos, dpos, cp2, dpossav1, dpossav2; struct mbuf *mreq, *mrep, *md, *mb, *mb2, *mdsav1, *mdsav2; struct nameidata nami, *ndp = &nami; struct componentname *cnp = &ndp->ni_cnd; nfsuint64 cookie; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsnode *dnp = VTONFS(vp), *np; nfsfh_t *fhp; u_quad_t fileno; int error = 0, tlen, more_dirs = 1, blksiz = 0, doit, bigenough = 1, i; int attrflag, fhsize; #ifndef nolint dp = (struct dirent *)0; #endif #ifndef DIAGNOSTIC if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) || (uiop->uio_resid & (DIRBLKSIZ - 1))) panic("nfs readdirplusrpc bad uio"); #endif ndp->ni_dvp = vp; newvp = NULLVP; /* * If there is no cookie, assume directory was stale. */ cookiep = nfs_getcookie(dnp, uiop->uio_offset, 0); if (cookiep) cookie = *cookiep; else return (NFSERR_BAD_COOKIE); /* * Loop around doing readdir rpc's of size nm_readdirsize * truncated to a multiple of DIRBLKSIZ. * The stopping criteria is EOF or buffer full. */ while (more_dirs && bigenough) { nfsstats.rpccnt[NFSPROC_READDIRPLUS]++; nfsm_reqhead(vp, NFSPROC_READDIRPLUS, NFSX_FH(1) + 6 * NFSX_UNSIGNED); nfsm_fhtom(vp, 1); nfsm_build(tl, u_int32_t *, 6 * NFSX_UNSIGNED); *tl++ = cookie.nfsuquad[0]; *tl++ = cookie.nfsuquad[1]; *tl++ = dnp->n_cookieverf.nfsuquad[0]; *tl++ = dnp->n_cookieverf.nfsuquad[1]; *tl++ = txdr_unsigned(nmp->nm_readdirsize); *tl = txdr_unsigned(nmp->nm_rsize); nfsm_request(vp, NFSPROC_READDIRPLUS, uiop->uio_procp, cred); nfsm_postop_attr(vp, attrflag); if (error) { m_freem(mrep); goto nfsmout; } nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); dnp->n_cookieverf.nfsuquad[0] = *tl++; dnp->n_cookieverf.nfsuquad[1] = *tl++; more_dirs = fxdr_unsigned(int, *tl); /* loop thru the dir entries, doctoring them to 4bsd form */ while (more_dirs && bigenough) { nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); fileno = fxdr_hyper(tl); len = fxdr_unsigned(int, *(tl + 2)); if (len <= 0 || len > NFS_MAXNAMLEN) { error = EBADRPC; m_freem(mrep); goto nfsmout; } tlen = nfsm_rndup(len); if (tlen == len) tlen += 4; /* To ensure null termination*/ left = DIRBLKSIZ - blksiz; if ((tlen + DIRHDSIZ) > left) { dp->d_reclen += left; uiop->uio_iov->iov_base += left; uiop->uio_iov->iov_len -= left; uiop->uio_offset += left; uiop->uio_resid -= left; blksiz = 0; } if ((tlen + DIRHDSIZ) > uiop->uio_resid) bigenough = 0; if (bigenough) { dp = (struct dirent *)uiop->uio_iov->iov_base; dp->d_fileno = (int)fileno; dp->d_namlen = len; dp->d_reclen = tlen + DIRHDSIZ; dp->d_type = DT_UNKNOWN; blksiz += dp->d_reclen; if (blksiz == DIRBLKSIZ) blksiz = 0; uiop->uio_offset += DIRHDSIZ; uiop->uio_resid -= DIRHDSIZ; uiop->uio_iov->iov_base += DIRHDSIZ; uiop->uio_iov->iov_len -= DIRHDSIZ; cnp->cn_nameptr = uiop->uio_iov->iov_base; cnp->cn_namelen = len; nfsm_mtouio(uiop, len); cp = uiop->uio_iov->iov_base; tlen -= len; *cp = '\0'; uiop->uio_iov->iov_base += tlen; uiop->uio_iov->iov_len -= tlen; uiop->uio_offset += tlen; uiop->uio_resid -= tlen; } else nfsm_adv(nfsm_rndup(len)); nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); if (bigenough) { cookie.nfsuquad[0] = *tl++; cookie.nfsuquad[1] = *tl++; } else tl += 2; /* * Since the attributes are before the file handle * (sigh), we must skip over the attributes and then * come back and get them. */ attrflag = fxdr_unsigned(int, *tl); if (attrflag) { dpossav1 = dpos; mdsav1 = md; nfsm_adv(NFSX_V3FATTR); nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); doit = fxdr_unsigned(int, *tl); if (doit) { nfsm_getfh(fhp, fhsize, 1); if (NFS_CMPFH(dnp, fhp, fhsize)) { VREF(vp); newvp = vp; np = dnp; } else { error = nfs_nget(vp->v_mount, fhp, fhsize, &np); if (error) doit = 0; else newvp = NFSTOV(np); } } if (doit && bigenough) { dpossav2 = dpos; dpos = dpossav1; mdsav2 = md; md = mdsav1; nfsm_loadattr(newvp, (struct vattr *)0); dpos = dpossav2; md = mdsav2; dp->d_type = IFTODT(VTTOIF(np->n_vattr.va_type)); ndp->ni_vp = newvp; cache_enter(ndp->ni_dvp, ndp->ni_vp, cnp); } } else { /* Just skip over the file handle */ nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); i = fxdr_unsigned(int, *tl); nfsm_adv(nfsm_rndup(i)); } if (newvp != NULLVP) { if (newvp == vp) vrele(newvp); else vput(newvp); newvp = NULLVP; } nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); more_dirs = fxdr_unsigned(int, *tl); } /* * If at end of rpc data, get the eof boolean */ if (!more_dirs) { nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); more_dirs = (fxdr_unsigned(int, *tl) == 0); } m_freem(mrep); } /* * Fill last record, iff any, out to a multiple of DIRBLKSIZ * by increasing d_reclen for the last record. */ if (blksiz > 0) { left = DIRBLKSIZ - blksiz; dp->d_reclen += left; uiop->uio_iov->iov_base += left; uiop->uio_iov->iov_len -= left; uiop->uio_offset += left; uiop->uio_resid -= left; } /* * We are now either at the end of the directory or have filled the * block. */ if (bigenough) dnp->n_direofoffset = uiop->uio_offset; else { if (uiop->uio_resid > 0) printf("EEK! readdirplusrpc resid > 0\n"); cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1); *cookiep = cookie; } nfsmout: if (newvp != NULLVP) { if (newvp == vp) vrele(newvp); else vput(newvp); newvp = NULLVP; } return (error); } /* * Silly rename. To make the NFS filesystem that is stateless look a little * more like the "ufs" a remove of an active vnode is translated to a rename * to a funny looking filename that is removed by nfs_inactive on the * nfsnode. There is the potential for another process on a different client * to create the same funny name between the nfs_lookitup() fails and the * nfs_rename() completes, but... */ static int nfs_sillyrename(dvp, vp, cnp) struct vnode *dvp, *vp; struct componentname *cnp; { register struct sillyrename *sp; struct nfsnode *np; int error; short pid; cache_purge(dvp); np = VTONFS(vp); #ifndef DIAGNOSTIC if (vp->v_type == VDIR) panic("nfs: sillyrename dir"); #endif MALLOC(sp, struct sillyrename *, sizeof (struct sillyrename), M_NFSREQ, M_WAITOK); sp->s_cred = crdup(cnp->cn_cred); sp->s_dvp = dvp; VREF(dvp); /* Fudge together a funny name */ pid = cnp->cn_proc->p_pid; sp->s_namlen = sprintf(sp->s_name, ".nfsA%04x4.4", pid); /* Try lookitups until we get one that isn't there */ while (nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred, cnp->cn_proc, (struct nfsnode **)0) == 0) { sp->s_name[4]++; if (sp->s_name[4] > 'z') { error = EINVAL; goto bad; } } error = nfs_renameit(dvp, cnp, sp); if (error) goto bad; error = nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred, cnp->cn_proc, &np); np->n_sillyrename = sp; return (0); bad: vrele(sp->s_dvp); crfree(sp->s_cred); free((caddr_t)sp, M_NFSREQ); return (error); } /* * Look up a file name and optionally either update the file handle or * allocate an nfsnode, depending on the value of npp. * npp == NULL --> just do the lookup * *npp == NULL --> allocate a new nfsnode and make sure attributes are * handled too * *npp != NULL --> update the file handle in the vnode */ static int nfs_lookitup(dvp, name, len, cred, procp, npp) register struct vnode *dvp; const char *name; int len; struct ucred *cred; struct proc *procp; struct nfsnode **npp; { register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; struct vnode *newvp = (struct vnode *)0; struct nfsnode *np, *dnp = VTONFS(dvp); caddr_t bpos, dpos, cp2; int error = 0, fhlen, attrflag; struct mbuf *mreq, *mrep, *md, *mb, *mb2; nfsfh_t *nfhp; int v3 = NFS_ISV3(dvp); nfsstats.rpccnt[NFSPROC_LOOKUP]++; nfsm_reqhead(dvp, NFSPROC_LOOKUP, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len)); nfsm_fhtom(dvp, v3); nfsm_strtom(name, len, NFS_MAXNAMLEN); nfsm_request(dvp, NFSPROC_LOOKUP, procp, cred); if (npp && !error) { nfsm_getfh(nfhp, fhlen, v3); if (*npp) { np = *npp; if (np->n_fhsize > NFS_SMALLFH && fhlen <= NFS_SMALLFH) { free((caddr_t)np->n_fhp, M_NFSBIGFH); np->n_fhp = &np->n_fh; } else if (np->n_fhsize <= NFS_SMALLFH && fhlen>NFS_SMALLFH) np->n_fhp =(nfsfh_t *)malloc(fhlen,M_NFSBIGFH,M_WAITOK); bcopy((caddr_t)nfhp, (caddr_t)np->n_fhp, fhlen); np->n_fhsize = fhlen; newvp = NFSTOV(np); } else if (NFS_CMPFH(dnp, nfhp, fhlen)) { VREF(dvp); newvp = dvp; } else { error = nfs_nget(dvp->v_mount, nfhp, fhlen, &np); if (error) { m_freem(mrep); return (error); } newvp = NFSTOV(np); } if (v3) { nfsm_postop_attr(newvp, attrflag); if (!attrflag && *npp == NULL) { m_freem(mrep); if (newvp == dvp) vrele(newvp); else vput(newvp); return (ENOENT); } } else nfsm_loadattr(newvp, (struct vattr *)0); } nfsm_reqdone; if (npp && *npp == NULL) { if (error) { if (newvp) { if (newvp == dvp) vrele(newvp); else vput(newvp); } } else *npp = np; } return (error); } /* * Nfs Version 3 commit rpc */ int nfs_commit(vp, offset, cnt, cred, procp) struct vnode *vp; u_quad_t offset; int cnt; struct ucred *cred; struct proc *procp; { register caddr_t cp; register u_int32_t *tl; register int32_t t1, t2; register struct nfsmount *nmp = VFSTONFS(vp->v_mount); caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) return (0); nfsstats.rpccnt[NFSPROC_COMMIT]++; nfsm_reqhead(vp, NFSPROC_COMMIT, NFSX_FH(1)); nfsm_fhtom(vp, 1); nfsm_build(tl, u_int32_t *, 3 * NFSX_UNSIGNED); txdr_hyper(offset, tl); tl += 2; *tl = txdr_unsigned(cnt); nfsm_request(vp, NFSPROC_COMMIT, procp, cred); nfsm_wcc_data(vp, wccflag); if (!error) { nfsm_dissect(tl, u_int32_t *, NFSX_V3WRITEVERF); if (bcmp((caddr_t)nmp->nm_verf, (caddr_t)tl, NFSX_V3WRITEVERF)) { bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF); error = NFSERR_STALEWRITEVERF; } } nfsm_reqdone; return (error); } /* * Kludge City.. * - make nfs_bmap() essentially a no-op that does no translation * - do nfs_strategy() by doing I/O with nfs_readrpc/nfs_writerpc * (Maybe I could use the process's page mapping, but I was concerned that * Kernel Write might not be enabled and also figured copyout() would do * a lot more work than bcopy() and also it currently happens in the * context of the swapper process (2). */ static int nfs_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct vnode **a_vpp; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { register struct vnode *vp = ap->a_vp; if (ap->a_vpp != NULL) *ap->a_vpp = vp; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn * btodb(vp->v_mount->mnt_stat.f_iosize); if (ap->a_runp != NULL) *ap->a_runp = 0; if (ap->a_runb != NULL) *ap->a_runb = 0; return (0); } /* * Strategy routine. * For async requests when nfsiod(s) are running, queue the request by * calling nfs_asyncio(), otherwise just all nfs_doio() to do the * request. */ static int nfs_strategy(ap) struct vop_strategy_args *ap; { register struct buf *bp = ap->a_bp; struct ucred *cr; struct proc *p; int error = 0; KASSERT(!(bp->b_flags & B_DONE), ("nfs_strategy: buffer %p unexpectedly marked B_DONE", bp)); KASSERT(BUF_REFCNT(bp) > 0, ("nfs_strategy: buffer %p not locked", bp)); if (bp->b_flags & B_PHYS) panic("nfs physio"); if (bp->b_flags & B_ASYNC) p = (struct proc *)0; else p = curproc; /* XXX */ if (bp->b_iocmd == BIO_READ) cr = bp->b_rcred; else cr = bp->b_wcred; /* * If the op is asynchronous and an i/o daemon is waiting * queue the request, wake it up and wait for completion * otherwise just do it ourselves. */ if ((bp->b_flags & B_ASYNC) == 0 || nfs_asyncio(bp, NOCRED, p)) error = nfs_doio(bp, cr, p); return (error); } /* * fsync vnode op. Just call nfs_flush() with commit == 1. */ /* ARGSUSED */ static int nfs_fsync(ap) struct vop_fsync_args /* { struct vnodeop_desc *a_desc; struct vnode * a_vp; struct ucred * a_cred; int a_waitfor; struct proc * a_p; } */ *ap; { return (nfs_flush(ap->a_vp, ap->a_cred, ap->a_waitfor, ap->a_p, 1)); } /* * Flush all the blocks associated with a vnode. * Walk through the buffer pool and push any dirty pages * associated with the vnode. */ static int nfs_flush(vp, cred, waitfor, p, commit) register struct vnode *vp; struct ucred *cred; int waitfor; struct proc *p; int commit; { register struct nfsnode *np = VTONFS(vp); register struct buf *bp; register int i; struct buf *nbp; struct nfsmount *nmp = VFSTONFS(vp->v_mount); int s, error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos; int passone = 1; u_quad_t off, endoff, toff; struct ucred* wcred = NULL; struct buf **bvec = NULL; #ifndef NFS_COMMITBVECSIZ #define NFS_COMMITBVECSIZ 20 #endif struct buf *bvec_on_stack[NFS_COMMITBVECSIZ]; int bvecsize = 0, bveccount; if (nmp->nm_flag & NFSMNT_INT) slpflag = PCATCH; if (!commit) passone = 0; /* * A b_flags == (B_DELWRI | B_NEEDCOMMIT) block has been written to the * server, but nas not been committed to stable storage on the server * yet. On the first pass, the byte range is worked out and the commit * rpc is done. On the second pass, nfs_writebp() is called to do the * job. */ again: off = (u_quad_t)-1; endoff = 0; bvecpos = 0; if (NFS_ISV3(vp) && commit) { s = splbio(); /* * Count up how many buffers waiting for a commit. */ bveccount = 0; for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (BUF_REFCNT(bp) == 0 && (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) == (B_DELWRI | B_NEEDCOMMIT)) bveccount++; } /* * Allocate space to remember the list of bufs to commit. It is * important to use M_NOWAIT here to avoid a race with nfs_write. * If we can't get memory (for whatever reason), we will end up * committing the buffers one-by-one in the loop below. */ if (bveccount > NFS_COMMITBVECSIZ) { if (bvec != NULL && bvec != bvec_on_stack) free(bvec, M_TEMP); bvec = (struct buf **) malloc(bveccount * sizeof(struct buf *), M_TEMP, M_NOWAIT); if (bvec == NULL) { bvec = bvec_on_stack; bvecsize = NFS_COMMITBVECSIZ; } else bvecsize = bveccount; } else { bvec = bvec_on_stack; bvecsize = NFS_COMMITBVECSIZ; } for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (bvecpos >= bvecsize) break; if ((bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) != (B_DELWRI | B_NEEDCOMMIT) || BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) continue; bremfree(bp); /* * Work out if all buffers are using the same cred * so we can deal with them all with one commit. * * NOTE: we are not clearing B_DONE here, so we have * to do it later on in this routine if we intend to * initiate I/O on the bp. * * Note: to avoid loopback deadlocks, we do not * assign b_runningbufspace. */ if (wcred == NULL) wcred = bp->b_wcred; else if (wcred != bp->b_wcred) wcred = NOCRED; bp->b_flags |= B_WRITEINPROG; vfs_busy_pages(bp, 1); /* * bp is protected by being locked, but nbp is not * and vfs_busy_pages() may sleep. We have to * recalculate nbp. */ nbp = TAILQ_NEXT(bp, b_vnbufs); /* * A list of these buffers is kept so that the * second loop knows which buffers have actually * been committed. This is necessary, since there * may be a race between the commit rpc and new * uncommitted writes on the file. */ bvec[bvecpos++] = bp; toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; if (toff < off) off = toff; toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff); if (toff > endoff) endoff = toff; } splx(s); } if (bvecpos > 0) { /* * Commit data on the server, as required. * If all bufs are using the same wcred, then use that with * one call for all of them, otherwise commit each one * separately. */ if (wcred != NOCRED) retv = nfs_commit(vp, off, (int)(endoff - off), wcred, p); else { retv = 0; for (i = 0; i < bvecpos; i++) { off_t off, size; bp = bvec[i]; off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; size = (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff); retv = nfs_commit(vp, off, (int)size, bp->b_wcred, p); if (retv) break; } } if (retv == NFSERR_STALEWRITEVERF) nfs_clearcommit(vp->v_mount); /* * Now, either mark the blocks I/O done or mark the * blocks dirty, depending on whether the commit * succeeded. */ for (i = 0; i < bvecpos; i++) { bp = bvec[i]; bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG | B_CLUSTEROK); if (retv) { /* * Error, leave B_DELWRI intact */ vfs_unbusy_pages(bp); brelse(bp); } else { /* * Success, remove B_DELWRI ( bundirty() ). * * b_dirtyoff/b_dirtyend seem to be NFS * specific. We should probably move that * into bundirty(). XXX */ s = splbio(); vp->v_numoutput++; bp->b_flags |= B_ASYNC; bundirty(bp); bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_dirtyoff = bp->b_dirtyend = 0; splx(s); bufdone(bp); } } } /* * Start/do any write(s) that are required. */ loop: s = splbio(); for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { if (waitfor != MNT_WAIT || passone) continue; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL, "nfsfsync", slpflag, slptimeo); splx(s); if (error == 0) panic("nfs_fsync: inconsistent lock"); if (error == ENOLCK) goto loop; if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) { error = EINTR; goto done; } if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; } goto loop; } if ((bp->b_flags & B_DELWRI) == 0) panic("nfs_fsync: not dirty"); if ((passone || !commit) && (bp->b_flags & B_NEEDCOMMIT)) { BUF_UNLOCK(bp); continue; } bremfree(bp); if (passone || !commit) bp->b_flags |= B_ASYNC; else bp->b_flags |= B_ASYNC | B_WRITEINPROG; splx(s); BUF_WRITE(bp); goto loop; } splx(s); if (passone) { passone = 0; goto again; } if (waitfor == MNT_WAIT) { while (vp->v_numoutput) { vp->v_flag |= VBWAIT; error = tsleep((caddr_t)&vp->v_numoutput, slpflag | (PRIBIO + 1), "nfsfsync", slptimeo); if (error) { if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) { error = EINTR; goto done; } if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; } } } if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) && commit) { goto loop; } } if (np->n_flag & NWRITEERR) { error = np->n_error; np->n_flag &= ~NWRITEERR; } done: if (bvec != NULL && bvec != bvec_on_stack) free(bvec, M_TEMP); return (error); } /* * NFS advisory byte-level locks. * Currently unsupported. */ static int nfs_advlock(ap) struct vop_advlock_args /* { struct vnode *a_vp; caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap; { register struct nfsnode *np = VTONFS(ap->a_vp); /* * The following kludge is to allow diskless support to work * until a real NFS lockd is implemented. Basically, just pretend * that this is a local lock. */ return (lf_advlock(ap, &(np->n_lockf), np->n_size)); } /* * Print out the contents of an nfsnode. */ static int nfs_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); printf("tag VT_NFS, fileid %ld fsid 0x%x", np->n_vattr.va_fileid, np->n_vattr.va_fsid); if (vp->v_type == VFIFO) fifo_printinfo(vp); printf("\n"); return (0); } /* - * Just call nfs_writebp() with the force argument set to 1. - * - * NOTE: B_DONE may or may not be set in a_bp on call. - */ -static int -nfs_bwrite(ap) - struct vop_bwrite_args /* { - struct vnode *a_bp; - } */ *ap; -{ - return (nfs_writebp(ap->a_bp, 1, curproc)); -} - -/* - * This is a clone of vn_bwrite(), except that B_WRITEINPROG isn't set unless - * the force flag is one and it also handles the B_NEEDCOMMIT flag. We set - * B_CACHE if this is a VMIO buffer. + * This is the "real" nfs::bwrite(struct buf*). + * B_WRITEINPROG isn't set unless the force flag is one and it + * handles the B_NEEDCOMMIT flag. + * We set B_CACHE if this is a VMIO buffer. */ int nfs_writebp(bp, force, procp) register struct buf *bp; int force; struct proc *procp; { int s; int oldflags = bp->b_flags; #if 0 int retv = 1; off_t off; #endif if (BUF_REFCNT(bp) == 0) panic("bwrite: buffer is not locked???"); if (bp->b_flags & B_INVAL) { brelse(bp); return(0); } bp->b_flags |= B_CACHE; /* * Undirty the bp. We will redirty it later if the I/O fails. */ s = splbio(); bundirty(bp); bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_iocmd = BIO_WRITE; bp->b_vp->v_numoutput++; curproc->p_stats->p_ru.ru_oublock++; splx(s); /* * Note: to avoid loopback deadlocks, we do not * assign b_runningbufspace. */ vfs_busy_pages(bp, 1); if (force) bp->b_flags |= B_WRITEINPROG; BUF_KERNPROC(bp); BUF_STRATEGY(bp); if( (oldflags & B_ASYNC) == 0) { int rtval = bufwait(bp); if (oldflags & B_DELWRI) { s = splbio(); reassignbuf(bp, bp->b_vp); splx(s); } brelse(bp); return (rtval); } return (0); } /* * nfs special file access vnode op. * Essentially just get vattr and then imitate iaccess() since the device is * local to the client. */ static int nfsspec_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vattr *vap; register gid_t *gp; register struct ucred *cred = ap->a_cred; struct vnode *vp = ap->a_vp; mode_t mode = ap->a_mode; struct vattr vattr; register int i; int error; /* * Disallow write attempts on filesystems mounted read-only; * unless the file is a socket, fifo, or a block or character * device resident on the filesystem. */ if ((mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) { switch (vp->v_type) { case VREG: case VDIR: case VLNK: return (EROFS); default: break; } } /* * If you're the super-user, * you always get access. */ if (cred->cr_uid == 0) return (0); vap = &vattr; error = VOP_GETATTR(vp, vap, cred, ap->a_p); if (error) return (error); /* * Access check is based on only one of owner, group, public. * If not owner, then check group. If not a member of the * group, then check public access. */ if (cred->cr_uid != vap->va_uid) { mode >>= 3; gp = cred->cr_groups; for (i = 0; i < cred->cr_ngroups; i++, gp++) if (vap->va_gid == *gp) goto found; mode >>= 3; found: ; } error = (vap->va_mode & mode) == mode ? 0 : EACCES; return (error); } /* * Read wrapper for special devices. */ static int nfsspec_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct nfsnode *np = VTONFS(ap->a_vp); /* * Set access flag. */ np->n_flag |= NACC; getnanotime(&np->n_atim); return (VOCALL(spec_vnodeop_p, VOFFSET(vop_read), ap)); } /* * Write wrapper for special devices. */ static int nfsspec_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct nfsnode *np = VTONFS(ap->a_vp); /* * Set update flag. */ np->n_flag |= NUPD; getnanotime(&np->n_mtim); return (VOCALL(spec_vnodeop_p, VOFFSET(vop_write), ap)); } /* * Close wrapper for special devices. * * Update the times on the nfsnode then do device close. */ static int nfsspec_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); struct vattr vattr; if (np->n_flag & (NACC | NUPD)) { np->n_flag |= NCHG; if (vp->v_usecount == 1 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { VATTR_NULL(&vattr); if (np->n_flag & NACC) vattr.va_atime = np->n_atim; if (np->n_flag & NUPD) vattr.va_mtime = np->n_mtim; (void)VOP_SETATTR(vp, &vattr, ap->a_cred, ap->a_p); } } return (VOCALL(spec_vnodeop_p, VOFFSET(vop_close), ap)); } /* * Read wrapper for fifos. */ static int nfsfifo_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct nfsnode *np = VTONFS(ap->a_vp); /* * Set access flag. */ np->n_flag |= NACC; getnanotime(&np->n_atim); return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_read), ap)); } /* * Write wrapper for fifos. */ static int nfsfifo_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct nfsnode *np = VTONFS(ap->a_vp); /* * Set update flag. */ np->n_flag |= NUPD; getnanotime(&np->n_mtim); return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_write), ap)); } /* * Close wrapper for fifos. * * Update the times on the nfsnode then do fifo close. */ static int nfsfifo_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); struct vattr vattr; struct timespec ts; if (np->n_flag & (NACC | NUPD)) { getnanotime(&ts); if (np->n_flag & NACC) np->n_atim = ts; if (np->n_flag & NUPD) np->n_mtim = ts; np->n_flag |= NCHG; if (vp->v_usecount == 1 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { VATTR_NULL(&vattr); if (np->n_flag & NACC) vattr.va_atime = np->n_atim; if (np->n_flag & NUPD) vattr.va_mtime = np->n_mtim; (void)VOP_SETATTR(vp, &vattr, ap->a_cred, ap->a_p); } } return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_close), ap)); } Index: head/sys/nfsclient/nfs_bio.c =================================================================== --- head/sys/nfsclient/nfs_bio.c (revision 75579) +++ head/sys/nfsclient/nfs_bio.c (revision 75580) @@ -1,1604 +1,1623 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +/* + * Just call nfs_writebp() with the force argument set to 1. + * + * NOTE: B_DONE may or may not be set in a_bp on call. + */ +static int +nfs_bwrite(struct buf *bp) +{ + return (nfs_writebp(bp, 1, curproc)); +} + +struct buf_ops buf_ops_nfs = { + "buf_ops_nfs", + nfs_bwrite +}; + + static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size, struct proc *p)); extern int nfs_numasync; extern int nfs_pbuf_freecnt; extern struct nfsstats nfsstats; /* * Vnode op for VM getpages. */ int nfs_getpages(ap) struct vop_getpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_reqpage; vm_ooffset_t a_offset; } */ *ap; { int i, error, nextoff, size, toff, count, npages; struct uio uio; struct iovec iov; vm_offset_t kva; struct buf *bp; struct vnode *vp; struct proc *p; struct ucred *cred; struct nfsmount *nmp; vm_page_t *pages; vp = ap->a_vp; p = curproc; /* XXX */ cred = curproc->p_ucred; /* XXX */ nmp = VFSTONFS(vp->v_mount); pages = ap->a_m; count = ap->a_count; if (vp->v_object == NULL) { printf("nfs_getpages: called with non-merged cache vnode??\n"); return VM_PAGER_ERROR; } if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) (void)nfs_fsinfo(nmp, vp, cred, p); npages = btoc(count); /* * If the requested page is partially valid, just return it and * allow the pager to zero-out the blanks. Partially valid pages * can only occur at the file EOF. */ { vm_page_t m = pages[ap->a_reqpage]; if (m->valid != 0) { /* handled by vm_fault now */ /* vm_page_zero_invalid(m, TRUE); */ for (i = 0; i < npages; ++i) { if (i != ap->a_reqpage) vnode_pager_freepage(pages[i]); } return(0); } } /* * We use only the kva address for the buffer, but this is extremely * convienient and fast. */ bp = getpbuf(&nfs_pbuf_freecnt); kva = (vm_offset_t) bp->b_data; pmap_qenter(kva, pages, npages); iov.iov_base = (caddr_t) kva; iov.iov_len = count; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); uio.uio_resid = count; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_procp = p; error = nfs_readrpc(vp, &uio, cred); pmap_qremove(kva, npages); relpbuf(bp, &nfs_pbuf_freecnt); if (error && (uio.uio_resid == count)) { printf("nfs_getpages: error %d\n", error); for (i = 0; i < npages; ++i) { if (i != ap->a_reqpage) vnode_pager_freepage(pages[i]); } return VM_PAGER_ERROR; } /* * Calculate the number of bytes read and validate only that number * of bytes. Note that due to pending writes, size may be 0. This * does not mean that the remaining data is invalid! */ size = count - uio.uio_resid; for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { vm_page_t m; nextoff = toff + PAGE_SIZE; m = pages[i]; m->flags &= ~PG_ZERO; if (nextoff <= size) { /* * Read operation filled an entire page */ m->valid = VM_PAGE_BITS_ALL; vm_page_undirty(m); } else if (size > toff) { /* * Read operation filled a partial page. */ m->valid = 0; vm_page_set_validclean(m, 0, size - toff); /* handled by vm_fault now */ /* vm_page_zero_invalid(m, TRUE); */ } if (i != ap->a_reqpage) { /* * Whether or not to leave the page activated is up in * the air, but we should put the page on a page queue * somewhere (it already is in the object). Result: * It appears that emperical results show that * deactivating pages is best. */ /* * Just in case someone was asking for this page we * now tell them that it is ok to use. */ if (!error) { if (m->flags & PG_WANTED) vm_page_activate(m); else vm_page_deactivate(m); vm_page_wakeup(m); } else { vnode_pager_freepage(m); } } } return 0; } /* * Vnode op for VM putpages. */ int nfs_putpages(ap) struct vop_putpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_sync; int *a_rtvals; vm_ooffset_t a_offset; } */ *ap; { struct uio uio; struct iovec iov; vm_offset_t kva; struct buf *bp; int iomode, must_commit, i, error, npages, count; off_t offset; int *rtvals; struct vnode *vp; struct proc *p; struct ucred *cred; struct nfsmount *nmp; struct nfsnode *np; vm_page_t *pages; vp = ap->a_vp; np = VTONFS(vp); p = curproc; /* XXX */ cred = curproc->p_ucred; /* XXX */ nmp = VFSTONFS(vp->v_mount); pages = ap->a_m; count = ap->a_count; rtvals = ap->a_rtvals; npages = btoc(count); offset = IDX_TO_OFF(pages[0]->pindex); if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) (void)nfs_fsinfo(nmp, vp, cred, p); for (i = 0; i < npages; i++) { rtvals[i] = VM_PAGER_AGAIN; } /* * When putting pages, do not extend file past EOF. */ if (offset + count > np->n_size) { count = np->n_size - offset; if (count < 0) count = 0; } /* * We use only the kva address for the buffer, but this is extremely * convienient and fast. */ bp = getpbuf(&nfs_pbuf_freecnt); kva = (vm_offset_t) bp->b_data; pmap_qenter(kva, pages, npages); iov.iov_base = (caddr_t) kva; iov.iov_len = count; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = offset; uio.uio_resid = count; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_WRITE; uio.uio_procp = p; if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0) iomode = NFSV3WRITE_UNSTABLE; else iomode = NFSV3WRITE_FILESYNC; error = nfs_writerpc(vp, &uio, cred, &iomode, &must_commit); pmap_qremove(kva, npages); relpbuf(bp, &nfs_pbuf_freecnt); if (!error) { int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; for (i = 0; i < nwritten; i++) { rtvals[i] = VM_PAGER_OK; vm_page_undirty(pages[i]); } if (must_commit) nfs_clearcommit(vp->v_mount); } return rtvals[0]; } /* * Vnode op for read using bio */ int nfs_bioread(vp, uio, ioflag, cred) register struct vnode *vp; register struct uio *uio; int ioflag; struct ucred *cred; { register struct nfsnode *np = VTONFS(vp); register int biosize, i; struct buf *bp = 0, *rabp; struct vattr vattr; struct proc *p; struct nfsmount *nmp = VFSTONFS(vp->v_mount); daddr_t lbn, rabn; int bcount; int seqcount; int nra, error = 0, n = 0, on = 0; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_READ) panic("nfs_read mode"); #endif if (uio->uio_resid == 0) return (0); if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */ return (EINVAL); p = uio->uio_procp; if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) (void)nfs_fsinfo(nmp, vp, cred, p); if (vp->v_type != VDIR && (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) return (EFBIG); biosize = vp->v_mount->mnt_stat.f_iosize; seqcount = (int)((off_t)(ioflag >> 16) * biosize / BKVASIZE); /* * For nfs, cache consistency can only be maintained approximately. * Although RFC1094 does not specify the criteria, the following is * believed to be compatible with the reference port. * For nqnfs, full cache consistency is maintained within the loop. * For nfs: * If the file's modify time on the server has changed since the * last read rpc or you have written to the file, * you may have lost data cache consistency with the * server, so flush all of the file's data out of the cache. * Then force a getattr rpc to ensure that you have up to date * attributes. * NB: This implies that cache data can be read when up to * NFS_ATTRTIMEO seconds out of date. If you find that you need current * attributes this could be forced by setting n_attrstamp to 0 before * the VOP_GETATTR() call. */ if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) { if (np->n_flag & NMODIFIED) { if (vp->v_type != VREG) { if (vp->v_type != VDIR) panic("nfs: bioread, not dir"); nfs_invaldir(vp); error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) return (error); } np->n_attrstamp = 0; error = VOP_GETATTR(vp, &vattr, cred, p); if (error) return (error); np->n_mtime = vattr.va_mtime.tv_sec; } else { error = VOP_GETATTR(vp, &vattr, cred, p); if (error) return (error); if (np->n_mtime != vattr.va_mtime.tv_sec) { if (vp->v_type == VDIR) nfs_invaldir(vp); error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) return (error); np->n_mtime = vattr.va_mtime.tv_sec; } } } do { /* * Get a valid lease. If cached data is stale, flush it. */ if (nmp->nm_flag & NFSMNT_NQNFS) { if (NQNFS_CKINVALID(vp, np, ND_READ)) { do { error = nqnfs_getlease(vp, ND_READ, cred, p); } while (error == NQNFS_EXPIRED); if (error) return (error); if (np->n_lrev != np->n_brev || (np->n_flag & NQNFSNONCACHE) || ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { if (vp->v_type == VDIR) nfs_invaldir(vp); error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) return (error); np->n_brev = np->n_lrev; } } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { nfs_invaldir(vp); error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) return (error); } } if (np->n_flag & NQNFSNONCACHE) { switch (vp->v_type) { case VREG: return (nfs_readrpc(vp, uio, cred)); case VLNK: return (nfs_readlinkrpc(vp, uio, cred)); case VDIR: break; default: printf(" NQNFSNONCACHE: type %x unexpected\n", vp->v_type); }; } switch (vp->v_type) { case VREG: nfsstats.biocache_reads++; lbn = uio->uio_offset / biosize; on = uio->uio_offset & (biosize - 1); /* * Start the read ahead(s), as required. */ if (nfs_numasync > 0 && nmp->nm_readahead > 0) { for (nra = 0; nra < nmp->nm_readahead && nra < seqcount && (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) { rabn = lbn + 1 + nra; if (!incore(vp, rabn)) { rabp = nfs_getcacheblk(vp, rabn, biosize, p); if (!rabp) return (EINTR); if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { rabp->b_flags |= B_ASYNC; rabp->b_iocmd = BIO_READ; vfs_busy_pages(rabp, 0); if (nfs_asyncio(rabp, cred, p)) { rabp->b_flags |= B_INVAL; rabp->b_ioflags |= BIO_ERROR; vfs_unbusy_pages(rabp); brelse(rabp); break; } } else { brelse(rabp); } } } } /* * Obtain the buffer cache block. Figure out the buffer size * when we are at EOF. If we are modifying the size of the * buffer based on an EOF condition we need to hold * nfs_rslock() through obtaining the buffer to prevent * a potential writer-appender from messing with n_size. * Otherwise we may accidently truncate the buffer and * lose dirty data. * * Note that bcount is *not* DEV_BSIZE aligned. */ again: bcount = biosize; if ((off_t)lbn * biosize >= np->n_size) { bcount = 0; } else if ((off_t)(lbn + 1) * biosize > np->n_size) { bcount = np->n_size - (off_t)lbn * biosize; } if (bcount != biosize) { switch(nfs_rslock(np, p)) { case ENOLCK: goto again; /* not reached */ case EINTR: case ERESTART: return(EINTR); /* not reached */ default: break; } } bp = nfs_getcacheblk(vp, lbn, bcount, p); if (bcount != biosize) nfs_rsunlock(np, p); if (!bp) return (EINTR); /* * If B_CACHE is not set, we must issue the read. If this * fails, we return an error. */ if ((bp->b_flags & B_CACHE) == 0) { bp->b_iocmd = BIO_READ; vfs_busy_pages(bp, 0); error = nfs_doio(bp, cred, p); if (error) { brelse(bp); return (error); } } /* * on is the offset into the current bp. Figure out how many * bytes we can copy out of the bp. Note that bcount is * NOT DEV_BSIZE aligned. * * Then figure out how many bytes we can copy into the uio. */ n = 0; if (on < bcount) n = min((unsigned)(bcount - on), uio->uio_resid); break; case VLNK: nfsstats.biocache_readlinks++; bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p); if (!bp) return (EINTR); if ((bp->b_flags & B_CACHE) == 0) { bp->b_iocmd = BIO_READ; vfs_busy_pages(bp, 0); error = nfs_doio(bp, cred, p); if (error) { bp->b_ioflags |= BIO_ERROR; brelse(bp); return (error); } } n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); on = 0; break; case VDIR: nfsstats.biocache_readdirs++; if (np->n_direofoffset && uio->uio_offset >= np->n_direofoffset) { return (0); } lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ; on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p); if (!bp) return (EINTR); if ((bp->b_flags & B_CACHE) == 0) { bp->b_iocmd = BIO_READ; vfs_busy_pages(bp, 0); error = nfs_doio(bp, cred, p); if (error) { brelse(bp); } while (error == NFSERR_BAD_COOKIE) { printf("got bad cookie vp %p bp %p\n", vp, bp); nfs_invaldir(vp); error = nfs_vinvalbuf(vp, 0, cred, p, 1); /* * Yuck! The directory has been modified on the * server. The only way to get the block is by * reading from the beginning to get all the * offset cookies. * * Leave the last bp intact unless there is an error. * Loop back up to the while if the error is another * NFSERR_BAD_COOKIE (double yuch!). */ for (i = 0; i <= lbn && !error; i++) { if (np->n_direofoffset && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) return (0); bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p); if (!bp) return (EINTR); if ((bp->b_flags & B_CACHE) == 0) { bp->b_iocmd = BIO_READ; vfs_busy_pages(bp, 0); error = nfs_doio(bp, cred, p); /* * no error + B_INVAL == directory EOF, * use the block. */ if (error == 0 && (bp->b_flags & B_INVAL)) break; } /* * An error will throw away the block and the * for loop will break out. If no error and this * is not the block we want, we throw away the * block and go for the next one via the for loop. */ if (error || i < lbn) brelse(bp); } } /* * The above while is repeated if we hit another cookie * error. If we hit an error and it wasn't a cookie error, * we give up. */ if (error) return (error); } /* * If not eof and read aheads are enabled, start one. * (You need the current block first, so that you have the * directory offset cookie of the next block.) */ if (nfs_numasync > 0 && nmp->nm_readahead > 0 && (bp->b_flags & B_INVAL) == 0 && (np->n_direofoffset == 0 || (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && !(np->n_flag & NQNFSNONCACHE) && !incore(vp, lbn + 1)) { rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p); if (rabp) { if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { rabp->b_flags |= B_ASYNC; rabp->b_iocmd = BIO_READ; vfs_busy_pages(rabp, 0); if (nfs_asyncio(rabp, cred, p)) { rabp->b_flags |= B_INVAL; rabp->b_ioflags |= BIO_ERROR; vfs_unbusy_pages(rabp); brelse(rabp); } } else { brelse(rabp); } } } /* * Unlike VREG files, whos buffer size ( bp->b_bcount ) is * chopped for the EOF condition, we cannot tell how large * NFS directories are going to be until we hit EOF. So * an NFS directory buffer is *not* chopped to its EOF. Now, * it just so happens that b_resid will effectively chop it * to EOF. *BUT* this information is lost if the buffer goes * away and is reconstituted into a B_CACHE state ( due to * being VMIO ) later. So we keep track of the directory eof * in np->n_direofoffset and chop it off as an extra step * right here. */ n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset) n = np->n_direofoffset - uio->uio_offset; break; default: printf(" nfs_bioread: type %x unexpected\n",vp->v_type); break; }; if (n > 0) { error = uiomove(bp->b_data + on, (int)n, uio); } switch (vp->v_type) { case VREG: break; case VLNK: n = 0; break; case VDIR: /* * Invalidate buffer if caching is disabled, forcing a * re-read from the remote later. */ if (np->n_flag & NQNFSNONCACHE) bp->b_flags |= B_INVAL; break; default: printf(" nfs_bioread: type %x unexpected\n",vp->v_type); } brelse(bp); } while (error == 0 && uio->uio_resid > 0 && n > 0); return (error); } /* * Vnode op for write using bio */ int nfs_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { int biosize; struct uio *uio = ap->a_uio; struct proc *p = uio->uio_procp; struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); struct ucred *cred = ap->a_cred; int ioflag = ap->a_ioflag; struct buf *bp; struct vattr vattr; struct nfsmount *nmp = VFSTONFS(vp->v_mount); daddr_t lbn; int bcount; int n, on, error = 0, iomode, must_commit; int haverslock = 0; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_WRITE) panic("nfs_write mode"); if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) panic("nfs_write proc"); #endif if (vp->v_type != VREG) return (EIO); if (np->n_flag & NWRITEERR) { np->n_flag &= ~NWRITEERR; return (np->n_error); } if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) (void)nfs_fsinfo(nmp, vp, cred, p); /* * Synchronously flush pending buffers if we are in synchronous * mode or if we are appending. */ if (ioflag & (IO_APPEND | IO_SYNC)) { if (np->n_flag & NMODIFIED) { np->n_attrstamp = 0; error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) return (error); } } /* * If IO_APPEND then load uio_offset. We restart here if we cannot * get the append lock. */ restart: if (ioflag & IO_APPEND) { np->n_attrstamp = 0; error = VOP_GETATTR(vp, &vattr, cred, p); if (error) return (error); uio->uio_offset = np->n_size; } if (uio->uio_offset < 0) return (EINVAL); if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) return (EFBIG); if (uio->uio_resid == 0) return (0); /* * We need to obtain the rslock if we intend to modify np->n_size * in order to guarentee the append point with multiple contending * writers, to guarentee that no other appenders modify n_size * while we are trying to obtain a truncated buffer (i.e. to avoid * accidently truncating data written by another appender due to * the race), and to ensure that the buffer is populated prior to * our extending of the file. We hold rslock through the entire * operation. * * Note that we do not synchronize the case where someone truncates * the file while we are appending to it because attempting to lock * this case may deadlock other parts of the system unexpectedly. */ if ((ioflag & IO_APPEND) || uio->uio_offset + uio->uio_resid > np->n_size) { switch(nfs_rslock(np, p)) { case ENOLCK: goto restart; /* not reached */ case EINTR: case ERESTART: return(EINTR); /* not reached */ default: break; } haverslock = 1; } /* * Maybe this should be above the vnode op call, but so long as * file servers have no limits, i don't think it matters */ if (p && uio->uio_offset + uio->uio_resid > p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { PROC_LOCK(p); psignal(p, SIGXFSZ); PROC_UNLOCK(p); if (haverslock) nfs_rsunlock(np, p); return (EFBIG); } biosize = vp->v_mount->mnt_stat.f_iosize; do { /* * Check for a valid write lease. */ if ((nmp->nm_flag & NFSMNT_NQNFS) && NQNFS_CKINVALID(vp, np, ND_WRITE)) { do { error = nqnfs_getlease(vp, ND_WRITE, cred, p); } while (error == NQNFS_EXPIRED); if (error) break; if (np->n_lrev != np->n_brev || (np->n_flag & NQNFSNONCACHE)) { error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) break; np->n_brev = np->n_lrev; } } if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { iomode = NFSV3WRITE_FILESYNC; error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); if (must_commit) nfs_clearcommit(vp->v_mount); break; } nfsstats.biocache_writes++; lbn = uio->uio_offset / biosize; on = uio->uio_offset & (biosize-1); n = min((unsigned)(biosize - on), uio->uio_resid); again: /* * Handle direct append and file extension cases, calculate * unaligned buffer size. */ if (uio->uio_offset == np->n_size && n) { /* * Get the buffer (in its pre-append state to maintain * B_CACHE if it was previously set). Resize the * nfsnode after we have locked the buffer to prevent * readers from reading garbage. */ bcount = on; bp = nfs_getcacheblk(vp, lbn, bcount, p); if (bp != NULL) { long save; np->n_size = uio->uio_offset + n; np->n_flag |= NMODIFIED; vnode_pager_setsize(vp, np->n_size); save = bp->b_flags & B_CACHE; bcount += n; allocbuf(bp, bcount); bp->b_flags |= save; + bp->b_magic = B_MAGIC_NFS; + bp->b_op = &buf_ops_nfs; } } else { /* * Obtain the locked cache block first, and then * adjust the file's size as appropriate. */ bcount = on + n; if ((off_t)lbn * biosize + bcount < np->n_size) { if ((off_t)(lbn + 1) * biosize < np->n_size) bcount = biosize; else bcount = np->n_size - (off_t)lbn * biosize; } bp = nfs_getcacheblk(vp, lbn, bcount, p); if (uio->uio_offset + n > np->n_size) { np->n_size = uio->uio_offset + n; np->n_flag |= NMODIFIED; vnode_pager_setsize(vp, np->n_size); } } if (!bp) { error = EINTR; break; } /* * Issue a READ if B_CACHE is not set. In special-append * mode, B_CACHE is based on the buffer prior to the write * op and is typically set, avoiding the read. If a read * is required in special append mode, the server will * probably send us a short-read since we extended the file * on our end, resulting in b_resid == 0 and, thusly, * B_CACHE getting set. * * We can also avoid issuing the read if the write covers * the entire buffer. We have to make sure the buffer state * is reasonable in this case since we will not be initiating * I/O. See the comments in kern/vfs_bio.c's getblk() for * more information. * * B_CACHE may also be set due to the buffer being cached * normally. */ if (on == 0 && n == bcount) { bp->b_flags |= B_CACHE; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; } if ((bp->b_flags & B_CACHE) == 0) { bp->b_iocmd = BIO_READ; vfs_busy_pages(bp, 0); error = nfs_doio(bp, cred, p); if (error) { brelse(bp); break; } } if (!bp) { error = EINTR; break; } if (bp->b_wcred == NOCRED) { crhold(cred); bp->b_wcred = cred; } np->n_flag |= NMODIFIED; /* * If dirtyend exceeds file size, chop it down. This should * not normally occur but there is an append race where it * might occur XXX, so we log it. * * If the chopping creates a reverse-indexed or degenerate * situation with dirtyoff/end, we 0 both of them. */ if (bp->b_dirtyend > bcount) { printf("NFS append race @%lx:%d\n", (long)bp->b_blkno * DEV_BSIZE, bp->b_dirtyend - bcount); bp->b_dirtyend = bcount; } if (bp->b_dirtyoff >= bp->b_dirtyend) bp->b_dirtyoff = bp->b_dirtyend = 0; /* * If the new write will leave a contiguous dirty * area, just update the b_dirtyoff and b_dirtyend, * otherwise force a write rpc of the old dirty area. * * While it is possible to merge discontiguous writes due to * our having a B_CACHE buffer ( and thus valid read data * for the hole), we don't because it could lead to * significant cache coherency problems with multiple clients, * especially if locking is implemented later on. * * as an optimization we could theoretically maintain * a linked list of discontinuous areas, but we would still * have to commit them separately so there isn't much * advantage to it except perhaps a bit of asynchronization. */ if (bp->b_dirtyend > 0 && (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { if (BUF_WRITE(bp) == EINTR) return (EINTR); goto again; } /* * Check for valid write lease and get one as required. * In case getblk() and/or bwrite() delayed us. */ if ((nmp->nm_flag & NFSMNT_NQNFS) && NQNFS_CKINVALID(vp, np, ND_WRITE)) { do { error = nqnfs_getlease(vp, ND_WRITE, cred, p); } while (error == NQNFS_EXPIRED); if (error) { brelse(bp); break; } if (np->n_lrev != np->n_brev || (np->n_flag & NQNFSNONCACHE)) { brelse(bp); error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) break; np->n_brev = np->n_lrev; goto again; } } error = uiomove((char *)bp->b_data + on, n, uio); /* * Since this block is being modified, it must be written * again and not just committed. Since write clustering does * not work for the stage 1 data write, only the stage 2 * commit rpc, we have to clear B_CLUSTEROK as well. */ bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); if (error) { bp->b_ioflags |= BIO_ERROR; brelse(bp); break; } /* * Only update dirtyoff/dirtyend if not a degenerate * condition. */ if (n) { if (bp->b_dirtyend > 0) { bp->b_dirtyoff = min(on, bp->b_dirtyoff); bp->b_dirtyend = max((on + n), bp->b_dirtyend); } else { bp->b_dirtyoff = on; bp->b_dirtyend = on + n; } vfs_bio_set_validclean(bp, on, n); } /* * If the lease is non-cachable or IO_SYNC do bwrite(). * * IO_INVAL appears to be unused. The idea appears to be * to turn off caching in this case. Very odd. XXX */ if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { if (ioflag & IO_INVAL) bp->b_flags |= B_NOCACHE; error = BUF_WRITE(bp); if (error) break; if (np->n_flag & NQNFSNONCACHE) { error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); if (error) break; } } else if ((n + on) == biosize && (nmp->nm_flag & NFSMNT_NQNFS) == 0) { bp->b_flags |= B_ASYNC; (void)nfs_writebp(bp, 0, 0); } else { bdwrite(bp); } } while (uio->uio_resid > 0 && n > 0); if (haverslock) nfs_rsunlock(np, p); return (error); } /* * Get an nfs cache block. * * Allocate a new one if the block isn't currently in the cache * and return the block marked busy. If the calling process is * interrupted by a signal for an interruptible mount point, return * NULL. * * The caller must carefully deal with the possible B_INVAL state of * the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it * indirectly), so synchronous reads can be issued without worrying about * the B_INVAL state. We have to be a little more careful when dealing * with writes (see comments in nfs_write()) when extending a file past * its EOF. */ static struct buf * nfs_getcacheblk(vp, bn, size, p) struct vnode *vp; daddr_t bn; int size; struct proc *p; { register struct buf *bp; struct mount *mp; struct nfsmount *nmp; mp = vp->v_mount; nmp = VFSTONFS(mp); if (nmp->nm_flag & NFSMNT_INT) { bp = getblk(vp, bn, size, PCATCH, 0); while (bp == (struct buf *)0) { if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) return ((struct buf *)0); bp = getblk(vp, bn, size, 0, 2 * hz); } } else { bp = getblk(vp, bn, size, 0, 0); } if (vp->v_type == VREG) { int biosize; biosize = mp->mnt_stat.f_iosize; bp->b_blkno = bn * (biosize / DEV_BSIZE); } return (bp); } /* * Flush and invalidate all dirty buffers. If another process is already * doing the flush, just wait for completion. */ int nfs_vinvalbuf(vp, flags, cred, p, intrflg) struct vnode *vp; int flags; struct ucred *cred; struct proc *p; int intrflg; { register struct nfsnode *np = VTONFS(vp); struct nfsmount *nmp = VFSTONFS(vp->v_mount); int error = 0, slpflag, slptimeo; if (vp->v_flag & VXLOCK) { return (0); } if ((nmp->nm_flag & NFSMNT_INT) == 0) intrflg = 0; if (intrflg) { slpflag = PCATCH; slptimeo = 2 * hz; } else { slpflag = 0; slptimeo = 0; } /* * First wait for any other process doing a flush to complete. */ while (np->n_flag & NFLUSHINPROG) { np->n_flag |= NFLUSHWANT; error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", slptimeo); if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) return (EINTR); } /* * Now, flush as required. */ np->n_flag |= NFLUSHINPROG; error = vinvalbuf(vp, flags, cred, p, slpflag, 0); while (error) { if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) { np->n_flag &= ~NFLUSHINPROG; if (np->n_flag & NFLUSHWANT) { np->n_flag &= ~NFLUSHWANT; wakeup((caddr_t)&np->n_flag); } return (EINTR); } error = vinvalbuf(vp, flags, cred, p, 0, slptimeo); } np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); if (np->n_flag & NFLUSHWANT) { np->n_flag &= ~NFLUSHWANT; wakeup((caddr_t)&np->n_flag); } return (0); } /* * Initiate asynchronous I/O. Return an error if no nfsiods are available. * This is mainly to avoid queueing async I/O requests when the nfsiods * are all hung on a dead server. * * Note: nfs_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp * is eventually dequeued by the async daemon, nfs_doio() *will*. */ int nfs_asyncio(bp, cred, procp) register struct buf *bp; struct ucred *cred; struct proc *procp; { struct nfsmount *nmp; int i; int gotiod; int slpflag = 0; int slptimeo = 0; int error; /* * If no async daemons then return EIO to force caller to run the rpc * synchronously. */ if (nfs_numasync == 0) return (EIO); nmp = VFSTONFS(bp->b_vp->v_mount); /* * Commits are usually short and sweet so lets save some cpu and * leave the async daemons for more important rpc's (such as reads * and writes). */ if (bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) && (nmp->nm_bufqiods > nfs_numasync / 2)) { return(EIO); } again: if (nmp->nm_flag & NFSMNT_INT) slpflag = PCATCH; gotiod = FALSE; /* * Find a free iod to process this request. */ for (i = 0; i < NFS_MAXASYNCDAEMON; i++) if (nfs_iodwant[i]) { /* * Found one, so wake it up and tell it which * mount to process. */ NFS_DPF(ASYNCIO, ("nfs_asyncio: waking iod %d for mount %p\n", i, nmp)); nfs_iodwant[i] = (struct proc *)0; nfs_iodmount[i] = nmp; nmp->nm_bufqiods++; wakeup((caddr_t)&nfs_iodwant[i]); gotiod = TRUE; break; } /* * If none are free, we may already have an iod working on this mount * point. If so, it will process our request. */ if (!gotiod) { if (nmp->nm_bufqiods > 0) { NFS_DPF(ASYNCIO, ("nfs_asyncio: %d iods are already processing mount %p\n", nmp->nm_bufqiods, nmp)); gotiod = TRUE; } } /* * If we have an iod which can process the request, then queue * the buffer. */ if (gotiod) { /* * Ensure that the queue never grows too large. We still want * to asynchronize so we block rather then return EIO. */ while (nmp->nm_bufqlen >= 2*nfs_numasync) { NFS_DPF(ASYNCIO, ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); nmp->nm_bufqwant = TRUE; error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO, "nfsaio", slptimeo); if (error) { if (nfs_sigintr(nmp, NULL, procp)) return (EINTR); if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; } } /* * We might have lost our iod while sleeping, * so check and loop if nescessary. */ if (nmp->nm_bufqiods == 0) { NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); goto again; } } if (bp->b_iocmd == BIO_READ) { if (bp->b_rcred == NOCRED && cred != NOCRED) { crhold(cred); bp->b_rcred = cred; } } else { bp->b_flags |= B_WRITEINPROG; if (bp->b_wcred == NOCRED && cred != NOCRED) { crhold(cred); bp->b_wcred = cred; } } BUF_KERNPROC(bp); TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); nmp->nm_bufqlen++; return (0); } /* * All the iods are busy on other mounts, so return EIO to * force the caller to process the i/o synchronously. */ NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); return (EIO); } /* * Do an I/O operation to/from a cache block. This may be called * synchronously or from an nfsiod. */ int nfs_doio(bp, cr, p) struct buf *bp; struct ucred *cr; struct proc *p; { struct uio *uiop; struct vnode *vp; struct nfsnode *np; struct nfsmount *nmp; int error = 0, iomode, must_commit = 0; struct uio uio; struct iovec io; vp = bp->b_vp; np = VTONFS(vp); nmp = VFSTONFS(vp->v_mount); uiop = &uio; uiop->uio_iov = &io; uiop->uio_iovcnt = 1; uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_procp = p; /* * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We * do this here so we do not have to do it in all the code that * calls us. */ bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp)); /* * Historically, paging was done with physio, but no more. */ if (bp->b_flags & B_PHYS) { /* * ...though reading /dev/drum still gets us here. */ io.iov_len = uiop->uio_resid = bp->b_bcount; /* mapping was done by vmapbuf() */ io.iov_base = bp->b_data; uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; if (bp->b_iocmd == BIO_READ) { uiop->uio_rw = UIO_READ; nfsstats.read_physios++; error = nfs_readrpc(vp, uiop, cr); } else { int com; iomode = NFSV3WRITE_DATASYNC; uiop->uio_rw = UIO_WRITE; nfsstats.write_physios++; error = nfs_writerpc(vp, uiop, cr, &iomode, &com); } if (error) { bp->b_ioflags |= BIO_ERROR; bp->b_error = error; } } else if (bp->b_iocmd == BIO_READ) { io.iov_len = uiop->uio_resid = bp->b_bcount; io.iov_base = bp->b_data; uiop->uio_rw = UIO_READ; switch (vp->v_type) { case VREG: uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; nfsstats.read_bios++; error = nfs_readrpc(vp, uiop, cr); if (!error) { if (uiop->uio_resid) { /* * If we had a short read with no error, we must have * hit a file hole. We should zero-fill the remainder. * This can also occur if the server hits the file EOF. * * Holes used to be able to occur due to pending * writes, but that is not possible any longer. */ int nread = bp->b_bcount - uiop->uio_resid; int left = bp->b_bcount - nread; if (left > 0) bzero((char *)bp->b_data + nread, left); uiop->uio_resid = 0; } } if (p && (vp->v_flag & VTEXT) && (((nmp->nm_flag & NFSMNT_NQNFS) && NQNFS_CKINVALID(vp, np, ND_READ) && np->n_lrev != np->n_brev) || (!(nmp->nm_flag & NFSMNT_NQNFS) && np->n_mtime != np->n_vattr.va_mtime.tv_sec))) { uprintf("Process killed due to text file modification\n"); PROC_LOCK(p); psignal(p, SIGKILL); _PHOLD(p); PROC_UNLOCK(p); } break; case VLNK: uiop->uio_offset = (off_t)0; nfsstats.readlink_bios++; error = nfs_readlinkrpc(vp, uiop, cr); break; case VDIR: nfsstats.readdir_bios++; uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; if (nmp->nm_flag & NFSMNT_RDIRPLUS) { error = nfs_readdirplusrpc(vp, uiop, cr); if (error == NFSERR_NOTSUPP) nmp->nm_flag &= ~NFSMNT_RDIRPLUS; } if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) error = nfs_readdirrpc(vp, uiop, cr); /* * end-of-directory sets B_INVAL but does not generate an * error. */ if (error == 0 && uiop->uio_resid == bp->b_bcount) bp->b_flags |= B_INVAL; break; default: printf("nfs_doio: type %x unexpected\n",vp->v_type); break; }; if (error) { bp->b_ioflags |= BIO_ERROR; bp->b_error = error; } } else { /* * If we only need to commit, try to commit */ if (bp->b_flags & B_NEEDCOMMIT) { int retv; off_t off; off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; bp->b_flags |= B_WRITEINPROG; retv = nfs_commit( bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff, bp->b_wcred, p); bp->b_flags &= ~B_WRITEINPROG; if (retv == 0) { bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); bp->b_resid = 0; bufdone(bp); return (0); } if (retv == NFSERR_STALEWRITEVERF) { nfs_clearcommit(bp->b_vp->v_mount); } } /* * Setup for actual write */ if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size) bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE; if (bp->b_dirtyend > bp->b_dirtyoff) { io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff; uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyoff; io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; uiop->uio_rw = UIO_WRITE; nfsstats.write_bios++; if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC) iomode = NFSV3WRITE_UNSTABLE; else iomode = NFSV3WRITE_FILESYNC; bp->b_flags |= B_WRITEINPROG; error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); /* * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try * to cluster the buffers needing commit. This will allow * the system to submit a single commit rpc for the whole * cluster. We can do this even if the buffer is not 100% * dirty (relative to the NFS blocksize), so we optimize the * append-to-file-case. * * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be * cleared because write clustering only works for commit * rpc's, not for the data portion of the write). */ if (!error && iomode == NFSV3WRITE_UNSTABLE) { bp->b_flags |= B_NEEDCOMMIT; if (bp->b_dirtyoff == 0 && bp->b_dirtyend == bp->b_bcount) bp->b_flags |= B_CLUSTEROK; } else { bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); } bp->b_flags &= ~B_WRITEINPROG; /* * For an interrupted write, the buffer is still valid * and the write hasn't been pushed to the server yet, * so we can't set BIO_ERROR and report the interruption * by setting B_EINTR. For the B_ASYNC case, B_EINTR * is not relevant, so the rpc attempt is essentially * a noop. For the case of a V3 write rpc not being * committed to stable storage, the block is still * dirty and requires either a commit rpc or another * write rpc with iomode == NFSV3WRITE_FILESYNC before * the block is reused. This is indicated by setting * the B_DELWRI and B_NEEDCOMMIT flags. * * If the buffer is marked B_PAGING, it does not reside on * the vp's paging queues so we cannot call bdirty(). The * bp in this case is not an NFS cache block so we should * be safe. XXX */ if (error == EINTR || (!error && (bp->b_flags & B_NEEDCOMMIT))) { int s; s = splbio(); bp->b_flags &= ~(B_INVAL|B_NOCACHE); if ((bp->b_flags & B_PAGING) == 0) { bdirty(bp); bp->b_flags &= ~B_DONE; } if (error && (bp->b_flags & B_ASYNC) == 0) bp->b_flags |= B_EINTR; splx(s); } else { if (error) { bp->b_ioflags |= BIO_ERROR; bp->b_error = np->n_error = error; np->n_flag |= NWRITEERR; } bp->b_dirtyoff = bp->b_dirtyend = 0; } } else { bp->b_resid = 0; bufdone(bp); return (0); } } bp->b_resid = uiop->uio_resid; if (must_commit) nfs_clearcommit(vp->v_mount); bufdone(bp); return (error); } Index: head/sys/nfsclient/nfs_vnops.c =================================================================== --- head/sys/nfsclient/nfs_vnops.c (revision 75579) +++ head/sys/nfsclient/nfs_vnops.c (revision 75580) @@ -1,3400 +1,3385 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)nfs_vnops.c 8.16 (Berkeley) 5/27/95 * $FreeBSD$ */ /* * vnode op calls for Sun NFS version 2 and 3 */ #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Defs */ #define TRUE 1 #define FALSE 0 /* * Ifdef for FreeBSD-current merged buffer cache. It is unfortunate that these * calls are not in getblk() and brelse() so that they would not be necessary * here. */ #ifndef B_VMIO #define vfs_busy_pages(bp, f) #endif static int nfsspec_read __P((struct vop_read_args *)); static int nfsspec_write __P((struct vop_write_args *)); static int nfsfifo_read __P((struct vop_read_args *)); static int nfsfifo_write __P((struct vop_write_args *)); static int nfsspec_close __P((struct vop_close_args *)); static int nfsfifo_close __P((struct vop_close_args *)); #define nfs_poll vop_nopoll static int nfs_flush __P((struct vnode *,struct ucred *,int,struct proc *,int)); static int nfs_setattrrpc __P((struct vnode *,struct vattr *,struct ucred *,struct proc *)); static int nfs_lookup __P((struct vop_lookup_args *)); static int nfs_create __P((struct vop_create_args *)); static int nfs_mknod __P((struct vop_mknod_args *)); static int nfs_open __P((struct vop_open_args *)); static int nfs_close __P((struct vop_close_args *)); static int nfs_access __P((struct vop_access_args *)); static int nfs_getattr __P((struct vop_getattr_args *)); static int nfs_setattr __P((struct vop_setattr_args *)); static int nfs_read __P((struct vop_read_args *)); static int nfs_fsync __P((struct vop_fsync_args *)); static int nfs_remove __P((struct vop_remove_args *)); static int nfs_link __P((struct vop_link_args *)); static int nfs_rename __P((struct vop_rename_args *)); static int nfs_mkdir __P((struct vop_mkdir_args *)); static int nfs_rmdir __P((struct vop_rmdir_args *)); static int nfs_symlink __P((struct vop_symlink_args *)); static int nfs_readdir __P((struct vop_readdir_args *)); static int nfs_bmap __P((struct vop_bmap_args *)); static int nfs_strategy __P((struct vop_strategy_args *)); static int nfs_lookitup __P((struct vnode *, const char *, int, struct ucred *, struct proc *, struct nfsnode **)); static int nfs_sillyrename __P((struct vnode *,struct vnode *,struct componentname *)); static int nfsspec_access __P((struct vop_access_args *)); static int nfs_readlink __P((struct vop_readlink_args *)); static int nfs_print __P((struct vop_print_args *)); static int nfs_advlock __P((struct vop_advlock_args *)); -static int nfs_bwrite __P((struct vop_bwrite_args *)); /* * Global vfs data structures for nfs */ vop_t **nfsv2_vnodeop_p; static struct vnodeopv_entry_desc nfsv2_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_defaultop }, { &vop_access_desc, (vop_t *) nfs_access }, { &vop_advlock_desc, (vop_t *) nfs_advlock }, { &vop_bmap_desc, (vop_t *) nfs_bmap }, - { &vop_bwrite_desc, (vop_t *) nfs_bwrite }, { &vop_close_desc, (vop_t *) nfs_close }, { &vop_create_desc, (vop_t *) nfs_create }, { &vop_fsync_desc, (vop_t *) nfs_fsync }, { &vop_getattr_desc, (vop_t *) nfs_getattr }, { &vop_getpages_desc, (vop_t *) nfs_getpages }, { &vop_putpages_desc, (vop_t *) nfs_putpages }, { &vop_inactive_desc, (vop_t *) nfs_inactive }, { &vop_islocked_desc, (vop_t *) vop_stdislocked }, { &vop_lease_desc, (vop_t *) vop_null }, { &vop_link_desc, (vop_t *) nfs_link }, { &vop_lock_desc, (vop_t *) vop_sharedlock }, { &vop_lookup_desc, (vop_t *) nfs_lookup }, { &vop_mkdir_desc, (vop_t *) nfs_mkdir }, { &vop_mknod_desc, (vop_t *) nfs_mknod }, { &vop_open_desc, (vop_t *) nfs_open }, { &vop_poll_desc, (vop_t *) nfs_poll }, { &vop_print_desc, (vop_t *) nfs_print }, { &vop_read_desc, (vop_t *) nfs_read }, { &vop_readdir_desc, (vop_t *) nfs_readdir }, { &vop_readlink_desc, (vop_t *) nfs_readlink }, { &vop_reclaim_desc, (vop_t *) nfs_reclaim }, { &vop_remove_desc, (vop_t *) nfs_remove }, { &vop_rename_desc, (vop_t *) nfs_rename }, { &vop_rmdir_desc, (vop_t *) nfs_rmdir }, { &vop_setattr_desc, (vop_t *) nfs_setattr }, { &vop_strategy_desc, (vop_t *) nfs_strategy }, { &vop_symlink_desc, (vop_t *) nfs_symlink }, { &vop_unlock_desc, (vop_t *) vop_stdunlock }, { &vop_write_desc, (vop_t *) nfs_write }, { NULL, NULL } }; static struct vnodeopv_desc nfsv2_vnodeop_opv_desc = { &nfsv2_vnodeop_p, nfsv2_vnodeop_entries }; VNODEOP_SET(nfsv2_vnodeop_opv_desc); /* * Special device vnode ops */ vop_t **spec_nfsv2nodeop_p; static struct vnodeopv_entry_desc nfsv2_specop_entries[] = { { &vop_default_desc, (vop_t *) spec_vnoperate }, { &vop_access_desc, (vop_t *) nfsspec_access }, { &vop_close_desc, (vop_t *) nfsspec_close }, { &vop_fsync_desc, (vop_t *) nfs_fsync }, { &vop_getattr_desc, (vop_t *) nfs_getattr }, { &vop_islocked_desc, (vop_t *) vop_stdislocked }, { &vop_inactive_desc, (vop_t *) nfs_inactive }, { &vop_lock_desc, (vop_t *) vop_sharedlock }, { &vop_print_desc, (vop_t *) nfs_print }, { &vop_read_desc, (vop_t *) nfsspec_read }, { &vop_reclaim_desc, (vop_t *) nfs_reclaim }, { &vop_setattr_desc, (vop_t *) nfs_setattr }, { &vop_unlock_desc, (vop_t *) vop_stdunlock }, { &vop_write_desc, (vop_t *) nfsspec_write }, { NULL, NULL } }; static struct vnodeopv_desc spec_nfsv2nodeop_opv_desc = { &spec_nfsv2nodeop_p, nfsv2_specop_entries }; VNODEOP_SET(spec_nfsv2nodeop_opv_desc); vop_t **fifo_nfsv2nodeop_p; static struct vnodeopv_entry_desc nfsv2_fifoop_entries[] = { { &vop_default_desc, (vop_t *) fifo_vnoperate }, { &vop_access_desc, (vop_t *) nfsspec_access }, { &vop_close_desc, (vop_t *) nfsfifo_close }, { &vop_fsync_desc, (vop_t *) nfs_fsync }, { &vop_getattr_desc, (vop_t *) nfs_getattr }, { &vop_inactive_desc, (vop_t *) nfs_inactive }, { &vop_islocked_desc, (vop_t *) vop_stdislocked }, { &vop_lock_desc, (vop_t *) vop_sharedlock }, { &vop_print_desc, (vop_t *) nfs_print }, { &vop_read_desc, (vop_t *) nfsfifo_read }, { &vop_reclaim_desc, (vop_t *) nfs_reclaim }, { &vop_setattr_desc, (vop_t *) nfs_setattr }, { &vop_unlock_desc, (vop_t *) vop_stdunlock }, { &vop_write_desc, (vop_t *) nfsfifo_write }, { NULL, NULL } }; static struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc = { &fifo_nfsv2nodeop_p, nfsv2_fifoop_entries }; VNODEOP_SET(fifo_nfsv2nodeop_opv_desc); static int nfs_mknodrpc __P((struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct vattr *vap)); static int nfs_removerpc __P((struct vnode *dvp, const char *name, int namelen, struct ucred *cred, struct proc *proc)); static int nfs_renamerpc __P((struct vnode *fdvp, const char *fnameptr, int fnamelen, struct vnode *tdvp, const char *tnameptr, int tnamelen, struct ucred *cred, struct proc *proc)); static int nfs_renameit __P((struct vnode *sdvp, struct componentname *scnp, struct sillyrename *sp)); /* * Global variables */ extern u_int32_t nfs_true, nfs_false; extern u_int32_t nfs_xdrneg1; extern struct nfsstats nfsstats; extern nfstype nfsv3_type[9]; struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON]; int nfs_numasync = 0; #define DIRHDSIZ (sizeof (struct dirent) - (MAXNAMLEN + 1)) SYSCTL_DECL(_vfs_nfs); static int nfsaccess_cache_timeout = NFS_MAXATTRTIMO; SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_timeout, CTLFLAG_RW, &nfsaccess_cache_timeout, 0, "NFS ACCESS cache timeout"); static int nfsv3_commit_on_close = 0; SYSCTL_INT(_vfs_nfs, OID_AUTO, nfsv3_commit_on_close, CTLFLAG_RW, &nfsv3_commit_on_close, 0, "write+commit on close, else only write"); #if 0 SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_hits, CTLFLAG_RD, &nfsstats.accesscache_hits, 0, "NFS ACCESS cache hit count"); SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_misses, CTLFLAG_RD, &nfsstats.accesscache_misses, 0, "NFS ACCESS cache miss count"); #endif #define NFSV3ACCESS_ALL (NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY \ | NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE \ | NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP) static int nfs3_access_otw(struct vnode *vp, int wmode, struct proc *p, struct ucred *cred) { const int v3 = 1; u_int32_t *tl; int error = 0, attrflag; struct mbuf *mreq, *mrep, *md, *mb, *mb2; caddr_t bpos, dpos, cp2; register int32_t t1, t2; register caddr_t cp; u_int32_t rmode; struct nfsnode *np = VTONFS(vp); nfsstats.rpccnt[NFSPROC_ACCESS]++; nfsm_reqhead(vp, NFSPROC_ACCESS, NFSX_FH(v3) + NFSX_UNSIGNED); nfsm_fhtom(vp, v3); nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(wmode); nfsm_request(vp, NFSPROC_ACCESS, p, cred); nfsm_postop_attr(vp, attrflag); if (!error) { nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); rmode = fxdr_unsigned(u_int32_t, *tl); np->n_mode = rmode; np->n_modeuid = cred->cr_uid; np->n_modestamp = time_second; } nfsm_reqdone; return error; } /* * nfs access vnode op. * For nfs version 2, just return ok. File accesses may fail later. * For nfs version 3, use the access rpc to check accessibility. If file modes * are changed on the server, accesses might still fail later. */ static int nfs_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; int error = 0; u_int32_t mode, wmode; int v3 = NFS_ISV3(vp); struct nfsnode *np = VTONFS(vp); /* * Disallow write attempts on filesystems mounted read-only; * unless the file is a socket, fifo, or a block or character * device resident on the filesystem. */ if ((ap->a_mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) { switch (vp->v_type) { case VREG: case VDIR: case VLNK: return (EROFS); default: break; } } /* * For nfs v3, check to see if we have done this recently, and if * so return our cached result instead of making an ACCESS call. * If not, do an access rpc, otherwise you are stuck emulating * ufs_access() locally using the vattr. This may not be correct, * since the server may apply other access criteria such as * client uid-->server uid mapping that we do not know about. */ if (v3) { if (ap->a_mode & VREAD) mode = NFSV3ACCESS_READ; else mode = 0; if (vp->v_type != VDIR) { if (ap->a_mode & VWRITE) mode |= (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND); if (ap->a_mode & VEXEC) mode |= NFSV3ACCESS_EXECUTE; } else { if (ap->a_mode & VWRITE) mode |= (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND | NFSV3ACCESS_DELETE); if (ap->a_mode & VEXEC) mode |= NFSV3ACCESS_LOOKUP; } /* XXX safety belt, only make blanket request if caching */ if (nfsaccess_cache_timeout > 0) { wmode = NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE | NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP; } else { wmode = mode; } /* * Does our cached result allow us to give a definite yes to * this request? */ if ((time_second < (np->n_modestamp + nfsaccess_cache_timeout)) && (ap->a_cred->cr_uid == np->n_modeuid) && ((np->n_mode & mode) == mode)) { nfsstats.accesscache_hits++; } else { /* * Either a no, or a don't know. Go to the wire. */ nfsstats.accesscache_misses++; error = nfs3_access_otw(vp, wmode, ap->a_p,ap->a_cred); if (!error) { if ((np->n_mode & mode) != mode) { error = EACCES; } } } return (error); } else { if ((error = nfsspec_access(ap)) != 0) return (error); /* * Attempt to prevent a mapped root from accessing a file * which it shouldn't. We try to read a byte from the file * if the user is root and the file is not zero length. * After calling nfsspec_access, we should have the correct * file size cached. */ if (ap->a_cred->cr_uid == 0 && (ap->a_mode & VREAD) && VTONFS(vp)->n_size > 0) { struct iovec aiov; struct uio auio; char buf[1]; aiov.iov_base = buf; aiov.iov_len = 1; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_resid = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_procp = ap->a_p; if (vp->v_type == VREG) error = nfs_readrpc(vp, &auio, ap->a_cred); else if (vp->v_type == VDIR) { char* bp; bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK); aiov.iov_base = bp; aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ; error = nfs_readdirrpc(vp, &auio, ap->a_cred); free(bp, M_TEMP); } else if (vp->v_type == VLNK) error = nfs_readlinkrpc(vp, &auio, ap->a_cred); else error = EACCES; } return (error); } } /* * nfs open vnode op * Check to see if the type is ok * and that deletion is not in progress. * For paged in text files, you will need to flush the page cache * if consistency is lost. */ /* ARGSUSED */ static int nfs_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct vattr vattr; int error; if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) { #ifdef DIAGNOSTIC printf("open eacces vtyp=%d\n",vp->v_type); #endif return (EACCES); } /* * Get a valid lease. If cached data is stale, flush it. */ if (nmp->nm_flag & NFSMNT_NQNFS) { if (NQNFS_CKINVALID(vp, np, ND_READ)) { do { error = nqnfs_getlease(vp, ND_READ, ap->a_cred, ap->a_p); } while (error == NQNFS_EXPIRED); if (error) return (error); if (np->n_lrev != np->n_brev || (np->n_flag & NQNFSNONCACHE)) { if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1)) == EINTR) return (error); np->n_brev = np->n_lrev; } } } else { if (np->n_flag & NMODIFIED) { if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1)) == EINTR) return (error); np->n_attrstamp = 0; if (vp->v_type == VDIR) np->n_direofoffset = 0; error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_p); if (error) return (error); np->n_mtime = vattr.va_mtime.tv_sec; } else { error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_p); if (error) return (error); if (np->n_mtime != vattr.va_mtime.tv_sec) { if (vp->v_type == VDIR) np->n_direofoffset = 0; if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1)) == EINTR) return (error); np->n_mtime = vattr.va_mtime.tv_sec; } } } if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) np->n_attrstamp = 0; /* For Open/Close consistency */ return (0); } /* * nfs close vnode op * What an NFS client should do upon close after writing is a debatable issue. * Most NFS clients push delayed writes to the server upon close, basically for * two reasons: * 1 - So that any write errors may be reported back to the client process * doing the close system call. By far the two most likely errors are * NFSERR_NOSPC and NFSERR_DQUOT to indicate space allocation failure. * 2 - To put a worst case upper bound on cache inconsistency between * multiple clients for the file. * There is also a consistency problem for Version 2 of the protocol w.r.t. * not being able to tell if other clients are writing a file concurrently, * since there is no way of knowing if the changed modify time in the reply * is only due to the write for this client. * (NFS Version 3 provides weak cache consistency data in the reply that * should be sufficient to detect and handle this case.) * * The current code does the following: * for NFS Version 2 - play it safe and flush/invalidate all dirty buffers * for NFS Version 3 - flush dirty buffers to the server but don't invalidate * or commit them (this satisfies 1 and 2 except for the * case where the server crashes after this close but * before the commit RPC, which is felt to be "good * enough". Changing the last argument to nfs_flush() to * a 1 would force a commit operation, if it is felt a * commit is necessary now. * for NQNFS - do nothing now, since 2 is dealt with via leases and * 1 should be dealt with via an fsync() system call for * cases where write errors are important. */ /* ARGSUSED */ static int nfs_close(ap) struct vop_close_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); int error = 0; if (vp->v_type == VREG) { if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS) == 0 && (np->n_flag & NMODIFIED)) { if (NFS_ISV3(vp)) { /* * Under NFSv3 we have dirty buffers to dispose of. We * must flush them to the NFS server. We have the option * of waiting all the way through the commit rpc or just * waiting for the initial write. The default is to only * wait through the initial write so the data is in the * server's cache, which is roughly similar to the state * a standard disk subsystem leaves the file in on close(). * * We cannot clear the NMODIFIED bit in np->n_flag due to * potential races with other processes, and certainly * cannot clear it if we don't commit. */ int cm = nfsv3_commit_on_close ? 1 : 0; error = nfs_flush(vp, ap->a_cred, MNT_WAIT, ap->a_p, cm); /* np->n_flag &= ~NMODIFIED; */ } else { error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1); } np->n_attrstamp = 0; } if (np->n_flag & NWRITEERR) { np->n_flag &= ~NWRITEERR; error = np->n_error; } } return (error); } /* * nfs getattr call from vfs. */ static int nfs_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); register caddr_t cp; register u_int32_t *tl; register int32_t t1, t2; caddr_t bpos, dpos; int error = 0; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3 = NFS_ISV3(vp); /* * Update local times for special files. */ if (np->n_flag & (NACC | NUPD)) np->n_flag |= NCHG; /* * First look in the cache. */ if (nfs_getattrcache(vp, ap->a_vap) == 0) return (0); if (v3 && nfsaccess_cache_timeout > 0) { nfsstats.accesscache_misses++; nfs3_access_otw(vp, NFSV3ACCESS_ALL, ap->a_p, ap->a_cred); if (nfs_getattrcache(vp, ap->a_vap) == 0) return (0); } nfsstats.rpccnt[NFSPROC_GETATTR]++; nfsm_reqhead(vp, NFSPROC_GETATTR, NFSX_FH(v3)); nfsm_fhtom(vp, v3); nfsm_request(vp, NFSPROC_GETATTR, ap->a_p, ap->a_cred); if (!error) { nfsm_loadattr(vp, ap->a_vap); } nfsm_reqdone; return (error); } /* * nfs setattr call. */ static int nfs_setattr(ap) struct vop_setattr_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); register struct vattr *vap = ap->a_vap; int error = 0; u_quad_t tsize; #ifndef nolint tsize = (u_quad_t)0; #endif /* * Setting of flags is not supported. */ if (vap->va_flags != VNOVAL) return (EOPNOTSUPP); /* * Disallow write attempts if the filesystem is mounted read-only. */ if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) && (vp->v_mount->mnt_flag & MNT_RDONLY)) return (EROFS); if (vap->va_size != VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VCHR: case VBLK: case VSOCK: case VFIFO: if (vap->va_mtime.tv_sec == VNOVAL && vap->va_atime.tv_sec == VNOVAL && vap->va_mode == (mode_t)VNOVAL && vap->va_uid == (uid_t)VNOVAL && vap->va_gid == (gid_t)VNOVAL) return (0); vap->va_size = VNOVAL; break; default: /* * Disallow write attempts if the filesystem is * mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); vnode_pager_setsize(vp, vap->va_size); if (np->n_flag & NMODIFIED) { if (vap->va_size == 0) error = nfs_vinvalbuf(vp, 0, ap->a_cred, ap->a_p, 1); else error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1); if (error) { vnode_pager_setsize(vp, np->n_size); return (error); } } tsize = np->n_size; np->n_size = np->n_vattr.va_size = vap->va_size; }; } else if ((vap->va_mtime.tv_sec != VNOVAL || vap->va_atime.tv_sec != VNOVAL) && (np->n_flag & NMODIFIED) && vp->v_type == VREG && (error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1)) == EINTR) return (error); error = nfs_setattrrpc(vp, vap, ap->a_cred, ap->a_p); if (error && vap->va_size != VNOVAL) { np->n_size = np->n_vattr.va_size = tsize; vnode_pager_setsize(vp, np->n_size); } return (error); } /* * Do an nfs setattr rpc. */ static int nfs_setattrrpc(vp, vap, cred, procp) register struct vnode *vp; register struct vattr *vap; struct ucred *cred; struct proc *procp; { register struct nfsv2_sattr *sp; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; u_int32_t *tl; int error = 0, wccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3 = NFS_ISV3(vp); nfsstats.rpccnt[NFSPROC_SETATTR]++; nfsm_reqhead(vp, NFSPROC_SETATTR, NFSX_FH(v3) + NFSX_SATTR(v3)); nfsm_fhtom(vp, v3); if (v3) { nfsm_v3attrbuild(vap, TRUE); nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); *tl = nfs_false; } else { nfsm_build(sp, struct nfsv2_sattr *, NFSX_V2SATTR); if (vap->va_mode == (mode_t)VNOVAL) sp->sa_mode = nfs_xdrneg1; else sp->sa_mode = vtonfsv2_mode(vp->v_type, vap->va_mode); if (vap->va_uid == (uid_t)VNOVAL) sp->sa_uid = nfs_xdrneg1; else sp->sa_uid = txdr_unsigned(vap->va_uid); if (vap->va_gid == (gid_t)VNOVAL) sp->sa_gid = nfs_xdrneg1; else sp->sa_gid = txdr_unsigned(vap->va_gid); sp->sa_size = txdr_unsigned(vap->va_size); txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } nfsm_request(vp, NFSPROC_SETATTR, procp, cred); if (v3) { nfsm_wcc_data(vp, wccflag); } else nfsm_loadattr(vp, (struct vattr *)0); nfsm_reqdone; return (error); } /* * nfs lookup call, one step at a time... * First look in cache * If not found, unlock the directory nfsnode and do the rpc */ static int nfs_lookup(ap) struct vop_lookup_args /* { struct vnodeop_desc *a_desc; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; int flags = cnp->cn_flags; struct vnode *newvp; u_int32_t *tl; caddr_t cp; int32_t t1, t2; struct nfsmount *nmp; caddr_t bpos, dpos, cp2; struct mbuf *mreq, *mrep, *md, *mb, *mb2; long len; nfsfh_t *fhp; struct nfsnode *np; int lockparent, wantparent, error = 0, attrflag, fhsize; int v3 = NFS_ISV3(dvp); struct proc *p = cnp->cn_proc; *vpp = NULLVP; cnp->cn_flags &= ~PDIRUNLOCK; if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); if (dvp->v_type != VDIR) return (ENOTDIR); lockparent = flags & LOCKPARENT; wantparent = flags & (LOCKPARENT|WANTPARENT); nmp = VFSTONFS(dvp->v_mount); np = VTONFS(dvp); if ((error = cache_lookup(dvp, vpp, cnp)) && error != ENOENT) { struct vattr vattr; int vpid; if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, p)) != 0) { *vpp = NULLVP; return (error); } newvp = *vpp; vpid = newvp->v_id; /* * See the comment starting `Step through' in ufs/ufs_lookup.c * for an explanation of the locking protocol */ if (dvp == newvp) { VREF(newvp); error = 0; } else if (flags & ISDOTDOT) { VOP_UNLOCK(dvp, 0, p); cnp->cn_flags |= PDIRUNLOCK; error = vget(newvp, LK_EXCLUSIVE, p); if (!error && lockparent && (flags & ISLASTCN)) { error = vn_lock(dvp, LK_EXCLUSIVE, p); if (error == 0) cnp->cn_flags &= ~PDIRUNLOCK; } } else { error = vget(newvp, LK_EXCLUSIVE, p); if (!lockparent || error || !(flags & ISLASTCN)) { VOP_UNLOCK(dvp, 0, p); cnp->cn_flags |= PDIRUNLOCK; } } if (!error) { if (vpid == newvp->v_id) { if (!VOP_GETATTR(newvp, &vattr, cnp->cn_cred, p) && vattr.va_ctime.tv_sec == VTONFS(newvp)->n_ctime) { nfsstats.lookupcache_hits++; if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) cnp->cn_flags |= SAVENAME; return (0); } cache_purge(newvp); } vput(newvp); if (lockparent && dvp != newvp && (flags & ISLASTCN)) VOP_UNLOCK(dvp, 0, p); } error = vn_lock(dvp, LK_EXCLUSIVE, p); *vpp = NULLVP; if (error) { cnp->cn_flags |= PDIRUNLOCK; return (error); } cnp->cn_flags &= ~PDIRUNLOCK; } error = 0; newvp = NULLVP; nfsstats.lookupcache_misses++; nfsstats.rpccnt[NFSPROC_LOOKUP]++; len = cnp->cn_namelen; nfsm_reqhead(dvp, NFSPROC_LOOKUP, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len)); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN); nfsm_request(dvp, NFSPROC_LOOKUP, cnp->cn_proc, cnp->cn_cred); if (error) { nfsm_postop_attr(dvp, attrflag); m_freem(mrep); goto nfsmout; } nfsm_getfh(fhp, fhsize, v3); /* * Handle RENAME case... */ if (cnp->cn_nameiop == RENAME && wantparent && (flags & ISLASTCN)) { if (NFS_CMPFH(np, fhp, fhsize)) { m_freem(mrep); return (EISDIR); } error = nfs_nget(dvp->v_mount, fhp, fhsize, &np); if (error) { m_freem(mrep); return (error); } newvp = NFSTOV(np); if (v3) { nfsm_postop_attr(newvp, attrflag); nfsm_postop_attr(dvp, attrflag); } else nfsm_loadattr(newvp, (struct vattr *)0); *vpp = newvp; m_freem(mrep); cnp->cn_flags |= SAVENAME; if (!lockparent) { VOP_UNLOCK(dvp, 0, p); cnp->cn_flags |= PDIRUNLOCK; } return (0); } if (flags & ISDOTDOT) { VOP_UNLOCK(dvp, 0, p); error = nfs_nget(dvp->v_mount, fhp, fhsize, &np); if (error) { vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, p); return (error); } newvp = NFSTOV(np); if (lockparent && (flags & ISLASTCN)) { error = vn_lock(dvp, LK_EXCLUSIVE, p); if (error) { cnp->cn_flags |= PDIRUNLOCK; vput(newvp); return (error); } } else cnp->cn_flags |= PDIRUNLOCK; } else if (NFS_CMPFH(np, fhp, fhsize)) { VREF(dvp); newvp = dvp; } else { error = nfs_nget(dvp->v_mount, fhp, fhsize, &np); if (error) { m_freem(mrep); return (error); } if (!lockparent || !(flags & ISLASTCN)) { cnp->cn_flags |= PDIRUNLOCK; VOP_UNLOCK(dvp, 0, p); } newvp = NFSTOV(np); } if (v3) { nfsm_postop_attr(newvp, attrflag); nfsm_postop_attr(dvp, attrflag); } else nfsm_loadattr(newvp, (struct vattr *)0); if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) cnp->cn_flags |= SAVENAME; if ((cnp->cn_flags & MAKEENTRY) && (cnp->cn_nameiop != DELETE || !(flags & ISLASTCN))) { np->n_ctime = np->n_vattr.va_ctime.tv_sec; cache_enter(dvp, newvp, cnp); } *vpp = newvp; nfsm_reqdone; if (error) { if (newvp != NULLVP) { vrele(newvp); *vpp = NULLVP; } if ((cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) && (flags & ISLASTCN) && error == ENOENT) { if (!lockparent) { VOP_UNLOCK(dvp, 0, p); cnp->cn_flags |= PDIRUNLOCK; } if (dvp->v_mount->mnt_flag & MNT_RDONLY) error = EROFS; else error = EJUSTRETURN; } if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) cnp->cn_flags |= SAVENAME; } return (error); } /* * nfs read call. * Just call nfs_bioread() to do the work. */ static int nfs_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; if (vp->v_type != VREG) return (EPERM); return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred)); } /* * nfs readlink call */ static int nfs_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; if (vp->v_type != VLNK) return (EINVAL); return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred)); } /* * Do a readlink rpc. * Called by nfs_doio() from below the buffer cache. */ int nfs_readlinkrpc(vp, uiop, cred) register struct vnode *vp; struct uio *uiop; struct ucred *cred; { register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; int error = 0, len, attrflag; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3 = NFS_ISV3(vp); nfsstats.rpccnt[NFSPROC_READLINK]++; nfsm_reqhead(vp, NFSPROC_READLINK, NFSX_FH(v3)); nfsm_fhtom(vp, v3); nfsm_request(vp, NFSPROC_READLINK, uiop->uio_procp, cred); if (v3) nfsm_postop_attr(vp, attrflag); if (!error) { nfsm_strsiz(len, NFS_MAXPATHLEN); if (len == NFS_MAXPATHLEN) { struct nfsnode *np = VTONFS(vp); if (np->n_size && np->n_size < NFS_MAXPATHLEN) len = np->n_size; } nfsm_mtouio(uiop, len); } nfsm_reqdone; return (error); } /* * nfs read rpc call * Ditto above */ int nfs_readrpc(vp, uiop, cred) register struct vnode *vp; struct uio *uiop; struct ucred *cred; { register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; struct mbuf *mreq, *mrep, *md, *mb, *mb2; struct nfsmount *nmp; int error = 0, len, retlen, tsiz, eof, attrflag; int v3 = NFS_ISV3(vp); #ifndef nolint eof = 0; #endif nmp = VFSTONFS(vp->v_mount); tsiz = uiop->uio_resid; if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize) return (EFBIG); while (tsiz > 0) { nfsstats.rpccnt[NFSPROC_READ]++; len = (tsiz > nmp->nm_rsize) ? nmp->nm_rsize : tsiz; nfsm_reqhead(vp, NFSPROC_READ, NFSX_FH(v3) + NFSX_UNSIGNED * 3); nfsm_fhtom(vp, v3); nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED * 3); if (v3) { txdr_hyper(uiop->uio_offset, tl); *(tl + 2) = txdr_unsigned(len); } else { *tl++ = txdr_unsigned(uiop->uio_offset); *tl++ = txdr_unsigned(len); *tl = 0; } nfsm_request(vp, NFSPROC_READ, uiop->uio_procp, cred); if (v3) { nfsm_postop_attr(vp, attrflag); if (error) { m_freem(mrep); goto nfsmout; } nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED); eof = fxdr_unsigned(int, *(tl + 1)); } else nfsm_loadattr(vp, (struct vattr *)0); nfsm_strsiz(retlen, nmp->nm_rsize); nfsm_mtouio(uiop, retlen); m_freem(mrep); tsiz -= retlen; if (v3) { if (eof || retlen == 0) tsiz = 0; } else if (retlen < len) tsiz = 0; } nfsmout: return (error); } /* * nfs write call */ int nfs_writerpc(vp, uiop, cred, iomode, must_commit) register struct vnode *vp; register struct uio *uiop; struct ucred *cred; int *iomode, *must_commit; { register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2, backup; caddr_t bpos, dpos, cp2; struct mbuf *mreq, *mrep, *md, *mb, *mb2; struct nfsmount *nmp = VFSTONFS(vp->v_mount); int error = 0, len, tsiz, wccflag = NFSV3_WCCRATTR, rlen, commit; int v3 = NFS_ISV3(vp), committed = NFSV3WRITE_FILESYNC; #ifndef DIAGNOSTIC if (uiop->uio_iovcnt != 1) panic("nfs: writerpc iovcnt > 1"); #endif *must_commit = 0; tsiz = uiop->uio_resid; if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize) return (EFBIG); while (tsiz > 0) { nfsstats.rpccnt[NFSPROC_WRITE]++; len = (tsiz > nmp->nm_wsize) ? nmp->nm_wsize : tsiz; nfsm_reqhead(vp, NFSPROC_WRITE, NFSX_FH(v3) + 5 * NFSX_UNSIGNED + nfsm_rndup(len)); nfsm_fhtom(vp, v3); if (v3) { nfsm_build(tl, u_int32_t *, 5 * NFSX_UNSIGNED); txdr_hyper(uiop->uio_offset, tl); tl += 2; *tl++ = txdr_unsigned(len); *tl++ = txdr_unsigned(*iomode); *tl = txdr_unsigned(len); } else { register u_int32_t x; nfsm_build(tl, u_int32_t *, 4 * NFSX_UNSIGNED); /* Set both "begin" and "current" to non-garbage. */ x = txdr_unsigned((u_int32_t)uiop->uio_offset); *tl++ = x; /* "begin offset" */ *tl++ = x; /* "current offset" */ x = txdr_unsigned(len); *tl++ = x; /* total to this offset */ *tl = x; /* size of this write */ } nfsm_uiotom(uiop, len); nfsm_request(vp, NFSPROC_WRITE, uiop->uio_procp, cred); if (v3) { wccflag = NFSV3_WCCCHK; nfsm_wcc_data(vp, wccflag); if (!error) { nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED + NFSX_V3WRITEVERF); rlen = fxdr_unsigned(int, *tl++); if (rlen == 0) { error = NFSERR_IO; m_freem(mrep); break; } else if (rlen < len) { backup = len - rlen; uiop->uio_iov->iov_base -= backup; uiop->uio_iov->iov_len += backup; uiop->uio_offset -= backup; uiop->uio_resid += backup; len = rlen; } commit = fxdr_unsigned(int, *tl++); /* * Return the lowest committment level * obtained by any of the RPCs. */ if (committed == NFSV3WRITE_FILESYNC) committed = commit; else if (committed == NFSV3WRITE_DATASYNC && commit == NFSV3WRITE_UNSTABLE) committed = commit; if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0){ bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF); nmp->nm_state |= NFSSTA_HASWRITEVERF; } else if (bcmp((caddr_t)tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF)) { *must_commit = 1; bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF); } } } else nfsm_loadattr(vp, (struct vattr *)0); if (wccflag) VTONFS(vp)->n_mtime = VTONFS(vp)->n_vattr.va_mtime.tv_sec; m_freem(mrep); if (error) break; tsiz -= len; } nfsmout: if (vp->v_mount->mnt_flag & MNT_ASYNC) committed = NFSV3WRITE_FILESYNC; *iomode = committed; if (error) uiop->uio_resid = tsiz; return (error); } /* * nfs mknod rpc * For NFS v2 this is a kludge. Use a create rpc but with the IFMT bits of the * mode set to specify the file type and the size field for rdev. */ static int nfs_mknodrpc(dvp, vpp, cnp, vap) register struct vnode *dvp; register struct vnode **vpp; register struct componentname *cnp; register struct vattr *vap; { register struct nfsv2_sattr *sp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; struct vnode *newvp = (struct vnode *)0; struct nfsnode *np = (struct nfsnode *)0; struct vattr vattr; char *cp2; caddr_t bpos, dpos; int error = 0, wccflag = NFSV3_WCCRATTR, gotvp = 0; struct mbuf *mreq, *mrep, *md, *mb, *mb2; u_int32_t rdev; int v3 = NFS_ISV3(dvp); if (vap->va_type == VCHR || vap->va_type == VBLK) rdev = txdr_unsigned(vap->va_rdev); else if (vap->va_type == VFIFO || vap->va_type == VSOCK) rdev = nfs_xdrneg1; else { return (EOPNOTSUPP); } if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_proc)) != 0) { return (error); } nfsstats.rpccnt[NFSPROC_MKNOD]++; nfsm_reqhead(dvp, NFSPROC_MKNOD, NFSX_FH(v3) + 4 * NFSX_UNSIGNED + + nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(v3)); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); if (v3) { nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); *tl++ = vtonfsv3_type(vap->va_type); nfsm_v3attrbuild(vap, FALSE); if (vap->va_type == VCHR || vap->va_type == VBLK) { nfsm_build(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(umajor(vap->va_rdev)); *tl = txdr_unsigned(uminor(vap->va_rdev)); } } else { nfsm_build(sp, struct nfsv2_sattr *, NFSX_V2SATTR); sp->sa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode); sp->sa_uid = nfs_xdrneg1; sp->sa_gid = nfs_xdrneg1; sp->sa_size = rdev; txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } nfsm_request(dvp, NFSPROC_MKNOD, cnp->cn_proc, cnp->cn_cred); if (!error) { nfsm_mtofh(dvp, newvp, v3, gotvp); if (!gotvp) { if (newvp) { vput(newvp); newvp = (struct vnode *)0; } error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, cnp->cn_proc, &np); if (!error) newvp = NFSTOV(np); } } if (v3) nfsm_wcc_data(dvp, wccflag); nfsm_reqdone; if (error) { if (newvp) vput(newvp); } else { if (cnp->cn_flags & MAKEENTRY) cache_enter(dvp, newvp, cnp); *vpp = newvp; } VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; return (error); } /* * nfs mknod vop * just call nfs_mknodrpc() to do the work. */ /* ARGSUSED */ static int nfs_mknod(ap) struct vop_mknod_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { return nfs_mknodrpc(ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap); } static u_long create_verf; /* * nfs file create call */ static int nfs_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { register struct vnode *dvp = ap->a_dvp; register struct vattr *vap = ap->a_vap; register struct componentname *cnp = ap->a_cnp; register struct nfsv2_sattr *sp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; struct nfsnode *np = (struct nfsnode *)0; struct vnode *newvp = (struct vnode *)0; caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR, gotvp = 0, fmode = 0; struct mbuf *mreq, *mrep, *md, *mb, *mb2; struct vattr vattr; int v3 = NFS_ISV3(dvp); /* * Oops, not for me.. */ if (vap->va_type == VSOCK) return (nfs_mknodrpc(dvp, ap->a_vpp, cnp, vap)); if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_proc)) != 0) { return (error); } if (vap->va_vaflags & VA_EXCLUSIVE) fmode |= O_EXCL; again: nfsstats.rpccnt[NFSPROC_CREATE]++; nfsm_reqhead(dvp, NFSPROC_CREATE, NFSX_FH(v3) + 2 * NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(v3)); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); if (v3) { nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); if (fmode & O_EXCL) { *tl = txdr_unsigned(NFSV3CREATE_EXCLUSIVE); nfsm_build(tl, u_int32_t *, NFSX_V3CREATEVERF); #ifdef INET if (!TAILQ_EMPTY(&in_ifaddrhead)) *tl++ = IA_SIN(in_ifaddrhead.tqh_first)->sin_addr.s_addr; else #endif *tl++ = create_verf; *tl = ++create_verf; } else { *tl = txdr_unsigned(NFSV3CREATE_UNCHECKED); nfsm_v3attrbuild(vap, FALSE); } } else { nfsm_build(sp, struct nfsv2_sattr *, NFSX_V2SATTR); sp->sa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode); sp->sa_uid = nfs_xdrneg1; sp->sa_gid = nfs_xdrneg1; sp->sa_size = 0; txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } nfsm_request(dvp, NFSPROC_CREATE, cnp->cn_proc, cnp->cn_cred); if (!error) { nfsm_mtofh(dvp, newvp, v3, gotvp); if (!gotvp) { if (newvp) { vput(newvp); newvp = (struct vnode *)0; } error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, cnp->cn_proc, &np); if (!error) newvp = NFSTOV(np); } } if (v3) nfsm_wcc_data(dvp, wccflag); nfsm_reqdone; if (error) { if (v3 && (fmode & O_EXCL) && error == NFSERR_NOTSUPP) { fmode &= ~O_EXCL; goto again; } if (newvp) vput(newvp); } else if (v3 && (fmode & O_EXCL)) { /* * We are normally called with only a partially initialized * VAP. Since the NFSv3 spec says that server may use the * file attributes to store the verifier, the spec requires * us to do a SETATTR RPC. FreeBSD servers store the verifier * in atime, but we can't really assume that all servers will * so we ensure that our SETATTR sets both atime and mtime. */ if (vap->va_mtime.tv_sec == VNOVAL) vfs_timestamp(&vap->va_mtime); if (vap->va_atime.tv_sec == VNOVAL) vap->va_atime = vap->va_mtime; error = nfs_setattrrpc(newvp, vap, cnp->cn_cred, cnp->cn_proc); } if (!error) { if (cnp->cn_flags & MAKEENTRY) cache_enter(dvp, newvp, cnp); *ap->a_vpp = newvp; } VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; return (error); } /* * nfs file remove call * To try and make nfs semantics closer to ufs semantics, a file that has * other processes using the vnode is renamed instead of removed and then * removed later on the last close. * - If v_usecount > 1 * If a rename is not already in the works * call nfs_sillyrename() to set it up * else * do the remove rpc */ static int nfs_remove(ap) struct vop_remove_args /* { struct vnodeop_desc *a_desc; struct vnode * a_dvp; struct vnode * a_vp; struct componentname * a_cnp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct vnode *dvp = ap->a_dvp; register struct componentname *cnp = ap->a_cnp; register struct nfsnode *np = VTONFS(vp); int error = 0; struct vattr vattr; #ifndef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("nfs_remove: no name"); if (vp->v_usecount < 1) panic("nfs_remove: bad v_usecount"); #endif if (vp->v_type == VDIR) error = EPERM; else if (vp->v_usecount == 1 || (np->n_sillyrename && VOP_GETATTR(vp, &vattr, cnp->cn_cred, cnp->cn_proc) == 0 && vattr.va_nlink > 1)) { /* * Purge the name cache so that the chance of a lookup for * the name succeeding while the remove is in progress is * minimized. Without node locking it can still happen, such * that an I/O op returns ESTALE, but since you get this if * another host removes the file.. */ cache_purge(vp); /* * throw away biocache buffers, mainly to avoid * unnecessary delayed writes later. */ error = nfs_vinvalbuf(vp, 0, cnp->cn_cred, cnp->cn_proc, 1); /* Do the rpc */ if (error != EINTR) error = nfs_removerpc(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, cnp->cn_proc); /* * Kludge City: If the first reply to the remove rpc is lost.. * the reply to the retransmitted request will be ENOENT * since the file was in fact removed * Therefore, we cheat and return success. */ if (error == ENOENT) error = 0; } else if (!np->n_sillyrename) error = nfs_sillyrename(dvp, vp, cnp); np->n_attrstamp = 0; return (error); } /* * nfs file remove rpc called from nfs_inactive */ int nfs_removeit(sp) register struct sillyrename *sp; { return (nfs_removerpc(sp->s_dvp, sp->s_name, sp->s_namlen, sp->s_cred, (struct proc *)0)); } /* * Nfs remove rpc, called from nfs_remove() and nfs_removeit(). */ static int nfs_removerpc(dvp, name, namelen, cred, proc) register struct vnode *dvp; const char *name; int namelen; struct ucred *cred; struct proc *proc; { register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3 = NFS_ISV3(dvp); nfsstats.rpccnt[NFSPROC_REMOVE]++; nfsm_reqhead(dvp, NFSPROC_REMOVE, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(namelen)); nfsm_fhtom(dvp, v3); nfsm_strtom(name, namelen, NFS_MAXNAMLEN); nfsm_request(dvp, NFSPROC_REMOVE, proc, cred); if (v3) nfsm_wcc_data(dvp, wccflag); nfsm_reqdone; VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; return (error); } /* * nfs file rename call */ static int nfs_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { register struct vnode *fvp = ap->a_fvp; register struct vnode *tvp = ap->a_tvp; register struct vnode *fdvp = ap->a_fdvp; register struct vnode *tdvp = ap->a_tdvp; register struct componentname *tcnp = ap->a_tcnp; register struct componentname *fcnp = ap->a_fcnp; int error; #ifndef DIAGNOSTIC if ((tcnp->cn_flags & HASBUF) == 0 || (fcnp->cn_flags & HASBUF) == 0) panic("nfs_rename: no name"); #endif /* Check for cross-device rename */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; goto out; } /* * We have to flush B_DELWRI data prior to renaming * the file. If we don't, the delayed-write buffers * can be flushed out later after the file has gone stale * under NFSV3. NFSV2 does not have this problem because * ( as far as I can tell ) it flushes dirty buffers more * often. */ VOP_FSYNC(fvp, fcnp->cn_cred, MNT_WAIT, fcnp->cn_proc); if (tvp) VOP_FSYNC(tvp, tcnp->cn_cred, MNT_WAIT, tcnp->cn_proc); /* * If the tvp exists and is in use, sillyrename it before doing the * rename of the new file over it. * XXX Can't sillyrename a directory. */ if (tvp && tvp->v_usecount > 1 && !VTONFS(tvp)->n_sillyrename && tvp->v_type != VDIR && !nfs_sillyrename(tdvp, tvp, tcnp)) { vput(tvp); tvp = NULL; } error = nfs_renamerpc(fdvp, fcnp->cn_nameptr, fcnp->cn_namelen, tdvp, tcnp->cn_nameptr, tcnp->cn_namelen, tcnp->cn_cred, tcnp->cn_proc); if (fvp->v_type == VDIR) { if (tvp != NULL && tvp->v_type == VDIR) cache_purge(tdvp); cache_purge(fdvp); } out: if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fdvp); vrele(fvp); /* * Kludge: Map ENOENT => 0 assuming that it is a reply to a retry. */ if (error == ENOENT) error = 0; return (error); } /* * nfs file rename rpc called from nfs_remove() above */ static int nfs_renameit(sdvp, scnp, sp) struct vnode *sdvp; struct componentname *scnp; register struct sillyrename *sp; { return (nfs_renamerpc(sdvp, scnp->cn_nameptr, scnp->cn_namelen, sdvp, sp->s_name, sp->s_namlen, scnp->cn_cred, scnp->cn_proc)); } /* * Do an nfs rename rpc. Called from nfs_rename() and nfs_renameit(). */ static int nfs_renamerpc(fdvp, fnameptr, fnamelen, tdvp, tnameptr, tnamelen, cred, proc) register struct vnode *fdvp; const char *fnameptr; int fnamelen; register struct vnode *tdvp; const char *tnameptr; int tnamelen; struct ucred *cred; struct proc *proc; { register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; int error = 0, fwccflag = NFSV3_WCCRATTR, twccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3 = NFS_ISV3(fdvp); nfsstats.rpccnt[NFSPROC_RENAME]++; nfsm_reqhead(fdvp, NFSPROC_RENAME, (NFSX_FH(v3) + NFSX_UNSIGNED)*2 + nfsm_rndup(fnamelen) + nfsm_rndup(tnamelen)); nfsm_fhtom(fdvp, v3); nfsm_strtom(fnameptr, fnamelen, NFS_MAXNAMLEN); nfsm_fhtom(tdvp, v3); nfsm_strtom(tnameptr, tnamelen, NFS_MAXNAMLEN); nfsm_request(fdvp, NFSPROC_RENAME, proc, cred); if (v3) { nfsm_wcc_data(fdvp, fwccflag); nfsm_wcc_data(tdvp, twccflag); } nfsm_reqdone; VTONFS(fdvp)->n_flag |= NMODIFIED; VTONFS(tdvp)->n_flag |= NMODIFIED; if (!fwccflag) VTONFS(fdvp)->n_attrstamp = 0; if (!twccflag) VTONFS(tdvp)->n_attrstamp = 0; return (error); } /* * nfs hard link create call */ static int nfs_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct vnode *tdvp = ap->a_tdvp; register struct componentname *cnp = ap->a_cnp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR, attrflag = 0; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3; if (vp->v_mount != tdvp->v_mount) { return (EXDEV); } /* * Push all writes to the server, so that the attribute cache * doesn't get "out of sync" with the server. * XXX There should be a better way! */ VOP_FSYNC(vp, cnp->cn_cred, MNT_WAIT, cnp->cn_proc); v3 = NFS_ISV3(vp); nfsstats.rpccnt[NFSPROC_LINK]++; nfsm_reqhead(vp, NFSPROC_LINK, NFSX_FH(v3)*2 + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen)); nfsm_fhtom(vp, v3); nfsm_fhtom(tdvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); nfsm_request(vp, NFSPROC_LINK, cnp->cn_proc, cnp->cn_cred); if (v3) { nfsm_postop_attr(vp, attrflag); nfsm_wcc_data(tdvp, wccflag); } nfsm_reqdone; VTONFS(tdvp)->n_flag |= NMODIFIED; if (!attrflag) VTONFS(vp)->n_attrstamp = 0; if (!wccflag) VTONFS(tdvp)->n_attrstamp = 0; /* * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry. */ if (error == EEXIST) error = 0; return (error); } /* * nfs symbolic link create call */ static int nfs_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap; { register struct vnode *dvp = ap->a_dvp; register struct vattr *vap = ap->a_vap; register struct componentname *cnp = ap->a_cnp; register struct nfsv2_sattr *sp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; int slen, error = 0, wccflag = NFSV3_WCCRATTR, gotvp; struct mbuf *mreq, *mrep, *md, *mb, *mb2; struct vnode *newvp = (struct vnode *)0; int v3 = NFS_ISV3(dvp); nfsstats.rpccnt[NFSPROC_SYMLINK]++; slen = strlen(ap->a_target); nfsm_reqhead(dvp, NFSPROC_SYMLINK, NFSX_FH(v3) + 2*NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen) + nfsm_rndup(slen) + NFSX_SATTR(v3)); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); if (v3) { nfsm_v3attrbuild(vap, FALSE); } nfsm_strtom(ap->a_target, slen, NFS_MAXPATHLEN); if (!v3) { nfsm_build(sp, struct nfsv2_sattr *, NFSX_V2SATTR); sp->sa_mode = vtonfsv2_mode(VLNK, vap->va_mode); sp->sa_uid = nfs_xdrneg1; sp->sa_gid = nfs_xdrneg1; sp->sa_size = nfs_xdrneg1; txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } /* * Issue the NFS request and get the rpc response. * * Only NFSv3 responses returning an error of 0 actually return * a file handle that can be converted into newvp without having * to do an extra lookup rpc. */ nfsm_request(dvp, NFSPROC_SYMLINK, cnp->cn_proc, cnp->cn_cred); if (v3) { if (error == 0) nfsm_mtofh(dvp, newvp, v3, gotvp); nfsm_wcc_data(dvp, wccflag); } /* * out code jumps -> here, mrep is also freed. */ nfsm_reqdone; /* * If we get an EEXIST error, silently convert it to no-error * in case of an NFS retry. */ if (error == EEXIST) error = 0; /* * If we do not have (or no longer have) an error, and we could * not extract the newvp from the response due to the request being * NFSv2 or the error being EEXIST. We have to do a lookup in order * to obtain a newvp to return. */ if (error == 0 && newvp == NULL) { struct nfsnode *np = NULL; error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, cnp->cn_proc, &np); if (!error) newvp = NFSTOV(np); } if (error) { if (newvp) vput(newvp); } else { *ap->a_vpp = newvp; } VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; return (error); } /* * nfs make dir call */ static int nfs_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { register struct vnode *dvp = ap->a_dvp; register struct vattr *vap = ap->a_vap; register struct componentname *cnp = ap->a_cnp; register struct nfsv2_sattr *sp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; register int len; struct nfsnode *np = (struct nfsnode *)0; struct vnode *newvp = (struct vnode *)0; caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR; int gotvp = 0; struct mbuf *mreq, *mrep, *md, *mb, *mb2; struct vattr vattr; int v3 = NFS_ISV3(dvp); if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_proc)) != 0) { return (error); } len = cnp->cn_namelen; nfsstats.rpccnt[NFSPROC_MKDIR]++; nfsm_reqhead(dvp, NFSPROC_MKDIR, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len) + NFSX_SATTR(v3)); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN); if (v3) { nfsm_v3attrbuild(vap, FALSE); } else { nfsm_build(sp, struct nfsv2_sattr *, NFSX_V2SATTR); sp->sa_mode = vtonfsv2_mode(VDIR, vap->va_mode); sp->sa_uid = nfs_xdrneg1; sp->sa_gid = nfs_xdrneg1; sp->sa_size = nfs_xdrneg1; txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } nfsm_request(dvp, NFSPROC_MKDIR, cnp->cn_proc, cnp->cn_cred); if (!error) nfsm_mtofh(dvp, newvp, v3, gotvp); if (v3) nfsm_wcc_data(dvp, wccflag); nfsm_reqdone; VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; /* * Kludge: Map EEXIST => 0 assuming that you have a reply to a retry * if we can succeed in looking up the directory. */ if (error == EEXIST || (!error && !gotvp)) { if (newvp) { vrele(newvp); newvp = (struct vnode *)0; } error = nfs_lookitup(dvp, cnp->cn_nameptr, len, cnp->cn_cred, cnp->cn_proc, &np); if (!error) { newvp = NFSTOV(np); if (newvp->v_type != VDIR) error = EEXIST; } } if (error) { if (newvp) vrele(newvp); } else *ap->a_vpp = newvp; return (error); } /* * nfs remove directory call */ static int nfs_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct vnode *dvp = ap->a_dvp; register struct componentname *cnp = ap->a_cnp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3 = NFS_ISV3(dvp); if (dvp == vp) return (EINVAL); nfsstats.rpccnt[NFSPROC_RMDIR]++; nfsm_reqhead(dvp, NFSPROC_RMDIR, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen)); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); nfsm_request(dvp, NFSPROC_RMDIR, cnp->cn_proc, cnp->cn_cred); if (v3) nfsm_wcc_data(dvp, wccflag); nfsm_reqdone; VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; cache_purge(dvp); cache_purge(vp); /* * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry. */ if (error == ENOENT) error = 0; return (error); } /* * nfs readdir call */ static int nfs_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); register struct uio *uio = ap->a_uio; int tresid, error; struct vattr vattr; if (vp->v_type != VDIR) return (EPERM); /* * First, check for hit on the EOF offset cache */ if (np->n_direofoffset > 0 && uio->uio_offset >= np->n_direofoffset && (np->n_flag & NMODIFIED) == 0) { if (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS) { if (NQNFS_CKCACHABLE(vp, ND_READ)) { nfsstats.direofcache_hits++; return (0); } } else if (VOP_GETATTR(vp, &vattr, ap->a_cred, uio->uio_procp) == 0 && np->n_mtime == vattr.va_mtime.tv_sec) { nfsstats.direofcache_hits++; return (0); } } /* * Call nfs_bioread() to do the real work. */ tresid = uio->uio_resid; error = nfs_bioread(vp, uio, 0, ap->a_cred); if (!error && uio->uio_resid == tresid) nfsstats.direofcache_misses++; return (error); } /* * Readdir rpc call. * Called from below the buffer cache by nfs_doio(). */ int nfs_readdirrpc(vp, uiop, cred) struct vnode *vp; register struct uio *uiop; struct ucred *cred; { register int len, left; register struct dirent *dp = NULL; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; register nfsuint64 *cookiep; caddr_t bpos, dpos, cp2; struct mbuf *mreq, *mrep, *md, *mb, *mb2; nfsuint64 cookie; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsnode *dnp = VTONFS(vp); u_quad_t fileno; int error = 0, tlen, more_dirs = 1, blksiz = 0, bigenough = 1; int attrflag; int v3 = NFS_ISV3(vp); #ifndef DIAGNOSTIC if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) || (uiop->uio_resid & (DIRBLKSIZ - 1))) panic("nfs readdirrpc bad uio"); #endif /* * If there is no cookie, assume directory was stale. */ cookiep = nfs_getcookie(dnp, uiop->uio_offset, 0); if (cookiep) cookie = *cookiep; else return (NFSERR_BAD_COOKIE); /* * Loop around doing readdir rpc's of size nm_readdirsize * truncated to a multiple of DIRBLKSIZ. * The stopping criteria is EOF or buffer full. */ while (more_dirs && bigenough) { nfsstats.rpccnt[NFSPROC_READDIR]++; nfsm_reqhead(vp, NFSPROC_READDIR, NFSX_FH(v3) + NFSX_READDIR(v3)); nfsm_fhtom(vp, v3); if (v3) { nfsm_build(tl, u_int32_t *, 5 * NFSX_UNSIGNED); *tl++ = cookie.nfsuquad[0]; *tl++ = cookie.nfsuquad[1]; *tl++ = dnp->n_cookieverf.nfsuquad[0]; *tl++ = dnp->n_cookieverf.nfsuquad[1]; } else { nfsm_build(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = cookie.nfsuquad[0]; } *tl = txdr_unsigned(nmp->nm_readdirsize); nfsm_request(vp, NFSPROC_READDIR, uiop->uio_procp, cred); if (v3) { nfsm_postop_attr(vp, attrflag); if (!error) { nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED); dnp->n_cookieverf.nfsuquad[0] = *tl++; dnp->n_cookieverf.nfsuquad[1] = *tl; } else { m_freem(mrep); goto nfsmout; } } nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); more_dirs = fxdr_unsigned(int, *tl); /* loop thru the dir entries, doctoring them to 4bsd form */ while (more_dirs && bigenough) { if (v3) { nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); fileno = fxdr_hyper(tl); len = fxdr_unsigned(int, *(tl + 2)); } else { nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED); fileno = fxdr_unsigned(u_quad_t, *tl++); len = fxdr_unsigned(int, *tl); } if (len <= 0 || len > NFS_MAXNAMLEN) { error = EBADRPC; m_freem(mrep); goto nfsmout; } tlen = nfsm_rndup(len); if (tlen == len) tlen += 4; /* To ensure null termination */ left = DIRBLKSIZ - blksiz; if ((tlen + DIRHDSIZ) > left) { dp->d_reclen += left; uiop->uio_iov->iov_base += left; uiop->uio_iov->iov_len -= left; uiop->uio_offset += left; uiop->uio_resid -= left; blksiz = 0; } if ((tlen + DIRHDSIZ) > uiop->uio_resid) bigenough = 0; if (bigenough) { dp = (struct dirent *)uiop->uio_iov->iov_base; dp->d_fileno = (int)fileno; dp->d_namlen = len; dp->d_reclen = tlen + DIRHDSIZ; dp->d_type = DT_UNKNOWN; blksiz += dp->d_reclen; if (blksiz == DIRBLKSIZ) blksiz = 0; uiop->uio_offset += DIRHDSIZ; uiop->uio_resid -= DIRHDSIZ; uiop->uio_iov->iov_base += DIRHDSIZ; uiop->uio_iov->iov_len -= DIRHDSIZ; nfsm_mtouio(uiop, len); cp = uiop->uio_iov->iov_base; tlen -= len; *cp = '\0'; /* null terminate */ uiop->uio_iov->iov_base += tlen; uiop->uio_iov->iov_len -= tlen; uiop->uio_offset += tlen; uiop->uio_resid -= tlen; } else nfsm_adv(nfsm_rndup(len)); if (v3) { nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); } else { nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED); } if (bigenough) { cookie.nfsuquad[0] = *tl++; if (v3) cookie.nfsuquad[1] = *tl++; } else if (v3) tl += 2; else tl++; more_dirs = fxdr_unsigned(int, *tl); } /* * If at end of rpc data, get the eof boolean */ if (!more_dirs) { nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); more_dirs = (fxdr_unsigned(int, *tl) == 0); } m_freem(mrep); } /* * Fill last record, iff any, out to a multiple of DIRBLKSIZ * by increasing d_reclen for the last record. */ if (blksiz > 0) { left = DIRBLKSIZ - blksiz; dp->d_reclen += left; uiop->uio_iov->iov_base += left; uiop->uio_iov->iov_len -= left; uiop->uio_offset += left; uiop->uio_resid -= left; } /* * We are now either at the end of the directory or have filled the * block. */ if (bigenough) dnp->n_direofoffset = uiop->uio_offset; else { if (uiop->uio_resid > 0) printf("EEK! readdirrpc resid > 0\n"); cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1); *cookiep = cookie; } nfsmout: return (error); } /* * NFS V3 readdir plus RPC. Used in place of nfs_readdirrpc(). */ int nfs_readdirplusrpc(vp, uiop, cred) struct vnode *vp; register struct uio *uiop; struct ucred *cred; { register int len, left; register struct dirent *dp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; register struct vnode *newvp; register nfsuint64 *cookiep; caddr_t bpos, dpos, cp2, dpossav1, dpossav2; struct mbuf *mreq, *mrep, *md, *mb, *mb2, *mdsav1, *mdsav2; struct nameidata nami, *ndp = &nami; struct componentname *cnp = &ndp->ni_cnd; nfsuint64 cookie; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsnode *dnp = VTONFS(vp), *np; nfsfh_t *fhp; u_quad_t fileno; int error = 0, tlen, more_dirs = 1, blksiz = 0, doit, bigenough = 1, i; int attrflag, fhsize; #ifndef nolint dp = (struct dirent *)0; #endif #ifndef DIAGNOSTIC if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) || (uiop->uio_resid & (DIRBLKSIZ - 1))) panic("nfs readdirplusrpc bad uio"); #endif ndp->ni_dvp = vp; newvp = NULLVP; /* * If there is no cookie, assume directory was stale. */ cookiep = nfs_getcookie(dnp, uiop->uio_offset, 0); if (cookiep) cookie = *cookiep; else return (NFSERR_BAD_COOKIE); /* * Loop around doing readdir rpc's of size nm_readdirsize * truncated to a multiple of DIRBLKSIZ. * The stopping criteria is EOF or buffer full. */ while (more_dirs && bigenough) { nfsstats.rpccnt[NFSPROC_READDIRPLUS]++; nfsm_reqhead(vp, NFSPROC_READDIRPLUS, NFSX_FH(1) + 6 * NFSX_UNSIGNED); nfsm_fhtom(vp, 1); nfsm_build(tl, u_int32_t *, 6 * NFSX_UNSIGNED); *tl++ = cookie.nfsuquad[0]; *tl++ = cookie.nfsuquad[1]; *tl++ = dnp->n_cookieverf.nfsuquad[0]; *tl++ = dnp->n_cookieverf.nfsuquad[1]; *tl++ = txdr_unsigned(nmp->nm_readdirsize); *tl = txdr_unsigned(nmp->nm_rsize); nfsm_request(vp, NFSPROC_READDIRPLUS, uiop->uio_procp, cred); nfsm_postop_attr(vp, attrflag); if (error) { m_freem(mrep); goto nfsmout; } nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); dnp->n_cookieverf.nfsuquad[0] = *tl++; dnp->n_cookieverf.nfsuquad[1] = *tl++; more_dirs = fxdr_unsigned(int, *tl); /* loop thru the dir entries, doctoring them to 4bsd form */ while (more_dirs && bigenough) { nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); fileno = fxdr_hyper(tl); len = fxdr_unsigned(int, *(tl + 2)); if (len <= 0 || len > NFS_MAXNAMLEN) { error = EBADRPC; m_freem(mrep); goto nfsmout; } tlen = nfsm_rndup(len); if (tlen == len) tlen += 4; /* To ensure null termination*/ left = DIRBLKSIZ - blksiz; if ((tlen + DIRHDSIZ) > left) { dp->d_reclen += left; uiop->uio_iov->iov_base += left; uiop->uio_iov->iov_len -= left; uiop->uio_offset += left; uiop->uio_resid -= left; blksiz = 0; } if ((tlen + DIRHDSIZ) > uiop->uio_resid) bigenough = 0; if (bigenough) { dp = (struct dirent *)uiop->uio_iov->iov_base; dp->d_fileno = (int)fileno; dp->d_namlen = len; dp->d_reclen = tlen + DIRHDSIZ; dp->d_type = DT_UNKNOWN; blksiz += dp->d_reclen; if (blksiz == DIRBLKSIZ) blksiz = 0; uiop->uio_offset += DIRHDSIZ; uiop->uio_resid -= DIRHDSIZ; uiop->uio_iov->iov_base += DIRHDSIZ; uiop->uio_iov->iov_len -= DIRHDSIZ; cnp->cn_nameptr = uiop->uio_iov->iov_base; cnp->cn_namelen = len; nfsm_mtouio(uiop, len); cp = uiop->uio_iov->iov_base; tlen -= len; *cp = '\0'; uiop->uio_iov->iov_base += tlen; uiop->uio_iov->iov_len -= tlen; uiop->uio_offset += tlen; uiop->uio_resid -= tlen; } else nfsm_adv(nfsm_rndup(len)); nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); if (bigenough) { cookie.nfsuquad[0] = *tl++; cookie.nfsuquad[1] = *tl++; } else tl += 2; /* * Since the attributes are before the file handle * (sigh), we must skip over the attributes and then * come back and get them. */ attrflag = fxdr_unsigned(int, *tl); if (attrflag) { dpossav1 = dpos; mdsav1 = md; nfsm_adv(NFSX_V3FATTR); nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); doit = fxdr_unsigned(int, *tl); if (doit) { nfsm_getfh(fhp, fhsize, 1); if (NFS_CMPFH(dnp, fhp, fhsize)) { VREF(vp); newvp = vp; np = dnp; } else { error = nfs_nget(vp->v_mount, fhp, fhsize, &np); if (error) doit = 0; else newvp = NFSTOV(np); } } if (doit && bigenough) { dpossav2 = dpos; dpos = dpossav1; mdsav2 = md; md = mdsav1; nfsm_loadattr(newvp, (struct vattr *)0); dpos = dpossav2; md = mdsav2; dp->d_type = IFTODT(VTTOIF(np->n_vattr.va_type)); ndp->ni_vp = newvp; cache_enter(ndp->ni_dvp, ndp->ni_vp, cnp); } } else { /* Just skip over the file handle */ nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); i = fxdr_unsigned(int, *tl); nfsm_adv(nfsm_rndup(i)); } if (newvp != NULLVP) { if (newvp == vp) vrele(newvp); else vput(newvp); newvp = NULLVP; } nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); more_dirs = fxdr_unsigned(int, *tl); } /* * If at end of rpc data, get the eof boolean */ if (!more_dirs) { nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); more_dirs = (fxdr_unsigned(int, *tl) == 0); } m_freem(mrep); } /* * Fill last record, iff any, out to a multiple of DIRBLKSIZ * by increasing d_reclen for the last record. */ if (blksiz > 0) { left = DIRBLKSIZ - blksiz; dp->d_reclen += left; uiop->uio_iov->iov_base += left; uiop->uio_iov->iov_len -= left; uiop->uio_offset += left; uiop->uio_resid -= left; } /* * We are now either at the end of the directory or have filled the * block. */ if (bigenough) dnp->n_direofoffset = uiop->uio_offset; else { if (uiop->uio_resid > 0) printf("EEK! readdirplusrpc resid > 0\n"); cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1); *cookiep = cookie; } nfsmout: if (newvp != NULLVP) { if (newvp == vp) vrele(newvp); else vput(newvp); newvp = NULLVP; } return (error); } /* * Silly rename. To make the NFS filesystem that is stateless look a little * more like the "ufs" a remove of an active vnode is translated to a rename * to a funny looking filename that is removed by nfs_inactive on the * nfsnode. There is the potential for another process on a different client * to create the same funny name between the nfs_lookitup() fails and the * nfs_rename() completes, but... */ static int nfs_sillyrename(dvp, vp, cnp) struct vnode *dvp, *vp; struct componentname *cnp; { register struct sillyrename *sp; struct nfsnode *np; int error; short pid; cache_purge(dvp); np = VTONFS(vp); #ifndef DIAGNOSTIC if (vp->v_type == VDIR) panic("nfs: sillyrename dir"); #endif MALLOC(sp, struct sillyrename *, sizeof (struct sillyrename), M_NFSREQ, M_WAITOK); sp->s_cred = crdup(cnp->cn_cred); sp->s_dvp = dvp; VREF(dvp); /* Fudge together a funny name */ pid = cnp->cn_proc->p_pid; sp->s_namlen = sprintf(sp->s_name, ".nfsA%04x4.4", pid); /* Try lookitups until we get one that isn't there */ while (nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred, cnp->cn_proc, (struct nfsnode **)0) == 0) { sp->s_name[4]++; if (sp->s_name[4] > 'z') { error = EINVAL; goto bad; } } error = nfs_renameit(dvp, cnp, sp); if (error) goto bad; error = nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred, cnp->cn_proc, &np); np->n_sillyrename = sp; return (0); bad: vrele(sp->s_dvp); crfree(sp->s_cred); free((caddr_t)sp, M_NFSREQ); return (error); } /* * Look up a file name and optionally either update the file handle or * allocate an nfsnode, depending on the value of npp. * npp == NULL --> just do the lookup * *npp == NULL --> allocate a new nfsnode and make sure attributes are * handled too * *npp != NULL --> update the file handle in the vnode */ static int nfs_lookitup(dvp, name, len, cred, procp, npp) register struct vnode *dvp; const char *name; int len; struct ucred *cred; struct proc *procp; struct nfsnode **npp; { register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; struct vnode *newvp = (struct vnode *)0; struct nfsnode *np, *dnp = VTONFS(dvp); caddr_t bpos, dpos, cp2; int error = 0, fhlen, attrflag; struct mbuf *mreq, *mrep, *md, *mb, *mb2; nfsfh_t *nfhp; int v3 = NFS_ISV3(dvp); nfsstats.rpccnt[NFSPROC_LOOKUP]++; nfsm_reqhead(dvp, NFSPROC_LOOKUP, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len)); nfsm_fhtom(dvp, v3); nfsm_strtom(name, len, NFS_MAXNAMLEN); nfsm_request(dvp, NFSPROC_LOOKUP, procp, cred); if (npp && !error) { nfsm_getfh(nfhp, fhlen, v3); if (*npp) { np = *npp; if (np->n_fhsize > NFS_SMALLFH && fhlen <= NFS_SMALLFH) { free((caddr_t)np->n_fhp, M_NFSBIGFH); np->n_fhp = &np->n_fh; } else if (np->n_fhsize <= NFS_SMALLFH && fhlen>NFS_SMALLFH) np->n_fhp =(nfsfh_t *)malloc(fhlen,M_NFSBIGFH,M_WAITOK); bcopy((caddr_t)nfhp, (caddr_t)np->n_fhp, fhlen); np->n_fhsize = fhlen; newvp = NFSTOV(np); } else if (NFS_CMPFH(dnp, nfhp, fhlen)) { VREF(dvp); newvp = dvp; } else { error = nfs_nget(dvp->v_mount, nfhp, fhlen, &np); if (error) { m_freem(mrep); return (error); } newvp = NFSTOV(np); } if (v3) { nfsm_postop_attr(newvp, attrflag); if (!attrflag && *npp == NULL) { m_freem(mrep); if (newvp == dvp) vrele(newvp); else vput(newvp); return (ENOENT); } } else nfsm_loadattr(newvp, (struct vattr *)0); } nfsm_reqdone; if (npp && *npp == NULL) { if (error) { if (newvp) { if (newvp == dvp) vrele(newvp); else vput(newvp); } } else *npp = np; } return (error); } /* * Nfs Version 3 commit rpc */ int nfs_commit(vp, offset, cnt, cred, procp) struct vnode *vp; u_quad_t offset; int cnt; struct ucred *cred; struct proc *procp; { register caddr_t cp; register u_int32_t *tl; register int32_t t1, t2; register struct nfsmount *nmp = VFSTONFS(vp->v_mount); caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) return (0); nfsstats.rpccnt[NFSPROC_COMMIT]++; nfsm_reqhead(vp, NFSPROC_COMMIT, NFSX_FH(1)); nfsm_fhtom(vp, 1); nfsm_build(tl, u_int32_t *, 3 * NFSX_UNSIGNED); txdr_hyper(offset, tl); tl += 2; *tl = txdr_unsigned(cnt); nfsm_request(vp, NFSPROC_COMMIT, procp, cred); nfsm_wcc_data(vp, wccflag); if (!error) { nfsm_dissect(tl, u_int32_t *, NFSX_V3WRITEVERF); if (bcmp((caddr_t)nmp->nm_verf, (caddr_t)tl, NFSX_V3WRITEVERF)) { bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF); error = NFSERR_STALEWRITEVERF; } } nfsm_reqdone; return (error); } /* * Kludge City.. * - make nfs_bmap() essentially a no-op that does no translation * - do nfs_strategy() by doing I/O with nfs_readrpc/nfs_writerpc * (Maybe I could use the process's page mapping, but I was concerned that * Kernel Write might not be enabled and also figured copyout() would do * a lot more work than bcopy() and also it currently happens in the * context of the swapper process (2). */ static int nfs_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct vnode **a_vpp; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { register struct vnode *vp = ap->a_vp; if (ap->a_vpp != NULL) *ap->a_vpp = vp; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn * btodb(vp->v_mount->mnt_stat.f_iosize); if (ap->a_runp != NULL) *ap->a_runp = 0; if (ap->a_runb != NULL) *ap->a_runb = 0; return (0); } /* * Strategy routine. * For async requests when nfsiod(s) are running, queue the request by * calling nfs_asyncio(), otherwise just all nfs_doio() to do the * request. */ static int nfs_strategy(ap) struct vop_strategy_args *ap; { register struct buf *bp = ap->a_bp; struct ucred *cr; struct proc *p; int error = 0; KASSERT(!(bp->b_flags & B_DONE), ("nfs_strategy: buffer %p unexpectedly marked B_DONE", bp)); KASSERT(BUF_REFCNT(bp) > 0, ("nfs_strategy: buffer %p not locked", bp)); if (bp->b_flags & B_PHYS) panic("nfs physio"); if (bp->b_flags & B_ASYNC) p = (struct proc *)0; else p = curproc; /* XXX */ if (bp->b_iocmd == BIO_READ) cr = bp->b_rcred; else cr = bp->b_wcred; /* * If the op is asynchronous and an i/o daemon is waiting * queue the request, wake it up and wait for completion * otherwise just do it ourselves. */ if ((bp->b_flags & B_ASYNC) == 0 || nfs_asyncio(bp, NOCRED, p)) error = nfs_doio(bp, cr, p); return (error); } /* * fsync vnode op. Just call nfs_flush() with commit == 1. */ /* ARGSUSED */ static int nfs_fsync(ap) struct vop_fsync_args /* { struct vnodeop_desc *a_desc; struct vnode * a_vp; struct ucred * a_cred; int a_waitfor; struct proc * a_p; } */ *ap; { return (nfs_flush(ap->a_vp, ap->a_cred, ap->a_waitfor, ap->a_p, 1)); } /* * Flush all the blocks associated with a vnode. * Walk through the buffer pool and push any dirty pages * associated with the vnode. */ static int nfs_flush(vp, cred, waitfor, p, commit) register struct vnode *vp; struct ucred *cred; int waitfor; struct proc *p; int commit; { register struct nfsnode *np = VTONFS(vp); register struct buf *bp; register int i; struct buf *nbp; struct nfsmount *nmp = VFSTONFS(vp->v_mount); int s, error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos; int passone = 1; u_quad_t off, endoff, toff; struct ucred* wcred = NULL; struct buf **bvec = NULL; #ifndef NFS_COMMITBVECSIZ #define NFS_COMMITBVECSIZ 20 #endif struct buf *bvec_on_stack[NFS_COMMITBVECSIZ]; int bvecsize = 0, bveccount; if (nmp->nm_flag & NFSMNT_INT) slpflag = PCATCH; if (!commit) passone = 0; /* * A b_flags == (B_DELWRI | B_NEEDCOMMIT) block has been written to the * server, but nas not been committed to stable storage on the server * yet. On the first pass, the byte range is worked out and the commit * rpc is done. On the second pass, nfs_writebp() is called to do the * job. */ again: off = (u_quad_t)-1; endoff = 0; bvecpos = 0; if (NFS_ISV3(vp) && commit) { s = splbio(); /* * Count up how many buffers waiting for a commit. */ bveccount = 0; for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (BUF_REFCNT(bp) == 0 && (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) == (B_DELWRI | B_NEEDCOMMIT)) bveccount++; } /* * Allocate space to remember the list of bufs to commit. It is * important to use M_NOWAIT here to avoid a race with nfs_write. * If we can't get memory (for whatever reason), we will end up * committing the buffers one-by-one in the loop below. */ if (bveccount > NFS_COMMITBVECSIZ) { if (bvec != NULL && bvec != bvec_on_stack) free(bvec, M_TEMP); bvec = (struct buf **) malloc(bveccount * sizeof(struct buf *), M_TEMP, M_NOWAIT); if (bvec == NULL) { bvec = bvec_on_stack; bvecsize = NFS_COMMITBVECSIZ; } else bvecsize = bveccount; } else { bvec = bvec_on_stack; bvecsize = NFS_COMMITBVECSIZ; } for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (bvecpos >= bvecsize) break; if ((bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) != (B_DELWRI | B_NEEDCOMMIT) || BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) continue; bremfree(bp); /* * Work out if all buffers are using the same cred * so we can deal with them all with one commit. * * NOTE: we are not clearing B_DONE here, so we have * to do it later on in this routine if we intend to * initiate I/O on the bp. * * Note: to avoid loopback deadlocks, we do not * assign b_runningbufspace. */ if (wcred == NULL) wcred = bp->b_wcred; else if (wcred != bp->b_wcred) wcred = NOCRED; bp->b_flags |= B_WRITEINPROG; vfs_busy_pages(bp, 1); /* * bp is protected by being locked, but nbp is not * and vfs_busy_pages() may sleep. We have to * recalculate nbp. */ nbp = TAILQ_NEXT(bp, b_vnbufs); /* * A list of these buffers is kept so that the * second loop knows which buffers have actually * been committed. This is necessary, since there * may be a race between the commit rpc and new * uncommitted writes on the file. */ bvec[bvecpos++] = bp; toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; if (toff < off) off = toff; toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff); if (toff > endoff) endoff = toff; } splx(s); } if (bvecpos > 0) { /* * Commit data on the server, as required. * If all bufs are using the same wcred, then use that with * one call for all of them, otherwise commit each one * separately. */ if (wcred != NOCRED) retv = nfs_commit(vp, off, (int)(endoff - off), wcred, p); else { retv = 0; for (i = 0; i < bvecpos; i++) { off_t off, size; bp = bvec[i]; off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; size = (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff); retv = nfs_commit(vp, off, (int)size, bp->b_wcred, p); if (retv) break; } } if (retv == NFSERR_STALEWRITEVERF) nfs_clearcommit(vp->v_mount); /* * Now, either mark the blocks I/O done or mark the * blocks dirty, depending on whether the commit * succeeded. */ for (i = 0; i < bvecpos; i++) { bp = bvec[i]; bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG | B_CLUSTEROK); if (retv) { /* * Error, leave B_DELWRI intact */ vfs_unbusy_pages(bp); brelse(bp); } else { /* * Success, remove B_DELWRI ( bundirty() ). * * b_dirtyoff/b_dirtyend seem to be NFS * specific. We should probably move that * into bundirty(). XXX */ s = splbio(); vp->v_numoutput++; bp->b_flags |= B_ASYNC; bundirty(bp); bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_dirtyoff = bp->b_dirtyend = 0; splx(s); bufdone(bp); } } } /* * Start/do any write(s) that are required. */ loop: s = splbio(); for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { if (waitfor != MNT_WAIT || passone) continue; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL, "nfsfsync", slpflag, slptimeo); splx(s); if (error == 0) panic("nfs_fsync: inconsistent lock"); if (error == ENOLCK) goto loop; if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) { error = EINTR; goto done; } if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; } goto loop; } if ((bp->b_flags & B_DELWRI) == 0) panic("nfs_fsync: not dirty"); if ((passone || !commit) && (bp->b_flags & B_NEEDCOMMIT)) { BUF_UNLOCK(bp); continue; } bremfree(bp); if (passone || !commit) bp->b_flags |= B_ASYNC; else bp->b_flags |= B_ASYNC | B_WRITEINPROG; splx(s); BUF_WRITE(bp); goto loop; } splx(s); if (passone) { passone = 0; goto again; } if (waitfor == MNT_WAIT) { while (vp->v_numoutput) { vp->v_flag |= VBWAIT; error = tsleep((caddr_t)&vp->v_numoutput, slpflag | (PRIBIO + 1), "nfsfsync", slptimeo); if (error) { if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) { error = EINTR; goto done; } if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; } } } if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) && commit) { goto loop; } } if (np->n_flag & NWRITEERR) { error = np->n_error; np->n_flag &= ~NWRITEERR; } done: if (bvec != NULL && bvec != bvec_on_stack) free(bvec, M_TEMP); return (error); } /* * NFS advisory byte-level locks. * Currently unsupported. */ static int nfs_advlock(ap) struct vop_advlock_args /* { struct vnode *a_vp; caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap; { register struct nfsnode *np = VTONFS(ap->a_vp); /* * The following kludge is to allow diskless support to work * until a real NFS lockd is implemented. Basically, just pretend * that this is a local lock. */ return (lf_advlock(ap, &(np->n_lockf), np->n_size)); } /* * Print out the contents of an nfsnode. */ static int nfs_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); printf("tag VT_NFS, fileid %ld fsid 0x%x", np->n_vattr.va_fileid, np->n_vattr.va_fsid); if (vp->v_type == VFIFO) fifo_printinfo(vp); printf("\n"); return (0); } /* - * Just call nfs_writebp() with the force argument set to 1. - * - * NOTE: B_DONE may or may not be set in a_bp on call. - */ -static int -nfs_bwrite(ap) - struct vop_bwrite_args /* { - struct vnode *a_bp; - } */ *ap; -{ - return (nfs_writebp(ap->a_bp, 1, curproc)); -} - -/* - * This is a clone of vn_bwrite(), except that B_WRITEINPROG isn't set unless - * the force flag is one and it also handles the B_NEEDCOMMIT flag. We set - * B_CACHE if this is a VMIO buffer. + * This is the "real" nfs::bwrite(struct buf*). + * B_WRITEINPROG isn't set unless the force flag is one and it + * handles the B_NEEDCOMMIT flag. + * We set B_CACHE if this is a VMIO buffer. */ int nfs_writebp(bp, force, procp) register struct buf *bp; int force; struct proc *procp; { int s; int oldflags = bp->b_flags; #if 0 int retv = 1; off_t off; #endif if (BUF_REFCNT(bp) == 0) panic("bwrite: buffer is not locked???"); if (bp->b_flags & B_INVAL) { brelse(bp); return(0); } bp->b_flags |= B_CACHE; /* * Undirty the bp. We will redirty it later if the I/O fails. */ s = splbio(); bundirty(bp); bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_iocmd = BIO_WRITE; bp->b_vp->v_numoutput++; curproc->p_stats->p_ru.ru_oublock++; splx(s); /* * Note: to avoid loopback deadlocks, we do not * assign b_runningbufspace. */ vfs_busy_pages(bp, 1); if (force) bp->b_flags |= B_WRITEINPROG; BUF_KERNPROC(bp); BUF_STRATEGY(bp); if( (oldflags & B_ASYNC) == 0) { int rtval = bufwait(bp); if (oldflags & B_DELWRI) { s = splbio(); reassignbuf(bp, bp->b_vp); splx(s); } brelse(bp); return (rtval); } return (0); } /* * nfs special file access vnode op. * Essentially just get vattr and then imitate iaccess() since the device is * local to the client. */ static int nfsspec_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vattr *vap; register gid_t *gp; register struct ucred *cred = ap->a_cred; struct vnode *vp = ap->a_vp; mode_t mode = ap->a_mode; struct vattr vattr; register int i; int error; /* * Disallow write attempts on filesystems mounted read-only; * unless the file is a socket, fifo, or a block or character * device resident on the filesystem. */ if ((mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) { switch (vp->v_type) { case VREG: case VDIR: case VLNK: return (EROFS); default: break; } } /* * If you're the super-user, * you always get access. */ if (cred->cr_uid == 0) return (0); vap = &vattr; error = VOP_GETATTR(vp, vap, cred, ap->a_p); if (error) return (error); /* * Access check is based on only one of owner, group, public. * If not owner, then check group. If not a member of the * group, then check public access. */ if (cred->cr_uid != vap->va_uid) { mode >>= 3; gp = cred->cr_groups; for (i = 0; i < cred->cr_ngroups; i++, gp++) if (vap->va_gid == *gp) goto found; mode >>= 3; found: ; } error = (vap->va_mode & mode) == mode ? 0 : EACCES; return (error); } /* * Read wrapper for special devices. */ static int nfsspec_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct nfsnode *np = VTONFS(ap->a_vp); /* * Set access flag. */ np->n_flag |= NACC; getnanotime(&np->n_atim); return (VOCALL(spec_vnodeop_p, VOFFSET(vop_read), ap)); } /* * Write wrapper for special devices. */ static int nfsspec_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct nfsnode *np = VTONFS(ap->a_vp); /* * Set update flag. */ np->n_flag |= NUPD; getnanotime(&np->n_mtim); return (VOCALL(spec_vnodeop_p, VOFFSET(vop_write), ap)); } /* * Close wrapper for special devices. * * Update the times on the nfsnode then do device close. */ static int nfsspec_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); struct vattr vattr; if (np->n_flag & (NACC | NUPD)) { np->n_flag |= NCHG; if (vp->v_usecount == 1 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { VATTR_NULL(&vattr); if (np->n_flag & NACC) vattr.va_atime = np->n_atim; if (np->n_flag & NUPD) vattr.va_mtime = np->n_mtim; (void)VOP_SETATTR(vp, &vattr, ap->a_cred, ap->a_p); } } return (VOCALL(spec_vnodeop_p, VOFFSET(vop_close), ap)); } /* * Read wrapper for fifos. */ static int nfsfifo_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct nfsnode *np = VTONFS(ap->a_vp); /* * Set access flag. */ np->n_flag |= NACC; getnanotime(&np->n_atim); return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_read), ap)); } /* * Write wrapper for fifos. */ static int nfsfifo_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct nfsnode *np = VTONFS(ap->a_vp); /* * Set update flag. */ np->n_flag |= NUPD; getnanotime(&np->n_mtim); return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_write), ap)); } /* * Close wrapper for fifos. * * Update the times on the nfsnode then do fifo close. */ static int nfsfifo_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); struct vattr vattr; struct timespec ts; if (np->n_flag & (NACC | NUPD)) { getnanotime(&ts); if (np->n_flag & NACC) np->n_atim = ts; if (np->n_flag & NUPD) np->n_mtim = ts; np->n_flag |= NCHG; if (vp->v_usecount == 1 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { VATTR_NULL(&vattr); if (np->n_flag & NACC) vattr.va_atime = np->n_atim; if (np->n_flag & NUPD) vattr.va_mtime = np->n_mtim; (void)VOP_SETATTR(vp, &vattr, ap->a_cred, ap->a_p); } } return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_close), ap)); } Index: head/sys/ntfs/ntfs_vnops.c =================================================================== --- head/sys/ntfs/ntfs_vnops.c (revision 75579) +++ head/sys/ntfs/ntfs_vnops.c (revision 75580) @@ -1,943 +1,942 @@ /* $NetBSD: ntfs_vnops.c,v 1.23 1999/10/31 19:45:27 jdolecek Exp $ */ /* * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * John Heidemann of the UCLA Ficus project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__NetBSD__) #include #endif #include #include #include #if defined(__FreeBSD__) #include #endif #include #include /*#define NTFS_DEBUG 1*/ #include #include #include #if defined(__NetBSD__) #include #include #endif #include /* for pathconf(2) constants */ static int ntfs_read __P((struct vop_read_args *)); static int ntfs_write __P((struct vop_write_args *ap)); static int ntfs_getattr __P((struct vop_getattr_args *ap)); static int ntfs_inactive __P((struct vop_inactive_args *ap)); static int ntfs_print __P((struct vop_print_args *ap)); static int ntfs_reclaim __P((struct vop_reclaim_args *ap)); static int ntfs_strategy __P((struct vop_strategy_args *ap)); static int ntfs_access __P((struct vop_access_args *ap)); static int ntfs_open __P((struct vop_open_args *ap)); static int ntfs_close __P((struct vop_close_args *ap)); static int ntfs_readdir __P((struct vop_readdir_args *ap)); static int ntfs_lookup __P((struct vop_lookup_args *ap)); static int ntfs_bmap __P((struct vop_bmap_args *ap)); #if defined(__FreeBSD__) static int ntfs_getpages __P((struct vop_getpages_args *ap)); static int ntfs_putpages __P((struct vop_putpages_args *)); static int ntfs_fsync __P((struct vop_fsync_args *ap)); #else static int ntfs_bypass __P((struct vop_generic_args *ap)); #endif static int ntfs_pathconf __P((void *)); int ntfs_prtactive = 1; /* 1 => print out reclaim of active vnodes */ #if defined(__FreeBSD__) int ntfs_getpages(ap) struct vop_getpages_args *ap; { return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage); } int ntfs_putpages(ap) struct vop_putpages_args *ap; { return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, ap->a_rtvals); } #endif /* * This is a noop, simply returning what one has been given. */ int ntfs_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct vnode **a_vpp; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { dprintf(("ntfs_bmap: vn: %p, blk: %d\n", ap->a_vp,(u_int32_t)ap->a_bn)); if (ap->a_vpp != NULL) *ap->a_vpp = ap->a_vp; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn; if (ap->a_runp != NULL) *ap->a_runp = 0; #if !defined(__NetBSD__) if (ap->a_runb != NULL) *ap->a_runb = 0; #endif return (0); } static int ntfs_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); struct uio *uio = ap->a_uio; struct ntfsmount *ntmp = ip->i_mp; u_int64_t toread; int error; dprintf(("ntfs_read: ino: %d, off: %d resid: %d, segflg: %d\n",ip->i_number,(u_int32_t)uio->uio_offset,uio->uio_resid,uio->uio_segflg)); dprintf(("ntfs_read: filesize: %d",(u_int32_t)fp->f_size)); /* don't allow reading after end of file */ if (uio->uio_offset > fp->f_size) toread = 0; else toread = min( uio->uio_resid, fp->f_size - uio->uio_offset ); dprintf((", toread: %d\n",(u_int32_t)toread)); if (toread == 0) return (0); error = ntfs_readattr(ntmp, ip, fp->f_attrtype, fp->f_attrname, uio->uio_offset, toread, NULL, uio); if (error) { printf("ntfs_read: ntfs_readattr failed: %d\n",error); return (error); } return (0); } #if !defined(__FreeBSD__) static int ntfs_bypass(ap) struct vop_generic_args /* { struct vnodeop_desc *a_desc; } */ *ap; { int error = ENOTTY; dprintf(("ntfs_bypass: %s\n", ap->a_desc->vdesc_name)); return (error); } #endif static int ntfs_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); register struct vattr *vap = ap->a_vap; dprintf(("ntfs_getattr: %d, flags: %d\n",ip->i_number,ip->i_flag)); #if defined(__FreeBSD__) vap->va_fsid = dev2udev(ip->i_dev); #else /* NetBSD */ vap->va_fsid = ip->i_dev; #endif vap->va_fileid = ip->i_number; vap->va_mode = ip->i_mp->ntm_mode; vap->va_nlink = ip->i_nlink; vap->va_uid = ip->i_mp->ntm_uid; vap->va_gid = ip->i_mp->ntm_gid; vap->va_rdev = 0; /* XXX UNODEV ? */ vap->va_size = fp->f_size; vap->va_bytes = fp->f_allocated; vap->va_atime = ntfs_nttimetounix(fp->f_times.t_access); vap->va_mtime = ntfs_nttimetounix(fp->f_times.t_write); vap->va_ctime = ntfs_nttimetounix(fp->f_times.t_create); vap->va_flags = ip->i_flag; vap->va_gen = 0; vap->va_blocksize = ip->i_mp->ntm_spc * ip->i_mp->ntm_bps; vap->va_type = vp->v_type; vap->va_filerev = 0; return (0); } /* * Last reference to an ntnode. If necessary, write or delete it. */ int ntfs_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; } */ *ap; { register struct vnode *vp = ap->a_vp; #ifdef NTFS_DEBUG register struct ntnode *ip = VTONT(vp); #endif dprintf(("ntfs_inactive: vnode: %p, ntnode: %d\n", vp, ip->i_number)); if (ntfs_prtactive && vp->v_usecount != 0) vprint("ntfs_inactive: pushing active", vp); VOP__UNLOCK(vp, 0, ap->a_p); /* XXX since we don't support any filesystem changes * right now, nothing more needs to be done */ return (0); } /* * Reclaim an fnode/ntnode so that it can be used for other purposes. */ int ntfs_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); int error; dprintf(("ntfs_reclaim: vnode: %p, ntnode: %d\n", vp, ip->i_number)); if (ntfs_prtactive && vp->v_usecount != 0) vprint("ntfs_reclaim: pushing active", vp); if ((error = ntfs_ntget(ip)) != 0) return (error); /* Purge old data structures associated with the inode. */ cache_purge(vp); if (ip->i_devvp) { vrele(ip->i_devvp); ip->i_devvp = NULL; } ntfs_frele(fp); ntfs_ntput(ip); vp->v_data = NULL; return (0); } static int ntfs_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { return (0); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. */ int ntfs_strategy(ap) struct vop_strategy_args /* { struct buf *a_bp; } */ *ap; { register struct buf *bp = ap->a_bp; register struct vnode *vp = bp->b_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); struct ntfsmount *ntmp = ip->i_mp; int error; #ifdef __FreeBSD__ dprintf(("ntfs_strategy: offset: %d, blkno: %d, lblkno: %d\n", (u_int32_t)bp->b_offset,(u_int32_t)bp->b_blkno, (u_int32_t)bp->b_lblkno)); #else dprintf(("ntfs_strategy: blkno: %d, lblkno: %d\n", (u_int32_t)bp->b_blkno, (u_int32_t)bp->b_lblkno)); #endif dprintf(("strategy: bcount: %d flags: 0x%lx\n", (u_int32_t)bp->b_bcount,bp->b_flags)); if (bp->b_iocmd == BIO_READ) { u_int32_t toread; if (ntfs_cntob(bp->b_blkno) >= fp->f_size) { clrbuf(bp); error = 0; } else { toread = min(bp->b_bcount, fp->f_size-ntfs_cntob(bp->b_blkno)); dprintf(("ntfs_strategy: toread: %d, fsize: %d\n", toread,(u_int32_t)fp->f_size)); error = ntfs_readattr(ntmp, ip, fp->f_attrtype, fp->f_attrname, ntfs_cntob(bp->b_blkno), toread, bp->b_data, NULL); if (error) { printf("ntfs_strategy: ntfs_readattr failed\n"); bp->b_error = error; bp->b_ioflags |= BIO_ERROR; } bzero(bp->b_data + toread, bp->b_bcount - toread); } } else { size_t tmp; u_int32_t towrite; if (ntfs_cntob(bp->b_blkno) + bp->b_bcount >= fp->f_size) { printf("ntfs_strategy: CAN'T EXTEND FILE\n"); bp->b_error = error = EFBIG; bp->b_ioflags |= BIO_ERROR; } else { towrite = min(bp->b_bcount, fp->f_size-ntfs_cntob(bp->b_blkno)); dprintf(("ntfs_strategy: towrite: %d, fsize: %d\n", towrite,(u_int32_t)fp->f_size)); error = ntfs_writeattr_plain(ntmp, ip, fp->f_attrtype, fp->f_attrname, ntfs_cntob(bp->b_blkno),towrite, bp->b_data, &tmp, NULL); if (error) { printf("ntfs_strategy: ntfs_writeattr fail\n"); bp->b_error = error; bp->b_ioflags |= BIO_ERROR; } } } bufdone(bp); return (error); } static int ntfs_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); struct uio *uio = ap->a_uio; struct ntfsmount *ntmp = ip->i_mp; u_int64_t towrite; size_t written; int error; dprintf(("ntfs_write: ino: %d, off: %d resid: %d, segflg: %d\n",ip->i_number,(u_int32_t)uio->uio_offset,uio->uio_resid,uio->uio_segflg)); dprintf(("ntfs_write: filesize: %d",(u_int32_t)fp->f_size)); if (uio->uio_resid + uio->uio_offset > fp->f_size) { printf("ntfs_write: CAN'T WRITE BEYOND END OF FILE\n"); return (EFBIG); } towrite = min(uio->uio_resid, fp->f_size - uio->uio_offset); dprintf((", towrite: %d\n",(u_int32_t)towrite)); error = ntfs_writeattr_plain(ntmp, ip, fp->f_attrtype, fp->f_attrname, uio->uio_offset, towrite, NULL, &written, uio); #ifdef NTFS_DEBUG if (error) printf("ntfs_write: ntfs_writeattr failed: %d\n", error); #endif return (error); } int ntfs_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct ntnode *ip = VTONT(vp); mode_t mode = ap->a_mode; #ifdef QUOTA int error; #endif dprintf(("ntfs_access: %d\n",ip->i_number)); /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ if (mode & VWRITE) { switch ((int)vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); #ifdef QUOTA if (error = getinoquota(ip)) return (error); #endif break; } } return (vaccess(vp->v_type, ip->i_mp->ntm_mode, ip->i_mp->ntm_uid, ip->i_mp->ntm_gid, ap->a_mode, ap->a_cred, NULL)); } /* * Open called. * * Nothing to do. */ /* ARGSUSED */ static int ntfs_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { #if NTFS_DEBUG register struct vnode *vp = ap->a_vp; register struct ntnode *ip = VTONT(vp); printf("ntfs_open: %d\n",ip->i_number); #endif /* * Files marked append-only must be opened for appending. */ return (0); } /* * Close called. * * Update the times on the inode. */ /* ARGSUSED */ static int ntfs_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { #if NTFS_DEBUG register struct vnode *vp = ap->a_vp; register struct ntnode *ip = VTONT(vp); printf("ntfs_close: %d\n",ip->i_number); #endif return (0); } int ntfs_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_ncookies; u_int **cookies; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); struct uio *uio = ap->a_uio; struct ntfsmount *ntmp = ip->i_mp; int i, error = 0; u_int32_t faked = 0, num; int ncookies = 0; struct dirent cde; off_t off; dprintf(("ntfs_readdir %d off: %d resid: %d\n",ip->i_number,(u_int32_t)uio->uio_offset,uio->uio_resid)); off = uio->uio_offset; /* Simulate . in every dir except ROOT */ if( ip->i_number != NTFS_ROOTINO ) { struct dirent dot = { NTFS_ROOTINO, sizeof(struct dirent), DT_DIR, 1, "." }; if( uio->uio_offset < sizeof(struct dirent) ) { dot.d_fileno = ip->i_number; error = uiomove((char *)&dot,sizeof(struct dirent),uio); if(error) return (error); ncookies ++; } } /* Simulate .. in every dir including ROOT */ if( uio->uio_offset < 2 * sizeof(struct dirent) ) { struct dirent dotdot = { NTFS_ROOTINO, sizeof(struct dirent), DT_DIR, 2, ".." }; error = uiomove((char *)&dotdot,sizeof(struct dirent),uio); if(error) return (error); ncookies ++; } faked = (ip->i_number == NTFS_ROOTINO) ? 1 : 2; num = uio->uio_offset / sizeof(struct dirent) - faked; while( uio->uio_resid >= sizeof(struct dirent) ) { struct attr_indexentry *iep; error = ntfs_ntreaddir(ntmp, fp, num, &iep); if(error) return (error); if( NULL == iep ) break; for(; !(iep->ie_flag & NTFS_IEFLAG_LAST) && (uio->uio_resid >= sizeof(struct dirent)); iep = NTFS_NEXTREC(iep, struct attr_indexentry *)) { if(!ntfs_isnamepermitted(ntmp,iep)) continue; for(i=0; iie_fnamelen; i++) { cde.d_name[i] = ntfs_u28(iep->ie_fname[i]); } cde.d_name[i] = '\0'; dprintf(("ntfs_readdir: elem: %d, fname:[%s] type: %d, flag: %d, ", num, cde.d_name, iep->ie_fnametype, iep->ie_flag)); cde.d_namlen = iep->ie_fnamelen; cde.d_fileno = iep->ie_number; cde.d_type = (iep->ie_fflag & NTFS_FFLAG_DIR) ? DT_DIR : DT_REG; cde.d_reclen = sizeof(struct dirent); dprintf(("%s\n", (cde.d_type == DT_DIR) ? "dir":"reg")); error = uiomove((char *)&cde, sizeof(struct dirent), uio); if(error) return (error); ncookies++; num++; } } dprintf(("ntfs_readdir: %d entries (%d bytes) read\n", ncookies,(u_int)(uio->uio_offset - off))); dprintf(("ntfs_readdir: off: %d resid: %d\n", (u_int32_t)uio->uio_offset,uio->uio_resid)); if (!error && ap->a_ncookies != NULL) { struct dirent* dpStart; struct dirent* dp; #if defined(__FreeBSD__) u_long *cookies; u_long *cookiep; #else /* defined(__NetBSD__) */ off_t *cookies; off_t *cookiep; #endif ddprintf(("ntfs_readdir: %d cookies\n",ncookies)); if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) panic("ntfs_readdir: unexpected uio from NFS server"); dpStart = (struct dirent *) ((caddr_t)uio->uio_iov->iov_base - (uio->uio_offset - off)); #if defined(__FreeBSD__) MALLOC(cookies, u_long *, ncookies * sizeof(u_long), M_TEMP, M_WAITOK); #else /* defined(__NetBSD__) */ MALLOC(cookies, off_t *, ncookies * sizeof(off_t), M_TEMP, M_WAITOK); #endif for (dp = dpStart, cookiep = cookies, i=0; i < ncookies; dp = (struct dirent *)((caddr_t) dp + dp->d_reclen), i++) { off += dp->d_reclen; *cookiep++ = (u_int) off; } *ap->a_ncookies = ncookies; *ap->a_cookies = cookies; } /* if (ap->a_eofflag) *ap->a_eofflag = VTONT(ap->a_vp)->i_size <= uio->uio_offset; */ return (error); } int ntfs_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { register struct vnode *dvp = ap->a_dvp; register struct ntnode *dip = VTONT(dvp); struct ntfsmount *ntmp = dip->i_mp; struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; int error; int lockparent = cnp->cn_flags & LOCKPARENT; #if NTFS_DEBUG int wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT); #endif dprintf(("ntfs_lookup: \"%.*s\" (%ld bytes) in %d, lp: %d, wp: %d \n", (int)cnp->cn_namelen, cnp->cn_nameptr, cnp->cn_namelen, dip->i_number, lockparent, wantparent)); error = VOP_ACCESS(dvp, VEXEC, cred, cnp->cn_proc); if(error) return (error); if ((cnp->cn_flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); #ifdef __NetBSD__ /* * We now have a segment name to search for, and a directory * to search. * * Before tediously performing a linear scan of the directory, * check the name cache to see if the directory/name pair * we are looking for is known already. */ if ((error = cache_lookup(ap->a_dvp, ap->a_vpp, cnp)) >= 0) return (error); #endif if(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { dprintf(("ntfs_lookup: faking . directory in %d\n", dip->i_number)); VREF(dvp); *ap->a_vpp = dvp; error = 0; } else if (cnp->cn_flags & ISDOTDOT) { struct ntvattr *vap; dprintf(("ntfs_lookup: faking .. directory in %d\n", dip->i_number)); error = ntfs_ntvattrget(ntmp, dip, NTFS_A_NAME, NULL, 0, &vap); if(error) return (error); VOP__UNLOCK(dvp,0,cnp->cn_proc); cnp->cn_flags |= PDIRUNLOCK; dprintf(("ntfs_lookup: parentdir: %d\n", vap->va_a_name->n_pnumber)); error = VFS_VGET(ntmp->ntm_mountp, vap->va_a_name->n_pnumber,ap->a_vpp); ntfs_ntvattrrele(vap); if (error) { if (VN_LOCK(dvp,LK_EXCLUSIVE|LK_RETRY,cnp->cn_proc)==0) cnp->cn_flags &= ~PDIRUNLOCK; return (error); } if (lockparent && (cnp->cn_flags & ISLASTCN)) { error = VN_LOCK(dvp, LK_EXCLUSIVE, cnp->cn_proc); if (error) { vput( *(ap->a_vpp) ); return (error); } cnp->cn_flags &= ~PDIRUNLOCK; } } else { error = ntfs_ntlookupfile(ntmp, dvp, cnp, ap->a_vpp); if (error) { dprintf(("ntfs_ntlookupfile: returned %d\n", error)); return (error); } dprintf(("ntfs_lookup: found ino: %d\n", VTONT(*ap->a_vpp)->i_number)); if(!lockparent || !(cnp->cn_flags & ISLASTCN)) VOP__UNLOCK(dvp, 0, cnp->cn_proc); } if (cnp->cn_flags & MAKEENTRY) cache_enter(dvp, *ap->a_vpp, cnp); return (error); } #if defined(__FreeBSD__) /* * Flush the blocks of a file to disk. * * This function is worthless for vnodes that represent directories. Maybe we * could just do a sync if they try an fsync on a directory file. */ static int ntfs_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct proc *a_p; } */ *ap; { return (0); } #endif /* * Return POSIX pathconf information applicable to NTFS filesystem */ int ntfs_pathconf(v) void *v; { struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap = v; switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = 1; return (0); case _PC_NAME_MAX: *ap->a_retval = NTFS_MAXFILENAME; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_NO_TRUNC: *ap->a_retval = 0; return (0); #if defined(__NetBSD__) case _PC_SYNC_IO: *ap->a_retval = 1; return (0); case _PC_FILESIZEBITS: *ap->a_retval = 64; return (0); #endif default: return (EINVAL); } /* NOTREACHED */ } /* * Global vfs data structures */ vop_t **ntfs_vnodeop_p; #if defined(__FreeBSD__) static struct vnodeopv_entry_desc ntfs_vnodeop_entries[] = { { &vop_default_desc, (vop_t *)vop_defaultop }, { &vop_getattr_desc, (vop_t *)ntfs_getattr }, { &vop_inactive_desc, (vop_t *)ntfs_inactive }, { &vop_reclaim_desc, (vop_t *)ntfs_reclaim }, { &vop_print_desc, (vop_t *)ntfs_print }, { &vop_pathconf_desc, ntfs_pathconf }, { &vop_islocked_desc, (vop_t *)vop_stdislocked }, { &vop_unlock_desc, (vop_t *)vop_stdunlock }, { &vop_lock_desc, (vop_t *)vop_stdlock }, { &vop_cachedlookup_desc, (vop_t *)ntfs_lookup }, { &vop_lookup_desc, (vop_t *)vfs_cache_lookup }, { &vop_access_desc, (vop_t *)ntfs_access }, { &vop_close_desc, (vop_t *)ntfs_close }, { &vop_open_desc, (vop_t *)ntfs_open }, { &vop_readdir_desc, (vop_t *)ntfs_readdir }, { &vop_fsync_desc, (vop_t *)ntfs_fsync }, { &vop_bmap_desc, (vop_t *)ntfs_bmap }, { &vop_getpages_desc, (vop_t *) ntfs_getpages }, { &vop_putpages_desc, (vop_t *) ntfs_putpages }, { &vop_strategy_desc, (vop_t *)ntfs_strategy }, - { &vop_bwrite_desc, (vop_t *)vop_stdbwrite }, { &vop_read_desc, (vop_t *)ntfs_read }, { &vop_write_desc, (vop_t *)ntfs_write }, { NULL, NULL } }; static struct vnodeopv_desc ntfs_vnodeop_opv_desc = { &ntfs_vnodeop_p, ntfs_vnodeop_entries }; VNODEOP_SET(ntfs_vnodeop_opv_desc); #else /* !FreeBSD */ struct vnodeopv_entry_desc ntfs_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) ntfs_bypass }, { &vop_lookup_desc, (vop_t *) ntfs_lookup }, /* lookup */ { &vop_create_desc, genfs_eopnotsupp }, /* create */ { &vop_mknod_desc, genfs_eopnotsupp }, /* mknod */ { &vop_open_desc, (vop_t *) ntfs_open }, /* open */ { &vop_close_desc,(vop_t *) ntfs_close }, /* close */ { &vop_access_desc, (vop_t *) ntfs_access }, /* access */ { &vop_getattr_desc, (vop_t *) ntfs_getattr }, /* getattr */ { &vop_setattr_desc, genfs_eopnotsupp }, /* setattr */ { &vop_read_desc, (vop_t *) ntfs_read }, /* read */ { &vop_write_desc, (vop_t *) ntfs_write }, /* write */ { &vop_lease_desc, genfs_lease_check }, /* lease */ { &vop_fcntl_desc, genfs_fcntl }, /* fcntl */ { &vop_ioctl_desc, genfs_enoioctl }, /* ioctl */ { &vop_poll_desc, genfs_poll }, /* poll */ { &vop_revoke_desc, genfs_revoke }, /* revoke */ { &vop_fsync_desc, genfs_fsync }, /* fsync */ { &vop_seek_desc, genfs_seek }, /* seek */ { &vop_remove_desc, genfs_eopnotsupp }, /* remove */ { &vop_link_desc, genfs_eopnotsupp }, /* link */ { &vop_rename_desc, genfs_eopnotsupp }, /* rename */ { &vop_mkdir_desc, genfs_eopnotsupp }, /* mkdir */ { &vop_rmdir_desc, genfs_eopnotsupp }, /* rmdir */ { &vop_symlink_desc, genfs_eopnotsupp }, /* symlink */ { &vop_readdir_desc, (vop_t *) ntfs_readdir }, /* readdir */ { &vop_readlink_desc, genfs_eopnotsupp }, /* readlink */ { &vop_abortop_desc, genfs_abortop }, /* abortop */ { &vop_inactive_desc, (vop_t *) ntfs_inactive }, /* inactive */ { &vop_reclaim_desc, (vop_t *) ntfs_reclaim }, /* reclaim */ { &vop_lock_desc, genfs_lock }, /* lock */ { &vop_unlock_desc, genfs_unlock }, /* unlock */ { &vop_bmap_desc, (vop_t *) ntfs_bmap }, /* bmap */ { &vop_strategy_desc, (vop_t *) ntfs_strategy }, /* strategy */ { &vop_print_desc, (vop_t *) ntfs_print }, /* print */ { &vop_islocked_desc, genfs_islocked }, /* islocked */ { &vop_pathconf_desc, ntfs_pathconf }, /* pathconf */ { &vop_advlock_desc, genfs_nullop }, /* advlock */ { &vop_blkatoff_desc, genfs_eopnotsupp }, /* blkatoff */ { &vop_valloc_desc, genfs_eopnotsupp }, /* valloc */ { &vop_reallocblks_desc, genfs_eopnotsupp }, /* reallocblks */ { &vop_vfree_desc, genfs_eopnotsupp }, /* vfree */ { &vop_truncate_desc, genfs_eopnotsupp }, /* truncate */ { &vop_update_desc, genfs_eopnotsupp }, /* update */ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ { (struct vnodeop_desc *)NULL, (int (*) __P((void *)))NULL } }; struct vnodeopv_desc ntfs_vnodeop_opv_desc = { &ntfs_vnodeop_p, ntfs_vnodeop_entries }; #endif Index: head/sys/sys/buf.h =================================================================== --- head/sys/sys/buf.h (revision 75579) +++ head/sys/sys/buf.h (revision 75580) @@ -1,548 +1,561 @@ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)buf.h 8.9 (Berkeley) 3/30/95 * $FreeBSD$ */ #ifndef _SYS_BUF_H_ #define _SYS_BUF_H_ #include #include struct bio; struct buf; struct mount; struct vnode; /* * To avoid including */ LIST_HEAD(workhead, worklist); /* * These are currently used only by the soft dependency code, hence * are stored once in a global variable. If other subsystems wanted * to use these hooks, a pointer to a set of bio_ops could be added * to each buffer. */ extern struct bio_ops { void (*io_start) __P((struct buf *)); void (*io_complete) __P((struct buf *)); void (*io_deallocate) __P((struct buf *)); void (*io_movedeps) __P((struct buf *, struct buf *)); int (*io_countdeps) __P((struct buf *, int)); } bioops; +struct buf_ops { + char *bop_name; + int (*bop_write) __P((struct buf *)); +}; + +extern struct buf_ops buf_ops_bio; + /* * The buffer header describes an I/O operation in the kernel. * * NOTES: * b_bufsize, b_bcount. b_bufsize is the allocation size of the * buffer, either DEV_BSIZE or PAGE_SIZE aligned. b_bcount is the * originally requested buffer size and can serve as a bounds check * against EOF. For most, but not all uses, b_bcount == b_bufsize. * * b_dirtyoff, b_dirtyend. Buffers support piecemeal, unaligned * ranges of dirty data that need to be written to backing store. * The range is typically clipped at b_bcount ( not b_bufsize ). * * b_resid. Number of bytes remaining in I/O. After an I/O operation * completes, b_resid is usually 0 indicating 100% success. */ struct buf { /* XXX: b_io must be the first element of struct buf for now /phk */ struct bio b_io; /* "Builtin" I/O request. */ #define b_bcount b_io.bio_bcount #define b_blkno b_io.bio_blkno #define b_caller1 b_io.bio_caller1 #define b_data b_io.bio_data #define b_dev b_io.bio_dev #define b_driver1 b_io.bio_driver1 #define b_driver2 b_io.bio_driver2 #define b_error b_io.bio_error #define b_iocmd b_io.bio_cmd #define b_ioflags b_io.bio_flags #define b_pblkno b_io.bio_pblkno #define b_resid b_io.bio_resid + struct buf_ops *b_op; + unsigned b_magic; +#define B_MAGIC_BIO 0x10b10b10 +#define B_MAGIC_NFS 0x67238234 void (*b_iodone) __P((struct buf *)); off_t b_offset; /* Offset into file. */ LIST_ENTRY(buf) b_hash; /* Hash chain. */ TAILQ_ENTRY(buf) b_vnbufs; /* Buffer's associated vnode. */ TAILQ_ENTRY(buf) b_freelist; /* Free list position if not active. */ TAILQ_ENTRY(buf) b_act; /* Device driver queue when active. *new* */ long b_flags; /* B_* flags. */ unsigned short b_qindex; /* buffer queue index */ unsigned char b_xflags; /* extra flags */ struct lock b_lock; /* Buffer lock */ long b_bufsize; /* Allocated buffer size. */ long b_runningbufspace; /* when I/O is running, pipelining */ caddr_t b_kvabase; /* base kva for buffer */ int b_kvasize; /* size of kva for buffer */ daddr_t b_lblkno; /* Logical block number. */ struct vnode *b_vp; /* Device vnode. */ int b_dirtyoff; /* Offset in buffer of dirty region. */ int b_dirtyend; /* Offset of end of dirty region. */ struct ucred *b_rcred; /* Read credentials reference. */ struct ucred *b_wcred; /* Write credentials reference. */ void *b_saveaddr; /* Original b_addr for physio. */ union pager_info { void *pg_spc; int pg_reqpage; } b_pager; union cluster_info { TAILQ_HEAD(cluster_list_head, buf) cluster_head; TAILQ_ENTRY(buf) cluster_entry; } b_cluster; struct vm_page *b_pages[btoc(MAXPHYS)]; int b_npages; struct workhead b_dep; /* List of filesystem dependencies. */ }; #define b_spc b_pager.pg_spc /* * These flags are kept in b_flags. * * Notes: * * B_ASYNC VOP calls on bp's are usually async whether or not * B_ASYNC is set, but some subsystems, such as NFS, like * to know what is best for the caller so they can * optimize the I/O. * * B_PAGING Indicates that bp is being used by the paging system or * some paging system and that the bp is not linked into * the b_vp's clean/dirty linked lists or ref counts. * Buffer vp reassignments are illegal in this case. * * B_CACHE This may only be set if the buffer is entirely valid. * The situation where B_DELWRI is set and B_CACHE is * clear MUST be committed to disk by getblk() so * B_DELWRI can also be cleared. See the comments for * getblk() in kern/vfs_bio.c. If B_CACHE is clear, * the caller is expected to clear BIO_ERROR and B_INVAL, * set BIO_READ, and initiate an I/O. * * The 'entire buffer' is defined to be the range from * 0 through b_bcount. * * B_MALLOC Request that the buffer be allocated from the malloc * pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned. * * B_CLUSTEROK This flag is typically set for B_DELWRI buffers * by filesystems that allow clustering when the buffer * is fully dirty and indicates that it may be clustered * with other adjacent dirty buffers. Note the clustering * may not be used with the stage 1 data write under NFS * but may be used for the commit rpc portion. * * B_VMIO Indicates that the buffer is tied into an VM object. * The buffer's data is always PAGE_SIZE aligned even * if b_bufsize and b_bcount are not. ( b_bufsize is * always at least DEV_BSIZE aligned, though ). * */ #define B_AGE 0x00000001 /* Move to age queue when I/O done. */ #define B_NEEDCOMMIT 0x00000002 /* Append-write in progress. */ #define B_ASYNC 0x00000004 /* Start I/O, do not wait. */ #define B_UNUSED0 0x00000008 /* Old B_BAD */ #define B_DEFERRED 0x00000010 /* Skipped over for cleaning */ #define B_CACHE 0x00000020 /* Bread found us in the cache. */ #define B_VALIDSUSPWRT 0x00000040 /* Valid write during suspension. */ #define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */ #define B_DONE 0x00000200 /* I/O completed. */ #define B_EINTR 0x00000400 /* I/O was interrupted */ #define B_00000800 0x00000800 /* Available flag. */ #define B_SCANNED 0x00001000 /* VOP_FSYNC funcs mark written bufs */ #define B_INVAL 0x00002000 /* Does not contain valid info. */ #define B_LOCKED 0x00004000 /* Locked in core (not reusable). */ #define B_NOCACHE 0x00008000 /* Do not cache block after use. */ #define B_MALLOC 0x00010000 /* malloced b_data */ #define B_CLUSTEROK 0x00020000 /* Pagein op, so swap() can count it. */ #define B_PHYS 0x00040000 /* I/O to user memory. */ #define B_RAW 0x00080000 /* Set by physio for raw transfers. */ #define B_DIRTY 0x00200000 /* Needs writing later. */ #define B_RELBUF 0x00400000 /* Release VMIO buffer. */ #define B_WANT 0x00800000 /* Used by vm_pager.c */ #define B_WRITEINPROG 0x01000000 /* Write in progress. */ #define B_XXX 0x02000000 /* Debugging flag. */ #define B_PAGING 0x04000000 /* volatile paging I/O -- bypass VMIO */ #define B_08000000 0x08000000 /* Available flag. */ #define B_RAM 0x10000000 /* Read ahead mark (flag) */ #define B_VMIO 0x20000000 /* VMIO flag */ #define B_CLUSTER 0x40000000 /* pagein op, so swap() can count it */ #define B_80000000 0x80000000 /* Available flag. */ #define PRINT_BUF_FLAGS "\20\40autochain\37cluster\36vmio\35ram\34ordered" \ "\33paging\32xxx\31writeinprog\30want\27relbuf\26dirty" \ "\25read\24raw\23phys\22clusterok\21malloc\20nocache" \ "\17locked\16inval\15scanned\14error\13eintr\12done\11freebuf" \ "\10delwri\7call\6cache\4bad\3async\2needcommit\1age" /* * These flags are kept in b_xflags. */ #define BX_VNDIRTY 0x00000001 /* On vnode dirty list */ #define BX_VNCLEAN 0x00000002 /* On vnode clean list */ #define BX_BKGRDWRITE 0x00000004 /* Do writes in background */ #define BX_BKGRDINPROG 0x00000008 /* Background write in progress */ #define BX_BKGRDWAIT 0x00000010 /* Background write waiting */ #define NOOFFSET (-1LL) /* No buffer offset calculated yet */ #ifdef _KERNEL /* * Buffer locking */ extern struct mtx buftimelock; /* Interlock on setting prio and timo */ extern char *buf_wmesg; /* Default buffer lock message */ #define BUF_WMESG "bufwait" #include /* XXX for curproc */ #include /* * Initialize a lock. */ #define BUF_LOCKINIT(bp) \ lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, 0) /* * * Get a lock sleeping non-interruptably until it becomes available. */ static __inline int BUF_LOCK __P((struct buf *, int)); static __inline int BUF_LOCK(struct buf *bp, int locktype) { int s, ret; s = splbio(); mtx_lock(&buftimelock); locktype |= LK_INTERLOCK; bp->b_lock.lk_wmesg = buf_wmesg; bp->b_lock.lk_prio = PRIBIO + 4; bp->b_lock.lk_timo = 0; ret = lockmgr(&(bp)->b_lock, locktype, &buftimelock, curproc); splx(s); return ret; } /* * Get a lock sleeping with specified interruptably and timeout. */ static __inline int BUF_TIMELOCK __P((struct buf *, int, char *, int, int)); static __inline int BUF_TIMELOCK(struct buf *bp, int locktype, char *wmesg, int catch, int timo) { int s, ret; s = splbio(); mtx_lock(&buftimelock); locktype |= LK_INTERLOCK; bp->b_lock.lk_wmesg = wmesg; bp->b_lock.lk_prio = (PRIBIO + 4) | catch; bp->b_lock.lk_timo = timo; ret = lockmgr(&(bp)->b_lock, (locktype), &buftimelock, curproc); splx(s); return ret; } /* * Release a lock. Only the acquiring process may free the lock unless * it has been handed off to biodone. */ static __inline void BUF_UNLOCK __P((struct buf *)); static __inline void BUF_UNLOCK(struct buf *bp) { int s; s = splbio(); lockmgr(&(bp)->b_lock, LK_RELEASE, NULL, curproc); splx(s); } /* * Free a buffer lock. */ #define BUF_LOCKFREE(bp) \ do { \ if (BUF_REFCNT(bp) > 0) \ panic("free locked buf"); \ lockdestroy(&(bp)->b_lock); \ } while (0) #ifdef _SYS_PROC_H_ /* Avoid #include pollution */ /* * When initiating asynchronous I/O, change ownership of the lock to the * kernel. Once done, the lock may legally released by biodone. The * original owning process can no longer acquire it recursively, but must * wait until the I/O is completed and the lock has been freed by biodone. */ static __inline void BUF_KERNPROC __P((struct buf *)); static __inline void BUF_KERNPROC(struct buf *bp) { struct proc *p = curproc; if (p != PCPU_GET(idleproc) && bp->b_lock.lk_lockholder == p->p_pid) p->p_locks--; bp->b_lock.lk_lockholder = LK_KERNPROC; } #endif /* * Find out the number of references to a lock. */ static __inline int BUF_REFCNT __P((struct buf *)); static __inline int BUF_REFCNT(struct buf *bp) { int s, ret; s = splbio(); ret = lockcount(&(bp)->b_lock); splx(s); return ret; } #endif /* _KERNEL */ struct buf_queue_head { TAILQ_HEAD(buf_queue, buf) queue; daddr_t last_pblkno; struct buf *insert_point; struct buf *switch_point; }; /* * This structure describes a clustered I/O. It is stored in the b_saveaddr * field of the buffer on which I/O is done. At I/O completion, cluster * callback uses the structure to parcel I/O's to individual buffers, and * then free's this structure. */ struct cluster_save { long bs_bcount; /* Saved b_bcount. */ long bs_bufsize; /* Saved b_bufsize. */ void *bs_saveaddr; /* Saved b_addr. */ int bs_nchildren; /* Number of associated buffers. */ struct buf **bs_children; /* List of associated buffers. */ }; #ifdef _KERNEL static __inline void bufq_init __P((struct buf_queue_head *head)); static __inline void bufq_insert_tail __P((struct buf_queue_head *head, struct buf *bp)); static __inline void bufq_remove __P((struct buf_queue_head *head, struct buf *bp)); static __inline struct buf *bufq_first __P((struct buf_queue_head *head)); static __inline void bufq_init(struct buf_queue_head *head) { TAILQ_INIT(&head->queue); head->last_pblkno = 0; head->insert_point = NULL; head->switch_point = NULL; } static __inline void bufq_insert_tail(struct buf_queue_head *head, struct buf *bp) { if ((bp->b_ioflags & BIO_ORDERED) != 0) { head->insert_point = bp; head->switch_point = NULL; } TAILQ_INSERT_TAIL(&head->queue, bp, b_act); } static __inline void bufq_remove(struct buf_queue_head *head, struct buf *bp) { if (bp == head->switch_point) head->switch_point = TAILQ_NEXT(bp, b_act); if (bp == head->insert_point) { head->insert_point = TAILQ_PREV(bp, buf_queue, b_act); if (head->insert_point == NULL) head->last_pblkno = 0; } else if (bp == TAILQ_FIRST(&head->queue)) head->last_pblkno = bp->b_pblkno; TAILQ_REMOVE(&head->queue, bp, b_act); if (TAILQ_FIRST(&head->queue) == head->switch_point) head->switch_point = NULL; } static __inline struct buf * bufq_first(struct buf_queue_head *head) { return (TAILQ_FIRST(&head->queue)); } -#define BUF_WRITE(bp) VOP_BWRITE((bp)->b_vp, (bp)) +#define BUF_WRITE(bp) \ + (bp)->b_op->bop_write(bp) + #define BUF_STRATEGY(bp) VOP_STRATEGY((bp)->b_vp, (bp)) static __inline void buf_start(struct buf *bp) { if (bioops.io_start) (*bioops.io_start)(bp); } static __inline void buf_complete(struct buf *bp) { if (bioops.io_complete) (*bioops.io_complete)(bp); } static __inline void buf_deallocate(struct buf *bp) { if (bioops.io_deallocate) (*bioops.io_deallocate)(bp); BUF_LOCKFREE(bp); } static __inline void buf_movedeps(struct buf *bp, struct buf *bp2) { if (bioops.io_movedeps) (*bioops.io_movedeps)(bp, bp2); } static __inline int buf_countdeps(struct buf *bp, int i) { if (bioops.io_countdeps) return ((*bioops.io_countdeps)(bp, i)); else return (0); } #endif /* _KERNEL */ /* * Definitions for the buffer free lists. */ #define BUFFER_QUEUES 6 /* number of free buffer queues */ #define QUEUE_NONE 0 /* on no queue */ #define QUEUE_LOCKED 1 /* locked buffers */ #define QUEUE_CLEAN 2 /* non-B_DELWRI buffers */ #define QUEUE_DIRTY 3 /* B_DELWRI buffers */ #define QUEUE_EMPTYKVA 4 /* empty buffer headers w/KVA assignment */ #define QUEUE_EMPTY 5 /* empty buffer headers */ /* * Zero out the buffer's data area. */ #define clrbuf(bp) { \ bzero((bp)->b_data, (u_int)(bp)->b_bcount); \ (bp)->b_resid = 0; \ } /* Flags to low-level allocation routines. */ #define B_CLRBUF 0x01 /* Request allocated buffer be cleared. */ #define B_SYNC 0x02 /* Do all allocations synchronously. */ #define B_METAONLY 0x04 /* Return indirect block buffer. */ #define B_NOWAIT 0x08 /* do not sleep to await lock */ #ifdef _KERNEL extern int nbuf; /* The number of buffer headers */ extern int runningbufspace; extern int buf_maxio; /* nominal maximum I/O for buffer */ extern struct buf *buf; /* The buffer headers. */ extern char *buffers; /* The buffer contents. */ extern int bufpages; /* Number of memory pages in the buffer pool. */ extern struct buf *swbuf; /* Swap I/O buffer headers. */ extern int nswbuf; /* Number of swap I/O buffer headers. */ extern TAILQ_HEAD(swqueue, buf) bswlist; extern TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES]; struct uio; caddr_t bufhashinit __P((caddr_t)); void bufinit __P((void)); void bwillwrite __P((void)); int buf_dirty_count_severe __P((void)); void bremfree __P((struct buf *)); int bread __P((struct vnode *, daddr_t, int, struct ucred *, struct buf **)); int breadn __P((struct vnode *, daddr_t, int, daddr_t *, int *, int, struct ucred *, struct buf **)); int bwrite __P((struct buf *)); void bdwrite __P((struct buf *)); void bawrite __P((struct buf *)); void bdirty __P((struct buf *)); void bundirty __P((struct buf *)); int bowrite __P((struct buf *)); void brelse __P((struct buf *)); void bqrelse __P((struct buf *)); int vfs_bio_awrite __P((struct buf *)); struct buf * getpbuf __P((int *)); struct buf *incore __P((struct vnode *, daddr_t)); struct buf *gbincore __P((struct vnode *, daddr_t)); int inmem __P((struct vnode *, daddr_t)); struct buf *getblk __P((struct vnode *, daddr_t, int, int, int)); struct buf *geteblk __P((int)); int bufwait __P((struct buf *)); void bufdone __P((struct buf *)); void bufdonebio __P((struct bio *)); void cluster_callback __P((struct buf *)); int cluster_read __P((struct vnode *, u_quad_t, daddr_t, long, struct ucred *, long, int, struct buf **)); int cluster_wbuild __P((struct vnode *, long, daddr_t, int)); void cluster_write __P((struct buf *, u_quad_t, int)); void vfs_bio_set_validclean __P((struct buf *, int base, int size)); void vfs_bio_clrbuf __P((struct buf *)); void vfs_busy_pages __P((struct buf *, int clear_modify)); void vfs_unbusy_pages __P((struct buf *)); void vwakeup __P((struct buf *)); void vmapbuf __P((struct buf *)); void vunmapbuf __P((struct buf *)); void relpbuf __P((struct buf *, int *)); void brelvp __P((struct buf *)); void bgetvp __P((struct vnode *, struct buf *)); void pbgetvp __P((struct vnode *, struct buf *)); void pbrelvp __P((struct buf *)); int allocbuf __P((struct buf *bp, int size)); void reassignbuf __P((struct buf *, struct vnode *)); void pbreassignbuf __P((struct buf *, struct vnode *)); struct buf *trypbuf __P((int *)); #endif /* _KERNEL */ #endif /* !_SYS_BUF_H_ */ Index: head/sys/sys/vnode.h =================================================================== --- head/sys/sys/vnode.h (revision 75579) +++ head/sys/sys/vnode.h (revision 75580) @@ -1,660 +1,658 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vnode.h 8.7 (Berkeley) 2/4/94 * $FreeBSD$ */ #ifndef _SYS_VNODE_H_ #define _SYS_VNODE_H_ #include #include #include #include #include #include /* * The vnode is the focus of all file activity in UNIX. There is a * unique vnode allocated for each active file, each current directory, * each mounted-on file, text file, and the root. */ /* * Vnode types. VNON means no type. */ enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD }; /* * Vnode tag types. * These are for the benefit of external programs only (e.g., pstat) * and should NEVER be inspected by the kernel. */ enum vtagtype { VT_NON, VT_UFS, VT_NFS, VT_MFS, VT_PC, VT_LFS, VT_LOFS, VT_FDESC, VT_PORTAL, VT_NULL, VT_UMAP, VT_KERNFS, VT_PROCFS, VT_AFS, VT_ISOFS, VT_UNION, VT_MSDOSFS, VT_DEVFS, VT_TFS, VT_VFS, VT_CODA, VT_NTFS, VT_HPFS, VT_NWFS, VT_PSEUDOFS, VT_SMBFS }; /* * Each underlying filesystem allocates its own private area and hangs * it from v_data. If non-null, this area is freed in getnewvnode(). */ TAILQ_HEAD(buflists, buf); typedef int vop_t __P((void *)); struct namecache; /* * Reading or writing any of these items requires holding the appropriate lock. * v_freelist is locked by the global vnode_free_list mutex. * v_mntvnodes is locked by the global mntvnodes mutex. * v_flag, v_usecount, v_holdcount and v_writecount are * locked by the v_interlock mutex. * v_pollinfo is locked by the lock contained inside it. */ struct vnode { u_long v_flag; /* vnode flags (see below) */ int v_usecount; /* reference count of users */ int v_writecount; /* reference count of writers */ int v_holdcnt; /* page & buffer references */ u_long v_id; /* capability identifier */ struct mount *v_mount; /* ptr to vfs we are in */ vop_t **v_op; /* vnode operations vector */ TAILQ_ENTRY(vnode) v_freelist; /* vnode freelist */ LIST_ENTRY(vnode) v_mntvnodes; /* vnodes for mount point */ struct buflists v_cleanblkhd; /* clean blocklist head */ struct buflists v_dirtyblkhd; /* dirty blocklist head */ LIST_ENTRY(vnode) v_synclist; /* vnodes with dirty buffers */ long v_numoutput; /* num of writes in progress */ enum vtype v_type; /* vnode type */ union { struct mount *vu_mountedhere;/* ptr to mounted vfs (VDIR) */ struct socket *vu_socket; /* unix ipc (VSOCK) */ struct { struct specinfo *vu_specinfo; /* device (VCHR, VBLK) */ SLIST_ENTRY(vnode) vu_specnext; } vu_spec; struct fifoinfo *vu_fifoinfo; /* fifo (VFIFO) */ } v_un; struct nqlease *v_lease; /* Soft reference to lease */ daddr_t v_lastw; /* last write (write cluster) */ daddr_t v_cstart; /* start block of cluster */ daddr_t v_lasta; /* last allocation */ int v_clen; /* length of current cluster */ struct vm_object *v_object; /* Place to store VM object */ struct mtx v_interlock; /* lock on usecount and flag */ struct lock v_lock; /* used if fs don't have one */ struct lock *v_vnlock; /* pointer to vnode lock */ enum vtagtype v_tag; /* type of underlying data */ void *v_data; /* private data for fs */ LIST_HEAD(, namecache) v_cache_src; /* Cache entries from us */ TAILQ_HEAD(, namecache) v_cache_dst; /* Cache entries to us */ struct vnode *v_dd; /* .. vnode */ u_long v_ddid; /* .. capability identifier */ struct { struct mtx vpi_lock; /* lock to protect below */ struct selinfo vpi_selinfo; /* identity of poller(s) */ short vpi_events; /* what they are looking for */ short vpi_revents; /* what has happened */ } v_pollinfo; struct proc *v_vxproc; /* proc owning VXLOCK */ #ifdef DEBUG_LOCKS const char *filename; /* Source file doing locking */ int line; /* Line number doing locking */ #endif }; #define v_mountedhere v_un.vu_mountedhere #define v_socket v_un.vu_socket #define v_rdev v_un.vu_spec.vu_specinfo #define v_specnext v_un.vu_spec.vu_specnext #define v_fifoinfo v_un.vu_fifoinfo #define VN_POLLEVENT(vp, events) \ do { \ if ((vp)->v_pollinfo.vpi_events & (events)) \ vn_pollevent((vp), (events)); \ } while (0) /* * Vnode flags. */ #define VROOT 0x00001 /* root of its file system */ #define VTEXT 0x00002 /* vnode is a pure text prototype */ #define VSYSTEM 0x00004 /* vnode being used by kernel */ #define VISTTY 0x00008 /* vnode represents a tty */ #define VXLOCK 0x00100 /* vnode is locked to change underlying type */ #define VXWANT 0x00200 /* process is waiting for vnode */ #define VBWAIT 0x00400 /* waiting for output to complete */ /* open for business 0x00800 */ /* open for business 0x01000 */ #define VOBJBUF 0x02000 /* Allocate buffers in VM object */ #define VCOPYONWRITE 0x04000 /* vnode is doing copy-on-write */ #define VAGE 0x08000 /* Insert vnode at head of free list */ #define VOLOCK 0x10000 /* vnode is locked waiting for an object */ #define VOWANT 0x20000 /* a process is waiting for VOLOCK */ #define VDOOMED 0x40000 /* This vnode is being recycled */ #define VFREE 0x80000 /* This vnode is on the freelist */ /* open for business 0x100000 */ #define VONWORKLST 0x200000 /* On syncer work-list */ #define VMOUNT 0x400000 /* Mount in progress */ /* * Vnode attributes. A field value of VNOVAL represents a field whose value * is unavailable (getattr) or which is not to be changed (setattr). */ struct vattr { enum vtype va_type; /* vnode type (for create) */ u_short va_mode; /* files access mode and type */ short va_nlink; /* number of references to file */ uid_t va_uid; /* owner user id */ gid_t va_gid; /* owner group id */ udev_t va_fsid; /* file system id */ long va_fileid; /* file id */ u_quad_t va_size; /* file size in bytes */ long va_blocksize; /* blocksize preferred for i/o */ struct timespec va_atime; /* time of last access */ struct timespec va_mtime; /* time of last modification */ struct timespec va_ctime; /* time file changed */ u_long va_gen; /* generation number of file */ u_long va_flags; /* flags defined for file */ udev_t va_rdev; /* device the special file represents */ u_quad_t va_bytes; /* bytes of disk space held by file */ u_quad_t va_filerev; /* file modification number */ u_int va_vaflags; /* operations flags, see below */ long va_spare; /* remain quad aligned */ }; /* * Flags for va_vaflags. */ #define VA_UTIMES_NULL 0x01 /* utimes argument was NULL */ #define VA_EXCLUSIVE 0x02 /* exclusive create request */ /* * Flags for ioflag. (high 16 bits used to ask for read-ahead and * help with write clustering) */ #define IO_UNIT 0x01 /* do I/O as atomic unit */ #define IO_APPEND 0x02 /* append write to end */ #define IO_SYNC 0x04 /* do I/O synchronously */ #define IO_NODELOCKED 0x08 /* underlying node already locked */ #define IO_NDELAY 0x10 /* FNDELAY flag set in file table */ #define IO_VMIO 0x20 /* data already in VMIO space */ #define IO_INVAL 0x40 /* invalidate after I/O */ #define IO_ASYNC 0x80 /* bawrite rather then bdwrite */ /* * Modes. Some values same as Ixxx entries from inode.h for now. */ #define VADMIN 010000 /* permission to administer vnode */ #define VSUID 004000 /* set user id on execution */ #define VSGID 002000 /* set group id on execution */ #define VSVTX 001000 /* save swapped text even after use */ #define VREAD 000400 /* read, write, execute permissions */ #define VWRITE 000200 #define VEXEC 000100 /* * Token indicating no attribute value yet assigned. */ #define VNOVAL (-1) #ifdef _KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_VNODE); #endif /* * Convert between vnode types and inode formats (since POSIX.1 * defines mode word of stat structure in terms of inode formats). */ extern enum vtype iftovt_tab[]; extern int vttoif_tab[]; #define IFTOVT(mode) (iftovt_tab[((mode) & S_IFMT) >> 12]) #define VTTOIF(indx) (vttoif_tab[(int)(indx)]) #define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode)) /* * Flags to various vnode functions. */ #define SKIPSYSTEM 0x0001 /* vflush: skip vnodes marked VSYSTEM */ #define FORCECLOSE 0x0002 /* vflush: force file closure */ #define WRITECLOSE 0x0004 /* vflush: only close writable files */ #define DOCLOSE 0x0008 /* vclean: close active files */ #define V_SAVE 0x0001 /* vinvalbuf: sync file first */ #define REVOKEALL 0x0001 /* vop_revoke: revoke all aliases */ #define V_WAIT 0x0001 /* vn_start_write: sleep for suspend */ #define V_NOWAIT 0x0002 /* vn_start_write: don't sleep for suspend */ #define V_XSLEEP 0x0004 /* vn_start_write: just return after sleep */ #define VREF(vp) vref(vp) #ifdef DIAGNOSTIC #define VATTR_NULL(vap) vattr_null(vap) #else #define VATTR_NULL(vap) (*(vap) = va_null) /* initialize a vattr */ #endif /* DIAGNOSTIC */ #define NULLVP ((struct vnode *)NULL) #define VNODEOP_SET(f) \ C_SYSINIT(f##init, SI_SUB_VFS, SI_ORDER_SECOND, vfs_add_vnodeops, &f); \ C_SYSUNINIT(f##uninit, SI_SUB_VFS, SI_ORDER_SECOND, vfs_rm_vnodeops, &f); /* * Global vnode data. */ extern struct vnode *rootvnode; /* root (i.e. "/") vnode */ extern int desiredvnodes; /* number of vnodes desired */ extern time_t syncdelay; /* max time to delay syncing data */ extern time_t filedelay; /* time to delay syncing files */ extern time_t dirdelay; /* time to delay syncing directories */ extern time_t metadelay; /* time to delay syncing metadata */ extern struct vm_zone *namei_zone; extern int prtactive; /* nonzero to call vprint() */ extern struct vattr va_null; /* predefined null vattr structure */ extern int vfs_ioopt; /* * Macro/function to check for client cache inconsistency w.r.t. leasing. */ #define LEASE_READ 0x1 /* Check lease for readers */ #define LEASE_WRITE 0x2 /* Check lease for modifiers */ extern void (*lease_updatetime) __P((int deltat)); #define VSHOULDFREE(vp) \ (!((vp)->v_flag & (VFREE|VDOOMED)) && \ !(vp)->v_holdcnt && !(vp)->v_usecount && \ (!(vp)->v_object || \ !((vp)->v_object->ref_count || (vp)->v_object->resident_page_count))) #define VSHOULDBUSY(vp) \ (((vp)->v_flag & VFREE) && \ ((vp)->v_holdcnt || (vp)->v_usecount)) #define VI_LOCK(vp) mtx_lock(&(vp)->v_interlock) #define VI_TRYLOCK(vp) mtx_trylock(&(vp)->v_interlock) #define VI_UNLOCK(vp) mtx_unlock(&(vp)->v_interlock) #endif /* _KERNEL */ /* * Mods for extensibility. */ /* * Flags for vdesc_flags: */ #define VDESC_MAX_VPS 16 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */ #define VDESC_VP0_WILLRELE 0x0001 #define VDESC_VP1_WILLRELE 0x0002 #define VDESC_VP2_WILLRELE 0x0004 #define VDESC_VP3_WILLRELE 0x0008 #define VDESC_NOMAP_VPP 0x0100 #define VDESC_VPP_WILLRELE 0x0200 /* * VDESC_NO_OFFSET is used to identify the end of the offset list * and in places where no such field exists. */ #define VDESC_NO_OFFSET -1 /* * This structure describes the vnode operation taking place. */ struct vnodeop_desc { int vdesc_offset; /* offset in vector--first for speed */ char *vdesc_name; /* a readable name for debugging */ int vdesc_flags; /* VDESC_* flags */ /* * These ops are used by bypass routines to map and locate arguments. * Creds and procs are not needed in bypass routines, but sometimes * they are useful to (for example) transport layers. * Nameidata is useful because it has a cred in it. */ int *vdesc_vp_offsets; /* list ended by VDESC_NO_OFFSET */ int vdesc_vpp_offset; /* return vpp location */ int vdesc_cred_offset; /* cred location, if any */ int vdesc_proc_offset; /* proc location, if any */ int vdesc_componentname_offset; /* if any */ /* * Finally, we've got a list of private data (about each operation) * for each transport layer. (Support to manage this list is not * yet part of BSD.) */ caddr_t *vdesc_transports; }; #ifdef _KERNEL /* * A list of all the operation descs. */ extern struct vnodeop_desc *vnodeop_descs[]; /* * Interlock for scanning list of vnodes attached to a mountpoint */ extern struct mtx mntvnode_mtx; /* * This macro is very helpful in defining those offsets in the vdesc struct. * * This is stolen from X11R4. I ignored all the fancy stuff for * Crays, so if you decide to port this to such a serious machine, * you might want to consult Intrinsic.h's XtOffset{,Of,To}. */ #define VOPARG_OFFSET(p_type,field) \ ((int) (((char *) (&(((p_type)NULL)->field))) - ((char *) NULL))) #define VOPARG_OFFSETOF(s_type,field) \ VOPARG_OFFSET(s_type*,field) #define VOPARG_OFFSETTO(S_TYPE,S_OFFSET,STRUCT_P) \ ((S_TYPE)(((char*)(STRUCT_P))+(S_OFFSET))) /* * This structure is used to configure the new vnodeops vector. */ struct vnodeopv_entry_desc { struct vnodeop_desc *opve_op; /* which operation this is */ vop_t *opve_impl; /* code implementing this operation */ }; struct vnodeopv_desc { /* ptr to the ptr to the vector where op should go */ vop_t ***opv_desc_vector_p; struct vnodeopv_entry_desc *opv_desc_ops; /* null terminated list */ }; /* * A generic structure. * This can be used by bypass routines to identify generic arguments. */ struct vop_generic_args { struct vnodeop_desc *a_desc; /* other random data follows, presumably */ }; #ifdef DEBUG_VFS_LOCKS /* * Macros to aid in tracing VFS locking problems. Not totally * reliable since if the process sleeps between changing the lock * state and checking it with the assert, some other process could * change the state. They are good enough for debugging a single * filesystem using a single-threaded test. I find that 'cvs co src' * is a pretty good test. */ /* * [dfr] Kludge until I get around to fixing all the vfs locking. */ #define IS_LOCKING_VFS(vp) ((vp)->v_tag == VT_UFS \ || (vp)->v_tag == VT_MFS \ || (vp)->v_tag == VT_NFS \ || (vp)->v_tag == VT_LFS \ || (vp)->v_tag == VT_ISOFS \ || (vp)->v_tag == VT_MSDOSFS \ || (vp)->v_tag == VT_DEVFS) #define ASSERT_VOP_LOCKED(vp, str) \ do { \ struct vnode *_vp = (vp); \ \ if (_vp && IS_LOCKING_VFS(_vp) && !VOP_ISLOCKED(_vp, NULL)) \ panic("%s: %p is not locked but should be", str, _vp); \ } while (0) #define ASSERT_VOP_UNLOCKED(vp, str) \ do { \ struct vnode *_vp = (vp); \ int lockstate; \ \ if (_vp && IS_LOCKING_VFS(_vp)) { \ lockstate = VOP_ISLOCKED(_vp, curproc); \ if (lockstate == LK_EXCLUSIVE) \ panic("%s: %p is locked but should not be", \ str, _vp); \ } \ } while (0) #define ASSERT_VOP_ELOCKED(vp, str) \ do { \ struct vnode *_vp = (vp); \ \ if (_vp && IS_LOCKING_VFS(_vp) && \ VOP_ISLOCKED(_vp, curproc) != LK_EXCLUSIVE) \ panic("%s: %p is not exclusive locked but should be", \ str, _vp); \ } while (0) #define ASSERT_VOP_ELOCKED_OTHER(vp, str) \ do { \ struct vnode *_vp = (vp); \ \ if (_vp && IS_LOCKING_VFS(_vp) && \ VOP_ISLOCKED(_vp, curproc) != LK_EXCLOTHER) \ panic("%s: %p is not exclusive locked by another proc", \ str, _vp); \ } while (0) #define ASSERT_VOP_SLOCKED(vp, str) \ do { \ struct vnode *_vp = (vp); \ \ if (_vp && IS_LOCKING_VFS(_vp) && \ VOP_ISLOCKED(_vp, NULL) != LK_SHARED) \ panic("%s: %p is not locked shared but should be", \ str, _vp); \ } while (0) #else #define ASSERT_VOP_LOCKED(vp, str) #define ASSERT_VOP_UNLOCKED(vp, str) #endif /* * VOCALL calls an op given an ops vector. We break it out because BSD's * vclean changes the ops vector and then wants to call ops with the old * vector. */ #define VOCALL(OPSV,OFF,AP) (( *((OPSV)[(OFF)])) (AP)) /* * This call works for vnodes in the kernel. */ #define VCALL(VP,OFF,AP) VOCALL((VP)->v_op,(OFF),(AP)) #define VDESC(OP) (& __CONCAT(OP,_desc)) #define VOFFSET(OP) (VDESC(OP)->vdesc_offset) /* * VMIO support inline */ extern int vmiodirenable; static __inline int vn_canvmio(struct vnode *vp) { if (vp && (vp->v_type == VREG || (vmiodirenable && vp->v_type == VDIR))) return(TRUE); return(FALSE); } /* * Finally, include the default set of vnode operations. */ #include "vnode_if.h" /* * Public vnode manipulation functions. */ struct componentname; struct file; struct mount; struct nameidata; struct ostat; struct proc; struct stat; struct nstat; struct ucred; struct uio; struct vattr; struct vnode; -struct vop_bwrite_args; extern int (*lease_check_hook) __P((struct vop_lease_args *)); struct vnode *addaliasu __P((struct vnode *vp, udev_t nvp_rdev)); int bdevvp __P((dev_t dev, struct vnode **vpp)); /* cache_* may belong in namei.h. */ void cache_enter __P((struct vnode *dvp, struct vnode *vp, struct componentname *cnp)); int cache_lookup __P((struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)); void cache_purge __P((struct vnode *vp)); void cache_purgevfs __P((struct mount *mp)); void cvtstat __P((struct stat *st, struct ostat *ost)); void cvtnstat __P((struct stat *sb, struct nstat *nsb)); int getnewvnode __P((enum vtagtype tag, struct mount *mp, vop_t **vops, struct vnode **vpp)); int lease_check __P((struct vop_lease_args *ap)); int spec_vnoperate __P((struct vop_generic_args *)); int speedup_syncer __P((void)); int textvp_fullpath __P((struct proc *p, char **retbuf, char **retfreebuf)); int vaccess __P((enum vtype type, mode_t file_mode, uid_t uid, gid_t gid, mode_t acc_mode, struct ucred *cred, int *privused)); int vaccess_acl_posix1e __P((enum vtype type, uid_t file_uid, gid_t file_gid, struct acl *acl, mode_t acc_mode, struct ucred *cred, int *privused)); void vattr_null __P((struct vattr *vap)); int vcount __P((struct vnode *vp)); void vdrop __P((struct vnode *)); int vfinddev __P((dev_t dev, enum vtype type, struct vnode **vpp)); void vfs_add_vnodeops __P((const void *)); void vfs_rm_vnodeops __P((const void *)); int vflush __P((struct mount *mp, struct vnode *skipvp, int flags)); int vget __P((struct vnode *vp, int lockflag, struct proc *p)); void vgone __P((struct vnode *vp)); void vgonel __P((struct vnode *vp, struct proc *p)); void vhold __P((struct vnode *)); int vinvalbuf __P((struct vnode *vp, int save, struct ucred *cred, struct proc *p, int slpflag, int slptimeo)); int vtruncbuf __P((struct vnode *vp, struct ucred *cred, struct proc *p, off_t length, int blksize)); void vprint __P((char *label, struct vnode *vp)); int vrecycle __P((struct vnode *vp, struct mtx *inter_lkp, struct proc *p)); int vn_close __P((struct vnode *vp, int flags, struct ucred *cred, struct proc *p)); void vn_finished_write __P((struct mount *mp)); int vn_isdisk __P((struct vnode *vp, int *errp)); int vn_lock __P((struct vnode *vp, int flags, struct proc *p)); #ifdef DEBUG_LOCKS int debug_vn_lock __P((struct vnode *vp, int flags, struct proc *p, const char *filename, int line)); #define vn_lock(vp,flags,p) debug_vn_lock(vp,flags,p,__FILE__,__LINE__) #endif int vn_open __P((struct nameidata *ndp, int *flagp, int cmode)); void vn_pollevent __P((struct vnode *vp, int events)); void vn_pollgone __P((struct vnode *vp)); int vn_pollrecord __P((struct vnode *vp, struct proc *p, int events)); int vn_rdwr __P((enum uio_rw rw, struct vnode *vp, caddr_t base, int len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *cred, int *aresid, struct proc *p)); int vn_stat __P((struct vnode *vp, struct stat *sb, struct proc *p)); int vn_start_write __P((struct vnode *vp, struct mount **mpp, int flags)); dev_t vn_todev __P((struct vnode *vp)); int vn_write_suspend_wait __P((struct vnode *vp, struct mount *mp, int flags)); int vn_writechk __P((struct vnode *vp)); int vn_extattr_get __P((struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int *buflen, char *buf, struct proc *p)); int vn_extattr_set __P((struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int buflen, char *buf, struct proc *p)); int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, struct proc *p); int vfs_cache_lookup __P((struct vop_lookup_args *ap)); int vfs_object_create __P((struct vnode *vp, struct proc *p, struct ucred *cred)); void vfs_timestamp __P((struct timespec *)); void vfs_write_resume __P((struct mount *mp)); void vfs_write_suspend __P((struct mount *mp)); -int vop_stdbwrite __P((struct vop_bwrite_args *ap)); int vop_stdgetwritemount __P((struct vop_getwritemount_args *)); int vop_stdinactive __P((struct vop_inactive_args *)); int vop_stdislocked __P((struct vop_islocked_args *)); int vop_stdlock __P((struct vop_lock_args *)); int vop_stdunlock __P((struct vop_unlock_args *)); int vop_noislocked __P((struct vop_islocked_args *)); int vop_nolock __P((struct vop_lock_args *)); int vop_nopoll __P((struct vop_poll_args *)); int vop_nounlock __P((struct vop_unlock_args *)); int vop_stdpathconf __P((struct vop_pathconf_args *)); int vop_stdpoll __P((struct vop_poll_args *)); int vop_revoke __P((struct vop_revoke_args *)); int vop_sharedlock __P((struct vop_lock_args *)); int vop_eopnotsupp __P((struct vop_generic_args *ap)); int vop_ebadf __P((struct vop_generic_args *ap)); int vop_einval __P((struct vop_generic_args *ap)); int vop_enotty __P((struct vop_generic_args *ap)); int vop_defaultop __P((struct vop_generic_args *ap)); int vop_null __P((struct vop_generic_args *ap)); int vop_panic __P((struct vop_generic_args *ap)); int vop_stdcreatevobject __P((struct vop_createvobject_args *ap)); int vop_stddestroyvobject __P((struct vop_destroyvobject_args *ap)); int vop_stdgetvobject __P((struct vop_getvobject_args *ap)); void vfree __P((struct vnode *)); void vput __P((struct vnode *vp)); void vrele __P((struct vnode *vp)); void vref __P((struct vnode *vp)); void vbusy __P((struct vnode *vp)); extern vop_t **default_vnodeop_p; extern vop_t **spec_vnodeop_p; extern vop_t **dead_vnodeop_p; #endif /* _KERNEL */ #endif /* !_SYS_VNODE_H_ */ Index: head/sys/ufs/mfs/mfs_vnops.c =================================================================== --- head/sys/ufs/mfs/mfs_vnops.c (revision 75579) +++ head/sys/ufs/mfs/mfs_vnops.c (revision 75580) @@ -1,441 +1,440 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)mfs_vnops.c 8.11 (Berkeley) 5/22/95 * $FreeBSD$ */ #include "opt_ufs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include static int mfs_badop __P((struct vop_generic_args *)); static int mfs_bmap __P((struct vop_bmap_args *)); static int mfs_close __P((struct vop_close_args *)); static int mfs_fsync __P((struct vop_fsync_args *)); static int mfs_freeblks __P((struct vop_freeblks_args *)); static int mfs_inactive __P((struct vop_inactive_args *)); /* XXX */ static int mfs_open __P((struct vop_open_args *)); static int mfs_reclaim __P((struct vop_reclaim_args *)); /* XXX */ static int mfs_print __P((struct vop_print_args *)); /* XXX */ static int mfs_strategy __P((struct vop_strategy_args *)); /* XXX */ static int mfs_getpages __P((struct vop_getpages_args *)); /* XXX */ /* * mfs vnode operations. */ vop_t **mfs_vnodeop_p; static struct vnodeopv_entry_desc mfs_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) mfs_badop }, { &vop_bmap_desc, (vop_t *) mfs_bmap }, - { &vop_bwrite_desc, (vop_t *) vop_defaultop }, { &vop_close_desc, (vop_t *) mfs_close }, { &vop_createvobject_desc, (vop_t *) vop_stdcreatevobject }, { &vop_destroyvobject_desc, (vop_t *) vop_stddestroyvobject }, { &vop_freeblks_desc, (vop_t *) mfs_freeblks }, { &vop_fsync_desc, (vop_t *) mfs_fsync }, #ifdef UFS_EXTATTR { &vop_getextattr_desc, (vop_t *) ufs_vop_getextattr }, #endif { &vop_getpages_desc, (vop_t *) mfs_getpages }, { &vop_getvobject_desc, (vop_t *) vop_stdgetvobject }, { &vop_inactive_desc, (vop_t *) mfs_inactive }, { &vop_ioctl_desc, (vop_t *) vop_enotty }, { &vop_islocked_desc, (vop_t *) vop_defaultop }, { &vop_lock_desc, (vop_t *) vop_defaultop }, { &vop_open_desc, (vop_t *) mfs_open }, { &vop_print_desc, (vop_t *) mfs_print }, { &vop_reclaim_desc, (vop_t *) mfs_reclaim }, #ifdef UFS_EXTATTR { &vop_setextattr_desc, (vop_t *) ufs_vop_setextattr }, #endif { &vop_strategy_desc, (vop_t *) mfs_strategy }, { &vop_unlock_desc, (vop_t *) vop_defaultop }, { &vop_getwritemount_desc, (vop_t *) vop_stdgetwritemount }, { NULL, NULL } }; static struct vnodeopv_desc mfs_vnodeop_opv_desc = { &mfs_vnodeop_p, mfs_vnodeop_entries }; VNODEOP_SET(mfs_vnodeop_opv_desc); /* * Vnode Operations. * * Open called to allow memory filesystem to initialize and * validate before actual IO. Record our process identifier * so we can tell when we are doing I/O to ourself. */ /* ARGSUSED */ static int mfs_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { if (ap->a_vp->v_type != VCHR) panic("mfs_open not VCHR"); return (0); } static int mfs_fsync(ap) struct vop_fsync_args *ap; { return (VOCALL(spec_vnodeop_p, VOFFSET(vop_fsync), ap)); } /* * mfs_freeblks() - hook to allow us to free physical memory. * * We implement the BIO_DELETE strategy. We can't just madvise() * here because we have to do it in the correct order vs other bio * requests, so we queue it. * * Note: geteblk() sets B_INVAL. We leave it set to guarentee buffer * throw-away on brelse()? XXX */ static int mfs_freeblks(ap) struct vop_freeblks_args /* { struct vnode *a_vp; daddr_t a_addr; daddr_t a_length; } */ *ap; { struct buf *bp; struct vnode *vp; if (!vfinddev(ap->a_vp->v_rdev, VCHR, &vp) || vp->v_usecount == 0) panic("mfs_freeblks: bad dev"); bp = geteblk(ap->a_length); bp->b_flags |= B_ASYNC; bp->b_iocmd = BIO_DELETE; bp->b_dev = ap->a_vp->v_rdev; bp->b_blkno = ap->a_addr; bp->b_offset = dbtob(ap->a_addr); bp->b_bcount = ap->a_length; BUF_KERNPROC(bp); VOP_STRATEGY(vp, bp); return(0); } /* * Pass I/O requests to the memory filesystem process. */ static int mfs_strategy(ap) struct vop_strategy_args /* { struct vnode *a_vp; struct bio *a_bp; } */ *ap; { register struct buf *bp = (struct buf *)ap->a_bp; register struct mfsnode *mfsp; struct vnode *vp; struct proc *p = curproc; /* XXX */ int s; if (!vfinddev(bp->b_dev, VCHR, &vp) || vp->v_usecount == 0) panic("mfs_strategy: bad dev"); mfsp = VTOMFS(vp); /* * splbio required for queueing/dequeueing, in case of forwarded * BPs from bio interrupts (?). It may not be necessary. */ s = splbio(); if (mfsp->mfs_pid == 0) { /* * mini-root. Note: BIO_DELETE not supported at the moment, * I'm not sure what kind of dataspace b_data is in. */ caddr_t base; base = mfsp->mfs_baseoff + (bp->b_blkno << DEV_BSHIFT); if (bp->b_iocmd == BIO_DELETE) ; if (bp->b_iocmd == BIO_READ) bcopy(base, bp->b_data, bp->b_bcount); else bcopy(bp->b_data, base, bp->b_bcount); bufdone(bp); } else if (mfsp->mfs_pid == p->p_pid) { /* * VOP to self */ splx(s); mfs_doio(bp, mfsp); s = splbio(); } else { /* * VOP from some other process, queue to MFS process and * wake it up. */ bufq_insert_tail(&mfsp->buf_queue, bp); wakeup((caddr_t)vp); } splx(s); return (0); } /* * Memory file system I/O. * * Trivial on the HP since buffer has already been mapping into KVA space. * * Read and Write are handled with a simple copyin and copyout. * * We also partially support VOP_FREEBLKS() via BIO_DELETE. We can't implement * completely -- for example, on fragments or inode metadata, but we can * implement it for page-aligned requests. */ void mfs_doio(bp, mfsp) register struct buf *bp; struct mfsnode *mfsp; { caddr_t base = mfsp->mfs_baseoff + (bp->b_blkno << DEV_BSHIFT); if (bp->b_iocmd == BIO_DELETE) { /* * Implement BIO_DELETE, which allows the filesystem to tell * a block device when blocks are no longer needed (like when * a file is deleted). We use the hook to MADV_FREE the VM. * This makes an MFS filesystem work as well or better then * a sun-style swap-mounted filesystem. */ int bytes = bp->b_bcount; if ((vm_offset_t)base & PAGE_MASK) { int n = PAGE_SIZE - ((vm_offset_t)base & PAGE_MASK); bytes -= n; base += n; } if (bytes > 0) { struct madvise_args uap; bytes &= ~PAGE_MASK; if (bytes != 0) { bzero(&uap, sizeof(uap)); uap.addr = base; uap.len = bytes; uap.behav = MADV_FREE; madvise(curproc, &uap); } } bp->b_error = 0; } else if (bp->b_iocmd == BIO_READ) { /* * Read data from our 'memory' disk */ bp->b_error = copyin(base, bp->b_data, bp->b_bcount); } else { /* * Write data to our 'memory' disk */ bp->b_error = copyout(bp->b_data, base, bp->b_bcount); } if (bp->b_error) bp->b_ioflags |= BIO_ERROR; bufdone(bp); } /* * This is a noop, simply returning what one has been given. */ static int mfs_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; ufs_daddr_t a_bn; struct vnode **a_vpp; ufs_daddr_t *a_bnp; int *a_runp; } */ *ap; { if (ap->a_vpp != NULL) *ap->a_vpp = ap->a_vp; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn; if (ap->a_runp != NULL) *ap->a_runp = 0; return (0); } /* * Memory filesystem close routine */ /* ARGSUSED */ static int mfs_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct mfsnode *mfsp = VTOMFS(vp); register struct buf *bp; int error; /* * Finish any pending I/O requests. */ while ((bp = bufq_first(&mfsp->buf_queue)) != NULL) { bufq_remove(&mfsp->buf_queue, bp); mfs_doio(bp, mfsp); wakeup((caddr_t)bp); } /* * On last close of a memory filesystem * we must invalidate any in core blocks, so that * we can, free up its vnode. */ if ((error = vinvalbuf(vp, 1, ap->a_cred, ap->a_p, 0, 0)) != 0) return (error); /* * There should be no way to have any more uses of this * vnode, so if we find any other uses, it is a panic. */ if (vp->v_usecount > 1) printf("mfs_close: ref count %d > 1\n", vp->v_usecount); if (vp->v_usecount > 1 || (bufq_first(&mfsp->buf_queue) != NULL)) panic("mfs_close"); /* * Send a request to the filesystem server to exit. */ mfsp->mfs_active = 0; wakeup((caddr_t)vp); return (0); } /* * Memory filesystem inactive routine */ /* ARGSUSED */ static int mfs_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct mfsnode *mfsp = VTOMFS(vp); if (bufq_first(&mfsp->buf_queue) != NULL) panic("mfs_inactive: not inactive (next buffer %p)", bufq_first(&mfsp->buf_queue)); VOP_UNLOCK(vp, 0, ap->a_p); return (0); } /* * Reclaim a memory filesystem devvp so that it can be reused. */ static int mfs_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { register struct vnode *vp = ap->a_vp; FREE(vp->v_data, M_MFSNODE); vp->v_data = NULL; return (0); } /* * Print out the contents of an mfsnode. */ static int mfs_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { register struct mfsnode *mfsp = VTOMFS(ap->a_vp); printf("tag VT_MFS, pid %ld, base %p, size %ld\n", (long)mfsp->mfs_pid, (void *)mfsp->mfs_baseoff, mfsp->mfs_size); return (0); } /* * Block device bad operation */ static int mfs_badop(struct vop_generic_args *ap) { int i; printf("mfs_badop[%s]\n", ap->a_desc->vdesc_name); i = vop_defaultop(ap); printf("mfs_badop[%s] = %d\n", ap->a_desc->vdesc_name,i); return (i); } static int mfs_getpages(ap) struct vop_getpages_args *ap; { return (VOCALL(spec_vnodeop_p, VOFFSET(vop_getpages), ap)); } Index: head/sys/vm/vm_pager.c =================================================================== --- head/sys/vm/vm_pager.c (revision 75579) +++ head/sys/vm/vm_pager.c (revision 75580) @@ -1,485 +1,487 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pager.c 8.6 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ /* * Paging space routine stubs. Emulates a matchmaker-like interface * for builtin pagers. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_VMPGDATA, "VM pgdata", "XXX: VM pager private data"); extern struct pagerops defaultpagerops; extern struct pagerops swappagerops; extern struct pagerops vnodepagerops; extern struct pagerops devicepagerops; extern struct pagerops physpagerops; int cluster_pbuf_freecnt = -1; /* unlimited to begin with */ static int dead_pager_getpages __P((vm_object_t, vm_page_t *, int, int)); static vm_object_t dead_pager_alloc __P((void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t)); static void dead_pager_putpages __P((vm_object_t, vm_page_t *, int, int, int *)); static boolean_t dead_pager_haspage __P((vm_object_t, vm_pindex_t, int *, int *)); static void dead_pager_dealloc __P((vm_object_t)); static int dead_pager_getpages(obj, ma, count, req) vm_object_t obj; vm_page_t *ma; int count; int req; { return VM_PAGER_FAIL; } static vm_object_t dead_pager_alloc(handle, size, prot, off) void *handle; vm_ooffset_t size; vm_prot_t prot; vm_ooffset_t off; { return NULL; } static void dead_pager_putpages(object, m, count, flags, rtvals) vm_object_t object; vm_page_t *m; int count; int flags; int *rtvals; { int i; for (i = 0; i < count; i++) { rtvals[i] = VM_PAGER_AGAIN; } } static int dead_pager_haspage(object, pindex, prev, next) vm_object_t object; vm_pindex_t pindex; int *prev; int *next; { if (prev) *prev = 0; if (next) *next = 0; return FALSE; } static void dead_pager_dealloc(object) vm_object_t object; { return; } static struct pagerops deadpagerops = { NULL, dead_pager_alloc, dead_pager_dealloc, dead_pager_getpages, dead_pager_putpages, dead_pager_haspage, NULL }; struct pagerops *pagertab[] = { &defaultpagerops, /* OBJT_DEFAULT */ &swappagerops, /* OBJT_SWAP */ &vnodepagerops, /* OBJT_VNODE */ &devicepagerops, /* OBJT_DEVICE */ &physpagerops, /* OBJT_PHYS */ &deadpagerops /* OBJT_DEAD */ }; int npagers = sizeof(pagertab) / sizeof(pagertab[0]); /* * Kernel address space for mapping pages. * Used by pagers where KVAs are needed for IO. * * XXX needs to be large enough to support the number of pending async * cleaning requests (NPENDINGIO == 64) * the maximum swap cluster size * (MAXPHYS == 64k) if you want to get the most efficiency. */ #define PAGER_MAP_SIZE (8 * 1024 * 1024) int pager_map_size = PAGER_MAP_SIZE; vm_map_t pager_map; static int bswneeded; static vm_offset_t swapbkva; /* swap buffers kva */ struct mtx pbuf_mtx; void vm_pager_init() { struct pagerops **pgops; /* * Initialize known pagers */ for (pgops = pagertab; pgops < &pagertab[npagers]; pgops++) if (pgops && ((*pgops)->pgo_init != NULL)) (*(*pgops)->pgo_init) (); } void vm_pager_bufferinit() { struct buf *bp; int i; mtx_init(&pbuf_mtx, "pbuf mutex", MTX_DEF); bp = swbuf; /* * Now set up swap and physical I/O buffer headers. */ for (i = 0; i < nswbuf; i++, bp++) { TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist); BUF_LOCKINIT(bp); LIST_INIT(&bp->b_dep); bp->b_rcred = bp->b_wcred = NOCRED; bp->b_xflags = 0; } cluster_pbuf_freecnt = nswbuf / 2; swapbkva = kmem_alloc_pageable(pager_map, nswbuf * MAXPHYS); if (!swapbkva) panic("Not enough pager_map VM space for physical buffers"); } /* * Allocate an instance of a pager of the given type. * Size, protection and offset parameters are passed in for pagers that * need to perform page-level validation (e.g. the device pager). */ vm_object_t vm_pager_allocate(objtype_t type, void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t off) { struct pagerops *ops; ops = pagertab[type]; if (ops) return ((*ops->pgo_alloc) (handle, size, prot, off)); return (NULL); } void vm_pager_deallocate(object) vm_object_t object; { (*pagertab[object->type]->pgo_dealloc) (object); } /* * vm_pager_strategy: * * called with no specific spl * Execute strategy routine directly to pager. */ void vm_pager_strategy(vm_object_t object, struct bio *bp) { if (pagertab[object->type]->pgo_strategy) { (*pagertab[object->type]->pgo_strategy)(object, bp); } else { bp->bio_flags |= BIO_ERROR; bp->bio_error = ENXIO; biodone(bp); } } /* * vm_pager_get_pages() - inline, see vm/vm_pager.h * vm_pager_put_pages() - inline, see vm/vm_pager.h * vm_pager_has_page() - inline, see vm/vm_pager.h * vm_pager_page_inserted() - inline, see vm/vm_pager.h * vm_pager_page_removed() - inline, see vm/vm_pager.h */ #if 0 /* * vm_pager_sync: * * Called by pageout daemon before going back to sleep. * Gives pagers a chance to clean up any completed async pageing * operations. */ void vm_pager_sync() { struct pagerops **pgops; for (pgops = pagertab; pgops < &pagertab[npagers]; pgops++) if (pgops && ((*pgops)->pgo_sync != NULL)) (*(*pgops)->pgo_sync) (); } #endif vm_offset_t vm_pager_map_page(m) vm_page_t m; { vm_offset_t kva; kva = kmem_alloc_wait(pager_map, PAGE_SIZE); pmap_kenter(kva, VM_PAGE_TO_PHYS(m)); return (kva); } void vm_pager_unmap_page(kva) vm_offset_t kva; { pmap_kremove(kva); kmem_free_wakeup(pager_map, kva, PAGE_SIZE); } vm_object_t vm_pager_object_lookup(pg_list, handle) register struct pagerlst *pg_list; void *handle; { register vm_object_t object; TAILQ_FOREACH(object, pg_list, pager_object_list) if (object->handle == handle) return (object); return (NULL); } /* * initialize a physical buffer */ static void initpbuf(struct buf *bp) { bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_NONE; bp->b_data = (caddr_t) (MAXPHYS * (bp - swbuf)) + swapbkva; bp->b_kvabase = bp->b_data; bp->b_kvasize = MAXPHYS; bp->b_xflags = 0; bp->b_flags = 0; bp->b_ioflags = 0; bp->b_iodone = NULL; bp->b_error = 0; + bp->b_magic = B_MAGIC_BIO; + bp->b_op = &buf_ops_bio; BUF_LOCK(bp, LK_EXCLUSIVE); } /* * allocate a physical buffer * * There are a limited number (nswbuf) of physical buffers. We need * to make sure that no single subsystem is able to hog all of them, * so each subsystem implements a counter which is typically initialized * to 1/2 nswbuf. getpbuf() decrements this counter in allocation and * increments it on release, and blocks if the counter hits zero. A * subsystem may initialize the counter to -1 to disable the feature, * but it must still be sure to match up all uses of getpbuf() with * relpbuf() using the same variable. * * NOTE: pfreecnt can be NULL, but this 'feature' will be removed * relatively soon when the rest of the subsystems get smart about it. XXX */ struct buf * getpbuf(pfreecnt) int *pfreecnt; { int s; struct buf *bp; s = splvm(); mtx_lock(&pbuf_mtx); for (;;) { if (pfreecnt) { while (*pfreecnt == 0) { msleep(pfreecnt, &pbuf_mtx, PVM, "wswbuf0", 0); } } /* get a bp from the swap buffer header pool */ if ((bp = TAILQ_FIRST(&bswlist)) != NULL) break; bswneeded = 1; msleep(&bswneeded, &pbuf_mtx, PVM, "wswbuf1", 0); /* loop in case someone else grabbed one */ } TAILQ_REMOVE(&bswlist, bp, b_freelist); if (pfreecnt) --*pfreecnt; mtx_unlock(&pbuf_mtx); splx(s); initpbuf(bp); return bp; } /* * allocate a physical buffer, if one is available. * * Note that there is no NULL hack here - all subsystems using this * call understand how to use pfreecnt. */ struct buf * trypbuf(pfreecnt) int *pfreecnt; { int s; struct buf *bp; s = splvm(); mtx_lock(&pbuf_mtx); if (*pfreecnt == 0 || (bp = TAILQ_FIRST(&bswlist)) == NULL) { mtx_unlock(&pbuf_mtx); splx(s); return NULL; } TAILQ_REMOVE(&bswlist, bp, b_freelist); --*pfreecnt; mtx_unlock(&pbuf_mtx); splx(s); initpbuf(bp); return bp; } /* * release a physical buffer * * NOTE: pfreecnt can be NULL, but this 'feature' will be removed * relatively soon when the rest of the subsystems get smart about it. XXX */ void relpbuf(bp, pfreecnt) struct buf *bp; int *pfreecnt; { int s; s = splvm(); mtx_lock(&pbuf_mtx); if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } if (bp->b_vp) pbrelvp(bp); BUF_UNLOCK(bp); TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist); if (bswneeded) { bswneeded = 0; wakeup(&bswneeded); } if (pfreecnt) { if (++*pfreecnt == 1) wakeup(pfreecnt); } mtx_unlock(&pbuf_mtx); splx(s); }